Myeunomia bpf/bpf developer tutorial (#5)

* Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md
2026-02-03 10:14:44 +08:00 · 2022-12-12 22:09:34 +08:00
parent 8fcb949d70
commit 42cda5db8d
3 changed files with 351 additions and 11 deletions
--- a/10-hardirqs/README.md
+++ b/10-hardirqs/README.md
@@ -1 +1,170 @@
-## eBPF 入门实践教程：
+## eBPF 入门开发实践指南十：在 eBPF 中使用 kprobe 监测捕获 unlink 系统调用
+
+eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。
+
+本文是 eBPF 入门开发实践指南的第十篇，在 eBPF 中。
+## hardirqs是什么？
+	hardirqs 是 bcc-tools 工具包的一部分，该工具包是一组用于在 Linux 系统上执行系统跟踪和分析的实用程序。
+	hardirqs 是一种用于跟踪和分析 Linux 内核中的中断处理程序的工具。它使用 BPF（Berkeley Packet Filter）程序来收集有关中断处理程序的数据，
+	并可用于识别内核中的性能问题和其他与中断处理相关的问题。
+##使用方法：
+	sudo hardirqs：该命令会显示有关内核中断处理程序的信息，包括每个处理程序的名称、统计信息和其他相关数据。
+	hardirqs 提供了多种选项，您可以根据需要使用它们来控制 hardirqs 的输出。一些常用的选项包括：
+	-h：显示帮助信息，包括所有可用选项的描述和示例。
+	-p PID：限制输出仅显示指定进程的中断处理程序。
+	-t：在输出中显示时间戳，以毫秒为单位。
+	-d：以持续的方式运行 hardirqs，并在输出中显示中断处理程序的实时数据。
+	-l：在输出中显示中断处理程序的完整路径。
+##实现原理：
+	在 Linux 内核中，每个中断处理程序都有一个唯一的名称，称为中断向量。hardirqs 通过检查每个中断处理程序的中断向量，来监控内核中的中断处理程序。当内核接收到一个中断时，它会查找与该中断相关的中断处理程序，并执行该程序。hardirqs 通过检查内核中执行的中断处理程序，来监控内核中的中断处理程序。另外，hardirqs 还可以通过注入 BPF 程序到内核中，来捕获内核中的中断处理程序。这样，hardirqs 就可以监控内核中执行的中断处理程序，并收集有关它们的信息。
+##代码实现：
+```
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Wenbo Zhang
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "hardirqs.h"
+#include "bits.bpf.h"
+#include "maps.bpf.h"
+
+#define MAX_ENTRIES	256
+
+const volatile bool filter_cg = false;
+const volatile bool targ_dist = false;
+const volatile bool targ_ns = false;
+const volatile bool do_count = false;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, 1);
+} cgroup_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, u64);
+} start SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, struct irq_key);
+	__type(value, struct info);
+} infos SEC(".maps");
+
+static struct info zero;
+
+static int handle_entry(int irq, struct irqaction *action)
+{
+	if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+		return 0;
+
+	if (do_count) {
+		struct irq_key key = {};
+		struct info *info;
+
+		bpf_probe_read_kernel_str(&key.name, sizeof(key.name), BPF_CORE_READ(action, name));
+		info = bpf_map_lookup_or_try_init(&infos, &key, &zero);
+		if (!info)
+			return 0;
+		info->count += 1;
+		return 0;
+	} else {
+		u64 ts = bpf_ktime_get_ns();
+		u32 key = 0;
+
+		if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+			return 0;
+
+		bpf_map_update_elem(&start, &key, &ts, BPF_ANY);
+		return 0;
+	}
+}
+
+static int handle_exit(int irq, struct irqaction *action)
+{
+	struct irq_key ikey = {};
+	struct info *info;
+	u32 key = 0;
+	u64 delta;
+	u64 *tsp;
+
+	if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+		return 0;
+
+	tsp = bpf_map_lookup_elem(&start, &key);
+	if (!tsp)
+		return 0;
+
+	delta = bpf_ktime_get_ns() - *tsp;
+	if (!targ_ns)
+		delta /= 1000U;
+
+	bpf_probe_read_kernel_str(&ikey.name, sizeof(ikey.name), BPF_CORE_READ(action, name));
+	info = bpf_map_lookup_or_try_init(&infos, &ikey, &zero);
+	if (!info)
+		return 0;
+
+	if (!targ_dist) {
+		info->count += delta;
+	} else {
+		u64 slot;
+
+		slot = log2(delta);
+		if (slot >= MAX_SLOTS)
+			slot = MAX_SLOTS - 1;
+		info->slots[slot]++;
+	}
+
+	return 0;
+}
+
+SEC("tp_btf/irq_handler_entry")
+int BPF_PROG(irq_handler_entry_btf, int irq, struct irqaction *action)
+{
+	return handle_entry(irq, action);
+}
+
+SEC("tp_btf/irq_handler_exit")
+int BPF_PROG(irq_handler_exit_btf, int irq, struct irqaction *action)
+{
+	return handle_exit(irq, action);
+}
+
+SEC("raw_tp/irq_handler_entry")
+int BPF_PROG(irq_handler_entry, int irq, struct irqaction *action)
+{
+	return handle_entry(irq, action);
+}
+
+SEC("raw_tp/irq_handler_exit")
+int BPF_PROG(irq_handler_exit, int irq, struct irqaction *action)
+{
+	return handle_exit(irq, action);
+}
+
+char LICENSE[] SEC("license") = "GPL";
+```
+这是一个 BPF（Berkeley Packet Filter）程序。BPF 程序是小型程序，可以直接在 Linux 内核中运行，用于过滤和操纵网络流量。这个特定的程序似乎旨在收集内核中中断处理程序的统计信息。它定义了一些地图（可以在 BPF 程序和内核的其他部分之间共享的数据结构）和两个函数：handle_entry 和 handle_exit。当内核进入和退出中断处理程序时，分别执行这些函数。handle_entry 函数用于跟踪中断处理程序被执行的次数，而 handle_exit 则用于测量中断处理程序中花费的时间。
+
+##运行代码
+
+要编译这个程序，请使用 ecc 工具：
+
+```console
+$ ecc kprobe-link.bpf.c
+Compiling bpf object...
+Packing ebpf object and config into package.json...
+```
+
+然后运行：
+
+```console
+sudo ecli package.json
+```
+
--- a/8-exitsnoop/README.md
+++ b/8-exitsnoop/README.md
@@ -1,13 +1,13 @@
 ## eBPF 入门开发实践指南八：在 eBPF 中使用 exitsnoop 监控 进程退出事件：
 ##exitsnoop
+
+
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_core_read.h>
 #include "exitsnoop.h"
-
 char LICENSE[] SEC("license") = "Dual BSD/GPL";
-
 struct {
 	__uint(type, BPF_MAP_TYPE_RINGBUF);
 	__uint(max_entries, 256 * 1024);
@@ -49,7 +49,6 @@ int handle_exit(struct trace_event_raw_sched_process_template* ctx)
 	return 0;
 }

-=======
 这段代码是一个 BPF 程序，用于监控 Linux 系统中的进程退出事件。BPF（Berkeley Packet Filter）是一种内核态程序设计语言，允许开发人员编写内核模块以捕获和处理内核事件。
 该程序通过注册一个 tracepoint，来监控进程退出事件。Tracepoint 是一种内核特性，允许内核模块获取特定事件的通知。在本程序中，注册的 tracepoint 是“tp/sched/sched_process_exit”，表示该程序监控的是进程退出事件。
 当系统中发生进程退出事件时，BPF 程序会捕获该事件，并调用“handle_exit”函数来处理它。该函数首先检查当前退出事件是否是进程退出事件（而不是线程退出事件），然后在 BPF 环形缓冲区（“rb”）中保留一个事件结构体，并填充该结构体中的其他信息，例如进程 ID、进程名称、退出代码和退出信号等信息。最后，该函数还会调用 BPF 的“perf_event_output”函数，将捕获的事件发送给用户空间程序。
--- a/9-runqlat/README.md
+++ b/9-runqlat/README.md
@@ -1,13 +1,173 @@
-## eBPF 入门实践教程：
+## eBPF 入门开发实践指南九：一个 Linux 内核 BPF 程序，通过柱状图来总结调度程序运行队列延迟，显示任务等待运行在 CPU 上的时间长度
+eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。

-## origin
+## runqlat是什么？
+	bcc-tools 是一组用于在 Linux 系统上使用 BPF 程序的工具。runqlat 是 bcc-tools 中的一个工具，用于分析 Linux 系统的调度性能。
+	具体来说，runqlat 用于测量一个任务在被调度到 CPU 上运行之前在运行队列中等待的时间。这些信息对于识别性能瓶颈和提高 Linux 内核
+	调度算法的整体效率非常有用

-origin from:
+## runqlat原理：
+	使用内核跟踪点和函数探针的结合来测量进程在运行队列中的时间。当进程被排队时，trace_enqueue 函数会在一个映射中记录时间戳。
+	当进程被调度到 CPU 上运行时，handle_switch 函数会检索时间戳，并计算当前时间与排队时间之间的时间差。这个差值（或 delta）
+	然后用于更新进程的直方图，该直方图记录运行队列延迟的分布。该直方图可用于分析 Linux 内核的调度性能。
+## runqlat代码实现
+```
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Wenbo Zhang
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+#include "runqlat.h"
+#include "bits.bpf.h"
+#include "maps.bpf.h"
+#include "core_fixes.bpf.h"
+
+#define MAX_ENTRIES	10240
+#define TASK_RUNNING 	0
+
+const volatile bool filter_cg = false;
+const volatile bool targ_per_process = false;
+const volatile bool targ_per_thread = false;
+const volatile bool targ_per_pidns = false;
+const volatile bool targ_ms = false;
+const volatile pid_t targ_tgid = 0;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+	__type(key, u32);
+	__type(value, u32);
+	__uint(max_entries, 1);
+} cgroup_map SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, u32);
+	__type(value, u64);
+} start SEC(".maps");
+
+static struct hist zero;
+
+/// @sample {"interval": 1000, "type" : "log2_hist"}
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, u32);
+	__type(value, struct hist);
+} hists SEC(".maps");
+
+static int trace_enqueue(u32 tgid, u32 pid)
+{
+	u64 ts;
+
+	if (!pid)
+		return 0;
+	if (targ_tgid && targ_tgid != tgid)
+		return 0;
+
+	ts = bpf_ktime_get_ns();
+	bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
+	return 0;
+}
+
+static unsigned int pid_namespace(struct task_struct *task)
+{
+	struct pid *pid;
+	unsigned int level;
+	struct upid upid;
+	unsigned int inum;
+
+	/*  get the pid namespace by following task_active_pid_ns(),
+	 *  pid->numbers[pid->level].ns
+	 */
+	pid = BPF_CORE_READ(task, thread_pid);
+	level = BPF_CORE_READ(pid, level);
+	bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]);
+	inum = BPF_CORE_READ(upid.ns, ns.inum);
+
+	return inum;
+}
+
+static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next)
+{
+	struct hist *histp;
+	u64 *tsp, slot;
+	u32 pid, hkey;
+	s64 delta;
+
+	if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+		return 0;
+
+	if (get_task_state(prev) == TASK_RUNNING)
+		trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid));
+
+	pid = BPF_CORE_READ(next, pid);
+
+	tsp = bpf_map_lookup_elem(&start, &pid);
+	if (!tsp)
+		return 0;
+	delta = bpf_ktime_get_ns() - *tsp;
+	if (delta < 0)
+		goto cleanup;
+
+	if (targ_per_process)
+		hkey = BPF_CORE_READ(next, tgid);
+	else if (targ_per_thread)
+		hkey = pid;
+	else if (targ_per_pidns)
+		hkey = pid_namespace(next);
+	else
+		hkey = -1;
+	histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero);
+	if (!histp)
+		goto cleanup;
+	if (!histp->comm[0])
+		bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm),
+					next->comm);
+	if (targ_ms)
+		delta /= 1000000U;
+	else
+		delta /= 1000U;
+	slot = log2l(delta);
+	if (slot >= MAX_SLOTS)
+		slot = MAX_SLOTS - 1;
+	__sync_fetch_and_add(&histp->slots[slot], 1);
+
+cleanup:
+	bpf_map_delete_elem(&start, &pid);
+	return 0;
+}
+
+SEC("raw_tp/sched_wakeup")
+int BPF_PROG(handle_sched_wakeup, struct task_struct *p)
+{
+	if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+		return 0;
+
+	return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
+}
+
+SEC("raw_tp/sched_wakeup_new")
+int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p)
+{
+	if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
+		return 0;
+
+	return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
+}
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
+{
+	return handle_switch(preempt, prev, next);
+}
+
+char LICENSE[] SEC("license") = "GPL";
+```
+这是一个 Linux 内核 BPF 程序，旨在收集和报告运行队列的延迟。BPF 是 Linux 内核中一项技术，它允许将程序附加到内核中的特定点并进行安全高效的执行。这些程序可用于收集有关内核行为的信息，并实现自定义行为。这个 BPF 程序使用 BPF maps和来自 bpf_helpers.h 和 bpf_tracing.h 头文件的帮助程序的组合来收集有关任务何时从内核的运行队列中排队和取消排队的信息，并记录任务在被安排执行之前在运行队列上等待的时间。然后，它使用这些信息生成直方图，显示不同组任务的运行队列延迟分布。这些直方图可用于识别和诊断内核调度行为中的性能问题。

-<https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.bpf.c>

-This program summarizes scheduler run queue latency as a histogram, showing
-how long tasks spent waiting their turn to run on-CPU.

 ## Compile and Run

@@ -16,7 +176,7 @@ Compile:
 ```shell
 docker run -it -v `pwd`/:/src/ yunwei37/ebpm:latest
 ```
-
+或者
 ```console
 $ ecc runqlat.bpf.c runqlat.h
 Compiling bpf object...
@@ -664,3 +824,15 @@ examples:
    ./runqlat -p 185     # trace PID 185 only

 ```
+
+## 总结
+一个 Linux 内核 BPF 程序，通过柱状图来总结调度程序运行队列延迟，显示任务等待运行在 CPU 上的时间长度
+编译这个程序可以使用 ecc 工具，运行时可以使用 ecli 命令，runqlat是一种用于监控Linux内核中进程调度延迟的工具。它可以帮助您了解进程在内核中等待执行的时间，并根据这些信息优化进程调度，提高系统的性能。要使用runq-lat，需要在终端中输入runq-lat命令，然后按照提示操作即可。更多的例子和详细的开发指南，请参考 eunomia-bpf 的官方文档：https://github.com/eunomia-bpf/eunomia-bpf
+## origin
+
+origin from:
+
+<https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.bpf.c>
+
+This program summarizes scheduler run queue latency as a histogram, showing
+how long tasks spent waiting their turn to run on-CPU.