mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-05-08 06:42:16 +08:00
init with documents from eunomia-bpf
This commit is contained in:
4
8-runqslower/.gitignore
vendored
Normal file
4
8-runqslower/.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
.vscode
|
||||
package.json
|
||||
eunomia-exporter
|
||||
ecli
|
||||
147
8-runqslower/README.md
Normal file
147
8-runqslower/README.md
Normal file
@@ -0,0 +1,147 @@
|
||||
| layout | title | date | category | author | tags | summary |
|
||||
| ------ | ---------- | ---------------- | -------- | -------- | --------------- | ----------------------------------------------- |
|
||||
| post | runqslower | 2022-11-11-20:50 | bpftools | yunwei37 | bpftool syscall | runqslower Trace long process scheduling delays |
|
||||
|
||||
## origin
|
||||
|
||||
origin from:
|
||||
|
||||
https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c
|
||||
|
||||
result:
|
||||
|
||||
```
|
||||
$ sudo ecli/build/bin/Release/ecli run examples/bpftools/runqslower/package.json
|
||||
|
||||
running and waiting for the ebpf events from perf event...
|
||||
time task prev_task delta_us pid prev_pid
|
||||
20:11:59 gnome-shell swapper/0 32 2202 0
|
||||
20:11:59 ecli swapper/3 23 3437 0
|
||||
20:11:59 rcu_sched swapper/1 1 14 0
|
||||
20:11:59 gnome-terminal- swapper/1 13 2714 0
|
||||
20:11:59 ecli swapper/3 2 3437 0
|
||||
20:11:59 kworker/3:3 swapper/3 3 215 0
|
||||
20:11:59 containerd swapper/1 8 1088 0
|
||||
20:11:59 ecli swapper/2 5 3437 0
|
||||
20:11:59 HangDetector swapper/3 6 854 0
|
||||
20:11:59 ecli swapper/2 60 3437 0
|
||||
20:11:59 rcu_sched swapper/1 26 14 0
|
||||
20:11:59 kworker/0:1 swapper/0 26 3414 0
|
||||
20:11:59 ecli swapper/2 6 3437 0
|
||||
```
|
||||
|
||||
## Compile and Run
|
||||
|
||||
Compile:
|
||||
|
||||
```
|
||||
docker run -it -v `pwd`/:/src/ yunwei37/ebpm:latest
|
||||
```
|
||||
|
||||
Run:
|
||||
|
||||
```
|
||||
sudo ./ecli run examples/bpftools/runqslower/package.json
|
||||
```
|
||||
|
||||
## details in bcc
|
||||
|
||||
Demonstrations of runqslower, the Linux eBPF/bcc version.
|
||||
|
||||
runqslower traces high scheduling delays between tasks being ready to run and them running on CPU after that. Example output:
|
||||
|
||||
```
|
||||
# ./runqslower
|
||||
Tracing run queue latency higher than 10000 us
|
||||
TIME COMM TID LAT(us)
|
||||
13:11:43 b'kworker/0:2' 8680 10250
|
||||
13:12:18 b'irq/16-vmwgfx' 422 10838
|
||||
13:12:18 b'systemd-oomd' 753 11012
|
||||
13:12:18 b'containerd' 8272 11254
|
||||
13:12:18 b'HangDetector' 764 12042
|
||||
^C
|
||||
``
|
||||
This measures the time a task spends waiting on a run queue for a turn on-CPU, and shows this time as a individual events. This time should be small, but a task may need to wait its turn due to CPU load.
|
||||
|
||||
This measures two types of run queue latency:
|
||||
1. The time from a task being enqueued on a run queue to its context switch and execution. This traces ttwu_do_wakeup(), wake_up_new_task() -> finish_task_switch() with either raw tracepoints (if supported) or kprobes and instruments the run queue latency after a voluntary context switch.
|
||||
2. The time from when a task was involuntary context switched and still in the runnable state, to when it next executed. This is instrumented from finish_task_switch() alone.
|
||||
|
||||
The overhead of this tool may become significant for some workloads: see the OVERHEAD section.
|
||||
|
||||
This works by tracing various kernel scheduler functions using dynamic tracing, and will need updating to match any changes to these functions.
|
||||
|
||||
Since this uses BPF, only the root user can use this tool.
|
||||
|
||||
```console
|
||||
Usage: runqslower [-h] [-p PID | -t TID | -P] [min_us]
|
||||
```
|
||||
|
||||
The min_us option sets the latency of the run queue to track:
|
||||
|
||||
```
|
||||
# ./runqslower 100
|
||||
Tracing run queue latency higher than 100 us
|
||||
TIME COMM TID LAT(us)
|
||||
20:48:26 b'gnome-shell' 3005 201
|
||||
20:48:26 b'gnome-shell' 3005 202
|
||||
20:48:26 b'gnome-shell' 3005 254
|
||||
20:48:26 b'gnome-shell' 3005 208
|
||||
20:48:26 b'gnome-shell' 3005 132
|
||||
20:48:26 b'gnome-shell' 3005 213
|
||||
20:48:26 b'gnome-shell' 3005 205
|
||||
20:48:26 b'python3' 5224 127
|
||||
20:48:26 b'gnome-shell' 3005 214
|
||||
20:48:26 b'gnome-shell' 3005 126
|
||||
20:48:26 b'gnome-shell' 3005 285
|
||||
20:48:26 b'Xorg' 2869 296
|
||||
20:48:26 b'gnome-shell' 3005 119
|
||||
20:48:26 b'gnome-shell' 3005 206
|
||||
```
|
||||
|
||||
The -p PID option only traces this PID:
|
||||
|
||||
```
|
||||
# ./runqslower -p 3005
|
||||
Tracing run queue latency higher than 10000 us
|
||||
TIME COMM TID LAT(us)
|
||||
20:46:22 b'gnome-shell' 3005 16024
|
||||
20:46:45 b'gnome-shell' 3005 11494
|
||||
20:46:45 b'gnome-shell' 3005 21430
|
||||
20:46:45 b'gnome-shell' 3005 14948
|
||||
20:47:16 b'gnome-shell' 3005 10164
|
||||
20:47:16 b'gnome-shell' 3005 18070
|
||||
20:47:17 b'gnome-shell' 3005 13272
|
||||
20:47:18 b'gnome-shell' 3005 10451
|
||||
20:47:18 b'gnome-shell' 3005 15010
|
||||
20:47:18 b'gnome-shell' 3005 19449
|
||||
20:47:22 b'gnome-shell' 3005 19327
|
||||
20:47:23 b'gnome-shell' 3005 13178
|
||||
20:47:23 b'gnome-shell' 3005 13483
|
||||
20:47:23 b'gnome-shell' 3005 15562
|
||||
20:47:23 b'gnome-shell' 3005 13655
|
||||
20:47:23 b'gnome-shell' 3005 19571
|
||||
```
|
||||
|
||||
The -P option also shows previous task name and TID:
|
||||
|
||||
```
|
||||
# ./runqslower -P
|
||||
Tracing run queue latency higher than 10000 us
|
||||
TIME COMM TID LAT(us) PREV COMM PREV TID
|
||||
20:42:48 b'sysbench' 5159 10562 b'sysbench' 5152
|
||||
20:42:48 b'sysbench' 5159 10367 b'sysbench' 5152
|
||||
20:42:49 b'sysbench' 5158 11818 b'sysbench' 5159
|
||||
20:42:49 b'sysbench' 5160 16913 b'sysbench' 5153
|
||||
20:42:49 b'sysbench' 5157 13742 b'sysbench' 5160
|
||||
20:42:49 b'sysbench' 5152 13746 b'sysbench' 5160
|
||||
20:42:49 b'sysbench' 5153 13731 b'sysbench' 5160
|
||||
20:42:49 b'sysbench' 5158 14688 b'sysbench' 5161
|
||||
20:42:50 b'sysbench' 5155 10468 b'sysbench' 5152
|
||||
20:42:50 b'sysbench' 5156 17695 b'sysbench' 5158
|
||||
20:42:50 b'sysbench' 5155 11251 b'sysbench' 5152
|
||||
20:42:50 b'sysbench' 5154 13283 b'sysbench' 5152
|
||||
20:42:50 b'sysbench' 5158 22278 b'sysbench' 5157
|
||||
```
|
||||
|
||||
For more details, see docs/special_filtering.md
|
||||
112
8-runqslower/core_fixes.h
Normal file
112
8-runqslower/core_fixes.h
Normal file
@@ -0,0 +1,112 @@
|
||||
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||||
/* Copyright (c) 2021 Hengqi Chen */
|
||||
|
||||
#ifndef __CORE_FIXES_BPF_H
|
||||
#define __CORE_FIXES_BPF_H
|
||||
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
|
||||
/**
|
||||
* commit 2f064a59a1 ("sched: Change task_struct::state") changes
|
||||
* the name of task_struct::state to task_struct::__state
|
||||
* see:
|
||||
* https://github.com/torvalds/linux/commit/2f064a59a1
|
||||
*/
|
||||
struct task_struct___o {
|
||||
volatile long int state;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct task_struct___x {
|
||||
unsigned int __state;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static __always_inline __s64 get_task_state(void *task)
|
||||
{
|
||||
struct task_struct___x *t = task;
|
||||
|
||||
if (bpf_core_field_exists(t->__state))
|
||||
return BPF_CORE_READ(t, __state);
|
||||
return BPF_CORE_READ((struct task_struct___o *)task, state);
|
||||
}
|
||||
|
||||
/**
|
||||
* commit 309dca309fc3 ("block: store a block_device pointer in struct bio")
|
||||
* adds a new member bi_bdev which is a pointer to struct block_device
|
||||
* see:
|
||||
* https://github.com/torvalds/linux/commit/309dca309fc3
|
||||
*/
|
||||
struct bio___o {
|
||||
struct gendisk *bi_disk;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct bio___x {
|
||||
struct block_device *bi_bdev;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static __always_inline struct gendisk *get_gendisk(void *bio)
|
||||
{
|
||||
struct bio___x *b = bio;
|
||||
|
||||
if (bpf_core_field_exists(b->bi_bdev))
|
||||
return BPF_CORE_READ(b, bi_bdev, bd_disk);
|
||||
return BPF_CORE_READ((struct bio___o *)bio, bi_disk);
|
||||
}
|
||||
|
||||
/**
|
||||
* commit d5869fdc189f ("block: introduce block_rq_error tracepoint")
|
||||
* adds a new tracepoint block_rq_error and it shares the same arguments
|
||||
* with tracepoint block_rq_complete. As a result, the kernel BTF now has
|
||||
* a `struct trace_event_raw_block_rq_completion` instead of
|
||||
* `struct trace_event_raw_block_rq_complete`.
|
||||
* see:
|
||||
* https://github.com/torvalds/linux/commit/d5869fdc189f
|
||||
*/
|
||||
struct trace_event_raw_block_rq_complete___x {
|
||||
dev_t dev;
|
||||
sector_t sector;
|
||||
unsigned int nr_sector;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct trace_event_raw_block_rq_completion___x {
|
||||
dev_t dev;
|
||||
sector_t sector;
|
||||
unsigned int nr_sector;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static __always_inline bool has_block_rq_completion()
|
||||
{
|
||||
if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* commit d152c682f03c ("block: add an explicit ->disk backpointer to the
|
||||
* request_queue") and commit f3fa33acca9f ("block: remove the ->rq_disk
|
||||
* field in struct request") make some changes to `struct request` and
|
||||
* `struct request_queue`. Now, to get the `struct gendisk *` field in a CO-RE
|
||||
* way, we need both `struct request` and `struct request_queue`.
|
||||
* see:
|
||||
* https://github.com/torvalds/linux/commit/d152c682f03c
|
||||
* https://github.com/torvalds/linux/commit/f3fa33acca9f
|
||||
*/
|
||||
struct request_queue___x {
|
||||
struct gendisk *disk;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
struct request___x {
|
||||
struct request_queue___x *q;
|
||||
struct gendisk *rq_disk;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static __always_inline struct gendisk *get_disk(void *request)
|
||||
{
|
||||
struct request___x *r = request;
|
||||
|
||||
if (bpf_core_field_exists(r->rq_disk))
|
||||
return BPF_CORE_READ(r, rq_disk);
|
||||
return BPF_CORE_READ(r, q, disk);
|
||||
}
|
||||
|
||||
#endif /* __CORE_FIXES_BPF_H */
|
||||
117
8-runqslower/runqslower.bpf.c
Normal file
117
8-runqslower/runqslower.bpf.c
Normal file
@@ -0,0 +1,117 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// Copyright (c) 2019 Facebook
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include "runqslower.bpf.h"
|
||||
#include "core_fixes.h"
|
||||
|
||||
#define TASK_RUNNING 0
|
||||
|
||||
const volatile __u64 min_us = 0;
|
||||
const volatile pid_t targ_pid = 0;
|
||||
const volatile pid_t targ_tgid = 0;
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 10240);
|
||||
__type(key, u32);
|
||||
__type(value, u64);
|
||||
} start SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} events SEC(".maps");
|
||||
|
||||
/* record enqueue timestamp */
|
||||
static int trace_enqueue(u32 tgid, u32 pid)
|
||||
{
|
||||
u64 ts;
|
||||
|
||||
if (!pid)
|
||||
return 0;
|
||||
if (targ_tgid && targ_tgid != tgid)
|
||||
return 0;
|
||||
if (targ_pid && targ_pid != pid)
|
||||
return 0;
|
||||
|
||||
ts = bpf_ktime_get_ns();
|
||||
bpf_map_update_elem(&start, &pid, &ts, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int handle_switch(void *ctx, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
struct event event = {};
|
||||
u64 *tsp, delta_us;
|
||||
u32 pid;
|
||||
|
||||
/* ivcsw: treat like an enqueue event and store timestamp */
|
||||
if (get_task_state(prev) == TASK_RUNNING)
|
||||
trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid));
|
||||
|
||||
pid = BPF_CORE_READ(next, pid);
|
||||
|
||||
/* fetch timestamp and calculate delta */
|
||||
tsp = bpf_map_lookup_elem(&start, &pid);
|
||||
if (!tsp)
|
||||
return 0; /* missed enqueue */
|
||||
|
||||
delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
|
||||
if (min_us && delta_us <= min_us)
|
||||
return 0;
|
||||
|
||||
event.pid = pid;
|
||||
event.prev_pid = BPF_CORE_READ(prev, pid);
|
||||
event.delta_us = delta_us;
|
||||
bpf_probe_read_kernel_str(&event.task, sizeof(event.task), next->comm);
|
||||
bpf_probe_read_kernel_str(&event.prev_task, sizeof(event.prev_task), prev->comm);
|
||||
|
||||
/* output */
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(event));
|
||||
|
||||
bpf_map_delete_elem(&start, &pid);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tp_btf/sched_wakeup")
|
||||
int BPF_PROG(sched_wakeup, struct task_struct *p)
|
||||
{
|
||||
return trace_enqueue(p->tgid, p->pid);
|
||||
}
|
||||
|
||||
SEC("tp_btf/sched_wakeup_new")
|
||||
int BPF_PROG(sched_wakeup_new, struct task_struct *p)
|
||||
{
|
||||
return trace_enqueue(p->tgid, p->pid);
|
||||
}
|
||||
|
||||
SEC("tp_btf/sched_switch")
|
||||
int BPF_PROG(sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
return handle_switch(ctx, prev, next);
|
||||
}
|
||||
|
||||
SEC("raw_tp/sched_wakeup")
|
||||
int BPF_PROG(handle_sched_wakeup, struct task_struct *p)
|
||||
{
|
||||
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
|
||||
}
|
||||
|
||||
SEC("raw_tp/sched_wakeup_new")
|
||||
int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p)
|
||||
{
|
||||
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
|
||||
}
|
||||
|
||||
SEC("raw_tp/sched_switch")
|
||||
int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
return handle_switch(ctx, prev, next);
|
||||
}
|
||||
|
||||
char LICENSE[] SEC("license") = "GPL";
|
||||
15
8-runqslower/runqslower.bpf.h
Normal file
15
8-runqslower/runqslower.bpf.h
Normal file
@@ -0,0 +1,15 @@
|
||||
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||||
#ifndef __RUNQSLOWER_H
|
||||
#define __RUNQSLOWER_H
|
||||
|
||||
#define TASK_COMM_LEN 16
|
||||
|
||||
struct event {
|
||||
char task[TASK_COMM_LEN];
|
||||
char prev_task[TASK_COMM_LEN];
|
||||
__u64 delta_us;
|
||||
int pid;
|
||||
int prev_pid;
|
||||
};
|
||||
|
||||
#endif /* __RUNQSLOWER_H */
|
||||
Reference in New Issue
Block a user