init with documents from eunomia-bpf

This commit is contained in:
yunwei37
2022-12-02 19:18:03 +08:00
parent 1179ec171e
commit 81d749a9cc
85 changed files with 11876 additions and 0 deletions

6
9-runqlat/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
.vscode
package.json
*.o
*.skel.json
*.skel.yaml
package.yaml

675
9-runqlat/README.md Executable file
View File

@@ -0,0 +1,675 @@
---
layout: post
title: runqlat
date: 2022-10-10 16:18
category: bpftools
author: yunwei37
tags: [bpftools, syscall, tracepoint]
summary: Summarize run queue (scheduler) latency as a histogram.
---
## origin
origin from:
<https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqlat.bpf.c>
This program summarizes scheduler run queue latency as a histogram, showing
how long tasks spent waiting their turn to run on-CPU.
## Compile and Run
Compile:
```shell
docker run -it -v `pwd`/:/src/ yunwei37/ebpm:latest
```
```console
$ ecc runqlat.bpf.c runqlat.h
Compiling bpf object...
Generating export types...
Packing ebpf object and config into package.json...
```
Run:
```console
$ sudo ecli examples/bpftools/runqlat/package.json -h
Usage: runqlat_bpf [--help] [--version] [--verbose] [--filter_cg] [--targ_per_process] [--targ_per_thread] [--targ_per_pidns] [--targ_ms] [--targ_tgid VAR]
A simple eBPF program
Optional arguments:
-h, --help shows help message and exits
-v, --version prints version information and exits
--verbose prints libbpf debug information
--filter_cg set value of bool variable filter_cg
--targ_per_process set value of bool variable targ_per_process
--targ_per_thread set value of bool variable targ_per_thread
--targ_per_pidns set value of bool variable targ_per_pidns
--targ_ms set value of bool variable targ_ms
--targ_tgid set value of pid_t variable targ_tgid
Built with eunomia-bpf framework.
See https://github.com/eunomia-bpf/eunomia-bpf for more information.
$ sudo ecli examples/bpftools/runqlat/package.json
key = 4294967295
comm = rcu_preempt
(unit) : count distribution
0 -> 1 : 9 |**** |
2 -> 3 : 6 |** |
4 -> 7 : 12 |***** |
8 -> 15 : 28 |************* |
16 -> 31 : 40 |******************* |
32 -> 63 : 83 |****************************************|
64 -> 127 : 57 |*************************** |
128 -> 255 : 19 |********* |
256 -> 511 : 11 |***** |
512 -> 1023 : 2 | |
1024 -> 2047 : 2 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 0 | |
8192 -> 16383 : 0 | |
16384 -> 32767 : 1 | |
$ sudo ecli examples/bpftools/runqlat/package.json --targ_per_process
key = 3189
comm = cpptools
(unit) : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 1 |*** |
16 -> 31 : 2 |******* |
32 -> 63 : 11 |****************************************|
64 -> 127 : 8 |***************************** |
128 -> 255 : 3 |********** |
```
## details in bcc
```text
Demonstrations of runqlat, the Linux eBPF/bcc version.
This program summarizes scheduler run queue latency as a histogram, showing
how long tasks spent waiting their turn to run on-CPU.
Here is a heavily loaded system:
# ./runqlat
Tracing run queue latency... Hit Ctrl-C to end.
^C
usecs : count distribution
0 -> 1 : 233 |*********** |
2 -> 3 : 742 |************************************ |
4 -> 7 : 203 |********** |
8 -> 15 : 173 |******** |
16 -> 31 : 24 |* |
32 -> 63 : 0 | |
64 -> 127 : 30 |* |
128 -> 255 : 6 | |
256 -> 511 : 3 | |
512 -> 1023 : 5 | |
1024 -> 2047 : 27 |* |
2048 -> 4095 : 30 |* |
4096 -> 8191 : 20 | |
8192 -> 16383 : 29 |* |
16384 -> 32767 : 809 |****************************************|
32768 -> 65535 : 64 |*** |
The distribution is bimodal, with one mode between 0 and 15 microseconds,
and another between 16 and 65 milliseconds. These modes are visible as the
spikes in the ASCII distribution (which is merely a visual representation
of the "count" column). As an example of reading one line: 809 events fell
into the 16384 to 32767 microsecond range (16 to 32 ms) while tracing.
I would expect the two modes to be due the workload: 16 hot CPU-bound threads,
and many other mostly idle threads doing occasional work. I suspect the mostly
idle threads will run with a higher priority when they wake up, and are
the reason for the low latency mode. The high latency mode will be the
CPU-bound threads. More analysis with this and other tools can confirm.
A -m option can be used to show milliseconds instead, as well as an interval
and a count. For example, showing three x five second summary in milliseconds:
# ./runqlat -m 5 3
Tracing run queue latency... Hit Ctrl-C to end.
msecs : count distribution
0 -> 1 : 3818 |****************************************|
2 -> 3 : 39 | |
4 -> 7 : 39 | |
8 -> 15 : 62 | |
16 -> 31 : 2214 |*********************** |
32 -> 63 : 226 |** |
msecs : count distribution
0 -> 1 : 3775 |****************************************|
2 -> 3 : 52 | |
4 -> 7 : 37 | |
8 -> 15 : 65 | |
16 -> 31 : 2230 |*********************** |
32 -> 63 : 212 |** |
msecs : count distribution
0 -> 1 : 3816 |****************************************|
2 -> 3 : 49 | |
4 -> 7 : 40 | |
8 -> 15 : 53 | |
16 -> 31 : 2228 |*********************** |
32 -> 63 : 221 |** |
This shows a similar distribution across the three summaries.
A -p option can be used to show one PID only, which is filtered in kernel for
efficiency. For example, PID 4505, and one second summaries:
# ./runqlat -mp 4505 1
Tracing run queue latency... Hit Ctrl-C to end.
msecs : count distribution
0 -> 1 : 1 |* |
2 -> 3 : 2 |*** |
4 -> 7 : 1 |* |
8 -> 15 : 0 | |
16 -> 31 : 25 |****************************************|
32 -> 63 : 3 |**** |
msecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 2 |** |
4 -> 7 : 0 | |
8 -> 15 : 1 |* |
16 -> 31 : 30 |****************************************|
32 -> 63 : 1 |* |
msecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 28 |****************************************|
32 -> 63 : 2 |** |
msecs : count distribution
0 -> 1 : 1 |* |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 27 |****************************************|
32 -> 63 : 4 |***** |
[...]
For comparison, here is pidstat(1) for that process:
# pidstat -p 4505 1
Linux 4.4.0-virtual (bgregg-xxxxxxxx) 02/08/2016 _x86_64_ (8 CPU)
08:56:11 AM UID PID %usr %system %guest %CPU CPU Command
08:56:12 AM 0 4505 9.00 3.00 0.00 12.00 0 bash
08:56:13 AM 0 4505 7.00 5.00 0.00 12.00 0 bash
08:56:14 AM 0 4505 10.00 2.00 0.00 12.00 0 bash
08:56:15 AM 0 4505 11.00 2.00 0.00 13.00 0 bash
08:56:16 AM 0 4505 9.00 3.00 0.00 12.00 0 bash
[...]
This is a synthetic workload that is CPU bound. It's only spending 12% on-CPU
each second because of high CPU demand on this server: the remaining time
is spent waiting on a run queue, as visualized by runqlat.
Here is the same system, but when it is CPU idle:
# ./runqlat 5 1
Tracing run queue latency... Hit Ctrl-C to end.
usecs : count distribution
0 -> 1 : 2250 |******************************** |
2 -> 3 : 2340 |********************************** |
4 -> 7 : 2746 |****************************************|
8 -> 15 : 418 |****** |
16 -> 31 : 93 |* |
32 -> 63 : 28 | |
64 -> 127 : 119 |* |
128 -> 255 : 9 | |
256 -> 511 : 4 | |
512 -> 1023 : 20 | |
1024 -> 2047 : 22 | |
2048 -> 4095 : 5 | |
4096 -> 8191 : 2 | |
Back to a microsecond scale, this time there is little run queue latency past 1
millisecond, as would be expected.
Now 16 threads are performing heavy disk I/O:
# ./runqlat 5 1
Tracing run queue latency... Hit Ctrl-C to end.
usecs : count distribution
0 -> 1 : 204 | |
2 -> 3 : 944 |* |
4 -> 7 : 16315 |********************* |
8 -> 15 : 29897 |****************************************|
16 -> 31 : 1044 |* |
32 -> 63 : 23 | |
64 -> 127 : 128 | |
128 -> 255 : 24 | |
256 -> 511 : 5 | |
512 -> 1023 : 13 | |
1024 -> 2047 : 15 | |
2048 -> 4095 : 13 | |
4096 -> 8191 : 10 | |
The distribution hasn't changed too much. While the disks are 100% busy, there
is still plenty of CPU headroom, and threads still don't spend much time
waiting their turn.
A -P option will print a distribution for each PID:
# ./runqlat -P
Tracing run queue latency... Hit Ctrl-C to end.
^C
pid = 0
usecs : count distribution
0 -> 1 : 351 |******************************** |
2 -> 3 : 96 |******** |
4 -> 7 : 437 |****************************************|
8 -> 15 : 12 |* |
16 -> 31 : 10 | |
32 -> 63 : 0 | |
64 -> 127 : 16 |* |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
1024 -> 2047 : 0 | |
2048 -> 4095 : 0 | |
4096 -> 8191 : 0 | |
8192 -> 16383 : 1 | |
pid = 12929
usecs : count distribution
0 -> 1 : 1 |****************************************|
2 -> 3 : 0 | |
4 -> 7 : 1 |****************************************|
pid = 12930
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 1 |****************************************|
32 -> 63 : 0 | |
64 -> 127 : 1 |****************************************|
pid = 12931
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 1 |******************** |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2 |****************************************|
pid = 12932
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 1 |****************************************|
256 -> 511 : 0 | |
512 -> 1023 : 1 |****************************************|
pid = 7
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 426 |************************************* |
4 -> 7 : 457 |****************************************|
8 -> 15 : 16 |* |
pid = 9
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 425 |****************************************|
8 -> 15 : 16 |* |
pid = 11
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 10 |****************************************|
pid = 14
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 8 |****************************************|
4 -> 7 : 2 |********** |
pid = 18
usecs : count distribution
0 -> 1 : 414 |****************************************|
2 -> 3 : 0 | |
4 -> 7 : 20 |* |
8 -> 15 : 8 | |
pid = 12928
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 1 |****************************************|
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 |****************************************|
pid = 1867
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 15 |****************************************|
16 -> 31 : 1 |** |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 4 |********** |
pid = 1871
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 2 |****************************************|
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 1 |******************** |
pid = 1876
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 1 |****************************************|
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 1 |****************************************|
pid = 1878
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 3 |****************************************|
pid = 1880
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3 |****************************************|
pid = 9307
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 1 |****************************************|
pid = 1886
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 1 |******************** |
8 -> 15 : 2 |****************************************|
pid = 1888
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3 |****************************************|
pid = 3297
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 1 |****************************************|
pid = 1892
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 1 |******************** |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2 |****************************************|
pid = 7024
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 4 |****************************************|
pid = 16468
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 3 |****************************************|
pid = 12922
usecs : count distribution
0 -> 1 : 1 |****************************************|
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 1 |****************************************|
16 -> 31 : 1 |****************************************|
32 -> 63 : 0 | |
64 -> 127 : 1 |****************************************|
pid = 12923
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 1 |******************** |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 2 |****************************************|
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 1 |******************** |
1024 -> 2047 : 1 |******************** |
pid = 12924
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 2 |******************** |
8 -> 15 : 4 |****************************************|
16 -> 31 : 1 |********** |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 0 | |
1024 -> 2047 : 1 |********** |
pid = 12925
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 1 |****************************************|
pid = 12926
usecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 1 |****************************************|
4 -> 7 : 0 | |
8 -> 15 : 1 |****************************************|
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 1 |****************************************|
pid = 12927
usecs : count distribution
0 -> 1 : 1 |****************************************|
2 -> 3 : 0 | |
4 -> 7 : 1 |****************************************|
A -L option will print a distribution for each TID:
# ./runqlat -L
Tracing run queue latency... Hit Ctrl-C to end.
^C
tid = 0
usecs : count distribution
0 -> 1 : 593 |**************************** |
2 -> 3 : 829 |****************************************|
4 -> 7 : 300 |************** |
8 -> 15 : 321 |*************** |
16 -> 31 : 132 |****** |
32 -> 63 : 58 |** |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 13 | |
tid = 7
usecs : count distribution
0 -> 1 : 8 |******** |
2 -> 3 : 19 |******************** |
4 -> 7 : 37 |****************************************|
[...]
And a --pidnss option (short for PID namespaces) will print for each PID
namespace, for analyzing container performance:
# ./runqlat --pidnss -m
Tracing run queue latency... Hit Ctrl-C to end.
^C
pidns = 4026532870
msecs : count distribution
0 -> 1 : 40 |****************************************|
2 -> 3 : 1 |* |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 2 |** |
64 -> 127 : 5 |***** |
pidns = 4026532809
msecs : count distribution
0 -> 1 : 67 |****************************************|
pidns = 4026532748
msecs : count distribution
0 -> 1 : 63 |****************************************|
pidns = 4026532687
msecs : count distribution
0 -> 1 : 7 |****************************************|
pidns = 4026532626
msecs : count distribution
0 -> 1 : 45 |****************************************|
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 3 |** |
pidns = 4026531836
msecs : count distribution
0 -> 1 : 314 |****************************************|
2 -> 3 : 1 | |
4 -> 7 : 11 |* |
8 -> 15 : 28 |*** |
16 -> 31 : 137 |***************** |
32 -> 63 : 86 |********** |
64 -> 127 : 1 | |
pidns = 4026532382
msecs : count distribution
0 -> 1 : 285 |****************************************|
2 -> 3 : 5 | |
4 -> 7 : 16 |** |
8 -> 15 : 9 |* |
16 -> 31 : 69 |********* |
32 -> 63 : 25 |*** |
Many of these distributions have two modes: the second, in this case, is
caused by capping CPU usage via CPU shares.
USAGE message:
# ./runqlat -h
usage: runqlat.py [-h] [-T] [-m] [-P] [--pidnss] [-L] [-p PID]
[interval] [count]
Summarize run queue (scheduler) latency as a histogram
positional arguments:
interval output interval, in seconds
count number of outputs
optional arguments:
-h, --help show this help message and exit
-T, --timestamp include timestamp on output
-m, --milliseconds millisecond histogram
-P, --pids print a histogram per process ID
--pidnss print a histogram per PID namespace
-L, --tids print a histogram per thread ID
-p PID, --pid PID trace this PID only
examples:
./runqlat # summarize run queue latency as a histogram
./runqlat 1 10 # print 1 second summaries, 10 times
./runqlat -mT 1 # 1s summaries, milliseconds, and timestamps
./runqlat -P # show each PID separately
./runqlat -p 185 # trace PID 185 only
```

31
9-runqlat/bits.bpf.h Normal file
View File

@@ -0,0 +1,31 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __BITS_BPF_H
#define __BITS_BPF_H
#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = val)
static __always_inline u64 log2(u32 v)
{
u32 shift, r;
r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
shift = (v > 0xF) << 2; v >>= shift; r |= shift;
shift = (v > 0x3) << 1; v >>= shift; r |= shift;
r |= (v >> 1);
return r;
}
static __always_inline u64 log2l(u64 v)
{
u32 hi = v >> 32;
if (hi)
return log2(hi) + 32;
else
return log2(v);
}
#endif /* __BITS_BPF_H */

112
9-runqlat/core_fixes.bpf.h Normal file
View File

@@ -0,0 +1,112 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
/* Copyright (c) 2021 Hengqi Chen */
#ifndef __CORE_FIXES_BPF_H
#define __CORE_FIXES_BPF_H
#include <vmlinux.h>
#include <bpf/bpf_core_read.h>
/**
* commit 2f064a59a1 ("sched: Change task_struct::state") changes
* the name of task_struct::state to task_struct::__state
* see:
* https://github.com/torvalds/linux/commit/2f064a59a1
*/
struct task_struct___o {
volatile long int state;
} __attribute__((preserve_access_index));
struct task_struct___x {
unsigned int __state;
} __attribute__((preserve_access_index));
static __always_inline __s64 get_task_state(void *task)
{
struct task_struct___x *t = task;
if (bpf_core_field_exists(t->__state))
return BPF_CORE_READ(t, __state);
return BPF_CORE_READ((struct task_struct___o *)task, state);
}
/**
* commit 309dca309fc3 ("block: store a block_device pointer in struct bio")
* adds a new member bi_bdev which is a pointer to struct block_device
* see:
* https://github.com/torvalds/linux/commit/309dca309fc3
*/
struct bio___o {
struct gendisk *bi_disk;
} __attribute__((preserve_access_index));
struct bio___x {
struct block_device *bi_bdev;
} __attribute__((preserve_access_index));
static __always_inline struct gendisk *get_gendisk(void *bio)
{
struct bio___x *b = bio;
if (bpf_core_field_exists(b->bi_bdev))
return BPF_CORE_READ(b, bi_bdev, bd_disk);
return BPF_CORE_READ((struct bio___o *)bio, bi_disk);
}
/**
* commit d5869fdc189f ("block: introduce block_rq_error tracepoint")
* adds a new tracepoint block_rq_error and it shares the same arguments
* with tracepoint block_rq_complete. As a result, the kernel BTF now has
* a `struct trace_event_raw_block_rq_completion` instead of
* `struct trace_event_raw_block_rq_complete`.
* see:
* https://github.com/torvalds/linux/commit/d5869fdc189f
*/
struct trace_event_raw_block_rq_complete___x {
dev_t dev;
sector_t sector;
unsigned int nr_sector;
} __attribute__((preserve_access_index));
struct trace_event_raw_block_rq_completion___x {
dev_t dev;
sector_t sector;
unsigned int nr_sector;
} __attribute__((preserve_access_index));
static __always_inline bool has_block_rq_completion()
{
if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x))
return true;
return false;
}
/**
* commit d152c682f03c ("block: add an explicit ->disk backpointer to the
* request_queue") and commit f3fa33acca9f ("block: remove the ->rq_disk
* field in struct request") make some changes to `struct request` and
* `struct request_queue`. Now, to get the `struct gendisk *` field in a CO-RE
* way, we need both `struct request` and `struct request_queue`.
* see:
* https://github.com/torvalds/linux/commit/d152c682f03c
* https://github.com/torvalds/linux/commit/f3fa33acca9f
*/
struct request_queue___x {
struct gendisk *disk;
} __attribute__((preserve_access_index));
struct request___x {
struct request_queue___x *q;
struct gendisk *rq_disk;
} __attribute__((preserve_access_index));
static __always_inline struct gendisk *get_disk(void *request)
{
struct request___x *r = request;
if (bpf_core_field_exists(r->rq_disk))
return BPF_CORE_READ(r, rq_disk);
return BPF_CORE_READ(r, q, disk);
}
#endif /* __CORE_FIXES_BPF_H */

26
9-runqlat/maps.bpf.h Normal file
View File

@@ -0,0 +1,26 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
// Copyright (c) 2020 Anton Protopopov
#ifndef __MAPS_BPF_H
#define __MAPS_BPF_H
#include <bpf/bpf_helpers.h>
#include <asm-generic/errno.h>
static __always_inline void *
bpf_map_lookup_or_try_init(void *map, const void *key, const void *init)
{
void *val;
long err;
val = bpf_map_lookup_elem(map, key);
if (val)
return val;
err = bpf_map_update_elem(map, key, init, BPF_NOEXIST);
if (err && err != -EEXIST)
return 0;
return bpf_map_lookup_elem(map, key);
}
#endif /* __MAPS_BPF_H */

152
9-runqlat/runqlat.bpf.c Normal file
View File

@@ -0,0 +1,152 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Wenbo Zhang
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_tracing.h>
#include "runqlat.h"
#include "bits.bpf.h"
#include "maps.bpf.h"
#include "core_fixes.bpf.h"
#define MAX_ENTRIES 10240
#define TASK_RUNNING 0
const volatile bool filter_cg = false;
const volatile bool targ_per_process = false;
const volatile bool targ_per_thread = false;
const volatile bool targ_per_pidns = false;
const volatile bool targ_ms = false;
const volatile pid_t targ_tgid = 0;
struct {
__uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
__type(key, u32);
__type(value, u32);
__uint(max_entries, 1);
} cgroup_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_ENTRIES);
__type(key, u32);
__type(value, u64);
} start SEC(".maps");
static struct hist zero;
/// @sample {"interval": 1000, "type" : "log2_hist"}
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_ENTRIES);
__type(key, u32);
__type(value, struct hist);
} hists SEC(".maps");
static int trace_enqueue(u32 tgid, u32 pid)
{
u64 ts;
if (!pid)
return 0;
if (targ_tgid && targ_tgid != tgid)
return 0;
ts = bpf_ktime_get_ns();
bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
return 0;
}
static unsigned int pid_namespace(struct task_struct *task)
{
struct pid *pid;
unsigned int level;
struct upid upid;
unsigned int inum;
/* get the pid namespace by following task_active_pid_ns(),
* pid->numbers[pid->level].ns
*/
pid = BPF_CORE_READ(task, thread_pid);
level = BPF_CORE_READ(pid, level);
bpf_core_read(&upid, sizeof(upid), &pid->numbers[level]);
inum = BPF_CORE_READ(upid.ns, ns.inum);
return inum;
}
static int handle_switch(bool preempt, struct task_struct *prev, struct task_struct *next)
{
struct hist *histp;
u64 *tsp, slot;
u32 pid, hkey;
s64 delta;
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
return 0;
if (get_task_state(prev) == TASK_RUNNING)
trace_enqueue(BPF_CORE_READ(prev, tgid), BPF_CORE_READ(prev, pid));
pid = BPF_CORE_READ(next, pid);
tsp = bpf_map_lookup_elem(&start, &pid);
if (!tsp)
return 0;
delta = bpf_ktime_get_ns() - *tsp;
if (delta < 0)
goto cleanup;
if (targ_per_process)
hkey = BPF_CORE_READ(next, tgid);
else if (targ_per_thread)
hkey = pid;
else if (targ_per_pidns)
hkey = pid_namespace(next);
else
hkey = -1;
histp = bpf_map_lookup_or_try_init(&hists, &hkey, &zero);
if (!histp)
goto cleanup;
if (!histp->comm[0])
bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm),
next->comm);
if (targ_ms)
delta /= 1000000U;
else
delta /= 1000U;
slot = log2l(delta);
if (slot >= MAX_SLOTS)
slot = MAX_SLOTS - 1;
__sync_fetch_and_add(&histp->slots[slot], 1);
cleanup:
bpf_map_delete_elem(&start, &pid);
return 0;
}
SEC("raw_tp/sched_wakeup")
int BPF_PROG(handle_sched_wakeup, struct task_struct *p)
{
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
return 0;
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
}
SEC("raw_tp/sched_wakeup_new")
int BPF_PROG(handle_sched_wakeup_new, struct task_struct *p)
{
if (filter_cg && !bpf_current_task_under_cgroup(&cgroup_map, 0))
return 0;
return trace_enqueue(BPF_CORE_READ(p, tgid), BPF_CORE_READ(p, pid));
}
SEC("raw_tp/sched_switch")
int BPF_PROG(handle_sched_switch, bool preempt, struct task_struct *prev, struct task_struct *next)
{
return handle_switch(preempt, prev, next);
}
char LICENSE[] SEC("license") = "GPL";

14
9-runqlat/runqlat.h Normal file
View File

@@ -0,0 +1,14 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
#ifndef __RUNQLAT_H
#define __RUNQLAT_H
#define TASK_COMM_LEN 16
#define MAX_SLOTS 26
struct hist {
__u32 slots[MAX_SLOTS];
char comm[TASK_COMM_LEN];
};
#endif /* __RUNQLAT_H */