eBPF 入门实践教程
origin
origin from:
https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpconnlat.bpf.c
Compile and Run
Compile:
docker run -it -v `pwd`/:/src/ yunwei37/ebpm:latest
Run:
sudo ./ecli run package.json
details in bcc
Demonstrations of tcpstates, the Linux BPF/bcc version.
tcpstates prints TCP state change information, including the duration in each state as milliseconds. For example, a single TCP session:
# tcpstates
SKADDR C-PID C-COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
ffff9fd7e8192000 22384 curl 100.66.100.185 0 52.33.159.26 80 CLOSE -> SYN_SENT 0.000
ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 SYN_SENT -> ESTABLISHED 1.373
ffff9fd7e8192000 22384 curl 100.66.100.185 63446 52.33.159.26 80 ESTABLISHED -> FIN_WAIT1 176.042
ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT1 -> FIN_WAIT2 0.536
ffff9fd7e8192000 0 swapper/5 100.66.100.185 63446 52.33.159.26 80 FIN_WAIT2 -> CLOSE 0.006
^C
This showed that the most time was spent in the ESTABLISHED state (which then transitioned to FIN_WAIT1), which was 176.042 milliseconds.
The first column is the socked address, as the output may include lines from different sessions interleaved. The next two columns show the current on-CPU process ID and command name: these may show the process that owns the TCP session, depending on whether the state change executes synchronously in process context. If that's not the case, they may show kernel details.
eBPF入门实践教程:使用 libbpf-bootstrap 开发程序统计 TCP 连接延时
来源
修改自 https://github.com/iovisor/bcc/blob/master/libbpf-tools/tcpstates.bpf.c
编译运行
git clone https://github.com/libbpf/libbpf-bootstrap libbpf-bootstrap-cloned- 将 libbpf-bootstrap目录下的文件复制到
libbpf-bootstrap-cloned/examples/c下 - 修改
libbpf-bootstrap-cloned/examples/c/Makefile,在其APPS项后添加tcpstates - 在
libbpf-bootstrap-cloned/examples/c下运行make tcpstates sudo ./tcpstates
效果
root@yutong-VirtualBox:~/libbpf-bootstrap/examples/c# ./tcpstates
SKADDR PID COMM LADDR LPORT RADDR RPORT OLDSTATE -> NEWSTATE MS
ffff9bf61bb62bc0 164978 node 192.168.88.15 0 52.178.17.2 443 CLOSE -> SYN_SENT 0.000
ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 SYN_SENT -> ESTABLISHED 225.794
ffff9bf61bb62bc0 0 swapper/0 192.168.88.15 41596 52.178.17.2 443 ESTABLISHED -> CLOSE_WAIT 901.454
ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 CLOSE_WAIT -> LAST_ACK 0.793
ffff9bf61bb62bc0 164978 node 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> LAST_ACK 0.086
ffff9bf61bb62bc0 228759 kworker/u6 192.168.88.15 41596 52.178.17.2 443 LAST_ACK -> CLOSE 0.193
ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 CLOSE -> LISTEN 0.000
ffff9bf6d8ee88c0 229832 redis-serv 0.0.0.0 6379 0.0.0.0 0 LISTEN -> CLOSE 1.763
ffff9bf7109d6900 88750 node 127.0.0.1 39755 127.0.0.1 50966 ESTABLISHED -> FIN_WAIT1 0.000
对于输出的详细解释,详见 README.md
tcpstates.bpf.c 的解释
tcpstates 是一个追踪当前系统上的TCP套接字的TCP状态的程序,主要通过跟踪内核跟踪点 inet_sock_set_state 来实现。统计数据通过 perf_event向用户态传输。
SEC("tracepoint/sock/inet_sock_set_state")
int handle_set_state(struct trace_event_raw_inet_sock_set_state *ctx)
在套接字改变状态处附加一个eBPF跟踪函数。
if (ctx->protocol != IPPROTO_TCP)
return 0;
if (target_family && target_family != family)
return 0;
if (filter_by_sport && !bpf_map_lookup_elem(&sports, &sport))
return 0;
if (filter_by_dport && !bpf_map_lookup_elem(&dports, &dport))
return 0;
跟踪函数被调用后,先判断当前改变状态的套接字是否满足我们需要的过滤条件,如果不满足则不进行记录。
tsp = bpf_map_lookup_elem(×tamps, &sk);
ts = bpf_ktime_get_ns();
if (!tsp)
delta_us = 0;
else
delta_us = (ts - *tsp) / 1000;
event.skaddr = (__u64)sk;
event.ts_us = ts / 1000;
event.delta_us = delta_us;
event.pid = bpf_get_current_pid_tgid() >> 32;
event.oldstate = ctx->oldstate;
event.newstate = ctx->newstate;
event.family = family;
event.sport = sport;
event.dport = dport;
bpf_get_current_comm(&event.task, sizeof(event.task));
if (family == AF_INET) {
bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_rcv_saddr);
bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_daddr);
} else { /* family == AF_INET6 */
bpf_probe_read_kernel(&event.saddr, sizeof(event.saddr), &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
bpf_probe_read_kernel(&event.daddr, sizeof(event.daddr), &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
}
使用状态改变相关填充event结构体。
- 此处使用了
libbpf的 CO-RE 支持。
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event));
将事件结构体发送至用户态程序。
if (ctx->newstate == TCP_CLOSE)
bpf_map_delete_elem(×tamps, &sk);
else
bpf_map_update_elem(×tamps, &sk, &ts, BPF_ANY);
根据这个TCP链接的新状态,决定是更新下时间戳记录还是不再记录它的时间戳。
对于用户态程序
while (!exiting) {
err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
if (err < 0 && err != -EINTR) {
warn("error polling perf buffer: %s\n", strerror(-err));
goto cleanup;
}
/* reset err to return 0 if exiting */
err = 0;
}
不停轮询内核程序所发过来的 perf event。
static void handle_event(void* ctx, int cpu, void* data, __u32 data_sz) {
char ts[32], saddr[26], daddr[26];
struct event* e = data;
struct tm* tm;
int family;
time_t t;
if (emit_timestamp) {
time(&t);
tm = localtime(&t);
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
printf("%8s ", ts);
}
inet_ntop(e->family, &e->saddr, saddr, sizeof(saddr));
inet_ntop(e->family, &e->daddr, daddr, sizeof(daddr));
if (wide_output) {
family = e->family == AF_INET ? 4 : 6;
printf(
"%-16llx %-7d %-16s %-2d %-26s %-5d %-26s %-5d %-11s -> %-11s "
"%.3f\n",
e->skaddr, e->pid, e->task, family, saddr, e->sport, daddr,
e->dport, tcp_states[e->oldstate], tcp_states[e->newstate],
(double)e->delta_us / 1000);
} else {
printf(
"%-16llx %-7d %-10.10s %-15s %-5d %-15s %-5d %-11s -> %-11s %.3f\n",
e->skaddr, e->pid, e->task, saddr, e->sport, daddr, e->dport,
tcp_states[e->oldstate], tcp_states[e->newstate],
(double)e->delta_us / 1000);
}
}
static void handle_lost_events(void* ctx, int cpu, __u64 lost_cnt) {
warn("lost %llu events on CPU #%d\n", lost_cnt, cpu);
}
收到事件后所调用对应的处理函数并进行输出打印。