mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-02-03 02:04:30 +08:00
update tcpconnlat
This commit is contained in:
2
src/13-tcpconnlat/.gitignore
vendored
2
src/13-tcpconnlat/.gitignore
vendored
@@ -1,2 +1,4 @@
|
||||
.vscode
|
||||
package.json
|
||||
tcpconnlat
|
||||
.output
|
||||
|
||||
141
src/13-tcpconnlat/Makefile
Normal file
141
src/13-tcpconnlat/Makefile
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
OUTPUT := .output
|
||||
CLANG ?= clang
|
||||
LIBBPF_SRC := $(abspath ../../libbpf/src)
|
||||
BPFTOOL_SRC := $(abspath ../../bpftool/src)
|
||||
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
|
||||
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
|
||||
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
|
||||
LIBBLAZESYM_SRC := $(abspath ../../blazesym/)
|
||||
LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a)
|
||||
LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h)
|
||||
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
|
||||
| sed 's/arm.*/arm/' \
|
||||
| sed 's/aarch64/arm64/' \
|
||||
| sed 's/ppc64le/powerpc/' \
|
||||
| sed 's/mips.*/mips/' \
|
||||
| sed 's/riscv64/riscv/' \
|
||||
| sed 's/loongarch64/loongarch/')
|
||||
VMLINUX := ../../vmlinux/$(ARCH)/vmlinux.h
|
||||
# Use our own libbpf API headers and Linux UAPI headers distributed with
|
||||
# libbpf to avoid dependency on system-wide headers, which could be missing or
|
||||
# outdated
|
||||
INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX))
|
||||
CFLAGS := -g -Wall
|
||||
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
|
||||
|
||||
APPS = tcpconnlat # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
|
||||
|
||||
CARGO ?= $(shell which cargo)
|
||||
ifeq ($(strip $(CARGO)),)
|
||||
BZS_APPS :=
|
||||
else
|
||||
BZS_APPS := # profile
|
||||
APPS += $(BZS_APPS)
|
||||
# Required by libblazesym
|
||||
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
|
||||
endif
|
||||
|
||||
# Get Clang's default includes on this system. We'll explicitly add these dirs
|
||||
# to the includes list when compiling with `-target bpf` because otherwise some
|
||||
# architecture-specific dirs will be "missing" on some architectures/distros -
|
||||
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
|
||||
# sys/cdefs.h etc. might be missing.
|
||||
#
|
||||
# Use '-idirafter': Don't interfere with include mechanics except where the
|
||||
# build would have failed anyways.
|
||||
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
|
||||
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
|
||||
|
||||
ifeq ($(V),1)
|
||||
Q =
|
||||
msg =
|
||||
else
|
||||
Q = @
|
||||
msg = @printf ' %-8s %s%s\n' \
|
||||
"$(1)" \
|
||||
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
|
||||
"$(if $(3), $(3))";
|
||||
MAKEFLAGS += --no-print-directory
|
||||
endif
|
||||
|
||||
define allow-override
|
||||
$(if $(or $(findstring environment,$(origin $(1))),\
|
||||
$(findstring command line,$(origin $(1)))),,\
|
||||
$(eval $(1) = $(2)))
|
||||
endef
|
||||
|
||||
$(call allow-override,CC,$(CROSS_COMPILE)cc)
|
||||
$(call allow-override,LD,$(CROSS_COMPILE)ld)
|
||||
|
||||
.PHONY: all
|
||||
all: $(APPS)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
$(call msg,CLEAN)
|
||||
$(Q)rm -rf $(OUTPUT) $(APPS)
|
||||
|
||||
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
|
||||
$(call msg,MKDIR,$@)
|
||||
$(Q)mkdir -p $@
|
||||
|
||||
# Build libbpf
|
||||
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||
$(call msg,LIB,$@)
|
||||
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
|
||||
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
|
||||
INCLUDEDIR= LIBDIR= UAPIDIR= \
|
||||
install
|
||||
|
||||
# Build bpftool
|
||||
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
|
||||
$(call msg,BPFTOOL,$@)
|
||||
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
|
||||
|
||||
|
||||
$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
|
||||
$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
|
||||
|
||||
$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
|
||||
$(call msg,LIB, $@)
|
||||
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
|
||||
|
||||
$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
|
||||
$(call msg,LIB,$@)
|
||||
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
|
||||
|
||||
# Build BPF code
|
||||
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
|
||||
$(call msg,BPF,$@)
|
||||
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
|
||||
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
|
||||
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||
|
||||
# Generate BPF skeletons
|
||||
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
|
||||
$(call msg,GEN-SKEL,$@)
|
||||
$(Q)$(BPFTOOL) gen skeleton $< > $@
|
||||
|
||||
# Build user-space code
|
||||
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
|
||||
|
||||
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
|
||||
$(call msg,CC,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
|
||||
|
||||
$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
|
||||
|
||||
$(BZS_APPS): $(LIBBLAZESYM_OBJ)
|
||||
|
||||
# Build application binary
|
||||
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
|
||||
$(call msg,BINARY,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
|
||||
|
||||
# delete failed targets
|
||||
.DELETE_ON_ERROR:
|
||||
|
||||
# keep intermediate (.skel.h, .bpf.o, etc) targets
|
||||
.SECONDARY:
|
||||
@@ -8,7 +8,7 @@ tcp 连接延时分析对于网络性能分析优化或者故障排查都能起
|
||||
|
||||
## tcpconnlat 的实现原理
|
||||
|
||||
tcpconnlat 这个工具跟踪执行活动TCP连接的内核函数 (例如,通过connect()系统调用),并显示本地测量的连接的延迟(时间),即从发送 SYN 到响应包的时间。
|
||||
tcpconnlat 这个工具跟踪执行活动TCP连接的内核函数(例如,通过connect()系统调用),并显示本地测量的连接的延迟(时间),即从发送 SYN 到响应包的时间。
|
||||
|
||||
### tcp 连接原理
|
||||
|
||||
@@ -141,16 +141,14 @@ cleanup:
|
||||
|
||||
## 编译运行
|
||||
|
||||
- ```git clone https://github.com/libbpf/libbpf-bootstrap libbpf-bootstrap-cloned```
|
||||
- 将 [libbpf-bootstrap](libbpf-bootstrap)目录下的文件复制到 ```libbpf-bootstrap-cloned/examples/c```下
|
||||
- 修改 ```libbpf-bootstrap-cloned/examples/c/Makefile``` ,在其 ```APPS``` 项后添加 ```tcpconnlat```
|
||||
- 在 ```libbpf-bootstrap-cloned/examples/c``` 下运行 ```make tcpconnlat```
|
||||
- ```sudo ./tcpconnlat```
|
||||
|
||||
## 效果
|
||||
|
||||
```plain
|
||||
root@yutong-VirtualBox:~/libbpf-bootstrap/examples/c# ./tcpconnlat
|
||||
```console
|
||||
$ make
|
||||
...
|
||||
BPF .output/tcpconnlat.bpf.o
|
||||
GEN-SKEL .output/tcpconnlat.skel.h
|
||||
CC .output/tcpconnlat.o
|
||||
BINARY tcpconnlat
|
||||
$ sudo ./tcpconnlat
|
||||
PID COMM IP SADDR DADDR DPORT LAT(ms)
|
||||
222564 wget 4 192.168.88.15 110.242.68.3 80 25.29
|
||||
222684 wget 4 192.168.88.15 167.179.101.42 443 246.76
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
// Copyright (c) 2020 Wenbo Zhang
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include "tcpconnlat.h"
|
||||
|
||||
#define AF_INET 2
|
||||
#define AF_INET6 10
|
||||
|
||||
const volatile __u64 targ_min_us = 0;
|
||||
const volatile pid_t targ_tgid = 0;
|
||||
|
||||
struct piddata {
|
||||
char comm[TASK_COMM_LEN];
|
||||
u64 ts;
|
||||
u32 tgid;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, struct sock *);
|
||||
__type(value, struct piddata);
|
||||
} start SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} events SEC(".maps");
|
||||
|
||||
static int trace_connect(struct sock *sk)
|
||||
{
|
||||
u32 tgid = bpf_get_current_pid_tgid() >> 32;
|
||||
struct piddata piddata = {};
|
||||
|
||||
if (targ_tgid && targ_tgid != tgid)
|
||||
return 0;
|
||||
|
||||
bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm));
|
||||
piddata.ts = bpf_ktime_get_ns();
|
||||
piddata.tgid = tgid;
|
||||
bpf_map_update_elem(&start, &sk, &piddata, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk)
|
||||
{
|
||||
struct piddata *piddatap;
|
||||
struct event event = {};
|
||||
s64 delta;
|
||||
u64 ts;
|
||||
|
||||
if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT)
|
||||
return 0;
|
||||
|
||||
piddatap = bpf_map_lookup_elem(&start, &sk);
|
||||
if (!piddatap)
|
||||
return 0;
|
||||
|
||||
ts = bpf_ktime_get_ns();
|
||||
delta = (s64)(ts - piddatap->ts);
|
||||
if (delta < 0)
|
||||
goto cleanup;
|
||||
|
||||
event.delta_us = delta / 1000U;
|
||||
if (targ_min_us && event.delta_us < targ_min_us)
|
||||
goto cleanup;
|
||||
__builtin_memcpy(&event.comm, piddatap->comm,
|
||||
sizeof(event.comm));
|
||||
event.ts_us = ts / 1000;
|
||||
event.tgid = piddatap->tgid;
|
||||
event.lport = BPF_CORE_READ(sk, __sk_common.skc_num);
|
||||
event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport);
|
||||
event.af = BPF_CORE_READ(sk, __sk_common.skc_family);
|
||||
if (event.af == AF_INET) {
|
||||
event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
|
||||
event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr);
|
||||
} else {
|
||||
BPF_CORE_READ_INTO(&event.saddr_v6, sk,
|
||||
__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
||||
BPF_CORE_READ_INTO(&event.daddr_v6, sk,
|
||||
__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
||||
}
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(event));
|
||||
|
||||
cleanup:
|
||||
bpf_map_delete_elem(&start, &sk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_v4_connect")
|
||||
int BPF_KPROBE(tcp_v4_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_v6_connect")
|
||||
int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_rcv_state_process")
|
||||
int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
|
||||
{
|
||||
return handle_tcp_rcv_state_process(ctx, sk);
|
||||
}
|
||||
|
||||
SEC("fentry/tcp_v4_connect")
|
||||
int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("fentry/tcp_v6_connect")
|
||||
int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("fentry/tcp_rcv_state_process")
|
||||
int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk)
|
||||
{
|
||||
return handle_tcp_rcv_state_process(ctx, sk);
|
||||
}
|
||||
|
||||
char LICENSE[] SEC("license") = "GPL";
|
||||
@@ -4,7 +4,7 @@
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include "tcpconnlat.bpf.h"
|
||||
#include "tcpconnlat.h"
|
||||
|
||||
#define AF_INET 2
|
||||
#define AF_INET6 10
|
||||
@@ -110,4 +110,22 @@ int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
|
||||
return handle_tcp_rcv_state_process(ctx, sk);
|
||||
}
|
||||
|
||||
char LICENSE[] SEC("license") = "GPL";
|
||||
SEC("fentry/tcp_v4_connect")
|
||||
int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("fentry/tcp_v6_connect")
|
||||
int BPF_PROG(fentry_tcp_v6_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("fentry/tcp_rcv_state_process")
|
||||
int BPF_PROG(fentry_tcp_rcv_state_process, struct sock *sk)
|
||||
{
|
||||
return handle_tcp_rcv_state_process(ctx, sk);
|
||||
}
|
||||
|
||||
char LICENSE[] SEC("license") = "GPL";
|
||||
@@ -1,26 +0,0 @@
|
||||
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
|
||||
#ifndef __TCPCONNLAT_H
|
||||
#define __TCPCONNLAT_H
|
||||
|
||||
#define TASK_COMM_LEN 16
|
||||
|
||||
struct event {
|
||||
// union {
|
||||
unsigned int saddr_v4;
|
||||
unsigned char saddr_v6[16];
|
||||
// };
|
||||
// union {
|
||||
unsigned int daddr_v4;
|
||||
unsigned char daddr_v6[16];
|
||||
// };
|
||||
char comm[TASK_COMM_LEN];
|
||||
unsigned long long delta_us;
|
||||
unsigned long long ts_us;
|
||||
unsigned int tgid;
|
||||
int af;
|
||||
unsigned short lport;
|
||||
unsigned short dport;
|
||||
};
|
||||
|
||||
|
||||
#endif /* __TCPCONNLAT_H_ */
|
||||
@@ -1,158 +0,0 @@
|
||||
# eBPF 入门实践教程:编写 eBPF 程序 tcpconnlat 测量 tcp 连接延时
|
||||
|
||||
## 代码解释
|
||||
|
||||
### 背景
|
||||
|
||||
在互联网后端日常开发接口的时候中,不管你使用的是C、Java、PHP还是Golang,都避免不了需要调用mysql、redis等组件来获取数据,可能还需要执行一些rpc远程调用,或者再调用一些其它restful api。 在这些调用的底层,基本都是在使用TCP协议进行传输。这是因为在传输层协议中,TCP协议具备可靠的连接,错误重传,拥塞控制等优点,所以目前应用比UDP更广泛一些。但相对而言,tcp 连接也有一些缺点,例如建立连接的延时较长等。因此也会出现像 QUIC ,即 快速UDP网络连接 ( Quick UDP Internet Connections )这样的替代方案。
|
||||
|
||||
tcp 连接延时分析对于网络性能分析优化或者故障排查都能起到不少作用。
|
||||
|
||||
### tcpconnlat 的实现原理
|
||||
|
||||
tcpconnlat 这个工具跟踪执行活动TCP连接的内核函数 (例如,通过connect()系统调用),并显示本地测量的连接的延迟(时间),即从发送 SYN 到响应包的时间。
|
||||
|
||||
### tcp 连接原理
|
||||
|
||||
tcp 连接的整个过程如图所示:
|
||||
|
||||

|
||||
|
||||
在这个连接过程中,我们来简单分析一下每一步的耗时:
|
||||
|
||||
1. 客户端发出SYNC包:客户端一般是通过connect系统调用来发出 SYN 的,这里牵涉到本机的系统调用和软中断的 CPU 耗时开销
|
||||
2. SYN传到服务器:SYN从客户端网卡被发出,这是一次长途远距离的网络传输
|
||||
3. 服务器处理SYN包:内核通过软中断来收包,然后放到半连接队列中,然后再发出SYN/ACK响应。主要是 CPU 耗时开销
|
||||
4. SYC/ACK传到客户端:长途网络跋涉
|
||||
5. 客户端处理 SYN/ACK:客户端内核收包并处理SYN后,经过几us的CPU处理,接着发出 ACK。同样是软中断处理开销
|
||||
6. ACK传到服务器:长途网络跋涉
|
||||
7. 服务端收到ACK:服务器端内核收到并处理ACK,然后把对应的连接从半连接队列中取出来,然后放到全连接队列中。一次软中断CPU开销
|
||||
8. 服务器端用户进程唤醒:正在被accpet系统调用阻塞的用户进程被唤醒,然后从全连接队列中取出来已经建立好的连接。一次上下文切换的CPU开销
|
||||
|
||||
在客户端视角,在正常情况下一次TCP连接总的耗时也就就大约是一次网络RTT的耗时。但在某些情况下,可能会导致连接时的网络传输耗时上涨、CPU处理开销增加、甚至是连接失败。这种时候在发现延时过长之后,就可以结合其他信息进行分析。
|
||||
|
||||
### ebpf 实现原理
|
||||
|
||||
在 TCP 三次握手的时候,Linux 内核会维护两个队列,分别是:
|
||||
|
||||
- 半连接队列,也称 SYN 队列;
|
||||
- 全连接队列,也称 accepet 队列;
|
||||
|
||||
服务端收到客户端发起的 SYN 请求后,内核会把该连接存储到半连接队列,并向客户端响应 SYN+ACK,接着客户端会返回 ACK,服务端收到第三次握手的 ACK 后,内核会把连接从半连接队列移除,然后创建新的完全的连接,并将其添加到 accept 队列,等待进程调用 accept 函数时把连接取出来。
|
||||
|
||||
我们的 ebpf 代码实现在 <https://github.com/yunwei37/Eunomia/blob/master/bpftools/tcpconnlat/tcpconnlat.bpf.c> 中:
|
||||
|
||||
它主要使用了 trace_tcp_rcv_state_process 和 kprobe/tcp_v4_connect 这样的跟踪点:
|
||||
|
||||
```c
|
||||
|
||||
SEC("kprobe/tcp_v4_connect")
|
||||
int BPF_KPROBE(tcp_v4_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_v6_connect")
|
||||
int BPF_KPROBE(tcp_v6_connect, struct sock *sk)
|
||||
{
|
||||
return trace_connect(sk);
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_rcv_state_process")
|
||||
int BPF_KPROBE(tcp_rcv_state_process, struct sock *sk)
|
||||
{
|
||||
return handle_tcp_rcv_state_process(ctx, sk);
|
||||
}
|
||||
```
|
||||
|
||||
在 trace_connect 中,我们跟踪新的 tcp 连接,记录到达时间,并且把它加入 map 中:
|
||||
|
||||
```c
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, struct sock *);
|
||||
__type(value, struct piddata);
|
||||
} start SEC(".maps");
|
||||
|
||||
static int trace_connect(struct sock *sk)
|
||||
{
|
||||
u32 tgid = bpf_get_current_pid_tgid() >> 32;
|
||||
struct piddata piddata = {};
|
||||
|
||||
if (targ_tgid && targ_tgid != tgid)
|
||||
return 0;
|
||||
|
||||
bpf_get_current_comm(&piddata.comm, sizeof(piddata.comm));
|
||||
piddata.ts = bpf_ktime_get_ns();
|
||||
piddata.tgid = tgid;
|
||||
bpf_map_update_elem(&start, &sk, &piddata, 0);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
在 handle_tcp_rcv_state_process 中,我们跟踪接收到的 tcp 数据包,从 map 从提取出对应的 connect 事件,并且计算延迟:
|
||||
|
||||
```c
|
||||
static int handle_tcp_rcv_state_process(void *ctx, struct sock *sk)
|
||||
{
|
||||
struct piddata *piddatap;
|
||||
struct event event = {};
|
||||
s64 delta;
|
||||
u64 ts;
|
||||
|
||||
if (BPF_CORE_READ(sk, __sk_common.skc_state) != TCP_SYN_SENT)
|
||||
return 0;
|
||||
|
||||
piddatap = bpf_map_lookup_elem(&start, &sk);
|
||||
if (!piddatap)
|
||||
return 0;
|
||||
|
||||
ts = bpf_ktime_get_ns();
|
||||
delta = (s64)(ts - piddatap->ts);
|
||||
if (delta < 0)
|
||||
goto cleanup;
|
||||
|
||||
event.delta_us = delta / 1000U;
|
||||
if (targ_min_us && event.delta_us < targ_min_us)
|
||||
goto cleanup;
|
||||
__builtin_memcpy(&event.comm, piddatap->comm,
|
||||
sizeof(event.comm));
|
||||
event.ts_us = ts / 1000;
|
||||
event.tgid = piddatap->tgid;
|
||||
event.lport = BPF_CORE_READ(sk, __sk_common.skc_num);
|
||||
event.dport = BPF_CORE_READ(sk, __sk_common.skc_dport);
|
||||
event.af = BPF_CORE_READ(sk, __sk_common.skc_family);
|
||||
if (event.af == AF_INET) {
|
||||
event.saddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
|
||||
event.daddr_v4 = BPF_CORE_READ(sk, __sk_common.skc_daddr);
|
||||
} else {
|
||||
BPF_CORE_READ_INTO(&event.saddr_v6, sk,
|
||||
__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
|
||||
BPF_CORE_READ_INTO(&event.daddr_v6, sk,
|
||||
__sk_common.skc_v6_daddr.in6_u.u6_addr32);
|
||||
}
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(event));
|
||||
|
||||
cleanup:
|
||||
bpf_map_delete_elem(&start, &sk);
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
### 编译运行
|
||||
|
||||
TODO
|
||||
### 总结
|
||||
|
||||
通过上面的实验,我们可以看到,tcpconnlat 工具的实现原理是基于内核的TCP连接的跟踪,并且可以跟踪到 tcp 连接的延迟时间;除了命令行使用方式之外,还可以将其和容器、k8s 等元信息综合起来,通过 `prometheus` 和 `grafana` 等工具进行网络性能分析。
|
||||
|
||||
> `Eunomia` 是一个使用 C/C++ 开发的基于 eBPF的轻量级,高性能云原生监控工具,旨在帮助用户了解容器的各项行为、监控可疑的容器安全事件,力求提供覆盖容器全生命周期的轻量级开源监控解决方案。它使用 `Linux` `eBPF` 技术在运行时跟踪您的系统和应用程序,并分析收集的事件以检测可疑的行为模式。目前,它包含性能分析、容器集群网络可视化分析*、容器安全感知告警、一键部署、持久化存储监控等功能,提供了多样化的 ebpf 追踪点。其核心导出器/命令行工具最小仅需要约 4MB 大小的二进制程序,即可在支持的 Linux 内核上启动。
|
||||
|
||||
项目地址:<https://github.com/yunwei37/Eunomia>
|
||||
|
||||
### 参考资料
|
||||
|
||||
1. <http://kerneltravel.net/blog/2020/tcpconnlat/>
|
||||
2. <https://network.51cto.com/article/640631.html>
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 40 KiB |
@@ -39,10 +39,10 @@ const volatile bool targ_ms = false;
|
||||
|
||||
/// @sample {"interval": 1000, "type" : "log2_hist"}
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, MAX_ENTRIES);
|
||||
__type(key, u64);
|
||||
__type(value, struct hist);
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, MAX_ENTRIES);
|
||||
__type(key, u64);
|
||||
__type(value, struct hist);
|
||||
} hists SEC(".maps");
|
||||
|
||||
static struct hist zero;
|
||||
@@ -50,43 +50,43 @@ static struct hist zero;
|
||||
SEC("fentry/tcp_rcv_established")
|
||||
int BPF_PROG(tcp_rcv, struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = (struct inet_sock *)(sk);
|
||||
struct tcp_sock *ts;
|
||||
struct hist *histp;
|
||||
u64 key, slot;
|
||||
u32 srtt;
|
||||
const struct inet_sock *inet = (struct inet_sock *)(sk);
|
||||
struct tcp_sock *ts;
|
||||
struct hist *histp;
|
||||
u64 key, slot;
|
||||
u32 srtt;
|
||||
|
||||
if (targ_sport && targ_sport != inet->inet_sport)
|
||||
return 0;
|
||||
if (targ_dport && targ_dport != sk->__sk_common.skc_dport)
|
||||
return 0;
|
||||
if (targ_saddr && targ_saddr != inet->inet_saddr)
|
||||
return 0;
|
||||
if (targ_daddr && targ_daddr != sk->__sk_common.skc_daddr)
|
||||
return 0;
|
||||
if (targ_sport && targ_sport != inet->inet_sport)
|
||||
return 0;
|
||||
if (targ_dport && targ_dport != sk->__sk_common.skc_dport)
|
||||
return 0;
|
||||
if (targ_saddr && targ_saddr != inet->inet_saddr)
|
||||
return 0;
|
||||
if (targ_daddr && targ_daddr != sk->__sk_common.skc_daddr)
|
||||
return 0;
|
||||
|
||||
if (targ_laddr_hist)
|
||||
key = inet->inet_saddr;
|
||||
else if (targ_raddr_hist)
|
||||
key = inet->sk.__sk_common.skc_daddr;
|
||||
else
|
||||
key = 0;
|
||||
histp = bpf_map_lookup_or_try_init(&hists, &key, &zero);
|
||||
if (!histp)
|
||||
return 0;
|
||||
ts = (struct tcp_sock *)(sk);
|
||||
srtt = BPF_CORE_READ(ts, srtt_us) >> 3;
|
||||
if (targ_ms)
|
||||
srtt /= 1000U;
|
||||
slot = log2l(srtt);
|
||||
if (slot >= MAX_SLOTS)
|
||||
slot = MAX_SLOTS - 1;
|
||||
__sync_fetch_and_add(&histp->slots[slot], 1);
|
||||
if (targ_show_ext) {
|
||||
__sync_fetch_and_add(&histp->latency, srtt);
|
||||
__sync_fetch_and_add(&histp->cnt, 1);
|
||||
}
|
||||
return 0;
|
||||
if (targ_laddr_hist)
|
||||
key = inet->inet_saddr;
|
||||
else if (targ_raddr_hist)
|
||||
key = inet->sk.__sk_common.skc_daddr;
|
||||
else
|
||||
key = 0;
|
||||
histp = bpf_map_lookup_or_try_init(&hists, &key, &zero);
|
||||
if (!histp)
|
||||
return 0;
|
||||
ts = (struct tcp_sock *)(sk);
|
||||
srtt = BPF_CORE_READ(ts, srtt_us) >> 3;
|
||||
if (targ_ms)
|
||||
srtt /= 1000U;
|
||||
slot = log2l(srtt);
|
||||
if (slot >= MAX_SLOTS)
|
||||
slot = MAX_SLOTS - 1;
|
||||
__sync_fetch_and_add(&histp->slots[slot], 1);
|
||||
if (targ_show_ext) {
|
||||
__sync_fetch_and_add(&histp->latency, srtt);
|
||||
__sync_fetch_and_add(&histp->cnt, 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
```
|
||||
@@ -95,20 +95,15 @@ int BPF_PROG(tcp_rcv, struct sock *sk)
|
||||
这段代码是基于eBPF的网络延迟分析工具,它通过hooking TCP协议栈中的tcp_rcv_established函数来统计TCP连接的RTT分布。下面是这段代码的主要工作原理:
|
||||
|
||||
1. 首先定义了一个名为"hists"的eBPF哈希表,用于保存RTT直方图数据。
|
||||
|
||||
2. 当tcp_rcv_established函数被调用时,它首先从传入的socket结构体中获取TCP相关信息,包括本地/远程IP地址、本地/远程端口号以及TCP状态信息等。
|
||||
|
||||
3. 接下来,代码会检查用户指定的条件是否匹配当前TCP连接。如果匹配失败,则直接返回。
|
||||
|
||||
4. 如果匹配成功,则从"hists"哈希表中查找与本地/远程IP地址匹配的直方图数据。如果该IP地址的直方图不存在,则创建一个新的直方图并插入哈希表中。
|
||||
|
||||
5. 接下来,代码会从socket结构体中获取当前TCP连接的RTT(srtt),并根据用户设置的选项来将srtt值进行处理。如果用户设置了"-ms"选项,则将srtt值除以1000。
|
||||
|
||||
6. 接着,代码会将srtt值转换为直方图的槽位(slot),并将该槽位的计数器+1。
|
||||
|
||||
7. 如果用户设置了"-show-ext"选项,则还会累加直方图的总延迟(latency)和计数(cnt)。
|
||||
|
||||
## 编译运行
|
||||
|
||||
eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发工具链,它的目的是简化 eBPF 程序的开发、构建、分发、运行。可以参考 <https://github.com/eunomia-bpf/eunomia-bpf> 下载和安装 ecc 编译工具链和 ecli 运行时。我们使用 eunomia-bpf 编译运行这个例子。
|
||||
|
||||
Compile:
|
||||
|
||||
Reference in New Issue
Block a user