mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-02-03 02:04:30 +08:00
Add http and translate for signal (#73)
* use BPF_NO_GLOBAL_DATA to avoid error on old kernel versions or libbpf versions, use BPF_NO_GLOBAL_DATA * add translate of replace * add socket * add socket http impl * improve userspace program * add desc for http * update translate to eng * add ci for 32
This commit is contained in:
5
.github/workflows/test-eunomia.yaml
vendored
5
.github/workflows/test-eunomia.yaml
vendored
@@ -64,6 +64,11 @@ jobs:
|
||||
run: |
|
||||
./ecc src/20-tc/tc.bpf.c
|
||||
sudo timeout -s 2 3 ./ecli run src/20-tc/package.json || if [ $? = 124 ]; then exit 0; else exit $?; fi
|
||||
- name: test 23 http
|
||||
run: |
|
||||
./ecc src/23-http/accept.bpf.c src/23-http/accept.h
|
||||
sudo timeout -s 2 3 ./ecli run src/23-http/package.json || if [ $? = 124 ]; then exit 0; else exit $?; fi
|
||||
|
||||
- name: test 25 signal
|
||||
run: |
|
||||
./ecc src/25-signal/signal.bpf.c src/25-signal/signal.h
|
||||
|
||||
4
.github/workflows/test-libbpf.yml
vendored
4
.github/workflows/test-libbpf.yml
vendored
@@ -43,6 +43,10 @@ jobs:
|
||||
run: |
|
||||
make -C src/17-biopattern
|
||||
# sudo timeout -s 2 3 src/17-biopattern/biopattern || if [ $? = 124 ]; then exit 0; else exit $?; fi
|
||||
- name: test 23 http
|
||||
run: |
|
||||
make -C src/23-http
|
||||
# sudo timeout -s 2 3 src/23-http/sockfilter || if [ $? = 124 ]; then exit 0; else exit $?; fi
|
||||
- name: test 30 sslsniff
|
||||
run: |
|
||||
make -C src/30-sslsniff
|
||||
|
||||
@@ -62,7 +62,7 @@ Android:
|
||||
网络和追踪:
|
||||
|
||||
- [使用 uprobe 捕获多种库的 SSL/TLS 明文数据](src/30-sslsniff/README.md)
|
||||
- [使用 eBPF 追踪 HTTP 请求或其他七层协议](src/23-http/README.md)
|
||||
- [使用 eBPF socket filter 或 syscall trace 追踪 HTTP 请求和其他七层协议](src/23-http/README.md)
|
||||
- [使用 sockops 加速网络请求转发](src/29-sockops/README.md)
|
||||
|
||||
安全:
|
||||
|
||||
@@ -49,7 +49,7 @@ Android:
|
||||
|
||||
Networking and tracing:
|
||||
|
||||
- [Tracing HTTP requests or other layer-7 protocols using eBPF](src/23-http/README_en.md)
|
||||
- [Tracing HTTP requests or other layer-7 protocols using eBPF socket filter or syscall trace](src/23-http/README_en.md)
|
||||
- [Accelerating network request forwarding using sockops](src/29-sockops/README_en.md)
|
||||
- [Capturing Plain Text Data of Various Libraries' SSL/TLS Using uprobe](src/30-sslsniff/README_en.md)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
/* Copyright (c) 2021 Sartura */
|
||||
#define BPF_NO_GLOBAL_DATA
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
9
src/23-http/.gitignore
vendored
Normal file
9
src/23-http/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
.vscode
|
||||
package.json
|
||||
*.o
|
||||
*.skel.json
|
||||
*.skel.yaml
|
||||
package.yaml
|
||||
ecli
|
||||
sockfilter
|
||||
*.class
|
||||
141
src/23-http/Makefile
Normal file
141
src/23-http/Makefile
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||
OUTPUT := .output
|
||||
CLANG ?= clang
|
||||
LIBBPF_SRC := $(abspath ../third_party/libbpf/src)
|
||||
BPFTOOL_SRC := $(abspath ../third_party/bpftool/src)
|
||||
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
|
||||
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
|
||||
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
|
||||
LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/)
|
||||
LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a)
|
||||
LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h)
|
||||
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
|
||||
| sed 's/arm.*/arm/' \
|
||||
| sed 's/aarch64/arm64/' \
|
||||
| sed 's/ppc64le/powerpc/' \
|
||||
| sed 's/mips.*/mips/' \
|
||||
| sed 's/riscv64/riscv/' \
|
||||
| sed 's/loongarch64/loongarch/')
|
||||
VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h
|
||||
# Use our own libbpf API headers and Linux UAPI headers distributed with
|
||||
# libbpf to avoid dependency on system-wide headers, which could be missing or
|
||||
# outdated
|
||||
INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
|
||||
CFLAGS := -g -Wall
|
||||
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
|
||||
|
||||
APPS = sockfilter # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
|
||||
|
||||
CARGO ?= $(shell which cargo)
|
||||
ifeq ($(strip $(CARGO)),)
|
||||
BZS_APPS :=
|
||||
else
|
||||
BZS_APPS := # profile
|
||||
APPS += $(BZS_APPS)
|
||||
# Required by libblazesym
|
||||
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
|
||||
endif
|
||||
|
||||
# Get Clang's default includes on this system. We'll explicitly add these dirs
|
||||
# to the includes list when compiling with `-target bpf` because otherwise some
|
||||
# architecture-specific dirs will be "missing" on some architectures/distros -
|
||||
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
|
||||
# sys/cdefs.h etc. might be missing.
|
||||
#
|
||||
# Use '-idirafter': Don't interfere with include mechanics except where the
|
||||
# build would have failed anyways.
|
||||
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
|
||||
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
|
||||
|
||||
ifeq ($(V),1)
|
||||
Q =
|
||||
msg =
|
||||
else
|
||||
Q = @
|
||||
msg = @printf ' %-8s %s%s\n' \
|
||||
"$(1)" \
|
||||
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
|
||||
"$(if $(3), $(3))";
|
||||
MAKEFLAGS += --no-print-directory
|
||||
endif
|
||||
|
||||
define allow-override
|
||||
$(if $(or $(findstring environment,$(origin $(1))),\
|
||||
$(findstring command line,$(origin $(1)))),,\
|
||||
$(eval $(1) = $(2)))
|
||||
endef
|
||||
|
||||
$(call allow-override,CC,$(CROSS_COMPILE)cc)
|
||||
$(call allow-override,LD,$(CROSS_COMPILE)ld)
|
||||
|
||||
.PHONY: all
|
||||
all: $(APPS)
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
$(call msg,CLEAN)
|
||||
$(Q)rm -rf $(OUTPUT) $(APPS)
|
||||
|
||||
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
|
||||
$(call msg,MKDIR,$@)
|
||||
$(Q)mkdir -p $@
|
||||
|
||||
# Build libbpf
|
||||
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||
$(call msg,LIB,$@)
|
||||
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
|
||||
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
|
||||
INCLUDEDIR= LIBDIR= UAPIDIR= \
|
||||
install
|
||||
|
||||
# Build bpftool
|
||||
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
|
||||
$(call msg,BPFTOOL,$@)
|
||||
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
|
||||
|
||||
|
||||
$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
|
||||
$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
|
||||
|
||||
$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
|
||||
$(call msg,LIB, $@)
|
||||
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
|
||||
|
||||
$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
|
||||
$(call msg,LIB,$@)
|
||||
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
|
||||
|
||||
# Build BPF code
|
||||
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
|
||||
$(call msg,BPF,$@)
|
||||
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
|
||||
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
|
||||
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||
|
||||
# Generate BPF skeletons
|
||||
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
|
||||
$(call msg,GEN-SKEL,$@)
|
||||
$(Q)$(BPFTOOL) gen skeleton $< > $@
|
||||
|
||||
# Build user-space code
|
||||
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
|
||||
|
||||
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
|
||||
$(call msg,CC,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
|
||||
|
||||
$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
|
||||
|
||||
$(BZS_APPS): $(LIBBLAZESYM_OBJ)
|
||||
|
||||
# Build application binary
|
||||
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
|
||||
$(call msg,BINARY,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
|
||||
|
||||
# delete failed targets
|
||||
.DELETE_ON_ERROR:
|
||||
|
||||
# keep intermediate (.skel.h, .bpf.o, etc) targets
|
||||
.SECONDARY:
|
||||
@@ -1,3 +1,648 @@
|
||||
# http
|
||||
# 通过 eBPF socket filter 或 syscall trace 追踪 HTTP 请求等七层协议 - eBPF 实践教程
|
||||
|
||||
TODO
|
||||
在当今的技术环境中,随着微服务、云原生应用和复杂的分布式系统的崛起,系统的可观测性已成为确保其健康、性能和安全的关键要素。特别是在微服务架构中,应用程序的组件可能分布在多个容器和服务器上,这使得传统的监控方法往往难以提供足够的深度和广度来全面了解系统的行为。这就是为什么观测七层协议,如 HTTP、gRPC、MQTT 等,变得尤为重要。
|
||||
|
||||
七层协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。在微服务环境中,了解这些交互是至关重要的,因为它们经常是性能瓶颈、故障和安全问题的根源。然而,监控这些协议并不简单。传统的网络监控工具,如 tcpdump,虽然在捕获网络流量方面非常有效,但在处理七层协议的复杂性和动态性时,它们往往显得力不从心。
|
||||
|
||||
这正是 eBPF 技术发挥作用的地方。eBPF 允许开发者和运维人员深入到系统的内核层,实时观测和分析系统的行为,而无需对应用程序代码进行任何修改或插入埋点。这为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务环境中。
|
||||
|
||||
在本教程中,我们将深入探讨以下内容:
|
||||
|
||||
- 追踪七层协议,如 HTTP,以及与其相关的挑战。
|
||||
- eBPF 的 socket filter 和 syscall 追踪:这两种技术如何帮助我们在不同的内核层次追踪 HTTP 网络请求数据,以及这两种方法的优势和局限性。
|
||||
- eBPF 实践教程:如何开发一个 eBPF 程序,使用 eBPF socket filter 或 syscall 追踪来捕获和分析 HTTP 流量
|
||||
|
||||
随着网络流量的增加和应用程序的复杂性增加,对七层协议的深入了解变得越来越重要。通过本教程,您将获得必要的知识和工具,以便更有效地监控和分析您的网络流量,从而为您的应用程序和服务器提供最佳的性能。
|
||||
|
||||
本文是 eBPF 开发者教程的一部分,更详细的内容可以在这里找到:<https://eunomia.dev/tutorials/> 源代码在 [GitHub 仓库](https://github.com/eunomia-bpf/bpf-developer-tutorial) 中开源。
|
||||
|
||||
## 追踪 HTTP, HTTP/2 等七层协议的挑战
|
||||
|
||||
在现代的网络环境中,七层协议不仅仅局限于 HTTP。实际上,有许多七层协议,如 HTTP/2, gRPC, MQTT, WebSocket, AMQP 和 SMTP,它们都在不同的应用场景中发挥着关键作用。这些协议为我们提供了关于应用程序如何与其他服务和组件交互的详细信息。但是,追踪这些协议并不是一个简单的任务,尤其是在复杂的分布式系统中。
|
||||
|
||||
1. **多样性和复杂性**:每种七层协议都有其特定的设计和工作原理。例如,gRPC 使用了 HTTP/2 作为其传输协议,并支持多种语言。而 MQTT 是为低带宽和不可靠的网络设计的轻量级发布/订阅消息传输协议。
|
||||
|
||||
2. **动态性**:许多七层协议都是动态的,这意味着它们的行为可能会根据网络条件、应用需求或其他因素而变化。
|
||||
|
||||
3. **加密和安全性**:随着安全意识的增强,许多七层协议都采用了加密技术,如 TLS/SSL。这为追踪和分析带来了额外的挑战,因为需要解密流量才能进行深入的分析。
|
||||
|
||||
4. **高性能需求**:在高流量的生产环境中,捕获和分析七层协议的流量可能会对系统性能产生影响。传统的网络监控工具可能无法处理大量的并发会话。
|
||||
|
||||
5. **数据的完整性和连续性**:与 tcpdump 这样的工具只捕获单独的数据包不同,追踪七层协议需要捕获完整的会话,这可能涉及多个数据包。这要求工具能够正确地重组和解析这些数据包,以提供连续的会话视图。
|
||||
|
||||
6. **代码侵入性**:为了深入了解七层协议的行为,开发人员可能需要修改应用程序代码以添加监控功能。这不仅增加了开发和维护的复杂性,而且可能会影响应用程序的性能。
|
||||
|
||||
正如上文所述,eBPF 提供了一个强大的解决方案,允许我们在内核层面捕获和分析七层协议的流量,而无需对应用程序进行任何修改。这种方法为我们提供了一个独特的机会,可以更简单、更高效地处理应用层流量,特别是在微服务和分布式环境中。
|
||||
|
||||
在处理网络流量和系统行为时,选择在内核态而非用户态进行处理有其独特的优势。首先,内核态处理可以直接访问系统资源和硬件,从而提供更高的性能和效率。其次,由于内核是操作系统的核心部分,它可以提供对系统行为的全面视图,而不受任何用户空间应用程序的限制。
|
||||
|
||||
**无插桩追踪("zero-instrumentation observability")**的优势如下:
|
||||
|
||||
1. **性能开销小**:由于不需要修改或添加额外的代码到应用程序中,所以对性能的影响最小化。
|
||||
2. **透明性**:开发者和运维人员不需要知道应用程序的内部工作原理,也不需要访问源代码。
|
||||
3. **灵活性**:可以轻松地在不同的环境和应用程序中部署和使用,无需进行任何特定的配置或修改。
|
||||
4. **安全性**:由于不需要修改应用程序代码,所以降低了引入潜在安全漏洞的风险。
|
||||
|
||||
利用 eBPF 在内核态进行无插桩追踪,我们可以实时捕获和分析系统的行为,而不需要对应用程序进行任何修改。这种方法不仅提供了对系统深入的洞察力,而且确保了最佳的性能和效率。这是为什么 eBPF 成为现代可观测性工具的首选技术,特别是在需要高性能和低延迟的生产环境中。
|
||||
|
||||
## eBPF 中的 socket filter 与 syscall 追踪:深入解析与比较
|
||||
|
||||
### **eBPF Socket Filter**
|
||||
|
||||
**是什么?**
|
||||
eBPF socket filter 是经典的 Berkeley Packet Filter (BPF) 的扩展,允许在内核中直接进行更高级的数据包过滤。它在套接字层操作,使得可以精细地控制哪些数据包被用户空间应用程序处理。
|
||||
|
||||
**主要特点:**
|
||||
|
||||
- **性能**:通过在内核中直接处理数据包,eBPF socket filters 减少了用户和内核空间之间的上下文切换的开销。
|
||||
- **灵活性**:eBPF socket filters 可以附加到任何套接字,为各种协议和套接字类型提供了通用的数据包过滤机制。
|
||||
- **可编程性**:开发者可以编写自定义的 eBPF 程序来定义复杂的过滤逻辑,超越简单的数据包匹配。
|
||||
|
||||
**用途:**
|
||||
|
||||
- **流量控制**:根据自定义条件限制或优先处理流量。
|
||||
- **安全性**:在它们到达用户空间应用程序之前丢弃恶意数据包。
|
||||
- **监控**:捕获特定数据包进行分析,而不影响其它流量。
|
||||
|
||||
### **eBPF Syscall Tracing**
|
||||
|
||||
**是什么?**
|
||||
使用 eBPF 进行的系统调用跟踪允许监视和操作应用程序发出的系统调用。系统调用是用户空间应用程序与内核交互的主要机制,因此跟踪它们可以深入了解应用程序的行为。
|
||||
|
||||
**主要特点:**
|
||||
|
||||
- **粒度**:eBPF 允许跟踪特定的系统调用,甚至是这些系统调用中的特定参数。
|
||||
- **低开销**:与其他跟踪方法相比,eBPF 系统调用跟踪旨在具有最小的性能影响。
|
||||
- **安全性**:内核验证 eBPF 程序,以确保它们不会损害系统稳定性。
|
||||
|
||||
**工作原理:**
|
||||
eBPF 系统调用跟踪通常涉及将 eBPF 程序附加到与系统调用相关的 tracepoints 或 kprobes。当跟踪的系统调用被调用时,执行 eBPF 程序,允许收集数据或甚至修改系统调用参数。
|
||||
|
||||
### eBPF 的 socket filter 和 syscall 追踪的对比
|
||||
|
||||
| 项目 | eBPF Socket Filter | eBPF Syscall Tracing |
|
||||
|------|--------------------|----------------------|
|
||||
| **操作层** | 套接字层,主要处理从套接字接收或发送的网络数据包 | 系统调用层,监视和可能更改应用程序发出的系统调用的行为 |
|
||||
| **主要用途** | 主要用于网络数据包的过滤、监控和操作 | 用于性能分析、安全监控和系统调用交互的调试 |
|
||||
| **粒度** | 专注于单个网络数据包 | 可以监视与网络无关的广泛的系统活动 |
|
||||
| **追踪 HTTP 流量** | 可以用于过滤和捕获通过套接字传递的 HTTP 数据包 | 可以跟踪与网络操作相关的系统调用 |
|
||||
|
||||
总之,eBPF 的 socket filter 和 syscall 追踪都可以用于追踪 HTTP 流量,但 socket filters 更直接且更适合此目的。然而,如果您对应用程序如何与系统交互的更广泛的上下文感兴趣(例如,哪些系统调用导致了 HTTP 流量),那么系统调用跟踪将是非常有价值的。在许多高级的可观察性设置中,这两种工具可能会同时使用,以提供系统和网络行为的全面视图。
|
||||
|
||||
## 使用 eBPF socket filter 来捕获 HTTP 流量
|
||||
|
||||
eBPF 代码由用户态和内核态组成,这里主要关注于内核态代码。这是使用 eBPF socket filter 技术来在内核中捕获HTTP流量的主要逻辑,完整代码如下:
|
||||
|
||||
```c
|
||||
SEC("socket")
|
||||
int socket_handler(struct __sk_buff *skb)
|
||||
{
|
||||
struct so_event *e;
|
||||
__u8 verlen;
|
||||
__u16 proto;
|
||||
__u32 nhoff = ETH_HLEN;
|
||||
__u32 ip_proto = 0;
|
||||
__u32 tcp_hdr_len = 0;
|
||||
__u16 tlen;
|
||||
__u32 payload_offset = 0;
|
||||
__u32 payload_length = 0;
|
||||
__u8 hdr_len;
|
||||
|
||||
bpf_skb_load_bytes(skb, 12, &proto, 2);
|
||||
proto = __bpf_ntohs(proto);
|
||||
if (proto != ETH_P_IP)
|
||||
return 0;
|
||||
|
||||
if (ip_is_fragment(skb, nhoff))
|
||||
return 0;
|
||||
|
||||
// ip4 header lengths are variable
|
||||
// access ihl as a u8 (linux/include/linux/skbuff.h)
|
||||
bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len));
|
||||
hdr_len &= 0x0f;
|
||||
hdr_len *= 4;
|
||||
|
||||
/* verify hlen meets minimum size requirements */
|
||||
if (hdr_len < sizeof(struct iphdr))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1);
|
||||
|
||||
if (ip_proto != IPPROTO_TCP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
tcp_hdr_len = nhoff + hdr_len;
|
||||
bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen));
|
||||
|
||||
__u8 doff;
|
||||
bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); // read the first byte past __tcphdr->ack_seq, we can't do offsetof bit fields
|
||||
doff &= 0xf0; // clean-up res1
|
||||
doff >>= 4; // move the upper 4 bits to low
|
||||
doff *= 4; // convert to bytes length
|
||||
|
||||
payload_offset = ETH_HLEN + hdr_len + doff;
|
||||
payload_length = __bpf_ntohs(tlen) - hdr_len - doff;
|
||||
|
||||
char line_buffer[7];
|
||||
if (payload_length < 7 || payload_offset < 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7);
|
||||
bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer);
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "POST") != 0 &&
|
||||
bpf_strncmp(line_buffer, 3, "PUT") != 0 &&
|
||||
bpf_strncmp(line_buffer, 6, "DELETE") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* reserve sample from BPF ringbuf */
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (!e)
|
||||
return 0;
|
||||
|
||||
e->ip_proto = ip_proto;
|
||||
bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4);
|
||||
e->pkt_type = skb->pkt_type;
|
||||
e->ifindex = skb->ifindex;
|
||||
|
||||
e->payload_length = payload_length;
|
||||
bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE);
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
|
||||
return skb->len;
|
||||
}
|
||||
```
|
||||
|
||||
当分析这段eBPF程序时,我们将按照每个代码块的内容来详细解释,并提供相关的背景知识:
|
||||
|
||||
```c
|
||||
SEC("socket")
|
||||
int socket_handler(struct __sk_buff *skb)
|
||||
{
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
这是eBPF程序的入口点,它定义了一个名为 `socket_handler` 的函数,它会被内核用于处理传入的网络数据包。这个函数位于一个名为 `socket` 的 eBPF 节(section)中,表明这个程序用于套接字处理。
|
||||
|
||||
```c
|
||||
struct so_event *e;
|
||||
__u8 verlen;
|
||||
__u16 proto;
|
||||
__u32 nhoff = ETH_HLEN;
|
||||
__u32 ip_proto = 0;
|
||||
__u32 tcp_hdr_len = 0;
|
||||
__u16 tlen;
|
||||
__u32 payload_offset = 0;
|
||||
__u32 payload_length = 0;
|
||||
__u8 hdr_len;
|
||||
```
|
||||
|
||||
在这个代码块中,我们定义了一些变量来存储在处理数据包时需要的信息。这些变量包括了`struct so_event *e`用于存储事件信息,`verlen`、`proto`、`nhoff`、`ip_proto`、`tcp_hdr_len`、`tlen`、`payload_offset`、`payload_length`、`hdr_len`等用于存储数据包信息的变量。
|
||||
|
||||
- `struct so_event *e;`:这是一个指向`so_event`结构体的指针,用于存储捕获到的事件信息。该结构体的具体定义在程序的其他部分。
|
||||
- `__u8 verlen;`、`__u16 proto;`、`__u32 nhoff = ETH_HLEN;`:这些变量用于存储各种信息,例如协议类型、数据包偏移量等。`nhoff`初始化为以太网帧头部的长度,通常为14字节,因为以太网帧头部包括目标MAC地址、源MAC地址和帧类型字段。
|
||||
- `__u32 ip_proto = 0;`:这个变量用于存储IP协议的类型,初始化为0。
|
||||
- `__u32 tcp_hdr_len = 0;`:这个变量用于存储TCP头部的长度,初始化为0。
|
||||
- `__u16 tlen;`:这个变量用于存储IP数据包的总长度。
|
||||
- `__u32 payload_offset = 0;`、`__u32 payload_length = 0;`:这两个变量用于存储HTTP请求的载荷(payload)的偏移量和长度。
|
||||
- `__u8 hdr_len;`:这个变量用于存储IP头部的长度。
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, 12, &proto, 2);
|
||||
proto = __bpf_ntohs(proto);
|
||||
if (proto != ETH_P_IP)
|
||||
return 0;
|
||||
```
|
||||
|
||||
在这里,代码从数据包中加载了以太网帧的类型字段,这个字段告诉我们数据包使用的网络层协议。然后,使用`__bpf_ntohs`函数将网络字节序的类型字段转换为主机字节序。接下来,代码检查类型字段是否等于IPv4的以太网帧类型(0x0800)。如果不等于,说明这个数据包不是IPv4数据包,直接返回0,放弃处理。
|
||||
|
||||
这里需要了解以下几个概念:
|
||||
|
||||
- 以太网帧(Ethernet Frame):是数据链路层(第二层)的协议,用于在局域网中传输数据帧。以太网帧通常包括目标MAC地址、源MAC地址和帧类型字段。
|
||||
- 网络字节序(Network Byte Order):网络协议通常使用大端字节序(Big-Endian)来表示数据。因此,需要将从网络中接收到的数据转换为主机字节序,以便在主机上正确解释数据。
|
||||
- IPv4帧类型(ETH_P_IP):表示以太网帧中包含的协议类型字段,0x0800表示IPv4。
|
||||
|
||||
```c
|
||||
if (ip_is_fragment(skb, nhoff))
|
||||
return 0;
|
||||
```
|
||||
|
||||
这一部分的代码检查是否处理IP分片。IP分片是将较大的IP数据包分割成多个小片段以进行传输的机制。在这里,如果数据包是IP分片,则直接返回0,表示不处理分片,只处理完整的数据包。
|
||||
|
||||
```c
|
||||
static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff)
|
||||
{
|
||||
__u16 frag_off;
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);
|
||||
frag_off = __bpf_ntohs(frag_off);
|
||||
return frag_off & (IP_MF | IP_OFFSET);
|
||||
}
|
||||
```
|
||||
|
||||
上述代码是一个辅助函数,用于检查传入的IPv4数据包是否为IP分片。IP分片是一种机制,当IP数据包的大小超过了网络的最大传输单元(MTU),路由器会将其分割成多个较小的片段,以便在网络上进行传输。这个函数的目的是检查数据包的分片标志(Fragmentation Flag)以及片偏移(Fragment Offset)字段,以确定是否为分片。
|
||||
|
||||
下面是代码的逐行解释:
|
||||
|
||||
1. `__u16 frag_off;`:定义一个16位无符号整数变量`frag_off`,用于存储片偏移字段的值。
|
||||
2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`:这行代码使用`bpf_skb_load_bytes`函数从数据包中加载IPv4头部的片偏移字段(`frag_off`),并加载2个字节。`nhoff`是IPv4头部在数据包中的偏移量,`offsetof(struct iphdr, frag_off)`用于计算片偏移字段在IPv4头部中的偏移量。
|
||||
3. `frag_off = __bpf_ntohs(frag_off);`:将加载的片偏移字段从网络字节序(Big-Endian)转换为主机字节序。网络协议通常使用大端字节序表示数据,而主机可能使用大端或小端字节序。这里将片偏移字段转换为主机字节序,以便进一步处理。
|
||||
4. `return frag_off & (IP_MF | IP_OFFSET);`:这行代码通过使用位运算检查片偏移字段的值,以确定是否为IP分片。具体来说,它使用位与运算符`&`将片偏移字段与两个标志位进行位与运算:
|
||||
- `IP_MF`:表示"更多分片"标志(More Fragments)。如果这个标志位被设置为1,表示数据包是分片的一部分,还有更多分片。
|
||||
- `IP_OFFSET`:表示片偏移字段。如果片偏移字段不为0,表示数据包是分片的一部分,且具有片偏移值。
|
||||
如果这两个标志位中的任何一个被设置为1,那么结果就不为零,说明数据包是IP分片。如果都为零,说明数据包不是分片。
|
||||
|
||||
需要注意的是,IP头部的片偏移字段以8字节为单位,所以实际的片偏移值需要左移3位来得到字节偏移。此外,IP头部的"更多分片"标志(IP_MF)表示数据包是否有更多的分片,通常与片偏移字段一起使用来指示整个数据包的分片情况。这个函数只关心这两个标志位,如果其中一个标志被设置,就认为是IP分片。
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len));
|
||||
hdr_len &= 0x0f;
|
||||
hdr_len *= 4;
|
||||
```
|
||||
|
||||
这一部分的代码从数据包中加载IP头部的长度字段。IP头部长度字段包含了IP头部的长度信息,以4字节为单位,需要将其转换为字节数。这里通过按位与和乘以4来进行转换。
|
||||
|
||||
需要了解:
|
||||
|
||||
- IP头部(IP Header):IP头部包含了关于数据包的基本信息,如源IP地址、目标IP地址、协议类型和头部校验和等。头部长度字段(IHL,Header Length)表示IP头部的长度,以4字节为单位,通常为20字节(5个4字节的字)。
|
||||
|
||||
```c
|
||||
if (hdr_len < sizeof(struct iphdr))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
这段代码检查IP头部的长度是否满足最小长度要求,通常IP头部的最小长度是20字节。如果IP头部的长度小于20字节,说明数据包不完整或损坏,直接返回0,放弃处理。
|
||||
|
||||
需要了解:
|
||||
|
||||
- `struct iphdr`:这是Linux内核中定义的结构体,表示IPv4头部的格式。它包括了版本、头部长度、服务类型、总长度、
|
||||
|
||||
标识符、标志位、片偏移、生存时间、协议、头部校验和、源IP地址和目标IP地址等字段。
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1);
|
||||
if (ip_proto != IPPROTO_TCP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
在这里,代码从数据包中加载IP头部中的协议字段,以确定数据包使用的传输层协议。然后,它检查协议字段是否为TCP协议(IPPROTO_TCP)。如果不是TCP协议,说明不是HTTP请求或响应,直接返回0。
|
||||
|
||||
需要了解:
|
||||
|
||||
- 传输层协议:IP头部中的协议字段指示了数据包所使用的传输层协议,例如TCP、UDP或ICMP。
|
||||
|
||||
```c
|
||||
tcp_hdr_len = nhoff + hdr_len;
|
||||
```
|
||||
|
||||
这行代码计算了TCP头部的偏移量。它将以太网帧头部的长度(`nhoff`)与IP头部的长度(`hdr_len`)相加,得到TCP头部的起始位置。
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
|
||||
```
|
||||
|
||||
这行代码从数据包中加载TCP头部的第一个字节,该字节包含了TCP头部长度信息。这个长度字段以4字节为单位,需要进行后续的转换。
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen));
|
||||
```
|
||||
|
||||
这行代码从数据包中加载IP头部的总长度字段。IP头部总长度字段表示整个IP数据包的长度,包括IP头部和数据部分。
|
||||
|
||||
```c
|
||||
__u8 doff;
|
||||
bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff));
|
||||
doff &= 0xf0;
|
||||
doff >>= 4;
|
||||
doff *= 4;
|
||||
```
|
||||
|
||||
这段代码用于计算TCP头部的长度。它加载TCP头部中的数据偏移字段(Data Offset,也称为头部长度字段),该字段表示TCP头部的长度以4字节为单位。代码将偏移字段的高四位清零,然后将其右移4位,最后乘以4,得到TCP头部的实际长度。
|
||||
|
||||
需要了解:
|
||||
|
||||
- TCP头部(TCP Header):TCP头部包含了TCP协议相关的信息,如源端口、目标端口、序列号、确认号、标志位(如SYN、ACK、FIN等)、窗口大小和校验和等。
|
||||
|
||||
```c
|
||||
payload_offset = ETH_HLEN + hdr_len + doff;
|
||||
payload_length = __bpf_ntohs(tlen) - hdr_len - doff;
|
||||
```
|
||||
|
||||
这两行代码计算HTTP请求的载荷(payload)的偏移量和长度。它们将以太网帧头部长度、IP头部长度和TCP头部长度相加,得到HTTP请求的数据部分的偏移量,然后通过减去总长度、IP头部长度和TCP头部长度,计算出HTTP请求数据的长度。
|
||||
|
||||
需要了解:
|
||||
|
||||
- HTTP请求载荷(Payload):HTTP请求中包含的实际数据部分,通常是HTTP请求头和请求体。
|
||||
|
||||
```c
|
||||
char line_buffer[7];
|
||||
if (payload_length < 7 || payload_offset < 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7);
|
||||
bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer);
|
||||
```
|
||||
|
||||
这部分代码用于加载HTTP请求行的前7个字节,存储在名为`line_buffer`的字符数组中。然后,它检查HTTP请求数据的长度是否小于7字节或偏移量是否为负数,如果满足这些条件,说明HTTP请求不完整,直接返回0。最后,它使用`bpf_printk`函数将HTTP请求行的内容打印到内核日志中,以供调试和分析。
|
||||
|
||||
```c
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "POST") != 0 &&
|
||||
bpf_strncmp(line_buffer, 3, "PUT") != 0 &&
|
||||
bpf_strncmp(line_buffer, 6, "DELETE") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
这段代码使用`bpf_strncmp`函数比较`line_buffer`中的数据与HTTP请求方法(GET、POST、PUT、DELETE、HTTP)是否匹配。如果不匹配,说明不是HTTP请求,直接返回0,放弃处理。
|
||||
|
||||
```c
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (!e)
|
||||
return 0;
|
||||
```
|
||||
|
||||
这部分代码尝试从BPF环形缓冲区中保留一块内存以存储事件信息。如果无法保留内存块,返回0。BPF环形缓冲区用于在eBPF程序和用户空间之间传递事件数据。
|
||||
|
||||
需要了解:
|
||||
|
||||
- BPF环形缓冲区:BPF环形缓冲区是一种在eBPF程序和用户空间之间传递数据的机制。它可以用来存储事件信息,以便用户空间应用程序进行进一步处理或分析。
|
||||
|
||||
```c
|
||||
e->ip_proto = ip_proto;
|
||||
bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4);
|
||||
e->pkt_type = skb->pkt_type;
|
||||
e->ifindex = skb->ifindex;
|
||||
|
||||
e->payload_length = payload_length;
|
||||
bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE);
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
|
||||
return skb->len;
|
||||
```
|
||||
|
||||
最后,这段代码将捕获到的事件信息存储在`e`结构体中,并将
|
||||
|
||||
其提交到BPF环形缓冲区。它包括了捕获的IP协议、源端口和目标端口、数据包类型、接口索引、载荷长度、源IP地址和目标IP地址等信息。最后,它返回数据包的长度,表示成功处理了数据包。
|
||||
|
||||
这段代码主要用于将捕获的事件信息存储起来,以便后续的处理和分析。 BPF环形缓冲区用于将这些信息传递到用户空间,供用户空间应用程序进一步处理或记录。
|
||||
|
||||
总结:这段eBPF程序的主要任务是捕获HTTP请求,它通过解析数据包的以太网帧、IP头部和TCP头部来确定数据包是否包含HTTP请求,并将有关请求的信息存储在`so_event`结构体中,然后提交到BPF环形缓冲区。这是一种高效的方法,可以在内核层面捕获HTTP流量,适用于网络监控和安全分析等应用。
|
||||
|
||||
### 潜在缺陷
|
||||
|
||||
上述代码也存在一些潜在的缺陷,其中一个主要缺陷是它无法处理跨多个数据包的URL。
|
||||
|
||||
- 跨包URL:代码中通过解析单个数据包来检查HTTP请求中的URL,如果HTTP请求的URL跨足够多的数据包,那么只会检查第一个数据包中的URL部分。这会导致丢失或部分记录那些跨多个数据包的长URL。
|
||||
|
||||
解决这个问题的方法通常需要对多个数据包进行重新组装,以还原完整的HTTP请求。这可能需要在eBPF程序中实现数据包的缓存和组装逻辑,并在检测到HTTP请求结束之前等待并收集所有相关数据包。这需要更复杂的逻辑和额外的内存来处理跨多个数据包的情况。
|
||||
|
||||
### 用户态代码
|
||||
|
||||
用户态代码的主要目的是创建一个原始套接字(raw socket),然后将先前在内核中定义的eBPF程序附加到该套接字上,从而允许eBPF程序捕获和处理从该套接字接收到的网络数据包,例如:
|
||||
|
||||
```c
|
||||
/* Create raw socket for localhost interface */
|
||||
sock = open_raw_sock(interface);
|
||||
if (sock < 0) {
|
||||
err = -2;
|
||||
fprintf(stderr, "Failed to open raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Attach BPF program to raw socket */
|
||||
prog_fd = bpf_program__fd(skel->progs.socket_handler);
|
||||
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) {
|
||||
err = -3;
|
||||
fprintf(stderr, "Failed to attach to raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
```
|
||||
|
||||
1. `sock = open_raw_sock(interface);`:这行代码调用了一个自定义的函数`open_raw_sock`,该函数用于创建一个原始套接字。原始套接字允许用户态应用程序直接处理网络数据包,而不经过协议栈的处理。函数`open_raw_sock`可能需要一个参数 `interface`,用于指定网络接口,以便确定从哪个接口接收数据包。如果创建套接字失败,它将返回一个负数,否则返回套接字的文件描述符`sock`。
|
||||
2. 如果`sock`的值小于0,表示打开原始套接字失败,那么将`err`设置为-2,并在标准错误流上输出一条错误信息。
|
||||
3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`:这行代码获取之前在eBPF程序定义中的套接字过滤器程序(`socket_handler`)的文件描述符,以便后续将它附加到套接字上。`skel`是一个eBPF程序对象的指针,可以通过它来访问程序集合。
|
||||
4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`:这行代码使用`setsockopt`系统调用将eBPF程序附加到原始套接字。它设置了`SO_ATTACH_BPF`选项,将eBPF程序的文件描述符传递给该选项,以便内核知道要将哪个eBPF程序应用于这个套接字。如果附加成功,套接字将开始捕获和处理从中接收到的网络数据包。
|
||||
5. 如果`setsockopt`失败,它将`err`设置为-3,并在标准错误流上输出一条错误信息。
|
||||
|
||||
### 编译运行
|
||||
|
||||
完整的源代码可以在 <https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/23-http> 中找到。编译运行上述代码:
|
||||
|
||||
```console
|
||||
$ git submodule update --init --recursive
|
||||
$ make
|
||||
BPF .output/sockfilter.bpf.o
|
||||
GEN-SKEL .output/sockfilter.skel.h
|
||||
CC .output/sockfilter.o
|
||||
BINARY sockfilter
|
||||
$ sudo ./sockfilter
|
||||
...
|
||||
```
|
||||
|
||||
在另外一个窗口中,使用 python 启动一个简单的 web server:
|
||||
|
||||
```console
|
||||
python3 -m http.server
|
||||
Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
|
||||
127.0.0.1 - - [18/Sep/2023 01:05:52] "GET / HTTP/1.1" 200 -
|
||||
```
|
||||
|
||||
可以使用 curl 发起请求:
|
||||
|
||||
```c
|
||||
$ curl http://0.0.0.0:8000/
|
||||
<!DOCTYPE HTML>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Directory listing for /</title>
|
||||
....
|
||||
```
|
||||
|
||||
在 eBPF 程序中,可以看到打印出了 HTTP 请求的内容:
|
||||
|
||||
```console
|
||||
127.0.0.1:34552(src) -> 127.0.0.1:8000(dst)
|
||||
payload: GET / HTTP/1.1
|
||||
Host: 0.0.0.0:8000
|
||||
User-Agent: curl/7.88.1
|
||||
...
|
||||
127.0.0.1:8000(src) -> 127.0.0.1:34552(dst)
|
||||
payload: HTTP/1.0 200 OK
|
||||
Server: SimpleHTTP/0.6 Python/3.11.4
|
||||
...
|
||||
```
|
||||
|
||||
分别包含了请求和响应的内容。
|
||||
|
||||
## 使用 eBPF syscall tracepoint 来捕获 HTTP 流量
|
||||
|
||||
eBPF 提供了一种强大的机制,允许我们在内核级别追踪系统调用。在这个示例中,我们将使用 eBPF 追踪 accept 和 read 系统调用,以捕获 HTTP 流量。由于篇幅有限,这里我们仅仅对代码框架做简要的介绍。
|
||||
|
||||
```c
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct accept_args_t);
|
||||
} active_accept_args_map SEC(".maps");
|
||||
|
||||
// 定义在 accept 系统调用入口的追踪点
|
||||
SEC("tracepoint/syscalls/sys_enter_accept")
|
||||
int sys_enter_accept(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
// ... 获取和存储 accept 调用的参数
|
||||
bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 定义在 accept 系统调用退出的追踪点
|
||||
SEC("tracepoint/syscalls/sys_exit_accept")
|
||||
int sys_exit_accept(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
// ... 处理 accept 调用的结果
|
||||
struct accept_args_t *args =
|
||||
bpf_map_lookup_elem(&active_accept_args_map, &id);
|
||||
// ... 获取和存储 accept 调用获得的 socket 文件描述符
|
||||
__u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd;
|
||||
bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY);
|
||||
// ...
|
||||
}
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct data_args_t);
|
||||
} active_read_args_map SEC(".maps");
|
||||
|
||||
// 定义在 read 系统调用入口的追踪点
|
||||
SEC("tracepoint/syscalls/sys_enter_read")
|
||||
int sys_enter_read(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
// ... 获取和存储 read 调用的参数
|
||||
bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 辅助函数,检查是否为 HTTP 连接
|
||||
static inline bool is_http_connection(const char *line_buffer, u64 bytes_count)
|
||||
{
|
||||
// ... 检查数据是否为 HTTP 请求或响应
|
||||
}
|
||||
|
||||
// 辅助函数,处理读取的数据
|
||||
static inline void process_data(struct trace_event_raw_sys_exit *ctx,
|
||||
u64 id, const struct data_args_t *args, u64 bytes_count)
|
||||
{
|
||||
// ... 处理读取的数据,检查是否为 HTTP 流量,并发送事件
|
||||
if (is_http_connection(line_buffer, bytes_count))
|
||||
{
|
||||
// ...
|
||||
bpf_probe_read_kernel(&event.msg, read_size, args->buf);
|
||||
// ...
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(struct socket_data_event_t));
|
||||
}
|
||||
}
|
||||
|
||||
// 定义在 read 系统调用退出的追踪点
|
||||
SEC("tracepoint/syscalls/sys_exit_read")
|
||||
int sys_exit_read(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
// ... 处理 read 调用的结果
|
||||
struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id);
|
||||
if (read_args != NULL)
|
||||
{
|
||||
process_data(ctx, id, read_args, bytes_count);
|
||||
}
|
||||
// ...
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
```
|
||||
|
||||
这段代码简要展示了如何使用eBPF追踪Linux内核中的系统调用来捕获HTTP流量。以下是对代码的hook位置和流程的详细解释,以及需要hook哪些系统调用来实现完整的请求追踪:
|
||||
|
||||
### **Hook 位置和流程**
|
||||
|
||||
- 该代码使用了eBPF的Tracepoint功能,具体来说,它定义了一系列的eBPF程序,并将它们绑定到了特定的系统调用的Tracepoint上,以捕获这些系统调用的入口和退出事件。
|
||||
|
||||
- 首先,它定义了两个eBPF哈希映射(`active_accept_args_map`和`active_read_args_map`)来存储系统调用参数。这些映射用于跟踪`accept`和`read`系统调用。
|
||||
|
||||
- 接着,它定义了多个Tracepoint追踪程序,其中包括:
|
||||
- `sys_enter_accept`:定义在`accept`系统调用的入口处,用于捕获`accept`系统调用的参数,并将它们存储在哈希映射中。
|
||||
- `sys_exit_accept`:定义在`accept`系统调用的退出处,用于处理`accept`系统调用的结果,包括获取和存储新的套接字文件描述符以及建立连接的相关信息。
|
||||
- `sys_enter_read`:定义在`read`系统调用的入口处,用于捕获`read`系统调用的参数,并将它们存储在哈希映射中。
|
||||
- `sys_exit_read`:定义在`read`系统调用的退出处,用于处理`read`系统调用的结果,包括检查读取的数据是否为HTTP流量,如果是,则发送事件。
|
||||
|
||||
- 在`sys_exit_accept`和`sys_exit_read`中,还涉及一些数据处理和事件发送的逻辑,例如检查数据是否为HTTP连接,组装事件数据,并使用`bpf_perf_event_output`将事件发送到用户空间供进一步处理。
|
||||
|
||||
### **需要 Hook 的完整系统调用**
|
||||
|
||||
要实现完整的HTTP请求追踪,通常需要hook的系统调用包括:
|
||||
|
||||
- `socket`:用于捕获套接字创建,以追踪新的连接。
|
||||
- `bind`:用于获取绑定的端口信息。
|
||||
- `listen`:用于开始监听连接请求。
|
||||
- `accept`:用于接受连接请求,获取新的套接字文件描述符。
|
||||
- `read`:用于捕获接收到的数据,以检查其中是否包含 HTTP 请求。
|
||||
- `write`:用于捕获发送的数据,以检查其中是否包含 HTTP 响应。
|
||||
|
||||
上述代码已经涵盖了`accept`和`read`系统调用的追踪。要完整实现HTTP请求的追踪,还需要hook其他系统调用,并实现相应的逻辑来处理这些系统调用的参数和结果。
|
||||
|
||||
完整的源代码可以在 <https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/23-http> 中找到。
|
||||
|
||||
## 总结
|
||||
|
||||
在当今复杂的技术环境中,系统的可观测性变得至关重要,特别是在微服务和云原生应用程序的背景下。本文探讨了如何利用eBPF技术来追踪七层协议,以及在这个过程中可能面临的挑战和解决方案。以下是对本文内容的总结:
|
||||
|
||||
1. **背景介绍**:
|
||||
- 现代应用程序通常由多个微服务和分布式组件组成,因此观测整个系统的行为至关重要。
|
||||
- 七层协议(如HTTP、gRPC、MQTT等)提供了深入了解应用程序交互的详细信息,但监控这些协议通常具有挑战性。
|
||||
|
||||
2. **eBPF技术的作用**:
|
||||
- eBPF允许开发者在不修改或插入应用程序代码的情况下,深入内核层来实时观测和分析系统行为。
|
||||
- eBPF技术为监控七层协议提供了一个强大的工具,特别适用于微服务环境。
|
||||
|
||||
3. **追踪七层协议**:
|
||||
- 本文介绍了如何追踪HTTP等七层协议的挑战,包括协议的复杂性和动态性。
|
||||
- 传统的网络监控工具难以应对七层协议的复杂性。
|
||||
|
||||
4. **eBPF的应用**:
|
||||
- eBPF提供两种主要方法来追踪七层协议:socket filter和syscall trace。
|
||||
- 这两种方法可以帮助捕获HTTP等协议的网络请求数据,并分析它们。
|
||||
|
||||
5. **eBPF实践教程**:
|
||||
- 本文提供了一个实际的eBPF教程,演示如何使用eBPF socket filter或syscall trace来捕获和分析HTTP流量。
|
||||
- 教程内容包括开发eBPF程序、使用eBPF工具链和实施HTTP请求的追踪。
|
||||
|
||||
通过这篇文章,读者可以获得深入了解如何使用eBPF技术来追踪七层协议,尤其是HTTP流量的知识。这将有助于更好地监控和分析网络流量,从而提高应用程序性能和安全性。如果您希望学习更多关于 eBPF 的知识和实践,可以访问我们的教程代码仓库 <https://github.com/eunomia-bpf/bpf-developer-tutorial> 或网站 <https://eunomia.dev/zh/tutorials/> 以获取更多示例和完整的教程。
|
||||
|
||||
@@ -1,3 +1,635 @@
|
||||
# eBPF Practical Tutorial: trace http requests data
|
||||
# Exploring Seven-Layer Protocol Tracing with eBPF: HTTP and Beyond via Socket Filters and Syscall Tracing
|
||||
|
||||
TODO
|
||||
In today's technology landscape, with the rise of microservices, cloud-native applications, and complex distributed systems, observability of systems has become a crucial factor in ensuring their health, performance, and security. Especially in a microservices architecture, application components may be distributed across multiple containers and servers, making traditional monitoring methods often insufficient to provide the depth and breadth needed to fully understand the behavior of the system. This is where observing seven-layer protocols such as HTTP, gRPC, MQTT, and more becomes particularly important.
|
||||
|
||||
Seven-layer protocols provide detailed insights into how applications interact with other services and components. In a microservices environment, understanding these interactions is vital, as they often serve as the root causes of performance bottlenecks, failures, and security issues. However, monitoring these protocols is not a straightforward task. Traditional network monitoring tools like tcpdump, while effective at capturing network traffic, often fall short when dealing with the complexity and dynamism of seven-layer protocols.
|
||||
|
||||
This is where eBPF (extended Berkeley Packet Filter) technology comes into play. eBPF allows developers and operators to delve deep into the kernel layer, observing and analyzing system behavior in real-time without the need to modify or insert instrumentation into application code. This presents a unique opportunity to handle application layer traffic more simply and efficiently, particularly in microservices environments.
|
||||
|
||||
In this tutorial, we will delve into the following:
|
||||
|
||||
- Tracking seven-layer protocols such as HTTP and the challenges associated with them.
|
||||
- eBPF's socket filter and syscall tracing: How these two technologies assist in tracing HTTP network request data at different kernel layers, and the advantages and limitations of each.
|
||||
- eBPF practical tutorial: How to develop an eBPF program and utilize eBPF socket filter or syscall tracing to capture and analyze HTTP traffic.
|
||||
|
||||
As network traffic increases and applications grow in complexity, gaining a deeper understanding of seven-layer protocols becomes increasingly important. Through this tutorial, you will acquire the necessary knowledge and tools to more effectively monitor and analyze your network traffic, ultimately enhancing the performance of your applications and servers.
|
||||
|
||||
This article is part of the eBPF Developer Tutorial, and for more detailed content, you can visit [here](https://eunomia.dev/tutorials/). The source code is available on the [GitHub repository](https://github.com/eunomia-bpf/bpf-developer-tutorial).
|
||||
|
||||
## Challenges in Tracking HTTP, HTTP/2, and Other Seven-Layer Protocols
|
||||
|
||||
In the modern networking environment, seven-layer protocols extend beyond just HTTP. In fact, there are many seven-layer protocols such as HTTP/2, gRPC, MQTT, WebSocket, AMQP, and SMTP, each serving critical roles in various application scenarios. These protocols provide detailed insights into how applications interact with other services and components. However, tracking these protocols is not a simple task, especially within complex distributed systems.
|
||||
|
||||
1. **Diversity and Complexity**: Each seven-layer protocol has its specific design and workings. For example, gRPC utilizes HTTP/2 as its transport protocol and supports multiple languages, while MQTT is a lightweight publish/subscribe messaging transport protocol designed for low-bandwidth and unreliable networks.
|
||||
|
||||
2. **Dynamism**: Many seven-layer protocols are dynamic, meaning their behavior can change based on network conditions, application requirements, or other factors.
|
||||
|
||||
3. **Encryption and Security**: With increased security awareness, many seven-layer protocols employ encryption technologies such as TLS/SSL. This introduces additional challenges for tracking and analysis, as decrypting traffic is required for in-depth examination.
|
||||
|
||||
4. **High-Performance Requirements**: In high-traffic production environments, capturing and analyzing traffic for seven-layer protocols can impact system performance. Traditional network monitoring tools may struggle to handle a large number of concurrent sessions.
|
||||
|
||||
5. **Data Completeness and Continuity**: Unlike tools like tcpdump, which capture individual packets, tracking seven-layer protocols requires capturing complete sessions, which may involve multiple packets. This necessitates tools capable of correctly reassembling and parsing these packets to provide a continuous session view.
|
||||
|
||||
6. **Code Intrusiveness**: To gain deeper insights into the behavior of seven-layer protocols, developers may need to modify application code to add monitoring functionalities. This not only increases development and maintenance complexity but can also impact application performance.
|
||||
|
||||
As mentioned earlier, eBPF provides a powerful solution, allowing us to capture and analyze seven-layer protocol traffic in the kernel layer without modifying application code. This approach not only offers insights into system behavior but also ensures optimal performance and efficiency. This is why eBPF has become the preferred technology for modern observability tools, especially in production environments that demand high performance and low latency.
|
||||
|
||||
## eBPF Socket Filter vs. Syscall Tracing: In-Depth Analysis and Comparison
|
||||
|
||||
### **eBPF Socket Filter**
|
||||
|
||||
**What Is It?**
|
||||
eBPF socket filter is an extension of the classic Berkeley Packet Filter (BPF) that allows for more advanced packet filtering directly within the kernel. It operates at the socket layer, enabling fine-grained control over which packets are processed by user-space applications.
|
||||
|
||||
**Key Features:**
|
||||
|
||||
- **Performance**: By handling packets directly within the kernel, eBPF socket filters reduce the overhead of context switches between user and kernel spaces.
|
||||
- **Flexibility**: eBPF socket filters can be attached to any socket, providing a universal packet filtering mechanism for various protocols and socket types.
|
||||
- **Programmability**: Developers can write custom eBPF programs to define complex filtering logic beyond simple packet matching.
|
||||
|
||||
**Use Cases:**
|
||||
|
||||
- **Traffic Control**: Restrict or prioritize traffic based on custom conditions.
|
||||
- **Security**: Discard malicious packets before they reach user-space applications.
|
||||
- **Monitoring**: Capture specific packets for analysis without affecting other traffic.
|
||||
|
||||
### **eBPF Syscall Tracing**
|
||||
|
||||
**What Is It?**
|
||||
System call tracing using eBPF allows monitoring and manipulation of system calls made by applications. System calls are the primary mechanism through which user-space applications interact with the kernel, making tracing them a valuable way to understand application behavior.
|
||||
|
||||
**Key Features:**
|
||||
|
||||
- **Granularity**: eBPF allows tracing specific system calls, even specific parameters within those system calls.
|
||||
- **Low Overhead**: Compared to other tracing methods, eBPF syscall tracing is designed to have minimal performance impact.
|
||||
- **Security**: Kernel validates eBPF programs to ensure they do not compromise system stability.
|
||||
|
||||
**How It Works:**
|
||||
eBPF syscall tracing typically involves attaching eBPF programs to tracepoints or kprobes related to the system calls being traced. When the traced system call is invoked, the eBPF program is executed, allowing data collection or even modification of system call parameters.
|
||||
|
||||
### Comparison of eBPF Socket Filter and Syscall Tracing
|
||||
|
||||
| Aspect | eBPF Socket Filter | eBPF Syscall Tracing |
|
||||
| ------ | ------------------- | --------------------- |
|
||||
| **Operational Layer** | Socket layer, primarily dealing with network packets received from or sent to sockets. | System call layer, monitoring and potentially altering the behavior of system calls made by applications. |
|
||||
| **Primary Use Cases** | Mainly used for filtering, monitoring, and manipulation of network packets. | Used for performance analysis, security monitoring, and debugging of interactions with the network. |
|
||||
| **Granularity** | Focuses on individual network packets. | Can monitor a wide range of system activities, including those unrelated to networking. |
|
||||
| **Tracking HTTP Traffic** | Can be used to filter and capture HTTP packets passed through sockets. | Can trace system calls associated with networking operations, which may include HTTP traffic. |
|
||||
|
||||
In summary, both eBPF socket filters and syscall tracing can be used to trace HTTP traffic, but socket filters are more direct and suitable for this purpose. However, if you are interested in the broader context of how an application interacts with the system (e.g., which system calls lead to HTTP traffic), syscall tracing can be highly valuable. In many advanced observability setups, both tools may be used simultaneously to provide a comprehensive view of system and network behavior.
|
||||
|
||||
## Capturing HTTP Traffic with eBPF Socket Filter
|
||||
|
||||
eBPF code consists of user-space and kernel-space components, and here we primarily focus on the kernel-space code. Below is the main logic for capturing HTTP traffic in the kernel using eBPF socket filter technology, and the complete code is provided:
|
||||
|
||||
```c
|
||||
SEC("socket")
|
||||
int socket_handler(struct __sk_buff *skb)
|
||||
{
|
||||
struct so_event *e;
|
||||
__u8 verlen;
|
||||
__u16 proto;
|
||||
__u32 nhoff = ETH_HLEN;
|
||||
__u32 ip_proto = 0;
|
||||
__u32 tcp_hdr_len = 0;
|
||||
__u16 tlen;
|
||||
__u32 payload_offset = 0;
|
||||
__u32 payload_length = 0;
|
||||
__u8 hdr_len;
|
||||
|
||||
bpf_skb_load_bytes(skb, 12, &proto, 2);
|
||||
proto = __bpf_ntohs(proto);
|
||||
if (proto != ETH_P_IP)
|
||||
return 0;
|
||||
|
||||
if (ip_is_fragment(skb, nhoff))
|
||||
return 0;
|
||||
|
||||
// ip4 header lengths are variable
|
||||
// access ihl as a u8 (linux/include/linux/skbuff.h)
|
||||
bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len));
|
||||
hdr_len &= 0x0f;
|
||||
hdr_len *= 4;
|
||||
|
||||
/* verify hlen meets minimum size requirements */
|
||||
if (hdr_len < sizeof(struct iphdr))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1);
|
||||
|
||||
if (ip_proto != IPPROTO_TCP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
tcp_hdr_len = nhoff + hdr_len;
|
||||
bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen));
|
||||
|
||||
__u8 doff;
|
||||
bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); // read the first byte past __tcphdr->ack_seq, we can't do offsetof bit fields
|
||||
doff &= 0xf0; // clean-up res1
|
||||
doff >>= 4; // move the upper 4 bits to low
|
||||
doff *= 4; // convert to bytes length
|
||||
|
||||
payload_offset = ETH_HLEN + hdr_len + doff;
|
||||
payload_length = __bpf_ntohs(tlen) - hdr_len - doff;
|
||||
|
||||
char line_buffer[7];
|
||||
if (payload_length < 7 || payload_offset < 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7);
|
||||
bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer);
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "POST") != 0 &&
|
||||
bpf_strncmp(line_buffer, 3, "PUT") != 0 &&
|
||||
bpf_strncmp(line_buffer, 6, "DELETE") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* reserve sample from BPF ringbuf */
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (!e)
|
||||
return 0;
|
||||
|
||||
e->ip_proto = ip_proto;
|
||||
bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4);
|
||||
e->pkt_type = skb->pkt_type;
|
||||
e->ifindex = skb->ifindex;
|
||||
|
||||
e->payload_length = payload_length;
|
||||
bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE);
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
|
||||
return skb->len;
|
||||
}
|
||||
```
|
||||
|
||||
When analyzing this eBPF program, we will explain it in detail according to the content of each code block and provide relevant background knowledge:
|
||||
|
||||
```c
|
||||
SEC("socket")
|
||||
int socket_handler(struct __sk_buff *skb)
|
||||
{
|
||||
// ...
|
||||
}
|
||||
```
|
||||
|
||||
This is the entry point of the eBPF program, defining a function named `socket_handler` that the kernel uses to handle incoming network packets. This function is located in an eBPF section named `socket`, indicating that it is intended for socket handling.
|
||||
|
||||
```c
|
||||
struct so_event *e;
|
||||
__u8 verlen;
|
||||
__u16 proto;
|
||||
__u32 nhoff = ETH_HLEN;
|
||||
__u32 ip_proto = 0;
|
||||
__u32 tcp_hdr_len = 0;
|
||||
__u16 tlen;
|
||||
__u32 payload_offset = 0;
|
||||
__u32 payload_length = 0;
|
||||
__u8 hdr_len;
|
||||
```
|
||||
|
||||
In this code block, several variables are defined to store information needed during packet processing. These variables include `struct so_event *e` for storing event information, `verlen`, `proto`, `nhoff`, `ip_proto`, `tcp_hdr_len`, `tlen`, `payload_offset`, `payload_length`, and `hdr_len` for storing packet information.
|
||||
|
||||
- `struct so_event *e;`: This is a pointer to the `so_event` structure for storing captured event information. The specific definition of this structure is located elsewhere in the program.
|
||||
- `__u8 verlen;`, `__u16 proto;`, `__u32 nhoff = ETH_HLEN;`: These variables are used to store various pieces of information, such as protocol types, packet offsets, etc. `nhoff` is initialized to the length of the Ethernet frame header, typically 14 bytes, as Ethernet frame headers include destination MAC address, source MAC address, and frame type fields.
|
||||
- `__u32 ip_proto = 0;`: This variable is used to store the type of the IP protocol and is initialized to 0.
|
||||
- `__u32 tcp_hdr_len = 0;`: This variable is used to store the length of the TCP header and is initialized to 0.
|
||||
- `__u16 tlen;`: This variable is used to store the total length of the IP packet.
|
||||
- `__u32 payload_offset = 0;`, `__u32 payload_length = 0;`: These two variables are used to store the offset and length of the HTTP request payload.
|
||||
- `__u8 hdr_len;`: This variable is used to store the length of the IP header.
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, 12, &proto, 2);
|
||||
proto = __bpf_ntohs(proto);
|
||||
if (proto != ETH_P_IP)
|
||||
return 0;
|
||||
```
|
||||
|
||||
Here, the code loads the Ethernet frame type field from the packet, which tells us the network layer protocol being used in the packet. It then uses the `__bpf_ntohs` function to convert the network byte order type field into host byte order. Next, the code checks if the type field is not equal to the Ethernet frame type for IPv4 (0x0800). If it's not equal, it means the packet is not an IPv4 packet, and the function returns 0, indicating that the packet should not be processed.
|
||||
|
||||
Key concepts to understand here:
|
||||
|
||||
- Ethernet Frame: The Ethernet frame is a data link layer (Layer 2) protocol used for transmitting data frames within a local area network (LAN). Ethernet frames typically include destination MAC address, source MAC address, and frame type fields.
|
||||
- Network Byte Order: Network protocols often use big-endian byte order to represent data. Therefore, data received from the network needs to be converted into host byte order for proper interpretation on the host. Here, the type field from the network is converted to host byte order for further processing.
|
||||
- IPv4 Frame Type (ETH_P_IP): This represents the frame type field in the Ethernet frame, where 0x0800 indicates IPv4.
|
||||
|
||||
```c
|
||||
if (ip_is_fragment(skb, nhoff))
|
||||
return 0;
|
||||
```
|
||||
|
||||
This part of the code checks if IP fragmentation is being handled. IP fragmentation is a mechanism for splitting larger IP packets into multiple smaller fragments for transmission. Here, if the packet is an IP fragment, the function returns 0, indicating that only complete packets will be processed.
|
||||
|
||||
```c
|
||||
static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff)
|
||||
{
|
||||
__u16 frag_off;
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);
|
||||
frag_off = __bpf_ntohs(frag_off);
|
||||
return frag_off & (IP_MF | IP_OFFSET);
|
||||
}
|
||||
```
|
||||
|
||||
The above code is a helper function used to check if the incoming IPv4 packet is an IP fragment. IP fragmentation is a mechanism where, if the size of an IP packet exceeds the Maximum Transmission Unit (MTU) of the network, routers split it into smaller fragments for transmission across the network. The purpose of this function is to examine the fragment flags and fragment offset fields within the packet to determine if it is a fragment.
|
||||
|
||||
Here's an explanation of the code line by line:
|
||||
|
||||
1. `__u16 frag_off;`: Defines a 16-bit unsigned integer variable `frag_off` to store the fragment offset field.
|
||||
2. `bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);`: This line of code uses the `bpf_skb_load_bytes` function to load the fragment offset field from the packet. `nhoff` is the offset of the IP header within the packet, and `offsetof(struct iphdr, frag_off)` calculates the offset of the fragment offset field within the IPv4 header.
|
||||
3. `frag_off = __bpf_ntohs(frag_off);`: Converts the loaded fragment offset field from network byte order (big-endian) to host byte order. Network protocols typically use big-endian to represent data, and the conversion to host byte order is done for further processing.
|
||||
4. `return frag_off & (IP_MF | IP_OFFSET);`: This line of code checks the value of the fragment offset field using a bitwise AND operation with two flag values:
|
||||
- `IP_MF`: Represents the "More Fragments" flag. If this flag is set to 1, it indicates that the packet is part of a fragmented sequence and more fragments are expected.
|
||||
- `IP_OFFSET`: Represents the fragment offset field. If the fragment offset field is non-zero, it indicates that the packet is part of a fragmented sequence and has a fragment offset value.
|
||||
If either of these flags is set to 1, the result is non-zero, indicating that the packet is an IP fragment. If both flags are 0, it means the packet is not fragmented.
|
||||
|
||||
It's important to note that the fragment offset field in the IP header is specified in units of 8 bytes, so the actual byte offset is obtained by left-shifting the value by 3 bits. Additionally, the "More Fragments" flag (IP_MF) in the IP header indicates whether there are more fragments in the sequence and is typically used in conjunction with the fragment offset field to indicate the status of fragmented packets.
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, ETH_HLEN, &
|
||||
|
||||
hdr_len, sizeof(hdr_len));
|
||||
hdr_len &= 0x0f;
|
||||
hdr_len *= 4;
|
||||
```
|
||||
|
||||
In this part of the code, the length of the IP header is loaded from the packet. The IP header length field contains information about the length of the IP header in units of 4 bytes, and it needs to be converted to bytes. Here, it is converted by performing a bitwise AND operation with 0x0f and then multiplying it by 4.
|
||||
|
||||
Key concept:
|
||||
|
||||
- IP Header: The IP header contains fundamental information about a packet, such as the source IP address, destination IP address, protocol type, total length, identification, flags, fragment offset, time to live (TTL), checksum, source port, and destination port.
|
||||
|
||||
```c
|
||||
if (hdr_len < sizeof(struct iphdr))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
This code segment checks if the length of the IP header meets the minimum length requirement, typically 20 bytes. If the length of the IP header is less than 20 bytes, it indicates an incomplete or corrupted packet, and the function returns 0, indicating that the packet should not be processed.
|
||||
|
||||
Key concept:
|
||||
|
||||
- `struct iphdr`: This is a structure defined in the Linux kernel, representing the format of an IPv4 header. It includes fields such as version, header length, service type, total length, identification, flags, fragment offset, time to live, protocol, header checksum, source IP address, and destination IP address, among others.
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1);
|
||||
if (ip_proto != IPPROTO_TCP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
Here, the code loads the protocol field from the IP header to determine the transport layer protocol used in the packet. Then, it checks if the protocol field is not equal to the value for TCP (IPPROTO_TCP). If it's not TCP, it means the packet is not an HTTP request or response, and the function returns 0.
|
||||
|
||||
Key concept:
|
||||
|
||||
- Transport Layer Protocol: The protocol field in the IP header indicates the transport layer protocol used in the packet, such as TCP, UDP, or ICMP.
|
||||
|
||||
```c
|
||||
tcp_hdr_len = nhoff + hdr_len;
|
||||
```
|
||||
|
||||
This line of code calculates the offset of the TCP header. It adds the length of the Ethernet frame header (`nhoff`) to the length of the IP header (`hdr_len`) to obtain the starting position of the TCP header.
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
|
||||
```
|
||||
|
||||
This line of code loads the first byte of the TCP header from the packet, which contains information about the TCP header length. This length field is specified in units of 4 bytes and requires further conversion.
|
||||
|
||||
```c
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen));
|
||||
```
|
||||
|
||||
This line of code loads the total length field of the IP header from the packet. The IP header's total length field represents the overall length of the IP packet, including both the IP header and the data portion.
|
||||
|
||||
```c
|
||||
__u8 doff;
|
||||
bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff));
|
||||
doff &= 0xf0;
|
||||
doff >>= 4;
|
||||
doff *= 4;
|
||||
```
|
||||
|
||||
This piece of code is used to calculate the length of the TCP header. It loads the Data Offset field (also known as the Header Length field) from the TCP header, which represents the length of the TCP header in units of 4 bytes. The code clears the high four bits of the offset field, then shifts it right by 4 bits, and finally multiplies it by 4 to obtain the actual length of the TCP header.
|
||||
|
||||
Key points to understand:
|
||||
|
||||
- TCP Header: The TCP header contains information related to the TCP protocol, such as source port, destination port, sequence number, acknowledgment number, flags (e.g., SYN, ACK, FIN), window size, and checksum.
|
||||
|
||||
```c
|
||||
payload_offset = ETH_HLEN + hdr_len + doff;
|
||||
payload_length = __bpf_ntohs(tlen) - hdr_len - doff;
|
||||
```
|
||||
|
||||
These two lines of code calculate the offset and length of the HTTP request payload. They add the lengths of the Ethernet frame header, IP header, and TCP header together to obtain the offset to the data portion of the HTTP request. Then, by subtracting the total length, IP header length, and TCP header length from the total length field, they calculate the length of the HTTP request data.
|
||||
|
||||
Key point:
|
||||
|
||||
- HTTP Request Payload: The actual data portion included in an HTTP request, typically consisting of the HTTP request headers and request body.
|
||||
|
||||
```c
|
||||
char line_buffer[7];
|
||||
if (payload_length < 7 || payload_offset < 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7);
|
||||
bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer);
|
||||
```
|
||||
|
||||
This portion of the code loads the first 7 bytes of the HTTP request line and stores them in a character array named `line_buffer`. It then checks if the length of the HTTP request data is less than 7 bytes or if the offset is negative. If these conditions are met, it indicates an incomplete HTTP request, and the function returns 0. Finally, it uses the `bpf_printk` function to print the content of the HTTP request line to the kernel log for debugging and analysis.
|
||||
|
||||
```c
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "POST") != 0 &&
|
||||
bpf_strncmp(line_buffer, 3, "PUT") != 0 &&
|
||||
bpf_strncmp(line_buffer, 6, "DELETE") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
This piece of code uses the `bpf_strncmp` function to compare the data in `line_buffer` with HTTP request methods (GET, POST, PUT, DELETE, HTTP). If there is no match, indicating that it is not an HTTP request, it returns 0, indicating that it should not be processed.
|
||||
|
||||
```c
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (!e)
|
||||
return 0;
|
||||
```
|
||||
|
||||
This section of the code attempts to reserve a block of memory from the BPF ring buffer to store event information. If it cannot reserve the memory block, it returns 0. The BPF ring buffer is used to pass event data between the eBPF program and user space.
|
||||
|
||||
Key point:
|
||||
|
||||
- BPF Ring Buffer: The BPF ring buffer is a mechanism for passing data between eBPF programs and user space. It can be used to store event information for further processing or analysis by user space applications.
|
||||
|
||||
```c
|
||||
e->ip_proto = ip_proto;
|
||||
bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4);
|
||||
e->pkt_type = skb->pkt_type;
|
||||
e->ifindex = skb->ifindex;
|
||||
|
||||
e->payload_length = payload_length;
|
||||
bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE);
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
|
||||
return skb->len;
|
||||
```
|
||||
|
||||
Finally, this code segment stores the captured event information in the `e` structure and submits it to the BPF ring buffer. It includes information such as the captured IP protocol, source and destination ports, packet type, interface index, payload length, source IP address, and destination IP address. Finally, it returns the length of the packet, indicating that the packet was successfully processed.
|
||||
|
||||
This code is primarily used to store captured event information for further processing. The BPF ring buffer is used to pass this information to user space for additional handling or logging.
|
||||
|
||||
In summary, this eBPF program's main task is to capture HTTP requests. It accomplishes this by parsing the Ethernet frame, IP header, and TCP header of incoming packets to determine if they contain HTTP requests. Information about the requests is then stored in the `so_event` structure and submitted to the BPF ring buffer. This is an efficient method for capturing HTTP traffic at the kernel level and is suitable for applications such as network monitoring and security analysis.
|
||||
|
||||
### Potential Limitations
|
||||
|
||||
The above code has some potential limitations, and one of the main limitations is that it cannot handle URLs that span multiple packets.
|
||||
|
||||
- Cross-Packet URLs: The code checks the URL in an HTTP request by parsing a single data packet. If the URL of an HTTP request spans multiple packets, it will only examine the URL in the first packet. This can lead to missing or partially capturing long URLs that span multiple data packets.
|
||||
|
||||
To address this issue, a solution often involves reassembling multiple packets to reconstruct the complete HTTP request. This may require implementing packet caching and assembly logic within the eBPF program and waiting to collect all relevant packets until the HTTP request is detected. This adds complexity and may require additional memory to handle cases where URLs span multiple packets.
|
||||
|
||||
### User-Space Code
|
||||
|
||||
The user-space code's main purpose is to create a raw socket and then attach the previously defined eBPF program in the kernel to that socket, allowing the eBPF program to capture and process network packets received on that socket. Here's an example of the user-space code:
|
||||
|
||||
```c
|
||||
/* Create raw socket for localhost interface */
|
||||
sock = open_raw_sock(interface);
|
||||
if (sock < 0) {
|
||||
err = -2;
|
||||
fprintf(stderr, "Failed to open raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Attach BPF program to raw socket */
|
||||
prog_fd = bpf_program__fd(skel->progs.socket_handler);
|
||||
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) {
|
||||
err = -3;
|
||||
fprintf(stderr, "Failed to attach to raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
```
|
||||
|
||||
1. `sock = open_raw_sock(interface);`: This line of code calls a custom function `open_raw_sock`, which is used to create a raw socket. Raw sockets allow a user-space application to handle network packets directly without going through the protocol stack. The `interface` parameter might specify the network interface from which to receive packets, determining where to capture packets from. If creating the socket fails, it returns a negative value, otherwise, it returns the file descriptor of the socket `sock`.
|
||||
2. If the value of `sock` is less than 0, indicating a failure to open the raw socket, it sets `err` to -2 and prints an error message on the standard error stream.
|
||||
3. `prog_fd = bpf_program__fd(skel->progs.socket_handler);`: This line of code retrieves the file descriptor of the socket filter program (`socket_handler`) previously defined in the eBPF program. It is necessary to attach this program to the socket. `skel` is a pointer to an eBPF program object, and it provides access to the program collection.
|
||||
4. `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))`: This line of code uses the `setsockopt` system call to attach the eBPF program to the raw socket. It sets the `SO_ATTACH_BPF` option and passes the file descriptor of the eBPF program to the option, letting the kernel know which eBPF program to apply to this socket. If the attachment is successful, the socket starts capturing and processing network packets received on it.
|
||||
5. If `setsockopt` fails, it sets `err` to -3 and prints an error message on the standard error stream.
|
||||
|
||||
### Compilation and Execution
|
||||
|
||||
The complete source code can be found at <https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/23-http>. To compile and run the code:
|
||||
|
||||
```console
|
||||
$ git submodule update --init --recursive
|
||||
$ make
|
||||
BPF .output/sockfilter.bpf.o
|
||||
GEN-SKEL .output/sockfilter.skel.h
|
||||
CC .output/sockfilter.o
|
||||
BINARY sockfilter
|
||||
$ sudo ./sockfilter
|
||||
...
|
||||
```
|
||||
|
||||
In another terminal, start a simple web server using Python:
|
||||
|
||||
```console
|
||||
python3 -m http.server
|
||||
Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
|
||||
127.0.0.1 - - [18/Sep/2023 01:05:52] "GET / HTTP/1.1" 200 -
|
||||
```
|
||||
|
||||
You can use `curl` to make requests:
|
||||
|
||||
```c
|
||||
$ curl http://0.0.0.0:8000/
|
||||
<!DOCTYPE HTML>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Directory listing for /</title>
|
||||
....
|
||||
```
|
||||
|
||||
In the eBPF program, you can see that it prints the content of HTTP requests:
|
||||
|
||||
```console
|
||||
127.0.0.1:34552(src) -> 127.0.0.1:8000(dst)
|
||||
payload: GET / HTTP/1.1
|
||||
Host: 0.0.0.0:8000
|
||||
User-Agent: curl/7.88.1
|
||||
...
|
||||
127.0.0.1:8000(src) -> 127.0.0.1:34552(dst)
|
||||
payload: HTTP/1.0 200 OK
|
||||
Server: SimpleHTTP/0.6 Python/3.11.4
|
||||
...
|
||||
```
|
||||
|
||||
It captures both request and response content.
|
||||
|
||||
## Capturing HTTP Traffic Using eBPF Syscall Tracepoints
|
||||
|
||||
eBPF provides a powerful mechanism for tracing system calls at the kernel level. In this example, we'll use eBPF to trace the `accept` and `read` system calls to capture HTTP traffic. Due to space limitations, we'll provide a brief overview of the code framework.
|
||||
|
||||
```c
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct accept_args_t);
|
||||
} active_accept_args_map SEC(".maps");
|
||||
|
||||
// Define a tracepoint at the entry of the accept system call
|
||||
SEC("tracepoint/syscalls/sys_enter_accept")
|
||||
int sys_enter_accept(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
// ... Get and store the arguments of the accept call
|
||||
bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Define a tracepoint at the exit of the accept system call
|
||||
SEC("tracepoint/syscalls/sys_exit_accept")
|
||||
int sys_exit_accept(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
// ... Process the result of the accept call
|
||||
struct accept_args_t *args =
|
||||
bpf_map_lookup_elem(&active_accept_args_map, &id);
|
||||
// ... Get and store the socket file descriptor obtained from the accept call
|
||||
__u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd;
|
||||
bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY);
|
||||
// ...
|
||||
}
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct data_args_t);
|
||||
} active_read_args_map SEC(".maps");
|
||||
|
||||
// Define a tracepoint at the entry of the read system call
|
||||
SEC("tracepoint/syscalls/sys_enter_read")
|
||||
int sys_enter_read(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
// ... Get and store the arguments of the read call
|
||||
bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Helper function to check if it's an HTTP connection
|
||||
static inline bool is_http_connection(const char *line_buffer, u64 bytes_count)
|
||||
{
|
||||
// ... Check if the data is an HTTP request or response
|
||||
}
|
||||
|
||||
// Helper function to process the read data
|
||||
static inline void process_data(struct trace_event_raw_sys_exit *ctx,
|
||||
u64 id, const struct data_args_t *args, u64 bytes_count)
|
||||
{
|
||||
// ... Process the read data, check if it's HTTP traffic, and send events
|
||||
if (is_http_connection(line_buffer, bytes_count))
|
||||
{
|
||||
// ...
|
||||
bpf_probe_read_kernel(&event.msg, read_size, args->buf);
|
||||
// ...
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(struct socket_data_event_t));
|
||||
}
|
||||
}
|
||||
|
||||
// Define a tracepoint at the exit of the read system call
|
||||
SEC("tracepoint/syscalls/sys_exit_read")
|
||||
int sys_exit_read(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
// ... Process the result of the read call
|
||||
struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id);
|
||||
if (read_args != NULL)
|
||||
{
|
||||
process_data(ctx, id, read_args, bytes_count);
|
||||
}
|
||||
// ...
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
```
|
||||
|
||||
This code briefly demonstrates how to use eBPF to trace system calls in the Linux kernel to capture HTTP traffic. Here's a detailed explanation of the hook locations and the flow, as well as the complete set of system calls that need to be hooked for comprehensive request tracing:
|
||||
|
||||
### Hook Locations and Flow
|
||||
|
||||
- The code uses eBPF Tracepoint functionality. Specifically, it defines a series of eBPF programs and binds them to specific system call Tracepoints to capture entry and exit events of these system calls.
|
||||
|
||||
- First, it defines two eBPF hash maps (`active_accept_args_map` and `active_read_args_map`) to store system call parameters. These maps are used to track `accept` and `read` system calls.
|
||||
|
||||
- Next, it defines multiple Tracepoint tracing programs, including:
|
||||
- `sys_enter_accept`: Defined at the entry of the `accept` system call, used to capture the arguments of the `accept` system call and store them in the hash map.
|
||||
- `sys_exit_accept`: Defined at the exit of the `accept` system call, used to process the result of the `accept` system call, including obtaining and storing the new socket file descriptor and related connection information.
|
||||
- `sys_enter_read`: Defined at the entry of the `read` system call, used to capture the arguments of the `read` system call and store them in the hash map.
|
||||
- `sys_exit_read`: Defined at the exit of the `read` system call, used to process the result of the `read` system call, including checking if the read data is HTTP traffic and sending events.
|
||||
|
||||
- In `sys_exit_accept` and `sys_exit_read`, there is also some data processing and event sending logic, such as checking if the data is an HTTP connection, assembling event data, and using `bpf_perf_event_output` to send events to user space for further processing.
|
||||
|
||||
### Complete Set of System Calls to Hook
|
||||
|
||||
To fully implement HTTP request tracing, the system calls that typically need to be hooked include:
|
||||
|
||||
- `socket`: Used to capture socket creation for tracking new connections.
|
||||
- `bind`: Used to obtain port information where the socket is bound.
|
||||
- `listen`: Used to start listening for connection requests.
|
||||
- `accept`: Used to accept connection requests and obtain new socket file descriptors.
|
||||
- `read`: Used to capture received data and check if it contains HTTP requests.
|
||||
- `write`: Used to capture sent data and check if it contains HTTP responses.
|
||||
|
||||
The provided code already covers the tracing of `accept` and `read` system calls. To complete HTTP request tracing, additional system calls need to be hooked, and corresponding logic needs to be implemented to handle the parameters and results of these system calls.
|
||||
|
||||
The complete source code can be found at <https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/23-http>.
|
||||
|
||||
## Summary
|
||||
|
||||
In today's complex technological landscape, system observability has become crucial, especially in the context of microservices and cloud-native applications. This article explores how to leverage eBPF technology for tracing the seven-layer protocols, along with the challenges and solutions that may arise in this process. Here's a summary of the content covered in this article:
|
||||
|
||||
1. **Introduction**:
|
||||
- Modern applications often consist of multiple microservices and distributed components, making it essential to observe the behavior of the entire system.
|
||||
- Seven-layer protocols (such as HTTP, gRPC, MQTT, etc.) provide detailed insights into application interactions, but monitoring these protocols can be challenging.
|
||||
|
||||
2. **Role of eBPF Technology**:
|
||||
- eBPF allows developers to dive deep into the kernel layer for real-time observation and analysis of system behavior without modifying or inserting application code.
|
||||
- eBPF technology offers a powerful tool for monitoring seven-layer protocols, especially in a microservices environment.
|
||||
|
||||
3. **Tracing Seven-Layer Protocols**:
|
||||
- The article discusses the challenges of tracing seven-layer protocols, including their complexity and dynamism.
|
||||
- Traditional network monitoring tools struggle with the complexity of seven-layer protocols.
|
||||
|
||||
4. **Applications of eBPF**:
|
||||
- eBPF provides two primary methods for tracing seven-layer protocols: socket filters and syscall tracing.
|
||||
- Both of these methods help capture network request data for protocols like HTTP and analyze them.
|
||||
|
||||
5. **eBPF Practical Tutorial**:
|
||||
- The article provides a practical eBPF tutorial demonstrating how to capture and analyze HTTP traffic using eBPF socket filters or syscall tracing.
|
||||
- The tutorial covers the development of eBPF programs, the use of the eBPF toolchain, and the implementation of HTTP request tracing.
|
||||
|
||||
Through this article, readers can gain a deep understanding of how to use eBPF technology for tracing seven-layer protocols, particularly HTTP traffic. This knowledge will help enhance the monitoring and analysis of network traffic, thereby improving application performance and security. If you're interested in learning more about eBPF and its practical applications, you can visit our tutorial code repository at <https://github.com/eunomia-bpf/bpf-developer-tutorial> or our website at <https://eunomia.dev/tutorials/> for more examples and complete tutorials.
|
||||
|
||||
211
src/23-http/accept.bpf.c
Normal file
211
src/23-http/accept.bpf.c
Normal file
@@ -0,0 +1,211 @@
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_endian.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include "accept.h"
|
||||
|
||||
struct conn_id_t
|
||||
{
|
||||
u32 pid;
|
||||
int fd;
|
||||
__u64 tsid;
|
||||
};
|
||||
|
||||
struct conn_info_t
|
||||
{
|
||||
struct conn_id_t conn_id;
|
||||
__s64 wr_bytes;
|
||||
__s64 rd_bytes;
|
||||
bool is_http;
|
||||
};
|
||||
|
||||
// A struct describing the event that we send to the user mode upon a new connection.
|
||||
struct socket_open_event_t
|
||||
{
|
||||
// The time of the event.
|
||||
u64 timestamp_ns;
|
||||
|
||||
// A unique ID for the connection.
|
||||
struct conn_id_t conn_id;
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 131072);
|
||||
__type(key, __u64);
|
||||
__type(value, struct conn_info_t);
|
||||
} conn_info_map SEC(".maps");
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} events SEC(".maps");
|
||||
|
||||
struct accept_args_t
|
||||
{
|
||||
struct sockaddr_in *addr;
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct accept_args_t);
|
||||
} active_accept_args_map SEC(".maps");
|
||||
|
||||
SEC("tracepoint/syscalls/sys_enter_accept")
|
||||
int sys_enter_accept(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
|
||||
struct accept_args_t accept_args = {};
|
||||
accept_args.addr = (struct sockaddr_in *)BPF_CORE_READ(ctx, args[1]);
|
||||
bpf_map_update_elem(&active_accept_args_map, &id, &accept_args, BPF_ANY);
|
||||
bpf_printk("enter_accept accept_args.addr: %llx\n", accept_args.addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tracepoint/syscalls/sys_exit_accept")
|
||||
int sys_exit_accept(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
|
||||
struct accept_args_t *args =
|
||||
bpf_map_lookup_elem(&active_accept_args_map, &id);
|
||||
if (args == NULL)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_printk("exit_accept accept_args.addr: %llx\n", args->addr);
|
||||
int ret_fd = (int)BPF_CORE_READ(ctx, ret);
|
||||
if (ret_fd <= 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct conn_info_t conn_info = {};
|
||||
|
||||
u32 pid = id >> 32;
|
||||
conn_info.conn_id.pid = pid;
|
||||
conn_info.conn_id.fd = ret_fd;
|
||||
conn_info.conn_id.tsid = bpf_ktime_get_ns();
|
||||
|
||||
__u64 pid_fd = ((__u64)pid << 32) | (u32)ret_fd;
|
||||
bpf_map_update_elem(&conn_info_map, &pid_fd, &conn_info, BPF_ANY);
|
||||
|
||||
struct socket_data_event_t open_event = {};
|
||||
open_event.timestamp_ns = bpf_ktime_get_ns();
|
||||
open_event.pid = conn_info.conn_id.pid;
|
||||
open_event.fd = conn_info.conn_id.fd;
|
||||
open_event.is_connection = true;
|
||||
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&open_event, sizeof(struct socket_data_event_t));
|
||||
|
||||
bpf_map_delete_elem(&active_accept_args_map, &id);
|
||||
}
|
||||
|
||||
struct data_args_t
|
||||
{
|
||||
__s32 fd;
|
||||
const char *buf;
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(max_entries, 4096);
|
||||
__type(key, u64);
|
||||
__type(value, struct data_args_t);
|
||||
} active_read_args_map SEC(".maps");
|
||||
|
||||
SEC("tracepoint/syscalls/sys_enter_read")
|
||||
int sys_enter_read(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
|
||||
struct data_args_t read_args = {};
|
||||
read_args.fd = (int)BPF_CORE_READ(ctx, args[0]);
|
||||
read_args.buf = (char *)BPF_CORE_READ(ctx, args[1]);
|
||||
bpf_map_update_elem(&active_read_args_map, &id, &read_args, BPF_ANY);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool is_http_connection(const char *line_buffer, u64 bytes_count)
|
||||
{
|
||||
if (bytes_count < 6)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 && bpf_strncmp(line_buffer, 4, "POST") != 0 && bpf_strncmp(line_buffer, 3, "PUT") != 0 && bpf_strncmp(line_buffer, 6, "DELETE") != 0 && bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline void process_data(struct trace_event_raw_sys_exit *ctx,
|
||||
u64 id, const struct data_args_t *args, u64 bytes_count)
|
||||
{
|
||||
if (args->buf == NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
u32 pid = id >> 32;
|
||||
u64 pid_fd = ((u64)pid << 32) | (u64)args->fd;
|
||||
struct conn_info_t *conn_info = bpf_map_lookup_elem(&conn_info_map, &pid_fd);
|
||||
if (conn_info == NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (args->buf == NULL)
|
||||
{
|
||||
return;
|
||||
}
|
||||
char line_buffer[7];
|
||||
bpf_probe_read_kernel(line_buffer, 7, args->buf);
|
||||
if (is_http_connection(line_buffer, bytes_count))
|
||||
{
|
||||
u32 kZero = 0;
|
||||
struct socket_data_event_t event = {};
|
||||
|
||||
event.timestamp_ns = bpf_ktime_get_ns();
|
||||
event.is_connection = false;
|
||||
event.pid = conn_info->conn_id.pid;
|
||||
event.fd = conn_info->conn_id.fd;
|
||||
unsigned int read_size = bytes_count > MAX_MSG_SIZE ? MAX_MSG_SIZE : bytes_count;
|
||||
bpf_probe_read_kernel(&event.msg, read_size, args->buf);
|
||||
|
||||
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU,
|
||||
&event, sizeof(struct socket_data_event_t));
|
||||
}
|
||||
}
|
||||
|
||||
SEC("tracepoint/syscalls/sys_exit_read")
|
||||
int sys_exit_read(struct trace_event_raw_sys_exit *ctx)
|
||||
{
|
||||
u64 bytes_count = (u64)BPF_CORE_READ(ctx, ret);
|
||||
if (bytes_count <= 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
u64 id = bpf_get_current_pid_tgid();
|
||||
struct data_args_t *read_args = bpf_map_lookup_elem(&active_read_args_map, &id);
|
||||
if (read_args != NULL)
|
||||
{
|
||||
process_data(ctx, id, read_args, bytes_count);
|
||||
}
|
||||
|
||||
bpf_map_delete_elem(&active_read_args_map, &id);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
17
src/23-http/accept.h
Normal file
17
src/23-http/accept.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef BPF_HTTP_ACCEPT_TRACE_H
|
||||
#define BPF_HTTP_ACCEPT_TRACE_H
|
||||
|
||||
#define MAX_MSG_SIZE 256
|
||||
|
||||
struct socket_data_event_t
|
||||
{
|
||||
unsigned long long timestamp_ns;
|
||||
unsigned int pid;
|
||||
int fd;
|
||||
bool is_connection;
|
||||
unsigned int msg_size;
|
||||
unsigned long long pos;
|
||||
char msg[MAX_MSG_SIZE];
|
||||
};
|
||||
|
||||
#endif // BPF_HTTP_ACCEPT_TRACE_H
|
||||
@@ -1,103 +0,0 @@
|
||||
/*
|
||||
* Copyright 2018- The Pixie Authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
bpfwrapper2 "github.com/seek-ret/ebpf-training/workshop1/internal/bpfwrapper"
|
||||
"github.com/seek-ret/ebpf-training/workshop1/internal/connections"
|
||||
"github.com/seek-ret/ebpf-training/workshop1/internal/settings"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"os/user"
|
||||
"runtime/debug"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/iovisor/gobpf/bcc"
|
||||
)
|
||||
|
||||
// abortIfNotRoot checks the current user permissions, if the permissions are not elevated, we abort.
|
||||
func abortIfNotRoot() {
|
||||
current, err := user.Current()
|
||||
if err != nil {
|
||||
log.Panic(err)
|
||||
}
|
||||
|
||||
if current.Uid != "0" {
|
||||
log.Panic("sniffer must run under superuser privileges")
|
||||
}
|
||||
}
|
||||
|
||||
// recoverFromCrashes is a defer function that caches all panics being thrown from the application.
|
||||
func recoverFromCrashes() {
|
||||
if err := recover(); err != nil {
|
||||
log.Printf("Application crashed: %v\nstack: %s\n", err, string(debug.Stack()))
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
if len(os.Args) != 2 {
|
||||
fmt.Println("Usage: go run main.go <path to bpf source code>")
|
||||
os.Exit(1)
|
||||
}
|
||||
bpfSourceCodeFile := os.Args[1]
|
||||
bpfSourceCodeContent, err := ioutil.ReadFile(bpfSourceCodeFile)
|
||||
if err != nil {
|
||||
log.Panic(err)
|
||||
}
|
||||
|
||||
defer recoverFromCrashes()
|
||||
abortIfNotRoot()
|
||||
|
||||
if err := settings.InitRealTimeOffset(); err != nil {
|
||||
log.Printf("Failed fixing BPF clock, timings will be offseted: %v", err)
|
||||
}
|
||||
|
||||
// Catching all termination signals to perform a cleanup when being stopped.
|
||||
sig := make(chan os.Signal, 1)
|
||||
signal.Notify(sig, syscall.SIGHUP, syscall.SIGINT, syscall.SIGQUIT, syscall.SIGTERM)
|
||||
|
||||
bpfModule := bcc.NewModule(string(bpfSourceCodeContent), nil)
|
||||
if bpfModule == nil {
|
||||
log.Panic("bpf is nil")
|
||||
}
|
||||
defer bpfModule.Close()
|
||||
|
||||
connectionFactory := connections.NewFactory(time.Minute)
|
||||
go func() {
|
||||
for {
|
||||
connectionFactory.HandleReadyConnections()
|
||||
time.Sleep(10 * time.Second)
|
||||
}
|
||||
}()
|
||||
if err := bpfwrapper2.LaunchPerfBufferConsumers(bpfModule, connectionFactory); err != nil {
|
||||
log.Panic(err)
|
||||
}
|
||||
|
||||
// Lastly, after everything is ready and configured, attach the kprobes and start capturing traffic.
|
||||
if err := bpfwrapper2.AttachKprobes(bpfModule); err != nil {
|
||||
log.Panic(err)
|
||||
}
|
||||
log.Println("Sniffer is ready")
|
||||
<-sig
|
||||
log.Println("Signaled to terminate")
|
||||
}
|
||||
136
src/23-http/sockfilter.bpf.c
Normal file
136
src/23-http/sockfilter.bpf.c
Normal file
@@ -0,0 +1,136 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
/* Copyright (c) 2022 Jacky Yin */
|
||||
#include <stddef.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/socket.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_endian.h>
|
||||
#include "sockfilter.h"
|
||||
|
||||
#define IP_MF 0x2000
|
||||
#define IP_OFFSET 0x1FFF
|
||||
#define IP_TCP 6
|
||||
#define ETH_HLEN 14
|
||||
|
||||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||||
|
||||
struct
|
||||
{
|
||||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||||
__uint(max_entries, 256 * 1024);
|
||||
} rb SEC(".maps");
|
||||
|
||||
// Taken from uapi/linux/tcp.h
|
||||
struct __tcphdr
|
||||
{
|
||||
__be16 source;
|
||||
__be16 dest;
|
||||
__be32 seq;
|
||||
__be32 ack_seq;
|
||||
__u16 res1 : 4, doff : 4, fin : 1, syn : 1, rst : 1, psh : 1, ack : 1, urg : 1, ece : 1, cwr : 1;
|
||||
__be16 window;
|
||||
__sum16 check;
|
||||
__be16 urg_ptr;
|
||||
};
|
||||
|
||||
static inline int ip_is_fragment(struct __sk_buff *skb, __u32 nhoff)
|
||||
{
|
||||
__u16 frag_off;
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, frag_off), &frag_off, 2);
|
||||
frag_off = __bpf_ntohs(frag_off);
|
||||
return frag_off & (IP_MF | IP_OFFSET);
|
||||
}
|
||||
|
||||
SEC("socket")
|
||||
int socket_handler(struct __sk_buff *skb)
|
||||
{
|
||||
struct so_event *e;
|
||||
__u8 verlen;
|
||||
__u16 proto;
|
||||
__u32 nhoff = ETH_HLEN;
|
||||
__u32 ip_proto = 0;
|
||||
__u32 tcp_hdr_len = 0;
|
||||
__u16 tlen;
|
||||
__u32 payload_offset = 0;
|
||||
__u32 payload_length = 0;
|
||||
__u8 hdr_len;
|
||||
|
||||
bpf_skb_load_bytes(skb, 12, &proto, 2);
|
||||
proto = __bpf_ntohs(proto);
|
||||
if (proto != ETH_P_IP)
|
||||
return 0;
|
||||
|
||||
if (ip_is_fragment(skb, nhoff))
|
||||
return 0;
|
||||
|
||||
// ip4 header lengths are variable
|
||||
// access ihl as a u8 (linux/include/linux/skbuff.h)
|
||||
bpf_skb_load_bytes(skb, ETH_HLEN, &hdr_len, sizeof(hdr_len));
|
||||
hdr_len &= 0x0f;
|
||||
hdr_len *= 4;
|
||||
|
||||
/* verify hlen meets minimum size requirements */
|
||||
if (hdr_len < sizeof(struct iphdr))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, protocol), &ip_proto, 1);
|
||||
|
||||
if (ip_proto != IPPROTO_TCP)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
tcp_hdr_len = nhoff + hdr_len;
|
||||
bpf_skb_load_bytes(skb, nhoff + 0, &verlen, 1);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, tot_len), &tlen, sizeof(tlen));
|
||||
|
||||
__u8 doff;
|
||||
bpf_skb_load_bytes(skb, tcp_hdr_len + offsetof(struct __tcphdr, ack_seq) + 4, &doff, sizeof(doff)); // read the first byte past __tcphdr->ack_seq, we can't do offsetof bit fields
|
||||
doff &= 0xf0; // clean-up res1
|
||||
doff >>= 4; // move the upper 4 bits to low
|
||||
doff *= 4; // convert to bytes length
|
||||
|
||||
payload_offset = ETH_HLEN + hdr_len + doff;
|
||||
payload_length = __bpf_ntohs(tlen) - hdr_len - doff;
|
||||
|
||||
char line_buffer[7];
|
||||
if (payload_length < 7 || payload_offset < 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
bpf_skb_load_bytes(skb, payload_offset, line_buffer, 7);
|
||||
bpf_printk("%d len %d buffer: %s", payload_offset, payload_length, line_buffer);
|
||||
if (bpf_strncmp(line_buffer, 3, "GET") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "POST") != 0 &&
|
||||
bpf_strncmp(line_buffer, 3, "PUT") != 0 &&
|
||||
bpf_strncmp(line_buffer, 6, "DELETE") != 0 &&
|
||||
bpf_strncmp(line_buffer, 4, "HTTP") != 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* reserve sample from BPF ringbuf */
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (!e)
|
||||
return 0;
|
||||
|
||||
e->ip_proto = ip_proto;
|
||||
bpf_skb_load_bytes(skb, nhoff + hdr_len, &(e->ports), 4);
|
||||
e->pkt_type = skb->pkt_type;
|
||||
e->ifindex = skb->ifindex;
|
||||
|
||||
e->payload_length = payload_length;
|
||||
bpf_skb_load_bytes(skb, payload_offset, e->payload, MAX_BUF_SIZE);
|
||||
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, saddr), &(e->src_addr), 4);
|
||||
bpf_skb_load_bytes(skb, nhoff + offsetof(struct iphdr, daddr), &(e->dst_addr), 4);
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
|
||||
return skb->len;
|
||||
}
|
||||
149
src/23-http/sockfilter.c
Normal file
149
src/23-http/sockfilter.c
Normal file
@@ -0,0 +1,149 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
/* Copyright (c) 2022 Jacky Yin */
|
||||
#include <argp.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <assert.h>
|
||||
#include <bpf/libbpf.h>
|
||||
#include <linux/if_packet.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/in.h>
|
||||
#include <net/if.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/socket.h>
|
||||
#include <unistd.h>
|
||||
#include "sockfilter.h"
|
||||
#include "sockfilter.skel.h"
|
||||
|
||||
static int open_raw_sock(const char *name)
|
||||
{
|
||||
struct sockaddr_ll sll;
|
||||
int sock;
|
||||
|
||||
sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
|
||||
if (sock < 0) {
|
||||
fprintf(stderr, "Failed to create raw socket\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
memset(&sll, 0, sizeof(sll));
|
||||
sll.sll_family = AF_PACKET;
|
||||
sll.sll_ifindex = if_nametoindex(name);
|
||||
sll.sll_protocol = htons(ETH_P_ALL);
|
||||
if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
|
||||
fprintf(stderr, "Failed to bind to %s: %s\n", name, strerror(errno));
|
||||
close(sock);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return sock;
|
||||
}
|
||||
|
||||
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||
{
|
||||
return vfprintf(stderr, format, args);
|
||||
}
|
||||
|
||||
static inline void ltoa(uint32_t addr, char *dst)
|
||||
{
|
||||
snprintf(dst, 16, "%u.%u.%u.%u", (addr >> 24) & 0xFF, (addr >> 16) & 0xFF,
|
||||
(addr >> 8) & 0xFF, (addr & 0xFF));
|
||||
}
|
||||
|
||||
static int handle_event(void *ctx, void *data, size_t data_sz)
|
||||
{
|
||||
const struct so_event *e = data;
|
||||
char ifname[IF_NAMESIZE];
|
||||
char sstr[16] = {}, dstr[16] = {};
|
||||
|
||||
if (e->pkt_type != PACKET_HOST)
|
||||
return 0;
|
||||
|
||||
if (e->ip_proto < 0 || e->ip_proto >= IPPROTO_MAX)
|
||||
return 0;
|
||||
|
||||
if (!if_indextoname(e->ifindex, ifname))
|
||||
return 0;
|
||||
|
||||
ltoa(ntohl(e->src_addr), sstr);
|
||||
ltoa(ntohl(e->dst_addr), dstr);
|
||||
|
||||
printf("%s:%d(src) -> %s:%d(dst)\n", sstr, ntohs(e->port16[0]), dstr, ntohs(e->port16[1]));
|
||||
printf("payload: %s\n", e->payload);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static volatile bool exiting = false;
|
||||
|
||||
static void sig_handler(int sig)
|
||||
{
|
||||
exiting = true;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct ring_buffer *rb = NULL;
|
||||
struct sockfilter_bpf *skel;
|
||||
int err, prog_fd, sock;
|
||||
|
||||
const char* interface = "lo";
|
||||
|
||||
/* Set up libbpf errors and debug info callback */
|
||||
libbpf_set_print(libbpf_print_fn);
|
||||
|
||||
/* Cleaner handling of Ctrl-C */
|
||||
signal(SIGINT, sig_handler);
|
||||
signal(SIGTERM, sig_handler);
|
||||
|
||||
/* Load and verify BPF programs*/
|
||||
skel = sockfilter_bpf__open_and_load();
|
||||
if (!skel) {
|
||||
fprintf(stderr, "Failed to open and load BPF skeleton\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Set up ring buffer polling */
|
||||
rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
|
||||
if (!rb) {
|
||||
err = -1;
|
||||
fprintf(stderr, "Failed to create ring buffer\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Create raw socket for localhost interface */
|
||||
sock = open_raw_sock(interface);
|
||||
if (sock < 0) {
|
||||
err = -2;
|
||||
fprintf(stderr, "Failed to open raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Attach BPF program to raw socket */
|
||||
prog_fd = bpf_program__fd(skel->progs.socket_handler);
|
||||
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd))) {
|
||||
err = -3;
|
||||
fprintf(stderr, "Failed to attach to raw socket\n");
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Process events */
|
||||
while (!exiting) {
|
||||
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
|
||||
/* Ctrl-C will cause -EINTR */
|
||||
if (err == -EINTR) {
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "Error polling perf buffer: %d\n", err);
|
||||
break;
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
ring_buffer__free(rb);
|
||||
sockfilter_bpf__destroy(skel);
|
||||
return -err;
|
||||
}
|
||||
22
src/23-http/sockfilter.h
Normal file
22
src/23-http/sockfilter.h
Normal file
@@ -0,0 +1,22 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
/* Copyright (c) 2022 Jacky Yin */
|
||||
#ifndef __SOCKFILTER_H
|
||||
#define __SOCKFILTER_H
|
||||
|
||||
#define MAX_BUF_SIZE 64
|
||||
|
||||
struct so_event {
|
||||
__be32 src_addr;
|
||||
__be32 dst_addr;
|
||||
union {
|
||||
__be32 ports;
|
||||
__be16 port16[2];
|
||||
};
|
||||
__u32 ip_proto;
|
||||
__u32 pkt_type;
|
||||
__u32 ifindex;
|
||||
__u32 payload_length;
|
||||
__u8 payload[MAX_BUF_SIZE];
|
||||
};
|
||||
|
||||
#endif /* __SOCKFILTER_H */
|
||||
@@ -1,497 +0,0 @@
|
||||
// +build ignore
|
||||
|
||||
/*
|
||||
* Copyright 2018- The Pixie Authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include <linux/in6.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/socket.h>
|
||||
#include <net/inet_sock.h>
|
||||
|
||||
// Defines
|
||||
|
||||
#define socklen_t size_t
|
||||
|
||||
// Data buffer message size. BPF can submit at most this amount of data to a perf buffer.
|
||||
// Kernel size limit is 32KiB. See https://github.com/iovisor/bcc/issues/2519 for more details.
|
||||
#define MAX_MSG_SIZE 30720 // 30KiB
|
||||
|
||||
// This defines how many chunks a perf_submit can support.
|
||||
// This applies to messages that are over MAX_MSG_SIZE,
|
||||
// and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE.
|
||||
#define CHUNK_LIMIT 4
|
||||
|
||||
enum traffic_direction_t {
|
||||
kEgress,
|
||||
kIngress,
|
||||
};
|
||||
|
||||
// Structs
|
||||
|
||||
// A struct representing a unique ID that is composed of the pid, the file
|
||||
// descriptor and the creation time of the struct.
|
||||
struct conn_id_t {
|
||||
// Process ID
|
||||
uint32_t pid;
|
||||
// The file descriptor to the opened network connection.
|
||||
int32_t fd;
|
||||
// Timestamp at the initialization of the struct.
|
||||
uint64_t tsid;
|
||||
};
|
||||
|
||||
// This struct contains information collected when a connection is established,
|
||||
// via an accept4() syscall.
|
||||
struct conn_info_t {
|
||||
// Connection identifier.
|
||||
struct conn_id_t conn_id;
|
||||
|
||||
// The number of bytes written/read on this connection.
|
||||
int64_t wr_bytes;
|
||||
int64_t rd_bytes;
|
||||
|
||||
// A flag indicating we identified the connection as HTTP.
|
||||
bool is_http;
|
||||
};
|
||||
|
||||
// An helper struct that hold the addr argument of the syscall.
|
||||
struct accept_args_t {
|
||||
struct sockaddr_in* addr;
|
||||
};
|
||||
|
||||
// An helper struct to cache input argument of read/write syscalls between the
|
||||
// entry hook and the exit hook.
|
||||
struct data_args_t {
|
||||
int32_t fd;
|
||||
const char* buf;
|
||||
};
|
||||
|
||||
// An helper struct that hold the input arguments of the close syscall.
|
||||
struct close_args_t {
|
||||
int32_t fd;
|
||||
};
|
||||
|
||||
// A struct describing the event that we send to the user mode upon a new connection.
|
||||
struct socket_open_event_t {
|
||||
// The time of the event.
|
||||
uint64_t timestamp_ns;
|
||||
// A unique ID for the connection.
|
||||
struct conn_id_t conn_id;
|
||||
// The address of the client.
|
||||
struct sockaddr_in addr;
|
||||
};
|
||||
|
||||
// Struct describing the close event being sent to the user mode.
|
||||
struct socket_close_event_t {
|
||||
// Timestamp of the close syscall
|
||||
uint64_t timestamp_ns;
|
||||
// The unique ID of the connection
|
||||
struct conn_id_t conn_id;
|
||||
// Total number of bytes written on that connection
|
||||
int64_t wr_bytes;
|
||||
// Total number of bytes read on that connection
|
||||
int64_t rd_bytes;
|
||||
};
|
||||
|
||||
struct socket_data_event_t {
|
||||
// We split attributes into a separate struct, because BPF gets upset if you do lots of
|
||||
// size arithmetic. This makes it so that it's attributes followed by message.
|
||||
struct attr_t {
|
||||
// The timestamp when syscall completed (return probe was triggered).
|
||||
uint64_t timestamp_ns;
|
||||
|
||||
// Connection identifier (PID, FD, etc.).
|
||||
struct conn_id_t conn_id;
|
||||
|
||||
// The type of the actual data that the msg field encodes, which is used by the caller
|
||||
// to determine how to interpret the data.
|
||||
enum traffic_direction_t direction;
|
||||
|
||||
// The size of the original message. We use this to truncate msg field to minimize the amount
|
||||
// of data being transferred.
|
||||
uint32_t msg_size;
|
||||
|
||||
// A 0-based position number for this event on the connection, in terms of byte position.
|
||||
// The position is for the first byte of this message.
|
||||
uint64_t pos;
|
||||
} attr;
|
||||
char msg[MAX_MSG_SIZE];
|
||||
};
|
||||
|
||||
// Maps
|
||||
|
||||
// A map of the active connections. The name of the map is conn_info_map
|
||||
// the key is of type uint64_t, the value is of type struct conn_info_t,
|
||||
// and the map won't be bigger than 128KB.
|
||||
BPF_HASH(conn_info_map, uint64_t, struct conn_info_t, 131072);
|
||||
// An helper map that will help us cache the input arguments of the accept syscall
|
||||
// between the entry hook and the return hook.
|
||||
BPF_HASH(active_accept_args_map, uint64_t, struct accept_args_t);
|
||||
// Perf buffer to send to the user-mode the data events.
|
||||
BPF_PERF_OUTPUT(socket_data_events);
|
||||
// A perf buffer that allows us send events from kernel to user mode.
|
||||
// This perf buffer is dedicated for special type of events - open events.
|
||||
BPF_PERF_OUTPUT(socket_open_events);
|
||||
// Perf buffer to send to the user-mode the close events.
|
||||
BPF_PERF_OUTPUT(socket_close_events);
|
||||
BPF_PERCPU_ARRAY(socket_data_event_buffer_heap, struct socket_data_event_t, 1);
|
||||
BPF_HASH(active_write_args_map, uint64_t, struct data_args_t);
|
||||
// Helper map to store read syscall arguments between entry and exit hooks.
|
||||
BPF_HASH(active_read_args_map, uint64_t, struct data_args_t);
|
||||
// An helper map to store close syscall arguments between entry and exit syscalls.
|
||||
BPF_HASH(active_close_args_map, uint64_t, struct close_args_t);
|
||||
|
||||
// Helper functions
|
||||
|
||||
// Generates a unique identifier using a tgid (Thread Global ID) and a fd (File Descriptor).
|
||||
static __inline uint64_t gen_tgid_fd(uint32_t tgid, int fd) {
|
||||
return ((uint64_t)tgid << 32) | (uint32_t)fd;
|
||||
}
|
||||
|
||||
// An helper function that checks if the syscall finished successfully and if it did
|
||||
// saves the new connection in a dedicated map of connections
|
||||
static __inline void process_syscall_accept(struct pt_regs* ctx, uint64_t id, const struct accept_args_t* args) {
|
||||
// Extracting the return code, and checking if it represent a failure,
|
||||
// if it does, we abort the as we have nothing to do.
|
||||
int ret_fd = PT_REGS_RC(ctx);
|
||||
if (ret_fd <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
struct conn_info_t conn_info = {};
|
||||
uint32_t pid = id >> 32;
|
||||
conn_info.conn_id.pid = pid;
|
||||
conn_info.conn_id.fd = ret_fd;
|
||||
conn_info.conn_id.tsid = bpf_ktime_get_ns();
|
||||
|
||||
uint64_t pid_fd = ((uint64_t)pid << 32) | (uint32_t)ret_fd;
|
||||
// Saving the connection info in a global map, so in the other syscalls
|
||||
// (read, write and close) we will be able to know that we have seen
|
||||
// the connection
|
||||
conn_info_map.update(&pid_fd, &conn_info);
|
||||
|
||||
// Sending an open event to the user mode, to let the user mode know that we
|
||||
// have identified a new connection.
|
||||
struct socket_open_event_t open_event = {};
|
||||
open_event.timestamp_ns = bpf_ktime_get_ns();
|
||||
open_event.conn_id = conn_info.conn_id;
|
||||
bpf_probe_read(&open_event.addr, sizeof(open_event.addr), args->addr);
|
||||
|
||||
socket_open_events.perf_submit(ctx, &open_event, sizeof(struct socket_open_event_t));
|
||||
}
|
||||
|
||||
static inline __attribute__((__always_inline__)) void process_syscall_close(struct pt_regs* ctx, uint64_t id,
|
||||
const struct close_args_t* close_args) {
|
||||
int ret_val = PT_REGS_RC(ctx);
|
||||
if (ret_val < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t tgid = id >> 32;
|
||||
uint64_t tgid_fd = gen_tgid_fd(tgid, close_args->fd);
|
||||
struct conn_info_t* conn_info = conn_info_map.lookup(&tgid_fd);
|
||||
if (conn_info == NULL) {
|
||||
// The FD being closed does not represent an IPv4 socket FD.
|
||||
return;
|
||||
}
|
||||
|
||||
// Send to the user mode an event indicating the connection was closed.
|
||||
struct socket_close_event_t close_event = {};
|
||||
close_event.timestamp_ns = bpf_ktime_get_ns();
|
||||
close_event.conn_id = conn_info->conn_id;
|
||||
close_event.rd_bytes = conn_info->rd_bytes;
|
||||
close_event.wr_bytes = conn_info->wr_bytes;
|
||||
|
||||
socket_close_events.perf_submit(ctx, &close_event, sizeof(struct socket_close_event_t));
|
||||
|
||||
// Remove the connection from the mapping.
|
||||
conn_info_map.delete(&tgid_fd);
|
||||
}
|
||||
|
||||
static inline __attribute__((__always_inline__)) bool is_http_connection(struct conn_info_t* conn_info, const char* buf, size_t count) {
|
||||
// If the connection was already identified as HTTP connection, no need to re-check it.
|
||||
if (conn_info->is_http) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// The minimum length of http request or response.
|
||||
if (count < 16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool res = false;
|
||||
if (buf[0] == 'H' && buf[1] == 'T' && buf[2] == 'T' && buf[3] == 'P') {
|
||||
res = true;
|
||||
}
|
||||
if (buf[0] == 'G' && buf[1] == 'E' && buf[2] == 'T') {
|
||||
res = true;
|
||||
}
|
||||
if (buf[0] == 'P' && buf[1] == 'O' && buf[2] == 'S' && buf[3] == 'T') {
|
||||
res = true;
|
||||
}
|
||||
|
||||
if (res) {
|
||||
conn_info->is_http = true;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static __inline void perf_submit_buf(struct pt_regs* ctx, const enum traffic_direction_t direction,
|
||||
const char* buf, size_t buf_size, size_t offset,
|
||||
struct conn_info_t* conn_info,
|
||||
struct socket_data_event_t* event) {
|
||||
switch (direction) {
|
||||
case kEgress:
|
||||
event->attr.pos = conn_info->wr_bytes + offset;
|
||||
break;
|
||||
case kIngress:
|
||||
event->attr.pos = conn_info->rd_bytes + offset;
|
||||
break;
|
||||
}
|
||||
|
||||
// Note that buf_size_minus_1 will be positive due to the if-statement above.
|
||||
size_t buf_size_minus_1 = buf_size - 1;
|
||||
|
||||
// Clang is too smart for us, and tries to remove some of the obvious hints we are leaving for the
|
||||
// BPF verifier. So we add this NOP volatile statement, so clang can't optimize away some of our
|
||||
// if-statements below.
|
||||
// By telling clang that buf_size_minus_1 is both an input and output to some black box assembly
|
||||
// code, clang has to discard any assumptions on what values this variable can take.
|
||||
asm volatile("" : "+r"(buf_size_minus_1) :);
|
||||
|
||||
buf_size = buf_size_minus_1 + 1;
|
||||
|
||||
// 4.14 kernels reject bpf_probe_read with size that they may think is zero.
|
||||
// Without the if statement, it somehow can't reason that the bpf_probe_read is non-zero.
|
||||
size_t amount_copied = 0;
|
||||
if (buf_size_minus_1 < MAX_MSG_SIZE) {
|
||||
bpf_probe_read(&event->msg, buf_size, buf);
|
||||
amount_copied = buf_size;
|
||||
} else {
|
||||
bpf_probe_read(&event->msg, MAX_MSG_SIZE, buf);
|
||||
amount_copied = MAX_MSG_SIZE;
|
||||
}
|
||||
|
||||
// If-statement is redundant, but is required to keep the 4.14 verifier happy.
|
||||
if (amount_copied > 0) {
|
||||
event->attr.msg_size = amount_copied;
|
||||
socket_data_events.perf_submit(ctx, event, sizeof(event->attr) + amount_copied);
|
||||
}
|
||||
}
|
||||
|
||||
static __inline void perf_submit_wrapper(struct pt_regs* ctx,
|
||||
const enum traffic_direction_t direction, const char* buf,
|
||||
const size_t buf_size, struct conn_info_t* conn_info,
|
||||
struct socket_data_event_t* event) {
|
||||
int bytes_sent = 0;
|
||||
unsigned int i;
|
||||
#pragma unroll
|
||||
for (i = 0; i < CHUNK_LIMIT; ++i) {
|
||||
const int bytes_remaining = buf_size - bytes_sent;
|
||||
const size_t current_size = (bytes_remaining > MAX_MSG_SIZE && (i != CHUNK_LIMIT - 1)) ? MAX_MSG_SIZE : bytes_remaining;
|
||||
perf_submit_buf(ctx, direction, buf + bytes_sent, current_size, bytes_sent, conn_info, event);
|
||||
bytes_sent += current_size;
|
||||
if (buf_size == bytes_sent) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline __attribute__((__always_inline__)) void process_data(struct pt_regs* ctx, uint64_t id,
|
||||
enum traffic_direction_t direction,
|
||||
const struct data_args_t* args, ssize_t bytes_count) {
|
||||
// Always check access to pointer before accessing them.
|
||||
if (args->buf == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// For read and write syscall, the return code is the number of bytes written or read, so zero means nothing
|
||||
// was written or read, and negative means that the syscall failed. Anyhow, we have nothing to do with that syscall.
|
||||
if (bytes_count <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint32_t pid = id >> 32;
|
||||
uint64_t pid_fd = ((uint64_t)pid << 32) | (uint32_t)args->fd;
|
||||
struct conn_info_t* conn_info = conn_info_map.lookup(&pid_fd);
|
||||
if (conn_info == NULL) {
|
||||
// The FD being read/written does not represent an IPv4 socket FD.
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if the connection is already HTTP, or check if that's a new connection, check protocol and return true if that's HTTP.
|
||||
if (is_http_connection(conn_info, args->buf, bytes_count)) {
|
||||
// allocate new event.
|
||||
uint32_t kZero = 0;
|
||||
struct socket_data_event_t* event = socket_data_event_buffer_heap.lookup(&kZero);
|
||||
if (event == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Fill the metadata of the data event.
|
||||
event->attr.timestamp_ns = bpf_ktime_get_ns();
|
||||
event->attr.direction = direction;
|
||||
event->attr.conn_id = conn_info->conn_id;
|
||||
|
||||
perf_submit_wrapper(ctx, direction, args->buf, bytes_count, conn_info, event);
|
||||
}
|
||||
|
||||
// Update the conn_info total written/read bytes.
|
||||
switch (direction) {
|
||||
case kEgress:
|
||||
conn_info->wr_bytes += bytes_count;
|
||||
break;
|
||||
case kIngress:
|
||||
conn_info->rd_bytes += bytes_count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Hooks
|
||||
int syscall__probe_entry_accept(struct pt_regs* ctx, int sockfd, struct sockaddr* addr, socklen_t* addrlen) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// Keep the addr in a map to use during the exit method.
|
||||
struct accept_args_t accept_args = {};
|
||||
accept_args.addr = (struct sockaddr_in *)addr;
|
||||
active_accept_args_map.update(&id, &accept_args);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int syscall__probe_ret_accept(struct pt_regs* ctx) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// Pulling the addr from the map.
|
||||
struct accept_args_t* accept_args = active_accept_args_map.lookup(&id);
|
||||
if (accept_args != NULL) {
|
||||
process_syscall_accept(ctx, id, accept_args);
|
||||
}
|
||||
|
||||
active_accept_args_map.delete(&id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Hooking the entry of accept4
|
||||
// the signature of the syscall is int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
|
||||
int syscall__probe_entry_accept4(struct pt_regs* ctx, int sockfd, struct sockaddr* addr, socklen_t* addrlen) {
|
||||
// Getting a unique ID for the relevant thread in the relevant pid.
|
||||
// That way we can link different calls from the same thread.
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// Keep the addr in a map to use during the accpet4 exit hook.
|
||||
struct accept_args_t accept_args = {};
|
||||
accept_args.addr = (struct sockaddr_in *)addr;
|
||||
active_accept_args_map.update(&id, &accept_args);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Hooking the exit of accept4
|
||||
int syscall__probe_ret_accept4(struct pt_regs* ctx) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// Pulling the addr from the map.
|
||||
struct accept_args_t* accept_args = active_accept_args_map.lookup(&id);
|
||||
// If the id exist in the map, we will get a non empty pointer that holds
|
||||
// the input address argument from the entry of the syscall.
|
||||
if (accept_args != NULL) {
|
||||
process_syscall_accept(ctx, id, accept_args);
|
||||
}
|
||||
|
||||
// Anyway, in the end clean the map.
|
||||
active_accept_args_map.delete(&id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// original signature: ssize_t write(int fd, const void *buf, size_t count);
|
||||
int syscall__probe_entry_write(struct pt_regs* ctx, int fd, char* buf, size_t count) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
struct data_args_t write_args = {};
|
||||
write_args.fd = fd;
|
||||
write_args.buf = buf;
|
||||
active_write_args_map.update(&id, &write_args);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int syscall__probe_ret_write(struct pt_regs* ctx) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
ssize_t bytes_count = PT_REGS_RC(ctx); // Also stands for return code.
|
||||
|
||||
// Unstash arguments, and process syscall.
|
||||
struct data_args_t* write_args = active_write_args_map.lookup(&id);
|
||||
if (write_args != NULL) {
|
||||
process_data(ctx, id, kEgress, write_args, bytes_count);
|
||||
}
|
||||
|
||||
active_write_args_map.delete(&id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// original signature: ssize_t read(int fd, void *buf, size_t count);
|
||||
int syscall__probe_entry_read(struct pt_regs* ctx, int fd, char* buf, size_t count) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// Stash arguments.
|
||||
struct data_args_t read_args = {};
|
||||
read_args.fd = fd;
|
||||
read_args.buf = buf;
|
||||
active_read_args_map.update(&id, &read_args);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int syscall__probe_ret_read(struct pt_regs* ctx) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
|
||||
// The return code the syscall is the number of bytes read as well.
|
||||
ssize_t bytes_count = PT_REGS_RC(ctx);
|
||||
struct data_args_t* read_args = active_read_args_map.lookup(&id);
|
||||
if (read_args != NULL) {
|
||||
// kIngress is an enum value that let's the process_data function
|
||||
// to know whether the input buffer is incoming or outgoing.
|
||||
process_data(ctx, id, kIngress, read_args, bytes_count);
|
||||
}
|
||||
|
||||
active_read_args_map.delete(&id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// original signature: int close(int fd)
|
||||
int syscall__probe_entry_close(struct pt_regs* ctx, int fd) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
struct close_args_t close_args;
|
||||
close_args.fd = fd;
|
||||
active_close_args_map.update(&id, &close_args);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int syscall__probe_ret_close(struct pt_regs* ctx) {
|
||||
uint64_t id = bpf_get_current_pid_tgid();
|
||||
const struct close_args_t* close_args = active_close_args_map.lookup(&id);
|
||||
if (close_args != NULL) {
|
||||
process_syscall_close(ctx, id, close_args);
|
||||
}
|
||||
|
||||
active_close_args_map.delete(&id);
|
||||
return 0;
|
||||
}
|
||||
@@ -160,6 +160,7 @@ eunomia-bpf 是一个结合 Wasm 的开源 eBPF 动态加载运行时和开发
|
||||
|
||||
```console
|
||||
$ sudo ./ecli package.json
|
||||
TIME PID COMM SUCCESS
|
||||
```
|
||||
|
||||
这个程序会对任何试图使用 `ptrace` 系统调用的程序,例如 `strace`,发出 `SIG_KILL` 信号。
|
||||
|
||||
@@ -1,24 +1,198 @@
|
||||
# eBPF Practical Tutorial: Terminate Malicious Processes Using bpf_send_signal
|
||||
# Using bpf_send_signal to Terminate Malicious Processes in eBPF
|
||||
|
||||
Compile:
|
||||
eBPF (Extended Berkeley Packet Filter) is a revolutionary technology in the Linux kernel that allows users to execute custom programs in kernel space without modifying the kernel source code or loading any kernel modules. This provides developers with great flexibility to observe, modify, and control the Linux system.
|
||||
|
||||
This article introduces how to use the `bpf_send_signal` feature of eBPF to intervene by sending signals to specified processes. For more tutorial documentation, please refer to <https://github.com/eunomia-bpf/bpf-developer-tutorial>.
|
||||
|
||||
## Use Cases
|
||||
|
||||
**1. Performance Issues:**
|
||||
|
||||
Optimizing the performance of applications is a core task for developers and system administrators in the modern software ecosystem. When applications, such as hhvm, run slowly or have abnormally high resource utilization, they can adversely affect the entire system. Therefore, pinpointing these performance bottlenecks and resolving them promptly is crucial.
|
||||
|
||||
**2. Anomaly Detection and Response:**
|
||||
|
||||
Any system running in a production environment may face various anomalies, from simple resource leaks to complex malware attacks. In these situations, the system needs to detect these anomalies quickly and accurately and take appropriate countermeasures.
|
||||
|
||||
**3. Dynamic System Management:**
|
||||
|
||||
With the rise of cloud computing and microservice architectures, dynamically adjusting resource configurations and application behaviors based on the current system state has become a key requirement. For example, auto-scaling based on traffic fluctuations or reducing CPU frequency when detecting system overheating.
|
||||
|
||||
### Limitations of Existing Solutions
|
||||
|
||||
To meet the needs of the above use cases, traditional technical methods are as follows:
|
||||
|
||||
- Install a bpf program that continuously monitors the system while polling a map.
|
||||
- When an event triggers specific conditions defined in the bpf program, it writes related data to this map.
|
||||
- Then, external analysis tools read data from this map and send signals to the target process based on the retrieved information.
|
||||
|
||||
Although this method is feasible in many scenarios, it has a major flaw: the time delay from when the event occurs to when the external tool responds can be relatively large. This delay can affect the speed of event response, making performance analysis results inaccurate or failing to respond promptly to malicious activity.
|
||||
|
||||
### Advantages of the New Solution
|
||||
|
||||
To overcome the limitations of traditional methods, the Linux kernel offers the `bpf_send_signal` and `bpf_send_signal_thread` helper functions.
|
||||
|
||||
The main advantages of these functions include:
|
||||
|
||||
**1. Real-time Response:**
|
||||
|
||||
By sending signals directly from kernel space, avoiding extra overhead in user space, signals can be sent immediately after an event occurs, significantly reducing latency.
|
||||
|
||||
**2. Accuracy:**
|
||||
|
||||
Thanks to reduced latency, we can now obtain a more accurate snapshot of the system state, especially important for performance analysis and anomaly detection.
|
||||
|
||||
**3. Flexibility:**
|
||||
|
||||
These new helper functions provide developers with more flexibility. They can customize the signal sending logic according to different use cases and needs, allowing for more precise control and management of system behavior.
|
||||
|
||||
## Kernel Code Analysis
|
||||
|
||||
In modern operating systems, a common security strategy is to monitor and control interactions between processes. Especially in Linux systems, the `ptrace` system call is a powerful tool that allows one process to observe and control the execution of another process, modifying its registers and memory. This makes it the primary mechanism for debugging and tracing tools like `strace` and `gdb`. However, malicious use of `ptrace` can also pose security risks.
|
||||
|
||||
The goal of this program is to monitor `ptrace` calls in kernel mode. When specific conditions are met, it sends a `SIGKILL` signal to terminate the calling process. Additionally, for debugging or auditing purposes, the program logs this intervention and sends related information to user space.
|
||||
|
||||
## Code Analysis
|
||||
|
||||
### 1. Data Structure Definition (`signal.h`)
|
||||
|
||||
signal.h
|
||||
|
||||
```c
|
||||
// Simple message structure to get events from eBPF Programs
|
||||
// in the kernel to user space
|
||||
#define TASK_COMM_LEN 16
|
||||
struct event {
|
||||
int pid;
|
||||
char comm[TASK_COMM_LEN];
|
||||
bool success;
|
||||
};
|
||||
```
|
||||
|
||||
This section defines a simple message structure used to pass events from eBPF programs in the kernel to user space. The structure includes the process ID, command name, and a boolean value indicating whether the signal was successfully sent.
|
||||
|
||||
### 2. eBPF Program (`signal.bpf.c`)
|
||||
|
||||
signal.bpf.c
|
||||
|
||||
```c
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include "common.h"
|
||||
|
||||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||||
|
||||
// Ringbuffer Map to pass messages from kernel to user
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_RINGBUF);
|
||||
__uint(max_entries, 256 * 1024);
|
||||
} rb SEC(".maps");
|
||||
|
||||
// Optional Target Parent PID
|
||||
const volatile int target_ppid = 0;
|
||||
|
||||
SEC("tp/syscalls/sys_enter_ptrace")
|
||||
int bpf_dos(struct trace_event_raw_sys_enter *ctx)
|
||||
{
|
||||
long ret = 0;
|
||||
size_t pid_tgid = bpf_get_current_pid_tgid();
|
||||
int pid = pid_tgid >> 32;
|
||||
|
||||
// if target_ppid is 0 then we target all pids
|
||||
if (target_ppid != 0) {
|
||||
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
|
||||
int ppid = BPF_CORE_READ(task, real_parent, tgid);
|
||||
if (ppid != target_ppid) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Send signal. 9 == SIGKILL
|
||||
ret = bpf_send_signal(9);
|
||||
|
||||
// Log event
|
||||
struct event *e;
|
||||
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
|
||||
if (e) {
|
||||
e->success = (ret == 0);
|
||||
e->pid = pid;
|
||||
bpf_get_current_comm(&e->comm, sizeof(e->comm));
|
||||
bpf_ringbuf_submit(e, 0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
- **License Declaration**
|
||||
|
||||
The program's license is declared as "Dual BSD/GPL". This is to meet the Linux kernel's licensing requirements for eBPF programs.
|
||||
|
||||
- **Ringbuffer Map**
|
||||
|
||||
This is a ring buffer type map that allows messages generated by the eBPF program in kernel space to be efficiently read by user space programs.
|
||||
|
||||
- **Target Parent Process ID**
|
||||
|
||||
`target_ppid` is an optional parent process ID used to limit which processes are affected. If set to a non-zero value, only processes that match it will be targeted.
|
||||
|
||||
- **Main Function `bpf_dos`**
|
||||
|
||||
- **Process Check**
|
||||
The program first retrieves the current process's ID. If `target_ppid` is set, it also retrieves the current process's parent process ID and compares them. If they don't match, it returns immediately.
|
||||
|
||||
- **Sending Signal**
|
||||
It uses `bpf_send_signal(9)` to send a `SIGKILL` signal. This terminates the process calling `ptrace`.
|
||||
|
||||
- **Logging the Event**
|
||||
The event is logged using the ring buffer map. This includes whether the signal was successfully sent, the process ID, and the process's command name.
|
||||
|
||||
In summary, this eBPF program provides a method that allows system administrators or security teams to monitor and intervene `ptrace` calls at the kernel level, offering an additional layer against potential malicious activities or misoperations.
|
||||
|
||||
## Compilation and Execution
|
||||
|
||||
eunomia-bpf is an open-source eBPF dynamic loading runtime and development toolchain combined with Wasm. Its purpose is to simplify the development, building, distribution, and execution of eBPF programs. You can refer to <https://github.com/eunomia-bpf/eunomia-bpf> to download and install the `ecc` compiler toolchain and `ecli` runtime. We use eunomia-bpf to compile and run this example.
|
||||
|
||||
Compilation:
|
||||
|
||||
```bash
|
||||
make
|
||||
./ecc signal.bpf.c signal.h
|
||||
```
|
||||
|
||||
Usage:
|
||||
|
||||
```bash
|
||||
sudo ./bpfdos
|
||||
```console
|
||||
$ sudo ./ecli package.json
|
||||
TIME PID COMM SUCCESS
|
||||
```
|
||||
|
||||
This program sends a `SIG_KILL` signal to any program that tries to use the `ptrace` system call, such as `strace`.
|
||||
Once bpf-dos starts running, you can test it by running the following command:
|
||||
This program will send a `SIG_KILL` signal to any program attempting to use the `ptrace` system call, such as `strace`. Once the eBPF program starts running, you can test it by running the following command:
|
||||
|
||||
```bash
|
||||
strace /bin/whoami
|
||||
$ strace /bin/whoami
|
||||
Killed
|
||||
```
|
||||
|
||||
The original console will output:
|
||||
|
||||
```txt
|
||||
INFO [bpf_loader_lib::skeleton] Running ebpf program...
|
||||
TIME PID COMM SUCCESS
|
||||
13:54:45 8857 strace true
|
||||
```
|
||||
|
||||
The complete source code can be found at: <https://github.com/eunomia-bpf/bpf-developer-tutorial/tree/main/src/25-signal>
|
||||
|
||||
## Conclusion
|
||||
|
||||
Through this example, we delved into how to combine eBPF programs with user-space programs to monitor and intervene in system calls. eBPF provides a mechanism for executing programs in kernel space. This technology is not limited to monitoring but can also be used for performance optimization, security defense, system diagnostics, and various other scenarios. For developers, it offers a powerful and flexible tool for performance tuning and troubleshooting in Linux systems.
|
||||
|
||||
Lastly, if you are interested in eBPF technology and wish to further understand and practice, you can visit our tutorial code repository <https://github.com/eunomia-bpf/bpf-developer-tutorial> and tutorial website <https://eunomia.dev/zh/tutorials/>.
|
||||
|
||||
## References
|
||||
|
||||
- <https://github.com/pathtofile/bad-bpf>.
|
||||
- <https://github.com/pathtofile/bad-bpf>
|
||||
- <https://www.mail-archive.com/netdev@vger.kernel.org/msg296358.html>
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include "common.h"
|
||||
#include "replace.h"
|
||||
|
||||
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|
||||
|
||||
@@ -268,7 +268,8 @@ int check_possible_addresses(struct trace_event_raw_sys_exit *ctx) {
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// we can use bpf_strncmp here, but it's not available in the kernel version older
|
||||
// we can use bpf_strncmp here,
|
||||
// but it's not available in the kernel version older than 5.17
|
||||
if (bpf_strncmp(name, text_len_max, (const char *)text_find) == 0) {
|
||||
// ***********
|
||||
// We've found out text!
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
#include <argp.h>
|
||||
#include <unistd.h>
|
||||
#include "replace.skel.h"
|
||||
#include "common.h"
|
||||
#include "replace.h"
|
||||
|
||||
|
||||
#include <bpf/bpf.h>
|
||||
|
||||
@@ -26,14 +26,4 @@ struct event {
|
||||
bool success;
|
||||
};
|
||||
|
||||
struct tr_file {
|
||||
char filename[FILENAME_LEN_MAX];
|
||||
unsigned int filename_len;
|
||||
};
|
||||
|
||||
struct tr_text {
|
||||
char text[TEXT_LEN_MAX];
|
||||
unsigned int text_len;
|
||||
};
|
||||
|
||||
#endif // BAD_BPF_COMMON_H
|
||||
@@ -1,5 +1,6 @@
|
||||
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
|
||||
/* Copyright (c) 2021 Sartura */
|
||||
#define BPF_NO_GLOBAL_DATA
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define BPF_NO_GLOBAL_DATA
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define BPF_NO_GLOBAL_DATA
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#define BPF_NO_GLOBAL_DATA
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
# eBPF 高级特性与进阶主题
|
||||
|
||||
- [在 Android 上使用 eBPF 程序](22-android/README.md)
|
||||
- [使用 eBPF 追踪 HTTP 请求或其他七层协议](23-http/README.md)
|
||||
- [使用 eBPF socket filter 或 syscall tracepoint 追踪 HTTP 请求等七层协议](23-http/README.md)
|
||||
- [使用 uprobe 捕获多种库的 SSL/TLS 明文数据](30-sslsniff/README.md)
|
||||
- [使用 sockops 加速网络请求转发](29-sockops/README.md)
|
||||
- [使用 eBPF 隐藏进程或文件信息](24-hide/README.md)
|
||||
|
||||
2
src/third_party/libbpf
vendored
2
src/third_party/libbpf
vendored
Submodule src/third_party/libbpf updated: 05f94ddbb8...56069cda78
Reference in New Issue
Block a user