diff --git a/src/28-detach/README.md b/src/28-detach/README.md index c9a364f..2f888c4 100644 --- a/src/28-detach/README.md +++ b/src/28-detach/README.md @@ -1 +1,5 @@ -# detach \ No newline at end of file +# detach + +## reference + +- https://github.com/pathtofile/bad-bpf diff --git a/src/29-sockops/.gitignore b/src/29-sockops/.gitignore new file mode 100644 index 0000000..024ee36 --- /dev/null +++ b/src/29-sockops/.gitignore @@ -0,0 +1,8 @@ +.vscode +package.json +*.o +*.skel.json +*.skel.yaml +package.yaml +ecli +ecc diff --git a/src/29-sockops/README.md b/src/29-sockops/README.md index e69de29..60f5057 100644 --- a/src/29-sockops/README.md +++ b/src/29-sockops/README.md @@ -0,0 +1,76 @@ +# eBPF sockops 示例 + +## 利用 eBPF 的 sockops 进行性能优化 + +网络连接本质上是 socket 之间的通讯,eBPF 提供了一个 [bpf_msg_redirect_hash](https://man7.org/linux/man-pages/man7/bpf-helpers.7.html) 函数,用来将应用发出的包直接转发到对端的 socket,可以极大地加速包在内核中的处理流程。 + +这里 sock_map 是记录 socket 规则的关键部分,即根据当前的数据包信息,从 sock_map 中挑选一个存在的 socket 连接来转发请求。所以需要先在 sockops 的 hook 处或者其它地方,将 socket 信息保存到 sock_map,并提供一个规则 (一般为四元组) 根据 key 查找到 socket。 + +Merbridge 项目就是这样实现了用 eBPF 代替 iptables 为 Istio 进行加速。在使用 Merbridge (eBPF) 优化之后,出入口流量会直接跳过很多内核模块,明显提高性能,如下图所示: + +![merbridge](merbridge.png) + +## 运行样例 + +此示例程序从发送者的套接字(出口)重定向流量至接收者的套接字(入口),**跳过 TCP/IP 内核网络栈**。在这个示例中,我们假定发送者和接收者都在**同一台**机器上运行。 + +### 编译 eBPF 程序 + +```shell +# Compile the bpf_sockops program +clang -O2 -g -Wall -target bpf -c bpf_sockops.c -o bpf_sockops.o +clang -O2 -g -Wall -target bpf -c bpf_redir.c -o bpf_redir.o +``` + +### 加载 eBPF 程序 + +```shell +sudo ./load.sh +``` + +您可以使用 [bpftool utility](https://github.com/torvalds/linux/blob/master/tools/bpf/bpftool/Documentation/bpftool-prog.rst) 检查这两个 eBPF 程序是否已经加载。 + +```console +$ sudo bpftool prog show +63: sock_ops name bpf_sockmap tag 275467be1d69253d gpl + loaded_at 2019-01-24T13:07:17+0200 uid 0 + xlated 1232B jited 750B memlock 4096B map_ids 58 +64: sk_msg name bpf_redir tag bc78074aa9dd96f4 gpl + loaded_at 2019-01-24T13:07:17+0200 uid 0 + xlated 304B jited 233B memlock 4096B map_ids 58 +``` + +### 运行 [iperf3](https://iperf.fr/) 服务器 + +```shell +iperf3 -s -p 10000 +``` + +### 运行 [iperf3](https://iperf.fr/) 客户端 + +```shell +iperf3 -c 127.0.0.1 -t 10 -l 64k -p 10000 +``` + +### 收集追踪 + +```console +$ ./trace.sh +iperf3-9516 [001] .... 22500.634108: 0: <<< ipv4 op = 4, port 18583 --> 4135 +iperf3-9516 [001] ..s1 22500.634137: 0: <<< ipv4 op = 5, port 4135 --> 18583 +iperf3-9516 [001] .... 22500.634523: 0: <<< ipv4 op = 4, port 19095 --> 4135 +iperf3-9516 [001] ..s1 22500.634536: 0: <<< ipv4 op = 5, port 4135 --> 19095 +``` + +你应该可以看到 4 个用于套接字建立的事件。如果你没有看到任何事件,那么 eBPF 程序可能没有正确地附加上。 + +### 卸载 eBPF 程序 + +```shell +sudo ./unload.sh +``` + +## 参考资料和源代码来源 + +- +- diff --git a/src/29-sockops/bpf_redir.c b/src/29-sockops/bpf_redir.c new file mode 100644 index 0000000..654587b --- /dev/null +++ b/src/29-sockops/bpf_redir.c @@ -0,0 +1,27 @@ +#include +#include + +#include "bpf_sockops.h" + +__section("sk_msg") +int bpf_redir(struct sk_msg_md *msg) +{ + __u64 flags = BPF_F_INGRESS; + struct sock_key key = {}; + + sk_msg_extract4_key(msg, &key); + // See whether the source or destination IP is local host + if (key.sip4 == 16777343 || key.dip4 == 16777343) { + // See whether the source or destination port is 10000 + if (key.sport == 4135 || key.dport == 4135) { + int len1 = (__u64)msg->data_end - (__u64)msg->data; + printk("<<< redir_proxy port %d --> %d (%d)\n", key.sport, key.dport, len1); + msg_redirect_hash(msg, &sock_ops_map, &key, flags); + } + } + + return SK_PASS; +} + +BPF_LICENSE("GPL"); +int _version __section("version") = 1; diff --git a/src/29-sockops/bpf_sockops.c b/src/29-sockops/bpf_sockops.c new file mode 100644 index 0000000..0820c82 --- /dev/null +++ b/src/29-sockops/bpf_sockops.c @@ -0,0 +1,52 @@ +#include +#include +#include + +#include "bpf_sockops.h" + +static inline void bpf_sock_ops_ipv4(struct bpf_sock_ops *skops) +{ + struct sock_key key = {}; + sk_extract4_key(skops, &key); + if (key.dip4 == 16777343 || key.sip4 == 16777343 ) { + if (key.dport == 4135 || key.sport == 4135) { + int ret = sock_hash_update(skops, &sock_ops_map, &key, BPF_NOEXIST); + printk("<<< ipv4 op = %d, port %d --> %d\n", skops->op, key.sport, key.dport); + if (ret != 0) + printk("*** FAILED %d ***\n", ret); + } + } +} + +static inline void bpf_sock_ops_ipv6(struct bpf_sock_ops *skops) +{ + if (skops->remote_ip4) + bpf_sock_ops_ipv4(skops); +} + + +__section("sockops") +int bpf_sockmap(struct bpf_sock_ops *skops) +{ + __u32 family, op; + + family = skops->family; + op = skops->op; + + //printk("<<< op %d, port = %d --> %d\n", op, skops->local_port, skops->remote_port); + switch (op) { + case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + if (family == AF_INET6) + bpf_sock_ops_ipv6(skops); + else if (family == AF_INET) + bpf_sock_ops_ipv4(skops); + break; + default: + break; + } + return 0; +} + +BPF_LICENSE("GPL"); +int _version __section("version") = 1; diff --git a/src/29-sockops/bpf_sockops.h b/src/29-sockops/bpf_sockops.h new file mode 100644 index 0000000..c625da2 --- /dev/null +++ b/src/29-sockops/bpf_sockops.h @@ -0,0 +1,168 @@ +#include +#include + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __bpf_ntohs(x) __builtin_bswap16(x) +# define __bpf_htons(x) __builtin_bswap16(x) +# define __bpf_constant_ntohs(x) ___constant_swab16(x) +# define __bpf_constant_htons(x) ___constant_swab16(x) +# define __bpf_ntohl(x) __builtin_bswap32(x) +# define __bpf_htonl(x) __builtin_bswap32(x) +# define __bpf_constant_ntohl(x) ___constant_swab32(x) +# define __bpf_constant_htonl(x) ___constant_swab32(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __bpf_ntohs(x) (x) +# define __bpf_htons(x) (x) +# define __bpf_constant_ntohs(x) (x) +# define __bpf_constant_htons(x) (x) +# define __bpf_ntohl(x) (x) +# define __bpf_htonl(x) (x) +# define __bpf_constant_ntohl(x) (x) +# define __bpf_constant_htonl(x) (x) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#define bpf_htons(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htons(x) : __bpf_htons(x)) +#define bpf_ntohs(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohs(x) : __bpf_ntohs(x)) +#define bpf_htonl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htonl(x) : __bpf_htonl(x)) +#define bpf_ntohl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohl(x) : __bpf_ntohl(x)) + +/** Section helper macros. */ + +#ifndef __section +# define __section(NAME) \ + __attribute__((section(NAME), used)) +#endif + +#ifndef __section_tail +# define __section_tail(ID, KEY) \ + __section(__stringify(ID) "/" __stringify(KEY)) +#endif + +#ifndef __section_cls_entry +# define __section_cls_entry \ + __section("classifier") +#endif + +#ifndef __section_act_entry +# define __section_act_entry \ + __section("action") +#endif + +#ifndef __section_license +# define __section_license \ + __section("license") +#endif + +#ifndef __section_maps +# define __section_maps \ + __section("maps") +#endif + +/** Declaration helper macros. */ + +#ifndef BPF_LICENSE +# define BPF_LICENSE(NAME) \ + char ____license[] __section_license = NAME +#endif + +#ifndef BPF_FUNC +# define BPF_FUNC(NAME, ...) \ + (*NAME)(__VA_ARGS__) = (void *)BPF_FUNC_##NAME +#endif + +static int BPF_FUNC(sock_hash_update, struct bpf_sock_ops *skops, void *map, void *key, uint64_t flags); +static int BPF_FUNC(msg_redirect_hash, struct sk_msg_md *md, void *map, void *key, uint64_t flags); +static void BPF_FUNC(trace_printk, const char *fmt, int fmt_size, ...); + +#ifndef printk +# define printk(fmt, ...) \ + ({ \ + char ____fmt[] = fmt; \ + trace_printk(____fmt, sizeof(____fmt), ##__VA_ARGS__); \ + }) +#endif + + +struct bpf_map_def { + __u32 type; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; +}; + +union v6addr { + struct { + __u32 p1; + __u32 p2; + __u32 p3; + __u32 p4; + }; + __u8 addr[16]; +}; + +struct sock_key { + union { + struct { + __u32 sip4; + __u32 pad1; + __u32 pad2; + __u32 pad3; + }; + union v6addr sip6; + }; + union { + struct { + __u32 dip4; + __u32 pad4; + __u32 pad5; + __u32 pad6; + }; + union v6addr dip6; + }; + __u8 family; + __u8 pad7; + __u16 pad8; + __u32 sport; + __u32 dport; +} __attribute__((packed)); + +struct bpf_map_def __section_maps sock_ops_map = { + .type = BPF_MAP_TYPE_SOCKHASH, + .key_size = sizeof(struct sock_key), + .value_size = sizeof(int), + .max_entries = 65535, + .map_flags = 0, +}; + +static inline void sk_extract4_key(struct bpf_sock_ops *ops, + struct sock_key *key) +{ + key->dip4 = ops->remote_ip4; + key->sip4 = ops->local_ip4; + key->family = 1; + + key->sport = (bpf_htonl(ops->local_port) >> 16); + key->dport = ops->remote_port >> 16; +} + +static inline void sk_msg_extract4_key(struct sk_msg_md *msg, + struct sock_key *key) +{ + key->sip4 = msg->remote_ip4; + key->dip4 = msg->local_ip4; + key->family = 1; + + key->dport = (bpf_htonl(msg->local_port) >> 16); + key->sport = msg->remote_port >> 16; +} diff --git a/src/29-sockops/envoy/Dockerfile b/src/29-sockops/envoy/Dockerfile new file mode 100644 index 0000000..1f1da7f --- /dev/null +++ b/src/29-sockops/envoy/Dockerfile @@ -0,0 +1,3 @@ +FROM envoyproxy/envoy:latest +COPY envoy.yaml /etc/envoy/envoy.yaml +EXPOSE 9901 diff --git a/src/29-sockops/envoy/envoy.yaml b/src/29-sockops/envoy/envoy.yaml new file mode 100644 index 0000000..6225a4f --- /dev/null +++ b/src/29-sockops/envoy/envoy.yaml @@ -0,0 +1,30 @@ +admin: + access_log_path: /tmp/admin_access.log + address: + socket_address: + protocol: TCP + address: 0.0.0.0 + port_value: 9901 +static_resources: + listeners: + - name: iperf3-listener + address: + socket_address: + protocol: TCP + address: 0.0.0.0 + port_value: 10000 + filter_chains: + - filters: + - name: envoy.tcp_proxy + config: + stat_prefix: iperf3-listener + cluster: iperf3_server + clusters: + - name: iperf3_server + connect_timeout: 1.0s + type: static + lb_policy: ROUND_ROBIN + hosts: + - socket_address: + address: 127.0.0.1 + port_value: 5201 diff --git a/src/29-sockops/load.sh b/src/29-sockops/load.sh new file mode 100755 index 0000000..df2073b --- /dev/null +++ b/src/29-sockops/load.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -x +set -e + +# Mount bpf filesystem +sudo mount -t bpf bpf /sys/fs/bpf/ + +# Load the bpf_sockops program +sudo bpftool prog load bpf_sockops.o "/sys/fs/bpf/bpf_sockop" +sudo bpftool cgroup attach "/sys/fs/cgroup/unified/" sock_ops pinned "/sys/fs/bpf/bpf_sockop" + +MAP_ID=$(sudo bpftool prog show pinned "/sys/fs/bpf/bpf_sockop" | grep -o -E 'map_ids [0-9]+' | awk '{print $2}') +sudo bpftool map pin id $MAP_ID "/sys/fs/bpf/sock_ops_map" + +# Load the bpf_redir program +if [ -z $1 ] +then + sudo bpftool prog load bpf_redir.o "/sys/fs/bpf/bpf_redir" map name sock_ops_map pinned "/sys/fs/bpf/sock_ops_map" + sudo bpftool prog attach pinned "/sys/fs/bpf/bpf_redir" msg_verdict pinned "/sys/fs/bpf/sock_ops_map" +fi diff --git a/src/29-sockops/merbridge.png b/src/29-sockops/merbridge.png new file mode 100644 index 0000000..f122315 Binary files /dev/null and b/src/29-sockops/merbridge.png differ diff --git a/src/29-sockops/trace.sh b/src/29-sockops/trace.sh new file mode 100755 index 0000000..4589b36 --- /dev/null +++ b/src/29-sockops/trace.sh @@ -0,0 +1,2 @@ +#!/bin/bash +sudo cat /sys/kernel/debug/tracing/trace_pipe diff --git a/src/29-sockops/unload.sh b/src/29-sockops/unload.sh new file mode 100755 index 0000000..32d8659 --- /dev/null +++ b/src/29-sockops/unload.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -x + +# UnLoad the bpf_redir program +sudo bpftool prog detach pinned "/sys/fs/bpf/bpf_redir" msg_verdict pinned "/sys/fs/bpf/sock_ops_map" +sudo rm "/sys/fs/bpf/bpf_redir" + +# UnLoad the bpf_sockops program +sudo bpftool cgroup detach "/sys/fs/cgroup/unified/" sock_ops pinned "/sys/fs/bpf/bpf_sockop" +sudo rm "/sys/fs/bpf/bpf_sockop" + +# Delete the map +sudo rm "/sys/fs/bpf/sock_ops_map"