From ba1a6a472e6f50063f7c6ac2534051a383288f26 Mon Sep 17 00:00:00 2001 From: yunwei37 Date: Sat, 4 Oct 2025 21:39:24 -0700 Subject: [PATCH] bpf: Implement arena list management with allocation and deletion This commit introduces a new BPF program that manages an arena list, allowing for the addition and deletion of elements. The following changes were made: - Added `arena_list.bpf.c` to implement BPF functions for adding and deleting elements in an arena list. - Created `arena_list.c` for user-space testing of the BPF program, including functions to sum elements and validate the arena list operations. - Introduced `bpf_arena_alloc.h` and `bpf_arena_common.h` for memory allocation and common definitions related to arena management. - Defined `bpf_arena_list.h` to establish the structure of the arena list nodes and heads. - Added `bpf_experimental.h` to include experimental BPF features and helper functions for object management. These changes enhance the BPF capabilities for managing memory in a structured way, facilitating efficient allocation and deallocation of resources. --- src/features/bpf_arena/.gitignore | 4 + src/features/bpf_arena/Makefile | 112 ++++ src/features/bpf_arena/arena_list.bpf.c | 88 ++++ src/features/bpf_arena/arena_list.c | 92 ++++ src/features/bpf_arena/bpf_arena_alloc.h | 67 +++ src/features/bpf_arena/bpf_arena_common.h | 72 +++ src/features/bpf_arena/bpf_arena_list.h | 92 ++++ src/features/bpf_arena/bpf_experimental.h | 591 ++++++++++++++++++++++ 8 files changed, 1118 insertions(+) create mode 100644 src/features/bpf_arena/.gitignore create mode 100644 src/features/bpf_arena/Makefile create mode 100644 src/features/bpf_arena/arena_list.bpf.c create mode 100644 src/features/bpf_arena/arena_list.c create mode 100644 src/features/bpf_arena/bpf_arena_alloc.h create mode 100644 src/features/bpf_arena/bpf_arena_common.h create mode 100644 src/features/bpf_arena/bpf_arena_list.h create mode 100644 src/features/bpf_arena/bpf_experimental.h diff --git a/src/features/bpf_arena/.gitignore b/src/features/bpf_arena/.gitignore new file mode 100644 index 0000000..d83e46a --- /dev/null +++ b/src/features/bpf_arena/.gitignore @@ -0,0 +1,4 @@ +/.output +/bootstrap +xdp-pktgen +arena_list diff --git a/src/features/bpf_arena/Makefile b/src/features/bpf_arena/Makefile new file mode 100644 index 0000000..bedc161 --- /dev/null +++ b/src/features/bpf_arena/Makefile @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) -I. +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = arena_list + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) -D__BPF_FEATURE_ADDR_SPACE_CAST \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/features/bpf_arena/arena_list.bpf.c b/src/features/bpf_arena/arena_list.bpf.c new file mode 100644 index 0000000..3a2ddca --- /dev/null +++ b/src/features/bpf_arena/arena_list.bpf.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include +#include "bpf_experimental.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +#include "bpf_arena_alloc.h" +#include "bpf_arena_list.h" + +struct elem { + struct arena_list_node node; + __u64 value; +}; + +struct arena_list_head __arena *list_head; +int list_sum; +int cnt; +bool skip = false; + +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST +long __arena arena_sum; +int __arena test_val = 1; +struct arena_list_head __arena global_head; +#else +long arena_sum SEC(".addr_space.1"); +int test_val SEC(".addr_space.1"); +#endif + +int zero; + +SEC("syscall") +int arena_list_add(void *ctx) +{ +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST + __u64 i; + + list_head = &global_head; + + for (i = zero; i < cnt && can_loop; i++) { + struct elem __arena *n = bpf_alloc(sizeof(*n)); + + test_val++; + n->value = i; + arena_sum += i; + list_add_head(&n->node, list_head); + } +#else + skip = true; +#endif + return 0; +} + +SEC("syscall") +int arena_list_del(void *ctx) +{ +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST + struct elem __arena *n; + int sum = 0; + + arena_sum = 0; + list_for_each_entry(n, list_head, node) { + sum += n->value; + arena_sum += n->value; + list_del(&n->node); + bpf_free(n); + } + list_sum = sum; +#else + skip = true; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/src/features/bpf_arena/arena_list.c b/src/features/bpf_arena/arena_list.c new file mode 100644 index 0000000..371e4e1 --- /dev/null +++ b/src/features/bpf_arena/arena_list.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include + +#include "bpf_arena_list.h" +#include "arena_list.skel.h" + +struct elem { + struct arena_list_node node; + uint64_t value; +}; + +static int list_sum(struct arena_list_head *head) +{ + struct elem __arena *n; + int sum = 0; + + list_for_each_entry(n, head, node) + sum += n->value; + return sum; +} + +static void test_arena_list_add_del(int cnt) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct arena_list_bpf *skel; + int expected_sum = (u_int64_t)cnt * (cnt - 1) / 2; + int ret, sum; + + skel = arena_list_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return; + } + + skel->bss->cnt = cnt; + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); + if (ret != 0) { + fprintf(stderr, "Failed to run arena_list_add: %d\n", ret); + goto out; + } + if (opts.retval != 0) { + fprintf(stderr, "arena_list_add returned %d\n", opts.retval); + goto out; + } + if (skel->bss->skip) { + printf("SKIP: compiler doesn't support arena_cast\n"); + goto out; + } + sum = list_sum(skel->bss->list_head); + printf("Sum of elements: %d (expected: %d)\n", sum, expected_sum); + printf("Arena sum: %ld (expected: %d)\n", skel->bss->arena_sum, expected_sum); + printf("Number of elements: %d (expected: %d)\n", skel->data->test_val, cnt + 1); + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_del), &opts); + if (ret != 0) { + fprintf(stderr, "Failed to run arena_list_del: %d\n", ret); + goto out; + } + sum = list_sum(skel->bss->list_head); + printf("Sum after deletion: %d (expected: 0)\n", sum); + printf("Sum computed by BPF: %d (expected: %d)\n", skel->bss->list_sum, expected_sum); + printf("Arena sum after deletion: %ld (expected: %d)\n", skel->bss->arena_sum, expected_sum); + + printf("\nTest passed!\n"); +out: + arena_list_bpf__destroy(skel); +} + +int main(int argc, char **argv) +{ + int cnt = 10; + + if (argc > 1) { + cnt = atoi(argv[1]); + if (cnt <= 0) { + fprintf(stderr, "Invalid count: %s\n", argv[1]); + return 1; + } + } + + printf("Testing arena list with %d elements\n", cnt); + test_arena_list_add_del(cnt); + + return 0; +} diff --git a/src/features/bpf_arena/bpf_arena_alloc.h b/src/features/bpf_arena/bpf_arena_alloc.h new file mode 100644 index 0000000..c276782 --- /dev/null +++ b/src/features/bpf_arena/bpf_arena_alloc.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +#ifndef __round_mask +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#endif +#ifndef round_up +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#endif + +#ifdef __BPF__ +#define NR_CPUS (sizeof(struct cpumask) * 8) + +static void __arena * __arena page_frag_cur_page[NR_CPUS]; +static int __arena page_frag_cur_offset[NR_CPUS]; + +/* Simple page_frag allocator */ +static inline void __arena* bpf_alloc(unsigned int size) +{ + __u64 __arena *obj_cnt; + __u32 cpu = bpf_get_smp_processor_id(); + void __arena *page = page_frag_cur_page[cpu]; + int __arena *cur_offset = &page_frag_cur_offset[cpu]; + int offset; + + size = round_up(size, 8); + if (size >= PAGE_SIZE - 8) + return NULL; + if (!page) { +refill: + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return NULL; + cast_kern(page); + page_frag_cur_page[cpu] = page; + *cur_offset = PAGE_SIZE - 8; + obj_cnt = page + PAGE_SIZE - 8; + *obj_cnt = 0; + } else { + cast_kern(page); + obj_cnt = page + PAGE_SIZE - 8; + } + + offset = *cur_offset - size; + if (offset < 0) + goto refill; + + (*obj_cnt)++; + *cur_offset = offset; + return page + offset; +} + +static inline void bpf_free(void __arena *addr) +{ + __u64 __arena *obj_cnt; + + addr = (void __arena *)(((long)addr) & ~(PAGE_SIZE - 1)); + obj_cnt = addr + PAGE_SIZE - 8; + if (--(*obj_cnt) == 0) + bpf_arena_free_pages(&arena, addr, 1); +} +#else +static inline void __arena* bpf_alloc(unsigned int size) { return NULL; } +static inline void bpf_free(void __arena *addr) {} +#endif diff --git a/src/features/bpf_arena/bpf_arena_common.h b/src/features/bpf_arena/bpf_arena_common.h new file mode 100644 index 0000000..d8cd82a --- /dev/null +++ b/src/features/bpf_arena/bpf_arena_common.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef WRITE_ONCE +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *) &(x)) = (val)) +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +#ifdef __BPF__ /* when compiled as bpf program */ + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +#else /* when compiled as user space code */ + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +extern char arena[1] __attribute__((weak)); + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} + +#endif diff --git a/src/features/bpf_arena/bpf_arena_list.h b/src/features/bpf_arena/bpf_arena_list.h new file mode 100644 index 0000000..85dbc3e --- /dev/null +++ b/src/features/bpf_arena/bpf_arena_list.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once +#include "bpf_arena_common.h" + +struct arena_list_node; + +typedef struct arena_list_node __arena arena_list_node_t; + +struct arena_list_node { + arena_list_node_t *next; + arena_list_node_t * __arena *pprev; +}; + +struct arena_list_head { + struct arena_list_node __arena *first; +}; +typedef struct arena_list_head __arena arena_list_head_t; + +#define list_entry(ptr, type, member) arena_container_of(ptr, type, member) + +#define list_entry_safe(ptr, type, member) \ + ({ typeof(*ptr) * ___ptr = (ptr); \ + ___ptr ? ({ cast_kern(___ptr); list_entry(___ptr, type, member); }) : NULL; \ + }) + +#ifndef __BPF__ +static inline void *bpf_iter_num_new(struct bpf_iter_num *it, int i, int j) { return NULL; } +static inline void bpf_iter_num_destroy(struct bpf_iter_num *it) {} +static inline bool bpf_iter_num_next(struct bpf_iter_num *it) { return true; } +#define cond_break ({}) +#define can_loop true +#endif + +/* Safely walk link list elements. Deletion of elements is allowed. */ +#define list_for_each_entry(pos, head, member) \ + for (void * ___tmp = (pos = list_entry_safe((head)->first, \ + typeof(*(pos)), member), \ + (void *)0); \ + pos && ({ ___tmp = (void *)pos->member.next; 1; }) && can_loop; \ + pos = list_entry_safe((void __arena *)___tmp, typeof(*(pos)), member)) + +static inline void list_add_head(arena_list_node_t *n, arena_list_head_t *h) +{ + arena_list_node_t *first = h->first, * __arena *tmp; + + cast_user(first); + cast_kern(n); + WRITE_ONCE(n->next, first); + cast_kern(first); + if (first) { + tmp = &n->next; + cast_user(tmp); + WRITE_ONCE(first->pprev, tmp); + } + cast_user(n); + WRITE_ONCE(h->first, n); + + tmp = &h->first; + cast_user(tmp); + cast_kern(n); + WRITE_ONCE(n->pprev, tmp); +} + +static inline void __list_del(arena_list_node_t *n) +{ + arena_list_node_t *next = n->next, *tmp; + arena_list_node_t * __arena *pprev = n->pprev; + + cast_user(next); + cast_kern(pprev); + tmp = *pprev; + cast_kern(tmp); + WRITE_ONCE(tmp, next); + if (next) { + cast_user(pprev); + cast_kern(next); + WRITE_ONCE(next->pprev, pprev); + } +} + +#define POISON_POINTER_DELTA 0 + +#define LIST_POISON1 ((void __arena *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void __arena *) 0x122 + POISON_POINTER_DELTA) + +static inline void list_del(arena_list_node_t *n) +{ + __list_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} diff --git a/src/features/bpf_arena/bpf_experimental.h b/src/features/bpf_arena/bpf_experimental.h new file mode 100644 index 0000000..cd8ecd3 --- /dev/null +++ b/src/features/bpf_arena/bpf_experimental.h @@ -0,0 +1,591 @@ +#ifndef __BPF_EXPERIMENTAL__ +#define __BPF_EXPERIMENTAL__ + +#include +#include +#include +#include + +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) + +/* Description + * Allocates an object of the type represented by 'local_type_id' in + * program BTF. User may use the bpf_core_type_id_local macro to pass the + * type ID of a struct in program BTF. + * + * The 'local_type_id' parameter must be a known constant. + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. + * Returns + * A pointer to an object of the type corresponding to the passed in + * 'local_type_id', or NULL on failure. + */ +extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; + +/* Convenience macro to wrap over bpf_obj_new_impl */ +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) + +/* Description + * Free an allocated object. All fields of the object that require + * destruction will be destructed before the storage is freed. + * + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. + * Returns + * Void. + */ +extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +/* Convenience macro to wrap over bpf_obj_drop_impl */ +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +/* Description + * Increment the refcount on a refcounted local kptr, turning the + * non-owning reference input into an owning reference in the process. + * + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. + * Returns + * An owning reference to the object pointed to by 'kptr' + */ +extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; + +/* Convenience macro to wrap over bpf_refcount_acquire_impl */ +#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) + +/* Description + * Add a new entry to the beginning of the BPF linked list. + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them + * Returns + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a list + */ +extern int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_list_push_front_impl */ +#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0) + +/* Description + * Add a new entry to the end of the BPF linked list. + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them + * Returns + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a list + */ +extern int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_list_push_back_impl */ +#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0) + +/* Description + * Remove the entry at the beginning of the BPF linked list. + * Returns + * Pointer to bpf_list_node of deleted entry, or NULL if list is empty. + */ +extern struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; + +/* Description + * Remove the entry at the end of the BPF linked list. + * Returns + * Pointer to bpf_list_node of deleted entry, or NULL if list is empty. + */ +extern struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; + +/* Description + * Remove 'node' from rbtree with root 'root' + * Returns + * Pointer to the removed node, or NULL if 'root' didn't contain 'node' + */ +extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; + +/* Description + * Add 'node' to rbtree with root 'root' using comparator 'less' + * + * The 'meta' and 'off' parameters are rewritten by the verifier, no need + * for BPF programs to set them + * Returns + * 0 if the node was successfully added + * -EINVAL if the node wasn't added because it's already in a tree + */ +extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta, __u64 off) __ksym; + +/* Convenience macro to wrap over bpf_rbtree_add_impl */ +#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) + +/* Description + * Return the first (leftmost) node in input tree + * Returns + * Pointer to the node, which is _not_ removed from the tree. If the tree + * contains no nodes, returns NULL. + */ +extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +/* Description + * Allocates a percpu object of the type represented by 'local_type_id' in + * program BTF. User may use the bpf_core_type_id_local macro to pass the + * type ID of a struct in program BTF. + * + * The 'local_type_id' parameter must be a known constant. + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. + * Returns + * A pointer to a percpu object of the type corresponding to the passed in + * 'local_type_id', or NULL on failure. + */ +extern void *bpf_percpu_obj_new_impl(__u64 local_type_id, void *meta) __ksym; + +/* Convenience macro to wrap over bpf_percpu_obj_new_impl */ +#define bpf_percpu_obj_new(type) ((type __percpu_kptr *)bpf_percpu_obj_new_impl(bpf_core_type_id_local(type), NULL)) + +/* Description + * Free an allocated percpu object. All fields of the object that require + * destruction will be destructed before the storage is freed. + * + * The 'meta' parameter is rewritten by the verifier, no need for BPF + * program to set it. + * Returns + * Void. + */ +extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym; + +struct bpf_iter_task_vma; + +extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, + struct task_struct *task, + __u64 addr) __ksym; +extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __ksym; +extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym; + +/* Convenience macro to wrap over bpf_obj_drop_impl */ +#define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL) + +/* Description + * Throw a BPF exception from the program, immediately terminating its + * execution and unwinding the stack. The supplied 'cookie' parameter + * will be the return value of the program when an exception is thrown, + * and the default exception callback is used. Otherwise, if an exception + * callback is set using the '__exception_cb(callback)' declaration tag + * on the main program, the 'cookie' parameter will be the callback's only + * input argument. + * + * Thus, in case of default exception callback, 'cookie' is subjected to + * constraints on the program's return value (as with R0 on exit). + * Otherwise, the return value of the marked exception callback will be + * subjected to the same checks. + * + * Note that throwing an exception with lingering resources (locks, + * references, etc.) will lead to a verification error. + * + * Note that callbacks *cannot* call this helper. + * Returns + * Never. + * Throws + * An exception with the specified 'cookie' value. + */ +extern void bpf_throw(u64 cookie) __ksym; + +/* Description + * Acquire a reference on the exe_file member field belonging to the + * mm_struct that is nested within the supplied task_struct. The supplied + * task_struct must be trusted/referenced. + * Returns + * A referenced file pointer pointing to the exe_file member field of the + * mm_struct nested in the supplied task_struct, or NULL. + */ +extern struct file *bpf_get_task_exe_file(struct task_struct *task) __ksym; + +/* Description + * Release a reference on the supplied file. The supplied file must be + * acquired. + */ +extern void bpf_put_file(struct file *file) __ksym; + +/* Description + * Resolve a pathname for the supplied path and store it in the supplied + * buffer. The supplied path must be trusted/referenced. + * Returns + * A positive integer corresponding to the length of the resolved pathname, + * including the NULL termination character, stored in the supplied + * buffer. On error, a negative integer is returned. + */ +extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym; + +/* This macro must be used to mark the exception callback corresponding to the + * main program. For example: + * + * int exception_cb(u64 cookie) { + * return cookie; + * } + * + * SEC("tc") + * __exception_cb(exception_cb) + * int main_prog(struct __sk_buff *ctx) { + * ... + * return TC_ACT_OK; + * } + * + * Here, exception callback for the main program will be 'exception_cb'. Note + * that this attribute can only be used once, and multiple exception callbacks + * specified for the main program will lead to verification error. + */ +#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name))) + +#define __bpf_assert_signed(x) _Generic((x), \ + unsigned long: 0, \ + unsigned long long: 0, \ + signed long: 1, \ + signed long long: 1 \ +) + +#define __bpf_assert_check(LHS, op, RHS) \ + _Static_assert(sizeof(&(LHS)), "1st argument must be an lvalue expression"); \ + _Static_assert(sizeof(LHS) == 8, "Only 8-byte integers are supported\n"); \ + _Static_assert(__builtin_constant_p(__bpf_assert_signed(LHS)), "internal static assert"); \ + _Static_assert(__builtin_constant_p((RHS)), "2nd argument must be a constant expression") + +#define __bpf_assert(LHS, op, cons, RHS, VAL) \ + ({ \ + (void)bpf_throw; \ + asm volatile ("if %[lhs] " op " %[rhs] goto +2; r1 = %[value]; call bpf_throw" \ + : : [lhs] "r"(LHS), [rhs] cons(RHS), [value] "ri"(VAL) : ); \ + }) + +#define __bpf_assert_op_sign(LHS, op, cons, RHS, VAL, supp_sign) \ + ({ \ + __bpf_assert_check(LHS, op, RHS); \ + if (__bpf_assert_signed(LHS) && !(supp_sign)) \ + __bpf_assert(LHS, "s" #op, cons, RHS, VAL); \ + else \ + __bpf_assert(LHS, #op, cons, RHS, VAL); \ + }) + +#define __bpf_assert_op(LHS, op, RHS, VAL, supp_sign) \ + ({ \ + if (sizeof(typeof(RHS)) == 8) { \ + const typeof(RHS) rhs_var = (RHS); \ + __bpf_assert_op_sign(LHS, op, "r", rhs_var, VAL, supp_sign); \ + } else { \ + __bpf_assert_op_sign(LHS, op, "i", RHS, VAL, supp_sign); \ + } \ + }) + +#define __cmp_cannot_be_signed(x) \ + __builtin_strcmp(#x, "==") == 0 || __builtin_strcmp(#x, "!=") == 0 || \ + __builtin_strcmp(#x, "&") == 0 + +#define __is_signed_type(type) (((type)(-1)) < (type)1) + +#define __bpf_cmp(LHS, OP, PRED, RHS, DEFAULT) \ + ({ \ + __label__ l_true; \ + bool ret = DEFAULT; \ + asm volatile goto("if %[lhs] " OP " %[rhs] goto %l[l_true]" \ + :: [lhs] "r"((short)LHS), [rhs] PRED (RHS) :: l_true); \ + ret = !DEFAULT; \ +l_true: \ + ret; \ + }) + +/* C type conversions coupled with comparison operator are tricky. + * Make sure BPF program is compiled with -Wsign-compare then + * __lhs OP __rhs below will catch the mistake. + * Be aware that we check only __lhs to figure out the sign of compare. + */ +#define _bpf_cmp(LHS, OP, RHS, UNLIKELY) \ + ({ \ + typeof(LHS) __lhs = (LHS); \ + typeof(RHS) __rhs = (RHS); \ + bool ret; \ + _Static_assert(sizeof(&(LHS)), "1st argument must be an lvalue expression"); \ + (void)(__lhs OP __rhs); \ + if (__cmp_cannot_be_signed(OP) || !__is_signed_type(typeof(__lhs))) { \ + if (sizeof(__rhs) == 8) \ + /* "i" will truncate 64-bit constant into s32, \ + * so we have to use extra register via "r". \ + */ \ + ret = __bpf_cmp(__lhs, #OP, "r", __rhs, UNLIKELY); \ + else \ + ret = __bpf_cmp(__lhs, #OP, "ri", __rhs, UNLIKELY); \ + } else { \ + if (sizeof(__rhs) == 8) \ + ret = __bpf_cmp(__lhs, "s"#OP, "r", __rhs, UNLIKELY); \ + else \ + ret = __bpf_cmp(__lhs, "s"#OP, "ri", __rhs, UNLIKELY); \ + } \ + ret; \ + }) + +#ifndef bpf_cmp_unlikely +#define bpf_cmp_unlikely(LHS, OP, RHS) _bpf_cmp(LHS, OP, RHS, true) +#endif + +#ifndef bpf_cmp_likely +#define bpf_cmp_likely(LHS, OP, RHS) \ + ({ \ + bool ret = 0; \ + if (__builtin_strcmp(#OP, "==") == 0) \ + ret = _bpf_cmp(LHS, !=, RHS, false); \ + else if (__builtin_strcmp(#OP, "!=") == 0) \ + ret = _bpf_cmp(LHS, ==, RHS, false); \ + else if (__builtin_strcmp(#OP, "<=") == 0) \ + ret = _bpf_cmp(LHS, >, RHS, false); \ + else if (__builtin_strcmp(#OP, "<") == 0) \ + ret = _bpf_cmp(LHS, >=, RHS, false); \ + else if (__builtin_strcmp(#OP, ">") == 0) \ + ret = _bpf_cmp(LHS, <=, RHS, false); \ + else if (__builtin_strcmp(#OP, ">=") == 0) \ + ret = _bpf_cmp(LHS, <, RHS, false); \ + else \ + asm volatile("r0 " #OP " invalid compare"); \ + ret; \ + }) +#endif + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define cond_break \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: break; \ + l_continue:; \ + }) +#endif +#endif + +#ifndef bpf_nop_mov +#define bpf_nop_mov(var) \ + asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) +#endif + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; + +typedef struct { +} __bpf_preempt_t; + +static inline __bpf_preempt_t __bpf_preempt_constructor(void) +{ + __bpf_preempt_t ret = {}; + + bpf_preempt_disable(); + return ret; +} +static inline void __bpf_preempt_destructor(__bpf_preempt_t *t) +{ + bpf_preempt_enable(); +} +#define bpf_guard_preempt() \ + __bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \ + __attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \ + __bpf_preempt_constructor() + +/* Description + * Assert that a conditional expression is true. + * Returns + * Void. + * Throws + * An exception with the value zero when the assertion fails. + */ +#define bpf_assert(cond) if (!(cond)) bpf_throw(0); + +/* Description + * Assert that a conditional expression is true. + * Returns + * Void. + * Throws + * An exception with the specified value when the assertion fails. + */ +#define bpf_assert_with(cond, value) if (!(cond)) bpf_throw(value); + +/* Description + * Assert that LHS is in the range [BEG, END] (inclusive of both). This + * statement updates the known bounds of LHS during verification. Note + * that both BEG and END must be constant values, and must fit within the + * data type of LHS. + * Returns + * Void. + * Throws + * An exception with the value zero when the assertion fails. + */ +#define bpf_assert_range(LHS, BEG, END) \ + ({ \ + _Static_assert(BEG <= END, "BEG must be <= END"); \ + barrier_var(LHS); \ + __bpf_assert_op(LHS, >=, BEG, 0, false); \ + __bpf_assert_op(LHS, <=, END, 0, false); \ + }) + +/* Description + * Assert that LHS is in the range [BEG, END] (inclusive of both). This + * statement updates the known bounds of LHS during verification. Note + * that both BEG and END must be constant values, and must fit within the + * data type of LHS. + * Returns + * Void. + * Throws + * An exception with the specified value when the assertion fails. + */ +#define bpf_assert_range_with(LHS, BEG, END, value) \ + ({ \ + _Static_assert(BEG <= END, "BEG must be <= END"); \ + barrier_var(LHS); \ + __bpf_assert_op(LHS, >=, BEG, value, false); \ + __bpf_assert_op(LHS, <=, END, value, false); \ + }) + +struct bpf_iter_css_task; +struct cgroup_subsys_state; +extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it, + struct cgroup_subsys_state *css, unsigned int flags) __weak __ksym; +extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym; +extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym; + +struct bpf_iter_task; +extern int bpf_iter_task_new(struct bpf_iter_task *it, + struct task_struct *task, unsigned int flags) __weak __ksym; +extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym; +extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym; + +struct bpf_iter_css; +extern int bpf_iter_css_new(struct bpf_iter_css *it, + struct cgroup_subsys_state *start, unsigned int flags) __weak __ksym; +extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; +extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; + +extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; +extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; +extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags__k, void *aux__ign) __ksym; +#define bpf_wq_set_callback(timer, cb, flags) \ + bpf_wq_set_callback_impl(timer, cb, flags, NULL) + +struct bpf_iter_kmem_cache; +extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; +extern struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) __weak __ksym; +extern void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) __weak __ksym; + +#endif