diff --git a/.mdbookignore b/.mdbookignore new file mode 100644 index 0000000..912eacc --- /dev/null +++ b/.mdbookignore @@ -0,0 +1 @@ +third_party diff --git a/43-kfuncs/.gitignore b/43-kfuncs/.gitignore new file mode 100644 index 0000000..c71a9ec --- /dev/null +++ b/43-kfuncs/.gitignore @@ -0,0 +1,10 @@ +.vscode +package.json +*.o +*.skel.json +*.skel.yaml +package.yaml +ecli +ecc +.output +kfunc diff --git a/43-kfuncs/Makefile b/43-kfuncs/Makefile new file mode 100644 index 0000000..71649c3 --- /dev/null +++ b/43-kfuncs/Makefile @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/) +LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a) +LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h) +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = kfunc # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall + +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := # profile +APPS += $(BZS_APPS) +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: + $(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/43-kfuncs/kfunc.bpf.c b/43-kfuncs/kfunc.bpf.c new file mode 100644 index 0000000..f282bc4 --- /dev/null +++ b/43-kfuncs/kfunc.bpf.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#define BPF_NO_GLOBAL_DATA +#include +#include +#include + +typedef unsigned int u32; +typedef unsigned long long u64; +typedef int pid_t; + +extern u64 bpf_kfunc_call_test(u32 a, u64 b, u32 c, u64 d) __ksym; + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +SEC("kprobe/do_unlinkat") +int handle_kprobe(void *ctx) +{ + pid_t pid = bpf_get_current_pid_tgid() >> 32; + u64 result = bpf_kfunc_call_test(1, 2, 3, 4); + bpf_printk("BPF triggered do_unlinkat from PID %d. Result: %lld\n", pid, result); + return 0; +} diff --git a/43-kfuncs/kfunc.c b/43-kfuncs/kfunc.c new file mode 100644 index 0000000..d7bb134 --- /dev/null +++ b/43-kfuncs/kfunc.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include + +#include "kfunc.skel.h" // Include the generated skeleton header + +static volatile bool exiting = false; + +// Signal handler for graceful termination +void handle_signal(int sig) { + exiting = true; +} + +int main(int argc, char **argv) { + struct kfunc_bpf *skel; + int err; + + // Handle SIGINT and SIGTERM for graceful shutdown + signal(SIGINT, handle_signal); + + // Open the BPF application + skel = kfunc_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + // Load & verify the BPF program + err = kfunc_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton: %d\n", err); + goto cleanup; + } + + // Attach the BPF program (e.g., attach kprobe) + err = kfunc_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to attach BPF skeleton: %d\n", err); + goto cleanup; + } + + printf("BPF program loaded and attached successfully. Press Ctrl-C to exit.\n"); + + // Optionally, read the trace_pipe to see bpf_printk outputs + FILE *trace_pipe = fopen("/sys/kernel/debug/tracing/trace_pipe", "r"); + if (!trace_pipe) { + perror("fopen trace_pipe"); + // Continue without reading trace_pipe + } + + // Main loop + while (!exiting) { + if (trace_pipe) { + char buffer[256]; + if (fgets(buffer, sizeof(buffer), trace_pipe)) { + printf("%s", buffer); + } else { + if (errno == EINTR) + break; + } + } else { + // If trace_pipe is not available, just sleep + sleep(1); + } + } + + if (trace_pipe) + fclose(trace_pipe); + +cleanup: + // Clean up and destroy the BPF program + kfunc_bpf__destroy(skel); + return err < 0 ? -err : 0; +} diff --git a/43-kfuncs/module/.gitignore b/43-kfuncs/module/.gitignore new file mode 100644 index 0000000..9523173 --- /dev/null +++ b/43-kfuncs/module/.gitignore @@ -0,0 +1,23 @@ +# Ignore object files and kernel modules +*.o +*.ko +*.mod +*.mod.c +*.symvers +*.order + +# Ignore temporary and backup files +*~ +*.bak +*.tmp +*.swp + +# Ignore build directory if generated +/Module.symvers +/Modules.markers +/Module.markers +/modules.order + +# Ignore other automatically generated files +*.cmd +.tmp_versions/ diff --git a/43-kfuncs/module/Makefile b/43-kfuncs/module/Makefile new file mode 100644 index 0000000..44afa31 --- /dev/null +++ b/43-kfuncs/module/Makefile @@ -0,0 +1,11 @@ +obj-m += hello.o # hello.o is the target + +# Enable BTF generation +KBUILD_CFLAGS += -g -O2 + +all: + # Compile the module with BTF information + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules + +clean: + make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean diff --git a/43-kfuncs/module/compact.h b/43-kfuncs/module/compact.h new file mode 100644 index 0000000..5ff2115 --- /dev/null +++ b/43-kfuncs/module/compact.h @@ -0,0 +1,11 @@ +// Compatible for lower kernel versions. No need in 6.11. +#ifndef BTF_SET8_KFUNCS +/* This flag implies BTF_SET8 holds kfunc(s) */ +#define BTF_SET8_KFUNCS (1 << 0) +#endif +#ifndef BTF_KFUNCS_START +#define BTF_KFUNCS_START(name) static struct btf_id_set8 __maybe_unused name = { .flags = BTF_SET8_KFUNCS }; +#endif +#ifndef BTF_KFUNCS_END +#define BTF_KFUNCS_END(name) +#endif diff --git a/43-kfuncs/module/hello.c b/43-kfuncs/module/hello.c new file mode 100644 index 0000000..9552e1d --- /dev/null +++ b/43-kfuncs/module/hello.c @@ -0,0 +1,61 @@ +#include // Macros for module initialization +#include // Core header for loading modules +#include // Kernel logging macros +#include +#include +#include + +__bpf_kfunc u64 bpf_kfunc_call_test(u32 a, u64 b, u32 c, u64 d); + +/* Define a kfunc function */ +__bpf_kfunc_start_defs(); + +__bpf_kfunc u64 bpf_kfunc_call_test(u32 a, u64 b, u32 c, u64 d) +{ + return a + b + c + d; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_kfunc_example_ids_set) +BTF_ID_FLAGS(func, bpf_kfunc_call_test) +BTF_KFUNCS_END(bpf_kfunc_example_ids_set) + +// Register the kfunc ID set +static const struct btf_kfunc_id_set bpf_kfunc_example_set = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_example_ids_set, +}; + +// Function executed when the module is loaded +static int __init hello_init(void) +{ + int ret; + + printk(KERN_INFO "Hello, world!\n"); + // Register the BTF kfunc ID set + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kfunc_example_set); + if (ret) { + pr_err("bpf_kfunc_example: Failed to register BTF kfunc ID set\n"); + return ret; + } + printk(KERN_INFO "bpf_kfunc_example: Module loaded successfully\n"); + return 0; // Return 0 if successful +} + +// Function executed when the module is removed +static void __exit hello_exit(void) +{ + // Unregister the BTF kfunc ID set + // unregister_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_kfunc_example_set); + printk(KERN_INFO "Goodbye, world!\n"); +} + +// Macros to define the module’s init and exit points +module_init(hello_init); +module_exit(hello_exit); + +MODULE_LICENSE("GPL"); // License type (GPL) +MODULE_AUTHOR("Your Name"); // Module author +MODULE_DESCRIPTION("A simple module"); // Module description +MODULE_VERSION("1.0"); // Module version diff --git a/third_party/blazesym/.gitignore b/third_party/blazesym/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/third_party/blazesym/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/third_party/blazesym/.rustfmt.toml b/third_party/blazesym/.rustfmt.toml new file mode 100644 index 0000000..0a8dd1d --- /dev/null +++ b/third_party/blazesym/.rustfmt.toml @@ -0,0 +1 @@ +blank_lines_upper_bound = 2 diff --git a/third_party/blazesym/Cargo.toml b/third_party/blazesym/Cargo.toml new file mode 100644 index 0000000..d89237c --- /dev/null +++ b/third_party/blazesym/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "blazesym" +description = "BlazeSym is a library that symbolizes addresses where symbol names, source file names, and line numbers can be acquired." +version = "0.1.0" +authors = ["Kui-Feng "] +license-file = "LICENSE" +repository = "https://github.com/libbpf/blazesym" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +name = "blazesym" +crate-type = ["cdylib", "rlib", "staticlib"] + +[package.metadata.docs.rs] +features = ["dont-generate-test-files"] + +[dependencies] +nix = "0.24" +regex = "1.6" +crossbeam-channel = "0.5" +libc = "0.2.137" + +[build-dependencies] +anyhow = "1.0.68" +cbindgen = {version = "0.24", optional = true} + +[features] +cheader = ["cbindgen"] +# Enable this feature to opt out of the generation of test files. That may be +# useful when certain utilities are not installed or when there is no intention +# to run tests. +dont-generate-test-files = [] +# Enable code paths requiring a nightly toolchain. This feature is only meant to +# be used for testing and benchmarking purposes, not for the core library, which +# is expected to work on stable. +nightly = [] diff --git a/third_party/blazesym/LICENSE b/third_party/blazesym/LICENSE new file mode 100644 index 0000000..a7c1bbc --- /dev/null +++ b/third_party/blazesym/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, Kuifeng Lee +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/blazesym/build.rs b/third_party/blazesym/build.rs new file mode 100644 index 0000000..20c2162 --- /dev/null +++ b/third_party/blazesym/build.rs @@ -0,0 +1,153 @@ +use std::env; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::ops::Deref as _; +use std::path::Path; +use std::process::Command; +use std::process::Stdio; + +use anyhow::bail; +use anyhow::Context as _; +use anyhow::Result; + +/// Format a command with the given list of arguments as a string. +fn format_command(command: C, args: A) -> String +where + C: AsRef, + A: IntoIterator, + S: AsRef, +{ + args.into_iter().fold( + command.as_ref().to_string_lossy().into_owned(), + |mut cmd, arg| { + cmd += " "; + cmd += arg.as_ref().to_string_lossy().deref(); + cmd + }, + ) +} + +/// Run a command with the provided arguments. +fn run(command: C, args: A) -> Result<()> +where + C: AsRef, + A: IntoIterator + Clone, + S: AsRef, +{ + let instance = Command::new(command.as_ref()) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .args(args.clone()) + .output() + .with_context(|| { + format!( + "failed to run `{}`", + format_command(command.as_ref(), args.clone()) + ) + })?; + + if !instance.status.success() { + let code = if let Some(code) = instance.status.code() { + format!(" ({code})") + } else { + " (terminated by signal)".to_string() + }; + + let stderr = String::from_utf8_lossy(&instance.stderr); + let stderr = stderr.trim_end(); + let stderr = if !stderr.is_empty() { + format!(": {stderr}") + } else { + String::new() + }; + + bail!( + "`{}` reported non-zero exit-status{code}{stderr}", + format_command(command, args), + ); + } + + Ok(()) +} + +/// Compile `src` into `dst` using `cc`. +fn cc(src: &Path, dst: &str, options: &[&str]) { + let dst = src.with_file_name(dst); + println!("cargo:rerun-if-changed={}", src.display()); + println!("cargo:rerun-if-changed={}", dst.display()); + + // Ideally we'd use the `cc` crate here, but it seemingly can't be convinced + // to create binaries. + run( + "cc", + options + .iter() + .map(OsStr::new) + .chain([src.as_os_str(), "-o".as_ref(), dst.as_os_str()]), + ) + .expect("failed to run `cc`") +} + +/// Convert debug information contained in `src` into GSYM in `dst` using +/// `llvm-gsymutil`. +fn gsym(src: &Path, dst: &str) { + let dst = src.with_file_name(dst); + println!("cargo:rerun-if-changed={}", src.display()); + println!("cargo:rerun-if-changed={}", dst.display()); + + let gsymutil = env::var_os("LLVM_GSYMUTIL").unwrap_or_else(|| OsString::from("llvm-gsymutil")); + + run( + gsymutil, + ["--convert".as_ref(), src, "--out-file".as_ref(), &dst], + ) + .expect("failed to run `llvm-gsymutil`") +} + +/// Build the various test binaries. +fn build_test_bins(crate_root: &Path) { + let src = crate_root.join("data").join("test.c"); + cc(&src, "test-no-debug.bin", &["-g0"]); + cc(&src, "test-dwarf-v4.bin", &["-gdwarf-4"]); + + let src = crate_root.join("data").join("test-gsym.c"); + let ld_script = crate_root.join("data").join("test-gsym.ld"); + let ld_script = ld_script.to_str().unwrap(); + println!("cargo:rerun-if-changed={ld_script}"); + cc( + &src, + "test-gsym.bin", + &[ + "-gdwarf-4", + "-T", + ld_script, + "-Wl,--build-id=none", + "-O0", + "-nostdlib", + ], + ); + + let src = crate_root.join("data").join("test-gsym.bin"); + gsym(&src, "test.gsym"); +} + +fn main() { + let crate_dir = env!("CARGO_MANIFEST_DIR"); + + if !cfg!(feature = "dont-generate-test-files") { + build_test_bins(crate_dir.as_ref()); + } + + #[cfg(feature = "cheader")] + { + let build_type = env::var("PROFILE").unwrap(); + let target_path = Path::new(&crate_dir).join("target").join(build_type); + + cbindgen::Builder::new() + .with_crate(crate_dir) + .with_config(cbindgen::Config::from_root_or_default(crate_dir)) + .generate() + .expect("Unable to generate bindings") + .write_to_file(target_path.join("blazesym.h")); + } +} diff --git a/third_party/blazesym/cbindgen.toml b/third_party/blazesym/cbindgen.toml new file mode 100644 index 0000000..fa99f24 --- /dev/null +++ b/third_party/blazesym/cbindgen.toml @@ -0,0 +1,22 @@ +language = "C" +include_guard = "__blazesym_h_" + +[export] +item_types = ["globals", "enums", "structs", "unions", "typedefs", "opaque", "functions"] + +[fn] +args = "Vertical" +rename_args = "GeckoCase" + +[struct] +associated_constants_in_body = true +derive_eq = true +derive_ostream = true + +[enum] +add_sentinel = false +derive_helper_methods = true +derive_ostream = true + +[macro_expansion] +bitflags = true diff --git a/third_party/blazesym/data/dwarf-example b/third_party/blazesym/data/dwarf-example new file mode 100755 index 0000000..6e8be8c Binary files /dev/null and b/third_party/blazesym/data/dwarf-example differ diff --git a/third_party/blazesym/data/test-gsym.c b/third_party/blazesym/data/test-gsym.c new file mode 100644 index 0000000..af8ec1a --- /dev/null +++ b/third_party/blazesym/data/test-gsym.c @@ -0,0 +1,22 @@ +/* The sample program is used to generate test.gsym. + * + * Chosen functions are placed in dedicated sections to allow for control placement. + */ + +__attribute__((section(".text.factorial"))) unsigned int +factorial(unsigned int n) { + if (n == 0) + return 1; + return factorial(n - 1) * n; +} + +static inline void +factorial_inline_wrapper() { + factorial(5); +} + +__attribute__((section(".text.main"))) int +main(int argc, const char *argv[]) { + factorial_inline_wrapper(); + return 0; +} diff --git a/third_party/blazesym/data/test-gsym.ld b/third_party/blazesym/data/test-gsym.ld new file mode 100644 index 0000000..1020af9 --- /dev/null +++ b/third_party/blazesym/data/test-gsym.ld @@ -0,0 +1,40 @@ +SECTIONS { + .text (0x2000000) : { + *(.text.main) + *(.text) + . = ABSOLUTE(0x2000100); + *(.text.factorial) + } + .data : { + *(.data) + } + .bss : { + *(.bss) + } + + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. + */ + /* DWARF 1. */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions. */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2. */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2. */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + + /DISCARD/ : { + *(.*) + } +} diff --git a/third_party/blazesym/data/test.c b/third_party/blazesym/data/test.c new file mode 100644 index 0000000..741442f --- /dev/null +++ b/third_party/blazesym/data/test.c @@ -0,0 +1,22 @@ +/* + * The sample program is used to generate test.bin. + */ + +#include + +static +unsigned int fibonacci(unsigned int n) { + if (n <= 1) + return n; + return fibonacci(n - 1) + fibonacci(n - 2); +} + +int +main() { + int i; + printf("calculate fibonacci(n); n = "); + scanf("%d", &i); + + printf("fibonacci(%d) = %d\n", i, fibonacci(i)); + return 0; +} diff --git a/third_party/blazesym/examples/addr2ln.rs b/third_party/blazesym/examples/addr2ln.rs new file mode 100644 index 0000000..5680ab2 --- /dev/null +++ b/third_party/blazesym/examples/addr2ln.rs @@ -0,0 +1,44 @@ +extern crate blazesym; + +use blazesym::{BlazeSymbolizer, SymbolSrcCfg}; +use std::env; +use std::path; + +fn show_usage() { + let args: Vec = env::args().collect(); + println!("Usage: {}
", args[0]); +} + +fn main() { + let args: Vec = env::args().collect(); + + if args.len() != 3 { + show_usage(); + return; + } + + let bin_name = &args[1]; + let mut addr_str = &args[2][..]; + let sym_srcs = [SymbolSrcCfg::Elf { + file_name: path::PathBuf::from(bin_name), + base_address: 0x0, + }]; + let resolver = BlazeSymbolizer::new().unwrap(); + + if &addr_str[0..2] == "0x" { + // Remove prefixed 0x + addr_str = &addr_str[2..]; + } + let addr = u64::from_str_radix(addr_str, 16).unwrap(); + + let results = resolver.symbolize(&sym_srcs, &[addr]); + if results.len() == 1 && !results[0].is_empty() { + let result = &results[0][0]; + println!( + "0x{:x} @ {} {}:{}", + addr, result.symbol, result.path, result.line_no + ); + } else { + println!("0x{addr:x} is not found"); + } +} diff --git a/third_party/blazesym/examples/addr2ln_pid.rs b/third_party/blazesym/examples/addr2ln_pid.rs new file mode 100644 index 0000000..3e5271a --- /dev/null +++ b/third_party/blazesym/examples/addr2ln_pid.rs @@ -0,0 +1,54 @@ +extern crate blazesym; + +use blazesym::{BlazeSymbolizer, SymbolSrcCfg, SymbolizedResult}; +use std::env; + +fn show_usage() { + let args: Vec = env::args().collect(); + println!("Usage: {}
", args[0]); + println!("Resolve an address in the process of the given pid, and"); + println!("print its symbol, the file name of the source, and the line number."); +} + +fn main() { + let args: Vec = env::args().collect(); + + if args.len() != 3 { + show_usage(); + return; + } + + let pid = args[1].parse::().unwrap(); + let mut addr_str = &args[2][..]; + println!("PID: {pid}"); + + if addr_str.len() > 2 && &addr_str[0..2] == "0x" { + // Remove prefixed 0x + addr_str = &addr_str[2..]; + } + let addr = u64::from_str_radix(addr_str, 16).unwrap(); + + let sym_files = [SymbolSrcCfg::Process { pid: Some(pid) }]; + let resolver = BlazeSymbolizer::new().unwrap(); + let symlist = resolver.symbolize(&sym_files, &[addr]); + if !symlist[0].is_empty() { + let SymbolizedResult { + symbol, + start_address, + path, + line_no, + column: _, + } = &symlist[0][0]; + println!( + "0x{:x} {}@0x{:x}+{} {}:{}", + addr, + symbol, + start_address, + addr - start_address, + path, + line_no + ); + } else { + println!("0x{addr:x} is not found"); + } +} diff --git a/third_party/blazesym/scripts/gen-one-page.py b/third_party/blazesym/scripts/gen-one-page.py new file mode 100644 index 0000000..423caa5 --- /dev/null +++ b/third_party/blazesym/scripts/gen-one-page.py @@ -0,0 +1,196 @@ +'''Generate one page API document from rustdoc. + +This script parses HTML files generated by rustoc and generates a single page API +document. + +''' +from html.parser import HTMLParser + +pages = ['index.html', + 'struct.BlazeSymbolizer.html', + 'struct.SymbolizedResult.html', + 'enum.SymbolSrcCfg.html', + 'enum.SymbolizerFeature.html', + 'fn.blazesym_new.html', + 'fn.blazesym_free.html', + 'fn.blazesym_symbolize.html', + 'fn.blazesym_result_free.html', + 'struct.blazesym.html', + 'struct.sym_src_cfg.html', + 'enum.blazesym_src_type.html', + 'union.ssc_params.html', + 'struct.ssc_elf.html', + 'struct.ssc_kernel.html', + 'struct.ssc_process.html', + 'struct.blazesym_result.html', + 'struct.blazesym_entry.html', + 'struct.blazesym_csym.html', + ] + +def replace_text(src, replace, start, stop): + lines = src.split('\n') + if start[0] == stop[0]: + lineno = start[0] + lines[lineno] = lines[lineno][:start[1]] + replace + lines[lineno][stop[1]:] + else: + lines[start[0]] = lines[start[0]][:start[1]] + lines[stop[0]] = lines[stop[0]][stop[1]:] + lines[start[0] + 1: stop[0]] = [replace] + pass + return '\n'.join(lines) + +class MyParser(HTMLParser): + def get_doc_range(self, start, end): + lines = self.raw_data.split('\n') + lines = lines[start[0] - 1: end[0]] + lines[-1] = lines[-1][:end[1]] + lines[0] = lines[0][start[1]:] + content = '\n'.join(lines) + if hasattr(self, 'replaces'): + for replace, rstart, rstop in reversed(self.replaces): + rstart = list(rstart) + rstop = list(rstop) + rstart[0] -= start[0] + rstop[0] -= start[0] + if rstart[0] == 0: + rstart[1] -= start[1] + pass + if rstop[0] == 0: + rstop[1] -= start[1] + pass + content = replace_text(content, replace, rstart, rstop) + pass + pass + return content + + def get_pos_end_tag(self): + pos = self.getpos() + lines = self.raw_data.split('\n') + line = lines[pos[0] - 1] + col = line.find('>', pos[1]) + 1 + return (pos[0], col) + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == 'body': + self.doc_type = attrs['class'].split()[-1] + elif tag == 'section' and 'id' in attrs and attrs['id'] == 'main-content': + if self.doc_type == 'crate': + self.start_pos = self.getpos() + elif self.doc_type in ('struct', 'enum', 'union'): + if not hasattr(self, 'start_pos'): + self.start_pos = self.getpos() + pass + pass + elif self.doc_type == 'fn': + if not hasattr(self, 'start_pos'): + self.start_pos = self.getpos() + pass + pass + pass + elif 'class' in attrs and 'example-wrap' in attrs['class'].split(): + self.replacing = '  ......<<EXAMPLE>>......
' + self.replace_depth = 0 + self.replace_start = self.getpos(); + elif 'class' in attrs and 'item-decl' in attrs['class'].split(): + self.replacing = '  ......<<DECLARATION>>......
' + self.replace_depth = 0 + self.replace_start = self.getpos() + elif hasattr(self, 'replace_depth'): + self.replace_depth += 1 + elif hasattr(self, 'depth'): + self.depth += 1 + pass + elif hasattr(self, 'start_pos'): + if self.doc_type in ('struct', 'enum', 'union'): + if 'id' in attrs and attrs['id'].endswith('implementations'): + print('%s\n' % self.get_doc_range(self.start_pos, self.getpos())) + self.doc_type = '' + pass + elif self.doc_type == 'fn' and not hasattr(self, 'depth'): + self.depth = 0 + pass + elif self.doc_type == 'crate' and 'class' in attrs and attrs['class'].endswith('top-doc'): + self.depth = 0 + pass + pass + pass + + def handle_endtag(self, tag): + if hasattr(self, 'replace_depth'): + if self.replace_depth > 0: + self.replace_depth -= 1 + else: + del self.replace_depth + if not hasattr(self, 'replaces'): + self.replaces = [] + pass + self.replaces.append((self.replacing, self.replace_start, self.get_pos_end_tag())) + pass + elif hasattr(self, 'depth'): + if self.depth > 0: + self.depth -= 1 + pass + else: + del self.depth + if self.doc_type == 'crate': + content = self.get_doc_range(self.start_pos, self.get_pos_end_tag()) + print('%s\n' % content) + pass + elif self.doc_type == 'fn': + print('%s\n' % self.get_doc_range(self.start_pos, self.get_pos_end_tag())) + self.doc_type = '' + pass + pass + pass + pass + pass + + +print('') +print('') +print('') +print('') +print('') +print(''' +''') +print('') +print('') + +for page in pages: + print('' % page) + p = MyParser() + fo = open('blazesym/' + page, 'r') + data = fo.read() + p.raw_data = data + p.feed(data) + print('') + pass + +print('') +print('') diff --git a/third_party/blazesym/src/c_api.rs b/third_party/blazesym/src/c_api.rs new file mode 100644 index 0000000..721b93f --- /dev/null +++ b/third_party/blazesym/src/c_api.rs @@ -0,0 +1,948 @@ +use std::alloc::{alloc, dealloc, Layout}; +use std::ffi::CStr; +use std::ffi::OsStr; +use std::mem; +use std::os::raw::c_char; +use std::os::unix::ffi::OsStrExt as _; +use std::path::PathBuf; +use std::ptr; +use std::u64; + +use crate::BlazeSymbolizer; +use crate::FindAddrFeature; +use crate::SymbolInfo; +use crate::SymbolSrcCfg; +use crate::SymbolType; +use crate::SymbolizedResult; +use crate::SymbolizerFeature; + + +/// Types of symbol sources and debug information for C API. +#[repr(C)] +#[allow(non_camel_case_types, unused)] +pub enum blazesym_src_type { + /// Symbols and debug information from an ELF file. + SRC_T_ELF, + /// Symbols and debug information from a kernel image and its kallsyms. + SRC_T_KERNEL, + /// Symbols and debug information from a process, including loaded object files. + SRC_T_PROCESS, +} + +/// The parameters to load symbols and debug information from an ELF. +/// +/// Describes the path and address of an ELF file loaded in a +/// process. +#[repr(C)] +pub struct ssc_elf { + /// The file name of an ELF file. + /// + /// It can be an executable or shared object. + /// For example, passing "/bin/sh" will load symbols and debug information from `sh`. + /// Whereas passing "/lib/libc.so.xxx" will load symbols and debug information from the libc. + pub file_name: *const c_char, + /// The base address is where the file's executable segment(s) is loaded. + /// + /// It should be the address + /// in the process mapping to the executable segment's first byte. + /// For example, in /proc/<pid>/maps + /// + /// ```text + /// 7fe1b2dc4000-7fe1b2f80000 r-xp 00000000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b2f80000-7fe1b3180000 ---p 001bc000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b3180000-7fe1b3184000 r--p 001bc000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b3184000-7fe1b3186000 rw-p 001c0000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// ``` + /// + /// It reveals that the executable segment of libc-2.28.so was + /// loaded at 0x7fe1b2dc4000. This base address is used to + /// translate an address in the segment to the corresponding + /// address in the ELF file. + /// + /// A loader would load an executable segment with the permission of `x` + /// (executable). For example, the first block is with the + /// permission of `r-xp`. + pub base_address: u64, +} + +/// The parameters to load symbols and debug information from a kernel. +/// +/// Use a kernel image and a snapshot of its kallsyms as a source of symbols and +/// debug information. +#[repr(C)] +pub struct ssc_kernel { + /// The path of a copy of kallsyms. + /// + /// It can be `"/proc/kallsyms"` for the running kernel on the + /// device. However, you can make copies for later. In that situation, + /// you should give the path of a copy. + /// Passing a `NULL`, by default, will result in `"/proc/kallsyms"`. + pub kallsyms: *const c_char, + /// The path of a kernel image. + /// + /// The path of a kernel image should be, for instance, + /// `"/boot/vmlinux-xxxx"`. For a `NULL` value, it will locate the + /// kernel image of the running kernel in `"/boot/"` or + /// `"/usr/lib/debug/boot/"`. + pub kernel_image: *const c_char, +} + +/// The parameters to load symbols and debug information from a process. +/// +/// Load all ELF files in a process as the sources of symbols and debug +/// information. +#[repr(C)] +pub struct ssc_process { + /// It is the PID of a process to symbolize. + /// + /// BlazeSym will parse `/proc//maps` and load all the object + /// files. + pub pid: u32, +} + +/// Parameters of a symbol source. +#[repr(C)] +pub union ssc_params { + /// The variant for SRC_T_ELF + pub elf: mem::ManuallyDrop, + /// The variant for SRC_T_KERNEL + pub kernel: mem::ManuallyDrop, + /// The variant for SRC_T_PROCESS + pub process: mem::ManuallyDrop, +} + +/// Description of a source of symbols and debug information for C API. +#[repr(C)] +pub struct sym_src_cfg { + /// A type of symbol source. + pub src_type: blazesym_src_type, + pub params: ssc_params, +} + +/// Names of the BlazeSym features. +#[repr(C)] +#[allow(non_camel_case_types, unused)] +pub enum blazesym_feature_name { + /// Enable or disable returning line numbers of addresses. + /// + /// Users should set `blazesym_feature.params.enable` to enabe or + /// disable the feature, + LINE_NUMBER_INFO, + /// Enable or disable loading symbols from DWARF. + /// + /// Users should `blazesym_feature.params.enable` to enable or + /// disable the feature. This feature is disabled by default. + DEBUG_INFO_SYMBOLS, +} + +#[repr(C)] +pub union blazesym_feature_params { + enable: bool, +} + +/// Setting of the blazesym features. +/// +/// Contain parameters to enable, disable, or customize a feature. +#[repr(C)] +pub struct blazesym_feature { + pub feature: blazesym_feature_name, + pub params: blazesym_feature_params, +} + +/// A placeholder symbolizer for C API. +/// +/// It is returned by [`blazesym_new()`] and should be free by +/// [`blazesym_free()`]. +#[repr(C)] +pub struct blazesym { + symbolizer: *mut BlazeSymbolizer, +} + +/// The result of symbolization of an address for C API. +/// +/// A `blazesym_csym` is the information of a symbol found for an +/// address. One address may result in several symbols. +#[repr(C)] +pub struct blazesym_csym { + /// The symbol name is where the given address should belong to. + pub symbol: *const c_char, + /// The address (i.e.,the first byte) is where the symbol is located. + /// + /// The address is already relocated to the address space of + /// the process. + pub start_address: u64, + /// The path of the source code defines the symbol. + pub path: *const c_char, + /// The instruction of the address is in the line number of the source code. + pub line_no: usize, + pub column: usize, +} + +/// `blazesym_entry` is the output of symbolization for an address for C API. +/// +/// Every address has an `blazesym_entry` in +/// [`blazesym_result::entries`] to collect symbols found by BlazeSym. +#[repr(C)] +pub struct blazesym_entry { + /// The number of symbols found for an address. + pub size: usize, + /// All symbols found. + /// + /// `syms` is an array of blazesym_csym in the size `size`. + pub syms: *const blazesym_csym, +} + +/// `blazesym_result` is the result of symbolization for C API. +/// +/// The instances of blazesym_result are returned from +/// [`blazesym_symbolize()`]. They should be free by calling +/// [`blazesym_result_free()`]. +#[repr(C)] +pub struct blazesym_result { + /// The number of addresses being symbolized. + pub size: usize, + /// The entries for addresses. + /// + /// Symbolization occurs based on the order of addresses. + /// Therefore, every address must have an entry here on the same + /// order. + pub entries: [blazesym_entry; 0], +} + +/// Create a `PathBuf` from a pointer of C string +/// +/// # Safety +/// +/// C string should be terminated with a null byte. +/// +unsafe fn from_cstr(cstr: *const c_char) -> PathBuf { + PathBuf::from(unsafe { CStr::from_ptr(cstr) }.to_str().unwrap()) +} + +unsafe fn symbolsrccfg_to_rust(cfg: *const sym_src_cfg, cfg_len: u32) -> Option> { + let mut cfg_rs = Vec::::with_capacity(cfg_len as usize); + + for i in 0..cfg_len { + let c = unsafe { cfg.offset(i as isize) }; + match unsafe { &(*c).src_type } { + blazesym_src_type::SRC_T_ELF => { + cfg_rs.push(SymbolSrcCfg::Elf { + file_name: unsafe { from_cstr((*c).params.elf.file_name) }, + base_address: unsafe { (*c).params.elf.base_address }, + }); + } + blazesym_src_type::SRC_T_KERNEL => { + let kallsyms = unsafe { (*c).params.kernel.kallsyms }; + let kernel_image = unsafe { (*c).params.kernel.kernel_image }; + cfg_rs.push(SymbolSrcCfg::Kernel { + kallsyms: if !kallsyms.is_null() { + Some(unsafe { from_cstr(kallsyms) }) + } else { + None + }, + kernel_image: if !kernel_image.is_null() { + Some(unsafe { from_cstr(kernel_image) }) + } else { + None + }, + }); + } + blazesym_src_type::SRC_T_PROCESS => { + let pid = unsafe { (*c).params.process.pid }; + cfg_rs.push(SymbolSrcCfg::Process { + pid: if pid > 0 { Some(pid) } else { None }, + }); + } + } + } + + Some(cfg_rs) +} + +/// Create an instance of blazesym a symbolizer for C API. +/// +/// # Safety +/// +/// Free the pointer with [`blazesym_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_new() -> *mut blazesym { + let symbolizer = match BlazeSymbolizer::new() { + Ok(s) => s, + Err(_) => { + return ptr::null_mut(); + } + }; + let symbolizer_box = Box::new(symbolizer); + let c_box = Box::new(blazesym { + symbolizer: Box::into_raw(symbolizer_box), + }); + Box::into_raw(c_box) +} + +/// Create an instance of blazesym a symbolizer for C API. +/// +/// # Safety +/// +/// Free the pointer with [`blazesym_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_new_opts( + features: *const blazesym_feature, + nfeatures: usize, +) -> *mut blazesym { + let features_v = unsafe { + Vec::::from_raw_parts( + features as *mut blazesym_feature, + nfeatures, + nfeatures, + ) + }; + let features_v = mem::ManuallyDrop::new(features_v); + let features_r: Vec<_> = features_v + .iter() + .map(|x| -> SymbolizerFeature { + match x.feature { + blazesym_feature_name::LINE_NUMBER_INFO => { + SymbolizerFeature::LineNumberInfo(unsafe { x.params.enable }) + } + blazesym_feature_name::DEBUG_INFO_SYMBOLS => { + SymbolizerFeature::DebugInfoSymbols(unsafe { x.params.enable }) + } + } + }) + .collect(); + + let symbolizer = match BlazeSymbolizer::new_opt(&features_r) { + Ok(s) => s, + Err(_) => { + return ptr::null_mut(); + } + }; + let symbolizer_box = Box::new(symbolizer); + let c_box = Box::new(blazesym { + symbolizer: Box::into_raw(symbolizer_box), + }); + Box::into_raw(c_box) +} + +/// Free an instance of blazesym a symbolizer for C API. +/// +/// # Safety +/// +/// The pointer must be returned by [`blazesym_new()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_free(symbolizer: *mut blazesym) { + if !symbolizer.is_null() { + drop(unsafe { Box::from_raw((*symbolizer).symbolizer) }); + drop(unsafe { Box::from_raw(symbolizer) }); + } +} + +/// Convert SymbolizedResults to blazesym_results. +/// +/// # Safety +/// +/// The returned pointer should be freed by [`blazesym_result_free()`]. +/// +unsafe fn convert_symbolizedresults_to_c( + results: Vec>, +) -> *const blazesym_result { + // Allocate a buffer to contain a blazesym_result, all + // blazesym_csym, and C strings of symbol and path. + let strtab_size = results.iter().flatten().fold(0, |acc, result| { + acc + result.symbol.len() + result.path.len() + 2 + }); + let all_csym_size = results.iter().flatten().count(); + let buf_size = strtab_size + + mem::size_of::() + + mem::size_of::() * results.len() + + mem::size_of::() * all_csym_size; + let raw_buf_with_sz = + unsafe { alloc(Layout::from_size_align(buf_size + mem::size_of::(), 8).unwrap()) }; + if raw_buf_with_sz.is_null() { + return ptr::null(); + } + + // prepend an u64 to keep the size of the buffer. + unsafe { *(raw_buf_with_sz as *mut u64) = buf_size as u64 }; + + let raw_buf = unsafe { raw_buf_with_sz.add(mem::size_of::()) }; + + let result_ptr = raw_buf as *mut blazesym_result; + let mut entry_last = unsafe { &mut (*result_ptr).entries as *mut blazesym_entry }; + let mut csym_last = unsafe { + raw_buf.add( + mem::size_of::() + mem::size_of::() * results.len(), + ) + } as *mut blazesym_csym; + let mut cstr_last = unsafe { + raw_buf.add( + mem::size_of::() + + mem::size_of::() * results.len() + + mem::size_of::() * all_csym_size, + ) + } as *mut c_char; + + let mut make_cstr = |src: &str| { + let cstr = cstr_last; + unsafe { ptr::copy(src.as_ptr(), cstr as *mut u8, src.len()) }; + unsafe { *cstr.add(src.len()) = 0 }; + cstr_last = unsafe { cstr_last.add(src.len() + 1) }; + + cstr + }; + + unsafe { (*result_ptr).size = results.len() }; + + // Convert all SymbolizedResults to blazesym_entrys and blazesym_csyms + for entry in results { + unsafe { (*entry_last).size = entry.len() }; + unsafe { (*entry_last).syms = csym_last }; + entry_last = unsafe { entry_last.add(1) }; + + for r in entry { + let symbol_ptr = make_cstr(&r.symbol); + + let path_ptr = make_cstr(&r.path); + + let csym_ref = unsafe { &mut *csym_last }; + csym_ref.symbol = symbol_ptr; + csym_ref.start_address = r.start_address; + csym_ref.path = path_ptr; + csym_ref.line_no = r.line_no; + csym_ref.column = r.column; + + csym_last = unsafe { csym_last.add(1) }; + } + } + + result_ptr +} + +/// Symbolize addresses with the sources of symbols and debug info. +/// +/// Return an array of [`blazesym_result`] with the same size as the +/// number of input addresses. The caller should free the returned +/// array by calling [`blazesym_result_free()`]. +/// +/// # Safety +/// +/// The returned pointer should be freed by [`blazesym_result_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_symbolize( + symbolizer: *mut blazesym, + sym_srcs: *const sym_src_cfg, + sym_srcs_len: u32, + addrs: *const u64, + addr_cnt: usize, +) -> *const blazesym_result { + let sym_srcs_rs = + if let Some(sym_srcs_rs) = unsafe { symbolsrccfg_to_rust(sym_srcs, sym_srcs_len) } { + sym_srcs_rs + } else { + #[cfg(debug_assertions)] + eprintln!("Fail to transform configurations of symbolizer from C to Rust"); + return ptr::null_mut(); + }; + + let symbolizer = unsafe { &*(*symbolizer).symbolizer }; + let addresses = unsafe { Vec::from_raw_parts(addrs as *mut u64, addr_cnt, addr_cnt) }; + + let results = symbolizer.symbolize(&sym_srcs_rs, &addresses); + + addresses.leak(); + + if results.is_empty() { + #[cfg(debug_assertions)] + eprintln!("Empty result while request for {addr_cnt}"); + return ptr::null(); + } + + unsafe { convert_symbolizedresults_to_c(results) } +} + +/// Free an array returned by blazesym_symbolize. +/// +/// # Safety +/// +/// The pointer must be returned by [`blazesym_symbolize()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_result_free(results: *const blazesym_result) { + if results.is_null() { + #[cfg(debug_assertions)] + eprintln!("blazesym_result_free(null)"); + return; + } + + let raw_buf_with_sz = unsafe { (results as *mut u8).offset(-(mem::size_of::() as isize)) }; + let sz = unsafe { *(raw_buf_with_sz as *mut u64) } as usize + mem::size_of::(); + unsafe { dealloc(raw_buf_with_sz, Layout::from_size_align(sz, 8).unwrap()) }; +} + +#[repr(C)] +pub struct blazesym_sym_info { + name: *const u8, + address: u64, + size: u64, + sym_type: blazesym_sym_type, + file_offset: u64, + obj_file_name: *const u8, +} + +/// Convert SymbolInfos returned by BlazeSymbolizer::find_addresses() to a C array. +unsafe fn convert_syms_list_to_c( + syms_list: Vec>, +) -> *const *const blazesym_sym_info { + let mut sym_cnt = 0; + let mut str_buf_sz = 0; + + for syms in &syms_list { + sym_cnt += syms.len() + 1; + for sym in syms { + str_buf_sz += sym.name.len() + 1; + if let Some(fname) = sym.obj_file_name.as_ref() { + str_buf_sz += AsRef::::as_ref(fname).as_bytes().len() + 1; + } + } + } + + let array_sz = ((mem::size_of::<*const u64>() * syms_list.len() + mem::size_of::() - 1) + % mem::size_of::()) + * mem::size_of::(); + let sym_buf_sz = mem::size_of::() * sym_cnt; + let buf_size = array_sz + sym_buf_sz + str_buf_sz; + let raw_buf_with_sz = + unsafe { alloc(Layout::from_size_align(buf_size + mem::size_of::(), 8).unwrap()) }; + + unsafe { *(raw_buf_with_sz as *mut u64) = buf_size as u64 }; + + let raw_buf = unsafe { raw_buf_with_sz.add(mem::size_of::()) }; + let mut syms_ptr = raw_buf as *mut *mut blazesym_sym_info; + let mut sym_ptr = unsafe { raw_buf.add(array_sz) } as *mut blazesym_sym_info; + let mut str_ptr = unsafe { raw_buf.add(array_sz + sym_buf_sz) } as *mut u8; + + for syms in syms_list { + unsafe { *syms_ptr = sym_ptr }; + for SymbolInfo { + name, + address, + size, + sym_type, + file_offset, + obj_file_name, + } in syms + { + let name_ptr = str_ptr as *const u8; + unsafe { ptr::copy_nonoverlapping(name.as_ptr(), str_ptr, name.len()) }; + str_ptr = unsafe { str_ptr.add(name.len()) }; + unsafe { *str_ptr = 0 }; + str_ptr = unsafe { str_ptr.add(1) }; + let obj_file_name = if let Some(fname) = obj_file_name.as_ref() { + let fname = AsRef::::as_ref(fname).as_bytes(); + let obj_fname_ptr = str_ptr; + unsafe { ptr::copy_nonoverlapping(fname.as_ptr(), str_ptr, fname.len()) }; + str_ptr = unsafe { str_ptr.add(fname.len()) }; + unsafe { *str_ptr = 0 }; + str_ptr = unsafe { str_ptr.add(1) }; + obj_fname_ptr + } else { + ptr::null() + }; + + unsafe { + (*sym_ptr) = blazesym_sym_info { + name: name_ptr, + address, + size, + sym_type: match sym_type { + SymbolType::Function => blazesym_sym_type::SYM_T_FUNC, + SymbolType::Variable => blazesym_sym_type::SYM_T_VAR, + _ => blazesym_sym_type::SYM_T_UNKNOWN, + }, + file_offset, + obj_file_name, + } + }; + sym_ptr = unsafe { sym_ptr.add(1) }; + } + unsafe { + (*sym_ptr) = blazesym_sym_info { + name: ptr::null(), + address: 0, + size: 0, + sym_type: blazesym_sym_type::SYM_T_UNKNOWN, + file_offset: 0, + obj_file_name: ptr::null(), + } + }; + sym_ptr = unsafe { sym_ptr.add(1) }; + + syms_ptr = unsafe { syms_ptr.add(1) }; + } + + raw_buf as *const *const blazesym_sym_info +} + +/// Convert SymbolInfos returned by BlazeSymbolizer::find_address_regex() to a C array. +unsafe fn convert_syms_to_c(syms: Vec) -> *const blazesym_sym_info { + let mut str_buf_sz = 0; + + for sym in &syms { + str_buf_sz += sym.name.len() + 1; + if let Some(fname) = sym.obj_file_name.as_ref() { + str_buf_sz += AsRef::::as_ref(fname).as_bytes().len() + 1; + } + } + + let sym_buf_sz = mem::size_of::() * (syms.len() + 1); + let buf_size = sym_buf_sz + str_buf_sz; + let raw_buf_with_sz = + unsafe { alloc(Layout::from_size_align(buf_size + mem::size_of::(), 8).unwrap()) }; + + unsafe { *(raw_buf_with_sz as *mut u64) = buf_size as u64 }; + + let raw_buf = unsafe { raw_buf_with_sz.add(mem::size_of::()) }; + let mut sym_ptr = raw_buf as *mut blazesym_sym_info; + let mut str_ptr = unsafe { raw_buf.add(sym_buf_sz) } as *mut u8; + + for sym in syms { + let SymbolInfo { + name, + address, + size, + sym_type, + file_offset, + obj_file_name, + } = sym; + let name_ptr = str_ptr as *const u8; + unsafe { ptr::copy_nonoverlapping(name.as_ptr(), str_ptr, name.len()) }; + str_ptr = unsafe { str_ptr.add(name.len()) }; + unsafe { *str_ptr = 0 }; + str_ptr = unsafe { str_ptr.add(1) }; + let obj_file_name = if let Some(fname) = obj_file_name.as_ref() { + let fname = AsRef::::as_ref(fname).as_bytes(); + let obj_fname_ptr = str_ptr; + unsafe { ptr::copy_nonoverlapping(fname.as_ptr(), str_ptr, fname.len()) }; + str_ptr = unsafe { str_ptr.add(fname.len()) }; + unsafe { *str_ptr = 0 }; + str_ptr = unsafe { str_ptr.add(1) }; + obj_fname_ptr + } else { + ptr::null() + }; + + unsafe { + (*sym_ptr) = blazesym_sym_info { + name: name_ptr, + address, + size, + sym_type: match sym_type { + SymbolType::Function => blazesym_sym_type::SYM_T_FUNC, + SymbolType::Variable => blazesym_sym_type::SYM_T_VAR, + _ => blazesym_sym_type::SYM_T_UNKNOWN, + }, + file_offset, + obj_file_name, + } + }; + sym_ptr = unsafe { sym_ptr.add(1) }; + } + unsafe { + (*sym_ptr) = blazesym_sym_info { + name: ptr::null(), + address: 0, + size: 0, + sym_type: blazesym_sym_type::SYM_T_UNKNOWN, + file_offset: 0, + obj_file_name: ptr::null(), + } + }; + + raw_buf as *const blazesym_sym_info +} + +/// The types of symbols. +/// +/// This type is used to choice what type of symbols you like to find +/// and indicate the types of symbols found. +#[repr(C)] +#[allow(non_camel_case_types, unused)] +#[derive(Copy, Clone)] +pub enum blazesym_sym_type { + /// Invalid type + SYM_T_INVALID, + /// You want to find a symbol of any type. + SYM_T_UNKNOWN, + /// The returned symbol is a function, or you want to find a function. + SYM_T_FUNC, + /// The returned symbol is a variable, or you want to find a variable. + SYM_T_VAR, +} + +/// Feature names of looking up addresses of symbols. +#[repr(C)] +#[allow(non_camel_case_types, unused)] +pub enum blazesym_faf_type { + /// Invalid type + FAF_T_INVALID, + /// Return the offset in the file. (enable) + FAF_T_OFFSET_IN_FILE, + /// Return the file name of the shared object. (enable) + FAF_T_OBJ_FILE_NAME, + /// Return symbols having the given type. (sym_type) + FAF_T_SYMBOL_TYPE, +} + +/// The parameter parts of `blazesym_faddr_feature`. +#[repr(C)] +pub union blazesym_faf_param { + enable: bool, + sym_type: blazesym_sym_type, +} + +/// Switches and settings of features of looking up addresses of +/// symbols. +/// +/// See [`FindAddrFeature`] for details. +#[repr(C)] +pub struct blazesym_faddr_feature { + ftype: blazesym_faf_type, + param: blazesym_faf_param, +} + +unsafe fn convert_find_addr_features( + features: *const blazesym_faddr_feature, + num_features: usize, +) -> Vec { + let mut feature = features; + let mut features_ret = vec![]; + for _ in 0..num_features { + match unsafe { &(*feature).ftype } { + blazesym_faf_type::FAF_T_SYMBOL_TYPE => { + features_ret.push(match unsafe { (*feature).param.sym_type } { + blazesym_sym_type::SYM_T_UNKNOWN => { + FindAddrFeature::SymbolType(SymbolType::Unknown) + } + blazesym_sym_type::SYM_T_FUNC => { + FindAddrFeature::SymbolType(SymbolType::Function) + } + blazesym_sym_type::SYM_T_VAR => { + FindAddrFeature::SymbolType(SymbolType::Variable) + } + _ => { + panic!("Invalid symbol type"); + } + }); + } + blazesym_faf_type::FAF_T_OFFSET_IN_FILE => { + features_ret.push(FindAddrFeature::OffsetInFile(unsafe { + (*feature).param.enable + })); + } + blazesym_faf_type::FAF_T_OBJ_FILE_NAME => { + features_ret.push(FindAddrFeature::ObjFileName(unsafe { + (*feature).param.enable + })); + } + _ => { + panic!("Unknown find_address feature type"); + } + } + feature = unsafe { feature.add(1) }; + } + + features_ret +} + +/// Find the addresses of symbols matching a pattern. +/// +/// Return an array of `blazesym_sym_info` ending with an item having a null address. +/// input names. The caller should free the returned array by calling +/// [`blazesym_syms_free()`]. +/// +/// It works the same as [`blazesym_find_address_regex()`] with +/// additional controls on features. +/// +/// # Safety +/// +/// The returned pointer should be free by [`blazesym_syms_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_find_address_regex_opt( + symbolizer: *mut blazesym, + sym_srcs: *const sym_src_cfg, + sym_srcs_len: u32, + pattern: *const c_char, + features: *const blazesym_faddr_feature, + num_features: usize, +) -> *const blazesym_sym_info { + let sym_srcs_rs = + if let Some(sym_srcs_rs) = unsafe { symbolsrccfg_to_rust(sym_srcs, sym_srcs_len) } { + sym_srcs_rs + } else { + #[cfg(debug_assertions)] + eprintln!("Fail to transform configurations of symbolizer from C to Rust"); + return ptr::null_mut(); + }; + + let symbolizer = unsafe { &*(*symbolizer).symbolizer }; + + let pattern = unsafe { CStr::from_ptr(pattern) }; + let features = unsafe { convert_find_addr_features(features, num_features) }; + let syms = + { symbolizer.find_address_regex_opt(&sym_srcs_rs, pattern.to_str().unwrap(), features) }; + + if syms.is_none() { + return ptr::null_mut(); + } + + unsafe { convert_syms_to_c(syms.unwrap()) } +} + +/// Find the addresses of symbols matching a pattern. +/// +/// Return an array of `blazesym_sym_info` ending with an item having a null address. +/// input names. The caller should free the returned array by calling +/// [`blazesym_syms_free()`]. +/// +/// # Safety +/// +/// The returned pointer should be free by [`blazesym_syms_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_find_address_regex( + symbolizer: *mut blazesym, + sym_srcs: *const sym_src_cfg, + sym_srcs_len: u32, + pattern: *const c_char, +) -> *const blazesym_sym_info { + unsafe { + blazesym_find_address_regex_opt(symbolizer, sym_srcs, sym_srcs_len, pattern, ptr::null(), 0) + } +} + +/// Free an array returned by blazesym_find_addr_regex() or +/// blazesym_find_addr_regex_opt(). +/// +/// # Safety +/// +/// The `syms` pointer should have been allocated by one of the +/// `blazesym_find_address*` variants. +#[no_mangle] +pub unsafe extern "C" fn blazesym_syms_free(syms: *const blazesym_sym_info) { + if syms.is_null() { + #[cfg(debug_assertions)] + eprintln!("blazesym_sym_info_free(null)"); + return; + } + + let raw_buf_with_sz = unsafe { (syms as *mut u8).offset(-(mem::size_of::() as isize)) }; + let sz = unsafe { *(raw_buf_with_sz as *mut u64) } as usize + mem::size_of::(); + unsafe { dealloc(raw_buf_with_sz, Layout::from_size_align(sz, 8).unwrap()) }; +} + +/// Find the addresses of a list of symbols. +/// +/// Return an array of `*const u64` with the same size as the +/// input names. The caller should free the returned array by calling +/// [`blazesym_syms_list_free()`]. +/// +/// Every name in the input name list may have more than one address. +/// The respective entry in the returned array is an array containing +/// all addresses and ended with a null (0x0). +/// +/// # Safety +/// +/// The returned pointer should be free by [`blazesym_syms_list_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_find_addresses_opt( + symbolizer: *mut blazesym, + sym_srcs: *const sym_src_cfg, + sym_srcs_len: u32, + names: *const *const c_char, + name_cnt: usize, + features: *const blazesym_faddr_feature, + num_features: usize, +) -> *const *const blazesym_sym_info { + let sym_srcs_rs = + if let Some(sym_srcs_rs) = unsafe { symbolsrccfg_to_rust(sym_srcs, sym_srcs_len) } { + sym_srcs_rs + } else { + #[cfg(debug_assertions)] + eprintln!("Fail to transform configurations of symbolizer from C to Rust"); + return ptr::null_mut(); + }; + + let symbolizer = unsafe { &*(*symbolizer).symbolizer }; + + let mut names_cstr = vec![]; + for i in 0..name_cnt { + let name_c = unsafe { *names.add(i) }; + let name_r = unsafe { CStr::from_ptr(name_c) }; + names_cstr.push(name_r); + } + let features = unsafe { convert_find_addr_features(features, num_features) }; + let syms = { + let mut names_r = vec![]; + for name in names_cstr.iter().take(name_cnt) { + names_r.push(name.to_str().unwrap()); + } + symbolizer.find_addresses_opt(&sym_srcs_rs, &names_r, features) + }; + + unsafe { convert_syms_list_to_c(syms) } +} + +/// Find addresses of a symbol name. +/// +/// A symbol may have multiple addressses. +/// +/// # Safety +/// +/// The returned data should be free by [`blazesym_syms_list_free()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_find_addresses( + symbolizer: *mut blazesym, + sym_srcs: *const sym_src_cfg, + sym_srcs_len: u32, + names: *const *const c_char, + name_cnt: usize, +) -> *const *const blazesym_sym_info { + unsafe { + blazesym_find_addresses_opt( + symbolizer, + sym_srcs, + sym_srcs_len, + names, + name_cnt, + ptr::null(), + 0, + ) + } +} + +/// Free an array returned by blazesym_find_addresses. +/// +/// # Safety +/// +/// The pointer must be returned by [`blazesym_find_addresses()`]. +/// +#[no_mangle] +pub unsafe extern "C" fn blazesym_syms_list_free(syms_list: *const *const blazesym_sym_info) { + if syms_list.is_null() { + #[cfg(debug_assertions)] + eprintln!("blazesym_syms_list_free(null)"); + return; + } + + let raw_buf_with_sz = + unsafe { (syms_list as *mut u8).offset(-(mem::size_of::() as isize)) }; + let sz = unsafe { *(raw_buf_with_sz as *mut u64) } as usize + mem::size_of::(); + unsafe { dealloc(raw_buf_with_sz, Layout::from_size_align(sz, 8).unwrap()) }; +} diff --git a/third_party/blazesym/src/dwarf.rs b/third_party/blazesym/src/dwarf.rs new file mode 100644 index 0000000..517da75 --- /dev/null +++ b/third_party/blazesym/src/dwarf.rs @@ -0,0 +1,1489 @@ +use super::elf::ElfParser; +use super::util::{ + decode_leb128, decode_leb128_s, decode_udword, decode_uhalf, decode_uword, search_address_key, +}; +use super::{FindAddrOpts, SymbolInfo, SymbolType}; +use crossbeam_channel::unbounded; + +use std::cell::RefCell; +use std::io::{Error, ErrorKind}; +use std::iter::Iterator; +use std::mem; +#[cfg(test)] +use std::path::Path; +use std::rc::Rc; + +#[cfg(test)] +use std::env; + +use std::clone::Clone; +use std::ffi::CStr; +use std::sync::mpsc; +use std::thread; + +use regex::Regex; + +#[allow(non_upper_case_globals, unused)] +mod constants; +#[allow(non_upper_case_globals)] +mod debug_info; + +#[repr(C, packed)] +struct DebugLinePrologueV2 { + total_length: u32, + version: u16, + prologue_length: u32, + minimum_instruction_length: u8, + default_is_stmt: u8, + line_base: i8, + line_range: u8, + opcode_base: u8, +} + +/// DebugLinePrologue is actually a V4. +/// +/// DebugLinePrologueV2 will be converted to this type. +#[repr(C, packed)] +struct DebugLinePrologue { + total_length: u32, + version: u16, + prologue_length: u32, + minimum_instruction_length: u8, + maximum_ops_per_instruction: u8, + default_is_stmt: u8, + line_base: i8, + line_range: u8, + opcode_base: u8, +} + +/// The file information of a file for a CU. +struct DebugLineFileInfo { + name: String, + dir_idx: u32, // Index to include_directories of DebugLineCU. + _mod_tm: u64, + _size: usize, +} + +/// Represent a Compile Unit (CU) in a .debug_line section. +struct DebugLineCU { + prologue: DebugLinePrologue, + _standard_opcode_lengths: Vec, + include_directories: Vec, + files: Vec, + matrix: Vec, +} + +impl DebugLineCU { + fn find_line(&self, address: u64) -> Option<(&str, &str, usize)> { + let idx = search_address_key(&self.matrix, address, &|x: &DebugLineStates| -> u64 { + x.address + })?; + + let states = &self.matrix[idx]; + if states.end_sequence { + // This is the first byte after the last instruction + return None; + } + + self.stringify_row(idx) + } + + fn stringify_row(&self, idx: usize) -> Option<(&str, &str, usize)> { + let states = &self.matrix[idx]; + let (dir, file) = { + if states.file > 0 { + let file = &self.files[states.file - 1]; + let dir = { + if file.dir_idx == 0 { + "" + } else { + self.include_directories[file.dir_idx as usize - 1].as_str() + } + }; + (dir, file.name.as_str()) + } else { + ("", "") + } + }; + + Some((dir, file, states.line)) + } +} + +/// Parse the list of directory paths for a CU. +fn parse_debug_line_dirs(data_buf: &[u8]) -> Result<(Vec, usize), Error> { + let mut strs = Vec::::new(); + let mut pos = 0; + + while pos < data_buf.len() { + if data_buf[pos] == 0 { + return Ok((strs, pos + 1)); + } + + // Find NUL byte + let mut end = pos; + while end < data_buf.len() && data_buf[end] != 0 { + end += 1; + } + if end < data_buf.len() { + let mut str_vec = Vec::::with_capacity(end - pos); + str_vec.extend_from_slice(&data_buf[pos..end]); + + let str_r = String::from_utf8(str_vec) + .map_err(|_| Error::new(ErrorKind::InvalidData, "Invalid UTF-8 string"))?; + strs.push(str_r); + end += 1; + } + pos = end; + } + + Err(Error::new( + ErrorKind::InvalidData, + "Did not find NULL terminated string", + )) +} + +/// Parse the list of file information for a CU. +fn parse_debug_line_files(data_buf: &[u8]) -> Result<(Vec, usize), Error> { + let mut strs = Vec::::new(); + let mut pos = 0; + + while pos < data_buf.len() { + if data_buf[pos] == 0 { + return Ok((strs, pos + 1)); + } + + // Find NULL byte + let mut end = pos; + while end < data_buf.len() && data_buf[end] != 0 { + end += 1; + } + if end < data_buf.len() { + // Null terminated file name string + let mut str_vec = Vec::::with_capacity(end - pos); + str_vec.extend_from_slice(&data_buf[pos..end]); + + let str_r = String::from_utf8(str_vec); + if str_r.is_err() { + return Err(Error::new(ErrorKind::InvalidData, "Invalid UTF-8 string")); + } + end += 1; + + // LEB128 directory index + let dir_idx_r = decode_leb128(&data_buf[end..]); + if dir_idx_r.is_none() { + return Err(Error::new( + ErrorKind::InvalidData, + "Invalid directory index", + )); + } + let (dir_idx, bytes) = dir_idx_r.unwrap(); + end += bytes as usize; + + // LEB128 last modified time + let mod_tm_r = decode_leb128(&data_buf[end..]); + if mod_tm_r.is_none() { + return Err(Error::new( + ErrorKind::InvalidData, + "Invalid last modified time", + )); + } + let (mod_tm, bytes) = mod_tm_r.unwrap(); + end += bytes as usize; + + // LEB128 file size + let flen_r = decode_leb128(&data_buf[end..]); + if flen_r.is_none() { + return Err(Error::new(ErrorKind::InvalidData, "Invalid file size")); + } + let (flen, bytes) = flen_r.unwrap(); + end += bytes as usize; + + strs.push(DebugLineFileInfo { + name: str_r.unwrap(), + dir_idx: dir_idx as u32, + _mod_tm: mod_tm, + _size: flen as usize, + }); + } + pos = end; + } + + Err(Error::new( + ErrorKind::InvalidData, + "Do not found null string", + )) +} + +fn parse_debug_line_cu( + parser: &ElfParser, + addresses: &[u64], + reused_buf: &mut Vec, +) -> Result { + let mut prologue_sz: usize = mem::size_of::(); + let prologue_v4_sz: usize = mem::size_of::(); + let buf = reused_buf; + + buf.resize(prologue_sz, 0); + unsafe { parser.read_raw(buf.as_mut_slice()) }?; + let prologue_raw = buf.as_mut_ptr() as *mut DebugLinePrologueV2; + // SAFETY: `prologue_raw` is valid for reads and `DebugLinePrologueV2` is + // comprised only of objects that are valid for any bit pattern. + let v2 = unsafe { prologue_raw.read_unaligned() }; + + if v2.version != 0x2 && v2.version != 0x4 { + let version = v2.version; + return Err(Error::new( + ErrorKind::Unsupported, + format!("Support DWARF version 2 & 4 (version: {version})"), + )); + } + + let prologue = if v2.version == 0x4 { + // Upgrade to V4. + // V4 has more fields to read. + buf.resize(prologue_v4_sz, 0); + unsafe { parser.read_raw(&mut buf.as_mut_slice()[prologue_sz..]) }?; + let prologue_raw = buf.as_mut_ptr() as *mut DebugLinePrologue; + // SAFETY: `prologue_raw` is valid for reads and `DebugLinePrologue` is + // comprised only of objects that are valid for any bit pattern. + let prologue_v4 = unsafe { prologue_raw.read_unaligned() }; + prologue_sz = prologue_v4_sz; + prologue_v4 + } else { + // Convert V2 to V4 + let prologue_v4 = DebugLinePrologue { + total_length: v2.total_length, + version: v2.version, + prologue_length: v2.prologue_length, + minimum_instruction_length: v2.minimum_instruction_length, + maximum_ops_per_instruction: 0, + default_is_stmt: v2.default_is_stmt, + line_base: v2.line_base, + line_range: v2.line_range, + opcode_base: v2.opcode_base, + }; + prologue_v4 + }; + + let to_read = prologue.total_length as usize + 4 - prologue_sz; + let data_buf = buf; + if to_read <= data_buf.capacity() { + // Gain better performance by skipping initialization. + unsafe { data_buf.set_len(to_read) }; + } else { + data_buf.resize(to_read, 0); + } + unsafe { parser.read_raw(data_buf.as_mut_slice())? }; + + let mut pos = 0; + + let std_op_num = (prologue.opcode_base - 1) as usize; + let mut std_op_lengths = Vec::::with_capacity(std_op_num); + std_op_lengths.extend_from_slice(&data_buf[pos..pos + std_op_num]); + pos += std_op_num; + + let (inc_dirs, bytes) = parse_debug_line_dirs(&data_buf[pos..])?; + pos += bytes; + + let (files, bytes) = parse_debug_line_files(&data_buf[pos..])?; + pos += bytes; + + let matrix = run_debug_line_stmts(&data_buf[pos..], &prologue, addresses)?; + + #[cfg(debug_assertions)] + for i in 1..matrix.len() { + if matrix[i].address < matrix[i - 1].address && !matrix[i - 1].end_sequence { + panic!( + "Not in ascending order @ [{}] {:?} [{}] {:?}", + i - 1, + matrix[i - 1], + i, + matrix[i] + ); + } + } + + Ok(DebugLineCU { + prologue, + _standard_opcode_lengths: std_op_lengths, + include_directories: inc_dirs, + files, + matrix, + }) +} + +#[derive(Clone, Debug)] +struct DebugLineStates { + address: u64, + file: usize, + line: usize, + column: usize, + discriminator: u64, + is_stmt: bool, + basic_block: bool, + end_sequence: bool, + prologue_end: bool, + should_reset: bool, +} + +impl DebugLineStates { + fn new(prologue: &DebugLinePrologue) -> DebugLineStates { + DebugLineStates { + address: 0, + file: 1, + line: 1, + column: 0, + discriminator: 0, + is_stmt: prologue.default_is_stmt != 0, + basic_block: false, + end_sequence: false, + prologue_end: false, + should_reset: false, + } + } + + fn reset(&mut self, prologue: &DebugLinePrologue) { + self.address = 0; + self.file = 1; + self.line = 1; + self.column = 0; + self.discriminator = 0; + self.is_stmt = prologue.default_is_stmt != 0; + self.basic_block = false; + self.end_sequence = false; + self.prologue_end = false; + self.should_reset = false; + } +} + +/// Return `Ok((insn_bytes, emit))` if success. `insn_bytes1 is the +/// size of the instruction at the position given by ip. `emit` is +/// true if this instruction emit a new row to describe line +/// information of an address. Not every instructions emit rows. +/// Some instructions create only intermediate states for the next row +/// going to emit. +fn run_debug_line_stmt( + stmts: &[u8], + prologue: &DebugLinePrologue, + ip: usize, + states: &mut DebugLineStates, +) -> Result<(usize, bool), Error> { + // Standard opcodes + const DW_LNS_EXT: u8 = 0; + const DW_LNS_COPY: u8 = 1; + const DW_LNS_ADVANCE_PC: u8 = 2; + const DW_LNS_ADVANCE_LINE: u8 = 3; + const DW_LNS_SET_FILE: u8 = 4; + const DW_LNS_SET_COLUMN: u8 = 5; + const DW_LNS_NEGATE_STMT: u8 = 6; + const DW_LNS_SET_BASIC_BLOCK: u8 = 7; + const DW_LNS_CONST_ADD_PC: u8 = 8; + const DW_LNS_FIXED_ADVANCE_PC: u8 = 9; + const DW_LNS_SET_PROLOGUE_END: u8 = 10; + + // Extended opcodes + const DW_LINE_END_SEQUENCE: u8 = 1; + const DW_LINE_SET_ADDRESS: u8 = 2; + const DW_LINE_DEFINE_FILE: u8 = 3; + const DW_LINE_SET_DISCRIMINATOR: u8 = 4; + + let opcode_base = prologue.opcode_base; + let opcode = stmts[ip]; + + match opcode { + DW_LNS_EXT => { + // Extended opcodes + if let Some((insn_size, bytes)) = decode_leb128(&stmts[(ip + 1)..]) { + if insn_size < 1 { + return Err(Error::new( + ErrorKind::InvalidData, + format!("invalid extended opcode (ip=0x{ip:x}, insn_size=0x{insn_size:x}"), + )); + } + let ext_opcode = stmts[ip + 1 + bytes as usize]; + match ext_opcode { + DW_LINE_END_SEQUENCE => { + states.end_sequence = true; + states.should_reset = true; + Ok((1 + bytes as usize + insn_size as usize, true)) + } + DW_LINE_SET_ADDRESS => match insn_size - 1 { + 4 => { + let address = decode_uword(&stmts[(ip + 1 + bytes as usize + 1)..]); + states.address = address as u64; + Ok((1 + bytes as usize + insn_size as usize, false)) + } + 8 => { + let address = decode_udword(&stmts[(ip + 1 + bytes as usize + 1)..]); + states.address = address; + Ok((1 + bytes as usize + insn_size as usize, false)) + } + _ => Err(Error::new( + ErrorKind::Unsupported, + format!("unsupported address size ({insn_size})"), + )), + }, + DW_LINE_DEFINE_FILE => Err(Error::new( + ErrorKind::Unsupported, + "DW_LINE_define_file is not supported yet", + )), + DW_LINE_SET_DISCRIMINATOR => { + if let Some((discriminator, discr_bytes)) = + decode_leb128(&stmts[(ip + 1 + bytes as usize + 1)..]) + { + if discr_bytes as u64 + 1 == insn_size { + states.discriminator = discriminator; + Ok((1 + bytes as usize + insn_size as usize, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "unmatched instruction size for DW_LINE_set_discriminator", + )) + } + } else { + Err(Error::new( + ErrorKind::InvalidData, + "discriminator is broken", + )) + } + } + _ => Err(Error::new( + ErrorKind::Unsupported, + format!( + "invalid extended opcode (ip=0x{ip:x}, ext_opcode=0x{ext_opcode:x})" + ), + )), + } + } else { + Err(Error::new( + ErrorKind::InvalidData, + format!("invalid extended opcode (ip=0x{ip:x})"), + )) + } + } + DW_LNS_COPY => Ok((1, true)), + DW_LNS_ADVANCE_PC => { + if let Some((adv, bytes)) = decode_leb128(&stmts[(ip + 1)..]) { + states.address += adv * prologue.minimum_instruction_length as u64; + Ok((1 + bytes as usize, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "the operand of advance_pc is broken", + )) + } + } + DW_LNS_ADVANCE_LINE => { + if let Some((adv, bytes)) = decode_leb128_s(&stmts[(ip + 1)..]) { + states.line = (states.line as i64 + adv) as usize; + Ok((1 + bytes as usize, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "the operand of advance_line is broken", + )) + } + } + DW_LNS_SET_FILE => { + if let Some((file_idx, bytes)) = decode_leb128(&stmts[(ip + 1)..]) { + states.file = file_idx as usize; + Ok((1 + bytes as usize, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "the operand of set_file is broken", + )) + } + } + DW_LNS_SET_COLUMN => { + if let Some((column, bytes)) = decode_leb128(&stmts[(ip + 1)..]) { + states.column = column as usize; + Ok((1 + bytes as usize, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "the operand of set_column is broken", + )) + } + } + DW_LNS_NEGATE_STMT => { + states.is_stmt = !states.is_stmt; + Ok((1, false)) + } + DW_LNS_SET_BASIC_BLOCK => { + states.basic_block = true; + Ok((1, false)) + } + DW_LNS_CONST_ADD_PC => { + let addr_adv = (255 - opcode_base) / prologue.line_range; + states.address += addr_adv as u64 * prologue.minimum_instruction_length as u64; + Ok((1, false)) + } + DW_LNS_FIXED_ADVANCE_PC => { + if (ip + 3) < stmts.len() { + let addr_adv = decode_uhalf(&stmts[(ip + 1)..]); + states.address += addr_adv as u64 * prologue.minimum_instruction_length as u64; + Ok((1, false)) + } else { + Err(Error::new( + ErrorKind::InvalidData, + "the operand of fixed_advance_pc is broken", + )) + } + } + DW_LNS_SET_PROLOGUE_END => { + states.prologue_end = true; + Ok((1, false)) + } + _ => { + // Special opcodes + let desired_line_incr = (opcode - opcode_base) % prologue.line_range; + let addr_adv = (opcode - opcode_base) / prologue.line_range; + states.address += addr_adv as u64 * prologue.minimum_instruction_length as u64; + states.line = (states.line as i64 + + (desired_line_incr as i16 + prologue.line_base as i16) as i64 + * prologue.minimum_instruction_length as i64) + as usize; + Ok((1, true)) + } + } +} + +fn run_debug_line_stmts( + stmts: &[u8], + prologue: &DebugLinePrologue, + addresses: &[u64], +) -> Result, Error> { + let mut ip = 0; + let mut matrix = Vec::::new(); + let mut should_sort = false; + let mut states_cur = DebugLineStates::new(prologue); + let mut states_last = states_cur.clone(); + let mut last_ip_pushed = false; + let mut force_no_emit = false; + + while ip < stmts.len() { + match run_debug_line_stmt(stmts, prologue, ip, &mut states_cur) { + Ok((sz, emit)) => { + ip += sz; + if emit { + if states_cur.address == 0 { + // This is a special case. Somehow, rust + // compiler generate debug_line for some + // builtin code starting from 0. And, it + // causes incorrect behavior. + force_no_emit = true; + } + if !force_no_emit { + if !addresses.is_empty() { + let mut pushed = false; + for addr in addresses { + if *addr == states_cur.address + || (states_last.address != 0 + && !states_last.end_sequence + && *addr < states_cur.address + && *addr > states_last.address) + { + if !last_ip_pushed && *addr != states_cur.address { + // The address falls between current and last emitted row. + matrix.push(states_last.clone()); + } + matrix.push(states_cur.clone()); + pushed = true; + break; + } + } + last_ip_pushed = pushed; + states_last = states_cur.clone(); + } else { + matrix.push(states_cur.clone()); + } + if states_last.address > states_cur.address { + should_sort = true; + } + } + } + if states_cur.should_reset { + states_cur.reset(prologue); + force_no_emit = false; + } + } + Err(e) => { + return Err(e); + } + } + } + + if should_sort { + matrix.sort_by_key(|x| x.address); + } + + Ok(matrix) +} + +/// If addresses is empty, it returns a full version of debug_line matrix. +/// If addresses is not empty, return only data needed to resolve given addresses . +fn parse_debug_line_elf_parser( + parser: &ElfParser, + addresses: &[u64], +) -> Result, Error> { + let debug_line_idx = parser.find_section(".debug_line")?; + let debug_line_sz = parser.get_section_size(debug_line_idx)?; + let mut remain_sz = debug_line_sz; + let prologue_size: usize = mem::size_of::(); + let mut not_found = Vec::from(addresses); + + parser.section_seek(debug_line_idx)?; + + let mut all_cus = Vec::::new(); + let mut buf = Vec::::new(); + while remain_sz > prologue_size { + let debug_line_cu = parse_debug_line_cu(parser, ¬_found, &mut buf)?; + let prologue = &debug_line_cu.prologue; + remain_sz -= prologue.total_length as usize + 4; + + if debug_line_cu.matrix.is_empty() { + continue; + } + + if !addresses.is_empty() { + let mut last_row = &debug_line_cu.matrix[0]; + for row in debug_line_cu.matrix.as_slice() { + let mut i = 0; + // Remove addresses found in this CU from not_found. + while i < not_found.len() { + let addr = addresses[i]; + if addr == row.address || (addr < row.address && addr > last_row.address) { + not_found.remove(i); + } else { + i += 1; + } + } + last_row = row; + } + + all_cus.push(debug_line_cu); + + if not_found.is_empty() { + return Ok(all_cus); + } + } else { + all_cus.push(debug_line_cu); + } + } + + if remain_sz != 0 { + return Err(Error::new( + ErrorKind::InvalidData, + "encountered remaining garbage data at the end", + )); + } + + Ok(all_cus) +} + +/// DwarfResolver provides abilities to query DWARF information of binaries. +pub struct DwarfResolver { + parser: Rc, + debug_line_cus: Vec, + addr_to_dlcu: Vec<(u64, u32)>, + enable_debug_info_syms: bool, + debug_info_syms: RefCell>>>, +} + +impl DwarfResolver { + pub fn get_parser(&self) -> &ElfParser { + &self.parser + } + + pub fn from_parser_for_addresses( + parser: Rc, + addresses: &[u64], + line_number_info: bool, + debug_info_symbols: bool, + ) -> Result { + let debug_line_cus: Vec = if line_number_info { + parse_debug_line_elf_parser(&parser, addresses).unwrap_or_default() + } else { + vec![] + }; + + let mut addr_to_dlcu = Vec::with_capacity(debug_line_cus.len()); + for (idx, dlcu) in debug_line_cus.iter().enumerate() { + if dlcu.matrix.is_empty() { + continue; + } + let first_addr = dlcu.matrix[0].address; + addr_to_dlcu.push((first_addr, idx as u32)); + } + addr_to_dlcu.sort_by_key(|v| v.0); + + Ok(DwarfResolver { + parser, + debug_line_cus, + addr_to_dlcu, + enable_debug_info_syms: debug_info_symbols, + debug_info_syms: RefCell::new(None), + }) + } + + /// Open a binary to load .debug_line only enough for a given list of addresses. + /// + /// When `addresses` is not empty, the returned instance only has + /// data that related to these addresses. For this case, the + /// isntance have the ability that can serve only these addresses. + /// This would be much faster. + /// + /// If `addresses` is empty, the returned instance has all data + /// from the given file. If the instance will be used for long + /// running, you would want to load all data into memory to have + /// the ability of handling all possible addresses. + #[cfg(test)] + fn open_for_addresses( + filename: &Path, + addresses: &[u64], + line_number_info: bool, + debug_info_symbols: bool, + ) -> Result { + let parser = ElfParser::open(filename)?; + Self::from_parser_for_addresses( + Rc::new(parser), + addresses, + line_number_info, + debug_info_symbols, + ) + } + + /// Open a binary to load and parse .debug_line for later uses. + /// + /// `filename` is the name of an ELF binary/or shared object that + /// has .debug_line section. + #[cfg(test)] + pub fn open( + filename: &Path, + debug_line_info: bool, + debug_info_symbols: bool, + ) -> Result { + Self::open_for_addresses(filename, &[], debug_line_info, debug_info_symbols) + } + + fn find_dlcu_index(&self, address: u64) -> Option { + let a2a = &self.addr_to_dlcu; + let a2a_idx = search_address_key(a2a, address, &|x: &(u64, u32)| -> u64 { x.0 })?; + let dlcu_idx = a2a[a2a_idx].1 as usize; + + Some(dlcu_idx) + } + + /// Find line information of an address. + /// + /// `address` is an offset from the head of the loaded binary/or + /// shared object. This function returns a tuple of `(dir_name, file_name, line_no)`. + pub fn find_line_as_ref(&self, address: u64) -> Option<(&str, &str, usize)> { + let idx = self.find_dlcu_index(address)?; + let dlcu = &self.debug_line_cus[idx]; + + dlcu.find_line(address) + } + + /// Find line information of an address. + /// + /// `address` is an offset from the head of the loaded binary/or + /// shared object. This function returns a tuple of `(dir_name, file_name, line_no)`. + /// + /// This function is pretty much the same as `find_line_as_ref()` + /// except returning a copies of `String` instead of `&str`. + #[cfg(test)] + fn find_line(&self, address: u64) -> Option<(String, String, usize)> { + let (dir, file, line_no) = self.find_line_as_ref(address)?; + Some((String::from(dir), String::from(file), line_no)) + } + + /// Extract the symbol information from DWARf if having not did it + /// before. + fn ensure_debug_info_syms(&self) -> Result<(), Error> { + if self.enable_debug_info_syms { + let mut dis_ref = self.debug_info_syms.borrow_mut(); + if dis_ref.is_some() { + return Ok(()); + } + let mut debug_info_syms = debug_info_parse_symbols(&self.parser, None, 1)?; + debug_info_syms.sort_by_key(|v: &DWSymInfo| -> &str { v.name }); + *dis_ref = Some(unsafe { mem::transmute(debug_info_syms) }); + } + Ok(()) + } + + /// Find the address of a symbol from DWARF. + /// + /// # Arguments + /// + /// * `name` - is the symbol name to find. + /// * `opts` - is the context giving additional parameters. + pub fn find_address(&self, name: &str, opts: &FindAddrOpts) -> Result, Error> { + if let SymbolType::Variable = opts.sym_type { + return Err(Error::new(ErrorKind::Unsupported, "Not implemented")); + } + let elf_r = self.parser.find_address(name, opts)?; + if !elf_r.is_empty() { + // Since it is found from symtab, symtab should be + // complete and DWARF shouldn't provide more information. + return Ok(elf_r); + } + + self.ensure_debug_info_syms()?; + let dis_ref = self.debug_info_syms.borrow(); + let debug_info_syms = dis_ref.as_ref().unwrap(); + let mut idx = + match debug_info_syms.binary_search_by_key(&name.to_string(), |v| v.name.to_string()) { + Ok(idx) => idx, + _ => { + return Ok(vec![]); + } + }; + while idx > 0 && debug_info_syms[idx].name.eq(name) { + idx -= 1; + } + if !debug_info_syms[idx].name.eq(name) { + idx += 1; + } + let mut found = vec![]; + while debug_info_syms[idx].name.eq(name) { + let DWSymInfo { + address, + size, + sym_type, + .. + } = debug_info_syms[idx]; + found.push(SymbolInfo { + name: name.to_string(), + address, + size, + sym_type, + ..Default::default() + }); + idx += 1; + } + Ok(found) + } + + /// Find the address of symbols matching a pattern from DWARF. + /// + /// #Arguments + /// + /// * `pattern` - is a regex pattern to match symbols. + /// * `opts` - is the context giving additional parameters. + /// + /// Return a list of symbols including addresses and other information. + pub fn find_address_regex( + &self, + pattern: &str, + opts: &FindAddrOpts, + ) -> Result, Error> { + if let SymbolType::Variable = opts.sym_type { + return Err(Error::new(ErrorKind::Unsupported, "Not implemented")); + } + let r = self.parser.find_address_regex(pattern, opts)?; + if !r.is_empty() { + return Ok(r); + } + + self.ensure_debug_info_syms()?; + + let dis_ref = self.debug_info_syms.borrow(); + if dis_ref.is_none() { + return Ok(vec![]); + } + let debug_info_syms = dis_ref.as_ref().unwrap(); + let mut syms = vec![]; + let re = Regex::new(pattern).unwrap(); + for sym in debug_info_syms { + if re.is_match(sym.name) { + let DWSymInfo { + address, + size, + sym_type, + .. + } = sym; + syms.push(SymbolInfo { + name: sym.name.to_string(), + address: *address, + size: *size, + sym_type: *sym_type, + ..Default::default() + }); + } + } + + Ok(syms) + } + + #[cfg(test)] + fn pick_address_for_test(&self) -> (u64, &str, &str, usize) { + let (addr, idx) = self.addr_to_dlcu[self.addr_to_dlcu.len() / 3]; + let dlcu = &self.debug_line_cus[idx as usize]; + let (dir, file, line) = dlcu.stringify_row(0).unwrap(); + (addr, dir, file, line) + } +} + +/// The symbol information extracted out of DWARF. +#[derive(Clone)] +struct DWSymInfo<'a> { + name: &'a str, + address: u64, + size: u64, + sym_type: SymbolType, // A function or a variable. +} + +fn find_die_sibling(die: &mut debug_info::DIE<'_>) -> Option { + for (name, _form, _opt, value) in die { + if name == constants::DW_AT_sibling { + if let debug_info::AttrValue::Unsigned(off) = value { + return Some(off as usize); + } + return None; + } + } + None +} + +/// Parse a DIE that declares a subprogram. (a function) +/// +/// We already know the given DIE is a declaration of a subprogram. +/// This function trys to extract the address of the subprogram and +/// other information from the DIE. +/// +/// # Arguments +/// +/// * `die` - is a DIE. +/// * `str_data` - is the content of the `.debug_str` section. +/// +/// Return a [`DWSymInfo`] if it finds the address of the subprogram. +fn parse_die_subprogram<'a>( + die: &mut debug_info::DIE<'a>, + str_data: &'a [u8], +) -> Result>, Error> { + let mut addr: Option = None; + let mut name_str: Option<&str> = None; + let mut size = 0; + + for (name, _form, _opt, value) in die { + match name { + constants::DW_AT_linkage_name | constants::DW_AT_name => { + if name_str.is_some() { + continue; + } + name_str = Some(match value { + debug_info::AttrValue::Unsigned(str_off) => unsafe { + CStr::from_ptr(str_data[str_off as usize..].as_ptr() as *const i8) + .to_str() + .map_err(|_e| { + Error::new( + ErrorKind::InvalidData, + "fail to extract the name of a subprogram", + ) + })? + }, + debug_info::AttrValue::String(s) => s, + _ => { + return Err(Error::new( + ErrorKind::InvalidData, + "fail to parse DW_AT_linkage_name {}", + )); + } + }); + } + constants::DW_AT_lo_pc => match value { + debug_info::AttrValue::Unsigned(pc) => { + addr = Some(pc); + } + _ => { + return Err(Error::new( + ErrorKind::InvalidData, + "fail to parse DW_AT_lo_pc", + )); + } + }, + constants::DW_AT_hi_pc => match value { + debug_info::AttrValue::Unsigned(sz) => { + size = sz; + } + _ => { + return Err(Error::new( + ErrorKind::InvalidData, + "fail to parse DW_AT_lo_pc", + )); + } + }, + _ => {} + } + } + + match (addr, name_str) { + (Some(address), Some(name)) => Ok(Some(DWSymInfo { + name, + address, + size, + sym_type: SymbolType::Function, + })), + _ => Ok(None), + } +} + +/// Walk through all DIEs of a compile unit to extract symbols. +/// +/// # Arguments +/// +/// * `dieiter` - is an iterator returned by the iterator that is +/// returned by an [`UnitIter`]. [`UnitIter`] returns +/// an [`UnitHeader`] and an [`DIEIter`]. +/// * `str_data` - is the content of the `.debug_str` section. +/// * `found_syms` - the Vec to append the found symbols. +fn debug_info_parse_symbols_cu<'a>( + mut dieiter: debug_info::DIEIter<'a>, + str_data: &'a [u8], + found_syms: &mut Vec>, +) { + while let Some(mut die) = dieiter.next() { + if die.tag == 0 || die.tag == constants::DW_TAG_namespace { + continue; + } + + assert!(die.abbrev.is_some()); + if die.tag != constants::DW_TAG_subprogram { + if die.abbrev.unwrap().has_children { + if let Some(sibling_off) = find_die_sibling(&mut die) { + dieiter.seek_to_sibling(sibling_off); + continue; + } + // Skip this DIE quickly, or the iterator will + // recalculate the size of the DIE. + die.exhaust().unwrap(); + } + continue; + } + + if let Ok(Some(syminfo)) = parse_die_subprogram(&mut die, str_data) { + found_syms.push(syminfo); + } + } +} + +/// The parse result of the `.debug_info` section. +/// +/// This type is used by the worker threads to pass results to the +/// coordinator after finishing an Unit. `Stop` is used to nofity the +/// coordinator that a matching condition is met. It could be that +/// the given symbol is already found, so that the coordinator should +/// stop producing more tasks. +enum DIParseResult<'a> { + Symbols(Vec>), + Stop, +} + +/// Parse the addresses of symbols from the `.debug_info` section. +/// +/// # Arguments +/// +/// * `parser` - is an ELF parser. +/// * `cond` - is a function to check if we have found the information +/// we need. The function will stop earlier if the +/// condition is met. +/// * `nthreads` - is the number of worker threads to create. 0 or 1 +/// means single thread. +fn debug_info_parse_symbols<'a>( + parser: &'a ElfParser, + cond: Option<&(dyn Fn(&DWSymInfo<'a>) -> bool + Send + Sync)>, + nthreads: usize, +) -> Result>, Error> { + let info_sect_idx = parser.find_section(".debug_info")?; + let info_data = parser.read_section_raw_cache(info_sect_idx)?; + let abbrev_sect_idx = parser.find_section(".debug_abbrev")?; + let abbrev_data = parser.read_section_raw_cache(abbrev_sect_idx)?; + let units = debug_info::UnitIter::new(info_data, abbrev_data); + let str_sect_idx = parser.find_section(".debug_str")?; + let str_data = parser.read_section_raw_cache(str_sect_idx)?; + + let mut syms = Vec::::new(); + + if nthreads > 1 { + thread::scope(|s| { + // Create worker threads to process tasks (Units) in a work + // queue. + let mut handles = vec![]; + let (qsend, qrecv) = unbounded::>(); + let (result_tx, result_rx) = mpsc::channel::(); + + for _ in 0..nthreads { + let result_tx = result_tx.clone(); + let qrecv = qrecv.clone(); + + let handle = s.spawn(move || { + let mut syms: Vec = vec![]; + if let Some(cond) = cond { + while let Ok(dieiterholder) = qrecv.recv() { + let saved_sz = syms.len(); + debug_info_parse_symbols_cu(dieiterholder, str_data, &mut syms); + for sym in &syms[saved_sz..] { + if !cond(sym) { + result_tx.send(DIParseResult::Stop).unwrap(); + } + } + } + } else { + while let Ok(dieiterholder) = qrecv.recv() { + debug_info_parse_symbols_cu(dieiterholder, str_data, &mut syms); + } + } + result_tx.send(DIParseResult::Symbols(syms)).unwrap(); + }); + + handles.push(handle); + } + + for (uhdr, dieiter) in units { + if let debug_info::UnitHeader::CompileV4(_) = uhdr { + qsend.send(dieiter).unwrap(); + } + + if let Ok(result) = result_rx.try_recv() { + if let DIParseResult::Stop = result { + break; + } else { + return Err(Error::new( + ErrorKind::UnexpectedEof, + "Receive an unexpected result", + )); + } + } + } + + drop(qsend); + + drop(result_tx); + while let Ok(result) = result_rx.recv() { + if let DIParseResult::Symbols(mut thread_syms) = result { + syms.append(&mut thread_syms); + } + } + for handle in handles { + handle.join().unwrap(); + } + Ok(()) + })?; + } else if let Some(cond) = cond { + 'outer: for (uhdr, dieiter) in units { + if let debug_info::UnitHeader::CompileV4(_) = uhdr { + let saved_sz = syms.len(); + debug_info_parse_symbols_cu(dieiter, str_data, &mut syms); + for sym in &syms[saved_sz..] { + if !cond(sym) { + break 'outer; + } + } + } + } + } else { + for (uhdr, dieiter) in units { + if let debug_info::UnitHeader::CompileV4(_) = uhdr { + debug_info_parse_symbols_cu(dieiter, str_data, &mut syms); + } + } + } + Ok(syms) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(feature = "nightly")] + use test::Bencher; + + + fn parse_debug_line_elf(filename: &Path) -> Result, Error> { + let parser = ElfParser::open(filename)?; + parse_debug_line_elf_parser(&parser, &[]) + } + + #[allow(unused)] + struct ArangesCU { + debug_line_off: usize, + aranges: Vec<(u64, u64)>, + } + + fn parse_aranges_cu(data: &[u8]) -> Result<(ArangesCU, usize), Error> { + if data.len() < 12 { + return Err(Error::new( + ErrorKind::InvalidData, + "invalid arange header (too small)", + )); + } + let len = decode_uword(data); + let version = decode_uhalf(&data[4..]); + let offset = decode_uword(&data[6..]); + let addr_sz = data[10]; + let _seg_sz = data[11]; + + if data.len() < (len + 4) as usize { + return Err(Error::new( + ErrorKind::InvalidData, + "data is broken (too small)", + )); + } + + // Size of the header + let mut pos = 12; + + // Padding to align with the size of addresses on the target system. + pos += addr_sz as usize - 1; + pos -= pos % addr_sz as usize; + + let mut aranges = Vec::<(u64, u64)>::new(); + match addr_sz { + 4 => { + while pos < (len + 4 - 8) as usize { + let start = decode_uword(&data[pos..]); + pos += 4; + let size = decode_uword(&data[pos..]); + pos += 4; + + if start == 0 && size == 0 { + break; + } + aranges.push((start as u64, size as u64)); + } + } + 8 => { + while pos < (len + 4 - 16) as usize { + let start = decode_udword(&data[pos..]); + pos += 8; + let size = decode_udword(&data[pos..]); + pos += 8; + + if start == 0 && size == 0 { + break; + } + aranges.push((start, size)); + } + } + _ => { + return Err(Error::new( + ErrorKind::Unsupported, + format!("unsupported address size {addr_sz} ver {version} off 0x{offset:x}"), + )); + } + } + + Ok(( + ArangesCU { + debug_line_off: offset as usize, + aranges, + }, + len as usize + 4, + )) + } + + fn parse_aranges_elf_parser(parser: &ElfParser) -> Result, Error> { + let debug_aranges_idx = parser.find_section(".debug_aranges")?; + + let raw_data = parser.read_section_raw(debug_aranges_idx)?; + + let mut pos = 0; + let mut acus = Vec::::new(); + while pos < raw_data.len() { + let (acu, bytes) = parse_aranges_cu(&raw_data[pos..])?; + acus.push(acu); + pos += bytes; + } + + Ok(acus) + } + + fn parse_aranges_elf(filename: &Path) -> Result, Error> { + let parser = ElfParser::open(filename)?; + parse_aranges_elf_parser(&parser) + } + + #[test] + fn test_parse_debug_line_elf() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-dwarf-v4.bin"); + + let _ = parse_debug_line_elf(bin_name.as_ref()).unwrap(); + } + + #[test] + fn test_run_debug_line_stmts_1() { + let stmts = [ + 0x00, 0x09, 0x02, 0x30, 0x8b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0xa0, 0x04, + 0x01, 0x05, 0x06, 0x0a, 0x08, 0x30, 0x02, 0x05, 0x00, 0x01, 0x01, + ]; + let prologue = DebugLinePrologue { + total_length: 0, + version: 4, + prologue_length: 0, + minimum_instruction_length: 1, + maximum_ops_per_instruction: 1, + default_is_stmt: 1, + line_base: -5, + line_range: 14, + opcode_base: 13, + }; + + let result = run_debug_line_stmts(&stmts, &prologue, &[]); + if result.is_err() { + let e = result.as_ref().err().unwrap(); + println!("result {e:?}"); + } + assert!(result.is_ok()); + let matrix = result.unwrap(); + assert_eq!(matrix.len(), 3); + assert_eq!(matrix[0].line, 545); + assert_eq!(matrix[0].address, 0x18b30); + assert_eq!(matrix[1].line, 547); + assert_eq!(matrix[1].address, 0x18b43); + assert_eq!(matrix[2].line, 547); + assert_eq!(matrix[2].address, 0x18b48); + } + + #[test] + fn test_run_debug_line_stmts_2() { + // File name Line number Starting address View Stmt + // methods.rs 789 0x18c70 x + // methods.rs 791 0x18c7c x + // methods.rs 791 0x18c81 + // methods.rs 790 0x18c86 x + // methods.rs 0 0x18c88 + // methods.rs 791 0x18c8c x + // methods.rs 0 0x18c95 + // methods.rs 792 0x18c99 x + // methods.rs 792 0x18c9d + // methods.rs 0 0x18ca4 + // methods.rs 791 0x18ca8 x + // methods.rs 792 0x18caf x + // methods.rs 0 0x18cb6 + // methods.rs 792 0x18cba + // methods.rs 0 0x18cc4 + // methods.rs 792 0x18cc8 + // methods.rs 790 0x18cce x + // methods.rs 794 0x18cd0 x + // methods.rs 794 0x18cde x + let stmts = [ + 0x00, 0x09, 0x02, 0x70, 0x8c, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x94, 0x06, + 0x01, 0x05, 0x0d, 0x0a, 0xbc, 0x05, 0x26, 0x06, 0x58, 0x05, 0x09, 0x06, 0x57, 0x06, + 0x03, 0xea, 0x79, 0x2e, 0x05, 0x13, 0x06, 0x03, 0x97, 0x06, 0x4a, 0x06, 0x03, 0xe9, + 0x79, 0x90, 0x05, 0x0d, 0x06, 0x03, 0x98, 0x06, 0x4a, 0x05, 0x12, 0x06, 0x4a, 0x03, + 0xe8, 0x79, 0x74, 0x05, 0x13, 0x06, 0x03, 0x97, 0x06, 0x4a, 0x05, 0x12, 0x75, 0x06, + 0x03, 0xe8, 0x79, 0x74, 0x05, 0x20, 0x03, 0x98, 0x06, 0x4a, 0x03, 0xe8, 0x79, 0x9e, + 0x05, 0x12, 0x03, 0x98, 0x06, 0x4a, 0x05, 0x09, 0x06, 0x64, 0x05, 0x06, 0x32, 0x02, + 0x0e, 0x00, 0x01, 0x01, + ]; + let prologue = DebugLinePrologue { + total_length: 0, + version: 4, + prologue_length: 0, + minimum_instruction_length: 1, + maximum_ops_per_instruction: 1, + default_is_stmt: 1, + line_base: -5, + line_range: 14, + opcode_base: 13, + }; + + let result = run_debug_line_stmts(&stmts, &prologue, &[]); + if result.is_err() { + let e = result.as_ref().err().unwrap(); + println!("result {e:?}"); + } + assert!(result.is_ok()); + let matrix = result.unwrap(); + + assert_eq!(matrix.len(), 19); + assert_eq!(matrix[0].line, 789); + assert_eq!(matrix[0].address, 0x18c70); + assert!(matrix[0].is_stmt); + + assert_eq!(matrix[1].line, 791); + assert_eq!(matrix[1].address, 0x18c7c); + assert!(matrix[1].is_stmt); + + assert_eq!(matrix[2].line, 791); + assert_eq!(matrix[2].address, 0x18c81); + assert!(!matrix[2].is_stmt); + + assert_eq!(matrix[13].line, 792); + assert_eq!(matrix[13].address, 0x18cba); + assert!(!matrix[13].is_stmt); + + assert_eq!(matrix[14].line, 0); + assert_eq!(matrix[14].address, 0x18cc4); + assert!(!matrix[14].is_stmt); + + assert_eq!(matrix[18].line, 794); + assert_eq!(matrix[18].address, 0x18cde); + assert!(matrix[18].is_stmt); + } + + #[test] + fn test_parse_aranges_elf() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-dwarf-v4.bin"); + + let _aranges = parse_aranges_elf(bin_name.as_ref()).unwrap(); + } + + #[test] + fn test_dwarf_resolver() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-dwarf-v4.bin"); + let resolver = DwarfResolver::open(bin_name.as_ref(), true, false).unwrap(); + let (addr, dir, file, line) = resolver.pick_address_for_test(); + + let (dir_ret, file_ret, line_ret) = resolver.find_line(addr).unwrap(); + assert_eq!(dir, dir_ret); + assert_eq!(file, file_ret); + assert_eq!(line, line_ret); + } + + #[test] + fn test_debug_info_parse_symbols() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-dwarf-v4.bin"); + + let parser = ElfParser::open(bin_name.as_ref()).unwrap(); + let syms = debug_info_parse_symbols(&parser, None, 4).unwrap(); + assert!(syms.iter().any(|sym| sym.name == "fibonacci")) + } + + #[test] + fn test_dwarf_find_addr_regex() { + let bin_name = env::args().next().unwrap(); + let dwarf = DwarfResolver::open(bin_name.as_ref(), false, true).unwrap(); + let opts = FindAddrOpts { + offset_in_file: false, + obj_file_name: false, + sym_type: SymbolType::Unknown, + }; + let syms = dwarf + .find_address_regex("DwarfResolver.*find_address_regex.*", &opts) + .unwrap(); + assert!(!syms.is_empty()); + } + + /// Benchmark the [`debug_info_parse_symbols`] function. + #[cfg(feature = "nightly")] + #[bench] + fn debug_info_parse_single_threaded(b: &mut Bencher) { + let bin_name = env::args().next().unwrap(); + let parser = ElfParser::open(bin_name.as_ref()).unwrap(); + + let () = b.iter(|| debug_info_parse_symbols(&parser, None, 1).unwrap()); + } +} diff --git a/third_party/blazesym/src/dwarf/constants.rs b/third_party/blazesym/src/dwarf/constants.rs new file mode 100644 index 0000000..c6e6a45 --- /dev/null +++ b/third_party/blazesym/src/dwarf/constants.rs @@ -0,0 +1,64 @@ +pub const DW_UT_compile: u8 = 0x1; +pub const DW_UT_type: u8 = 0x2; + +pub const DW_TAG_array_type: u8 = 0x1; +pub const DW_TAG_enumeration_type: u8 = 0x4; +pub const DW_TAG_compile_unit: u8 = 0x11; +pub const DW_TAG_subprogram: u8 = 0x2e; +pub const DW_TAG_variable: u8 = 0x34; +pub const DW_TAG_namespace: u8 = 0x39; + +pub const DW_CHILDREN_no: u8 = 0x00; +pub const DW_CHILDREN_yes: u8 = 0x01; + +pub const DW_AT_sibling: u8 = 0x01; +pub const DW_AT_location: u8 = 0x02; +pub const DW_AT_name: u8 = 0x03; +pub const DW_AT_lo_pc: u8 = 0x11; +pub const DW_AT_hi_pc: u8 = 0x12; +pub const DW_AT_entry_pc: u8 = 0x52; +pub const DW_AT_linkage_name: u8 = 0x6e; + +pub const DW_FORM_addr: u8 = 0x01; +pub const DW_FORM_block2: u8 = 0x03; +pub const DW_FORM_block4: u8 = 0x04; +pub const DW_FORM_data2: u8 = 0x05; +pub const DW_FORM_data4: u8 = 0x06; +pub const DW_FORM_data8: u8 = 0x07; +pub const DW_FORM_string: u8 = 0x08; +pub const DW_FORM_block: u8 = 0x09; +pub const DW_FORM_block1: u8 = 0x0a; +pub const DW_FORM_data1: u8 = 0x0b; +pub const DW_FORM_flag: u8 = 0x0c; +pub const DW_FORM_sdata: u8 = 0x0d; +pub const DW_FORM_strp: u8 = 0x0e; +pub const DW_FORM_udata: u8 = 0x0f; +pub const DW_FORM_ref_addr: u8 = 0x10; +pub const DW_FORM_ref1: u8 = 0x11; +pub const DW_FORM_ref2: u8 = 0x12; +pub const DW_FORM_ref4: u8 = 0x13; +pub const DW_FORM_ref8: u8 = 0x14; +pub const DW_FORM_ref_udata: u8 = 0x15; +pub const DW_FORM_indirect: u8 = 0x16; +pub const DW_FORM_sec_offset: u8 = 0x17; +pub const DW_FORM_exprloc: u8 = 0x18; +pub const DW_FORM_flag_present: u8 = 0x19; +pub const DW_FORM_strx: u8 = 0x1a; +pub const DW_FORM_addrx: u8 = 0x1b; +pub const DW_FORM_ref_sup4: u8 = 0x1c; +pub const DW_FORM_strp_sup: u8 = 0x1d; +pub const DW_FORM_data16: u8 = 0x1e; +pub const DW_FORM_line_strp: u8 = 0x1f; +pub const DW_FORM_ref_sig8: u8 = 0x20; +pub const DW_FORM_implicit_const: u8 = 0x21; +pub const DW_FORM_loclistx: u8 = 0x22; +pub const DW_FORM_rnglistx: u8 = 0x23; +pub const DW_FORM_ref_sup8: u8 = 0x24; +pub const DW_FORM_str1: u8 = 0x25; +pub const DW_FORM_str2: u8 = 0x26; +pub const DW_FORM_str3: u8 = 0x27; +pub const DW_FORM_str4: u8 = 0x28; +pub const DW_FORM_addrx1: u8 = 0x29; +pub const DW_FORM_addrx2: u8 = 0x2a; +pub const DW_FORM_addrx3: u8 = 0x2b; +pub const DW_FORM_addrx4: u8 = 0x2c; diff --git a/third_party/blazesym/src/dwarf/debug_info.rs b/third_party/blazesym/src/dwarf/debug_info.rs new file mode 100644 index 0000000..106e844 --- /dev/null +++ b/third_party/blazesym/src/dwarf/debug_info.rs @@ -0,0 +1,869 @@ +//! Parse the `.debug_info` section to get Debug Information Entries. +//! +//! It supports DWARFv4 now. (See ) +//! It parse DIEs from the `.debug_info` section and Abbreviations +//! from the `.debg_abbrev` section. +//! +//! The `.debug_info` section is a list of (Compile-)Units. Every +//! Unit comprises DIEs to carry debug information of a source file. +//! A Unit starts with a header to describe the size of this unit in +//! the section, the offset of its abbreviation table in the +//! `.debug_abbrev` section, ..., and DWARF version. (version 4) +//! +//! A DIE starts with an index encoded in LEB128 to the abbreviation +//! table of the Unit. The abbreviation given by the index describle +//! the content, format and layout of a DIE. The abbreviation index +//! starts from 1. 0 means a null entry. DIEs in an Unit are +//! organized as a tree, parent-children. Null entries are used to +//! signal the last sibling to end a level. +//! +//! A user starts a parser by creating an instance of [`UnitIter`]. +//! It will walk through the data in the `.debug_info` and +//! `.debug_abbrev` section to return Units. + +use std::io::{Error, ErrorKind}; +use std::iter::Iterator; +use std::mem; + +use crate::util::decode_leb128_128; +use crate::util::decode_udword; +use crate::util::decode_uhalf; +use crate::util::decode_uword; +use crate::util::ReadRaw as _; + +use super::constants::*; + + +fn read_3bytes(data: &mut &[u8]) -> Option { + if let [b1, b2, b3] = data.read_slice(3)? { + Some(*b1 as u32 | ((*b2 as u32) << 8) | ((*b3 as u32) << 16)) + } else { + unreachable!() + } +} + +#[allow(unused)] +pub struct UnknownHeader { + init_length: usize, + bits64: bool, + version: u16, + unit_type: u8, + hdr_size: usize, +} + +#[allow(unused)] +pub struct CUHeaderV5 { + init_length: usize, + bits64: bool, + version: u16, + unit_type: u8, + address_size: u8, + debug_abbrev_offset: u64, + hdr_size: usize, +} + +#[allow(unused)] +pub struct CUHeaderV4 { + init_length: usize, + bits64: bool, + version: u16, + address_size: u8, + debug_abbrev_offset: u64, // The location of the abbreviation table. + hdr_size: usize, +} + +/// The Unit header. +/// +/// With DWARFv4, an unit header describe a compile unit followed by +/// DIEs of the unit in the `.debug_info` section. DWARFv5 is much +/// more complicated. +/// +/// So far, BlazeSym supports only DWARFv4, that is common used. +pub enum UnitHeader { + CompileV4(CUHeaderV4), + CompileV5(CUHeaderV5), + Unknown(UnknownHeader), +} + +impl UnitHeader { + fn unit_size(&self) -> usize { + match self { + UnitHeader::CompileV4(h) => h.init_length + (if h.bits64 { 12 } else { 4 }), + UnitHeader::CompileV5(h) => h.init_length + (if h.bits64 { 12 } else { 4 }), + UnitHeader::Unknown(h) => h.init_length + (if h.bits64 { 12 } else { 4 }), + } + } + + fn header_size(&self) -> usize { + match self { + UnitHeader::CompileV4(h) => h.hdr_size, + UnitHeader::CompileV5(h) => h.hdr_size, + UnitHeader::Unknown(h) => h.hdr_size, + } + } +} + +#[derive(Clone)] +pub struct AbbrevAttr { + name: u8, + form: u8, + opt: u128, +} + +/// An abbreviation. +/// +/// An abbrivation describes the format of a DIE. it comprises a list +/// of specifications that describe the names and the formats of +/// attributes. A DIE will be formated in the way described by it's +/// abbreviation. +pub struct Abbrev { + /// The index to the abbreviation table. + pub abbrev_code: u32, + /// The type of the abbreviation. + /// + /// It can be a DW_TAG_compile (a compile unit), + /// DW_TAG_subprogram, DW_TAG_variable, ... etc. + pub tag: u8, + pub has_children: bool, + + parsed_attrs: Vec, +} + +impl Abbrev { + #[inline] + pub fn all_attrs(&self) -> &[AbbrevAttr] { + &self.parsed_attrs[..] + } +} + +/// Parse an abbreviation from a buffer. +/// +/// Include all attributes, names and forms. +#[inline] +fn parse_abbrev(data: &[u8]) -> Option<(Abbrev, usize)> { + let (abbrev_code, bytes) = decode_leb128_128(data)?; + if abbrev_code == 0 { + return Some(( + Abbrev { + abbrev_code: 0, + tag: 0, + has_children: false, + parsed_attrs: vec![], + }, + 1, + )); + } + + let mut pos = bytes as usize; + let (tag, bytes) = decode_leb128_128(&data[pos..])?; + pos += bytes as usize; + let has_children = data[pos] == DW_CHILDREN_yes; + pos += 1; + + let mut parsed_attrs = Vec::::new(); + while pos < data.len() { + if let Some((name, form, opt, bytes)) = parse_abbrev_attr(&data[pos..]) { + pos += bytes; + parsed_attrs.push(AbbrevAttr { name, form, opt }); + if form == 0 { + break; + } + } else { + break; + } + } + + Some(( + Abbrev { + abbrev_code: abbrev_code as u32, + tag: tag as u8, + has_children, + parsed_attrs, + }, + pos, + )) +} + +/// Parse an attribute specification from a buffer. +/// +/// Return the name, form, optional value and size of an abbreviation. +#[inline] +fn parse_abbrev_attr(data: &[u8]) -> Option<(u8, u8, u128, usize)> { + let mut pos = 0; // Track the size of this abbreviation. + let (name, bytes) = decode_leb128_128(&data[pos..])?; + pos += bytes as usize; + let (form, bytes) = decode_leb128_128(&data[pos..])?; + pos += bytes as usize; + let opt = if form as u8 == DW_FORM_implicit_const || form as u8 == DW_FORM_indirect { + let (c, bytes) = decode_leb128_128(&data[pos..])?; + pos += bytes as usize; + c + } else { + 0 + }; + Some((name as u8, form as u8, opt, pos)) +} + +#[derive(Clone, Debug)] +pub enum AttrValue<'a> { + Signed128(i128), + Unsigned(u64), + Unsigned128(u128), + Bytes(&'a [u8]), + String(&'a str), +} + +fn extract_attr_value_impl<'data>( + data: &mut &'data [u8], + form: u8, + dwarf_sz: usize, + addr_sz: usize, +) -> Option> { + match form { + DW_FORM_addr => { + if addr_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()? as u64)) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_block2 => { + let value = data.read_u16()?; + Some(AttrValue::Bytes(data.read_slice(value.into())?)) + } + DW_FORM_block4 => { + let value = data.read_u32()?; + Some(AttrValue::Bytes(data.read_slice(value as usize)?)) + } + DW_FORM_data2 => Some(AttrValue::Unsigned(data.read_u16()?.into())), + DW_FORM_data4 => Some(AttrValue::Unsigned(data.read_u32()?.into())), + DW_FORM_data8 => Some(AttrValue::Unsigned(data.read_u64()?)), + DW_FORM_string => { + let string = data.read_cstr()?; + Some(AttrValue::String(string.to_str().ok()?)) + } + DW_FORM_block => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Bytes(data.read_slice(value as usize)?)) + } + DW_FORM_block1 => { + let value = data.read_u8()?; + Some(AttrValue::Bytes(data.read_slice(value.into())?)) + } + DW_FORM_data1 => Some(AttrValue::Unsigned(data.read_u8()?.into())), + DW_FORM_flag => Some(AttrValue::Unsigned(data.read_u8()?.into())), + DW_FORM_sdata => { + let (value, _bytes) = data.read_i128_leb128()?; + Some(AttrValue::Signed128(value)) + } + DW_FORM_strp => { + if dwarf_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()?.into())) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_udata => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned128(value)) + } + DW_FORM_ref_addr => { + if dwarf_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()?.into())) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_ref1 => Some(AttrValue::Unsigned(data.read_u8()?.into())), + DW_FORM_ref2 => Some(AttrValue::Unsigned(data.read_u16()?.into())), + DW_FORM_ref4 => Some(AttrValue::Unsigned(data.read_u32()?.into())), + DW_FORM_ref8 => Some(AttrValue::Unsigned(data.read_u64()?)), + DW_FORM_ref_udata => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned128(value)) + } + DW_FORM_indirect => { + let (f, _bytes) = data.read_u128_leb128()?; + extract_attr_value_impl(data, f as u8, dwarf_sz, addr_sz) + } + DW_FORM_sec_offset => { + if dwarf_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()?.into())) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_exprloc => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Bytes(data.read_slice(value as usize)?)) + } + DW_FORM_flag_present => Some(AttrValue::Unsigned(0)), + DW_FORM_strx => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned(value as u64)) + } + DW_FORM_addrx => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned(value as u64)) + } + DW_FORM_ref_sup4 => Some(AttrValue::Unsigned(data.read_u32()?.into())), + DW_FORM_strp_sup => { + if dwarf_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()?.into())) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_data16 => Some(AttrValue::Bytes(data.read_slice(16)?)), + DW_FORM_line_strp => { + if dwarf_sz == 0x4 { + Some(AttrValue::Unsigned(data.read_u32()?.into())) + } else { + Some(AttrValue::Unsigned(data.read_u64()?)) + } + } + DW_FORM_ref_sig8 => Some(AttrValue::Bytes(data.read_slice(8)?)), + DW_FORM_implicit_const => Some(AttrValue::Unsigned(0)), + DW_FORM_loclistx => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned(value as u64)) + } + DW_FORM_rnglistx => { + let (value, _bytes) = data.read_u128_leb128()?; + Some(AttrValue::Unsigned(value as u64)) + } + DW_FORM_ref_sup8 => Some(AttrValue::Unsigned(data.read_u64()?)), + DW_FORM_str1 => Some(AttrValue::Unsigned(data.read_u8()?.into())), + DW_FORM_str2 => Some(AttrValue::Unsigned(data.read_u16()?.into())), + DW_FORM_str3 => Some(AttrValue::Unsigned(read_3bytes(data)?.into())), + DW_FORM_str4 => Some(AttrValue::Unsigned(data.read_u32()?.into())), + DW_FORM_addrx1 => Some(AttrValue::Unsigned(data.read_u8()?.into())), + DW_FORM_addrx2 => Some(AttrValue::Unsigned(data.read_u16()?.into())), + DW_FORM_addrx3 => Some(AttrValue::Unsigned(read_3bytes(data)?.into())), + DW_FORM_addrx4 => Some(AttrValue::Unsigned(data.read_u32()?.into())), + _ => None, + } +} + +/// Extract the value of an attribute from a data buffer. +/// +/// This function works with [`parse_abbrev_attr()`], that parse the +/// attribute specifications of DIEs delcared in the abbreviation +/// table in the .debug_abbrev section, by using the result of +/// [`parse_abbrev_attr()`] to parse the value of an attribute of a +/// DIE. +/// +/// # Arguments +/// +/// * `data` - A buffer where the value is in. +/// * `form` - The format of the value. (DW_FORM_*) +/// * `dwarf_sz` - Describe the DWARF format. (4 for 32-bits and 8 for 64-bits) +/// * `addr_sz` - The size of an address of the target platform. (4 for 32-bits and 8 for 64-bits) +/// +/// Return AttrValue and the number of bytes it takes. +fn extract_attr_value( + mut data: &[u8], + form: u8, + dwarf_sz: usize, + addr_sz: usize, +) -> Option<(AttrValue<'_>, usize)> { + let data = &mut data; + let before = (*data).as_ptr(); + let value = extract_attr_value_impl(data, form, dwarf_sz, addr_sz)?; + let after = (*data).as_ptr(); + // TODO: Remove this workaround once callers no longer require an explicit + // byte count being passed out. + // SAFETY: Both pointers point into the same underlying byte array. + let count = unsafe { after.offset_from(before) }; + Some((value, count.try_into().unwrap())) +} + +/// Parse all abbreviations of an abbreviation table for a compile +/// unit. +/// +/// An abbreviation table is usually for a compile unit, but not always. +/// +/// Return a list of abbreviations and the number of bytes they take. +fn parse_cu_abbrevs(data: &[u8]) -> Option<(Vec, usize)> { + let mut pos = 0; + let mut abbrevs = Vec::::with_capacity(data.len() / 50); // Heuristic! + + while pos < data.len() { + let (abbrev, bytes) = parse_abbrev(&data[pos..])?; + pos += bytes; + if abbrev.abbrev_code == 0x0 { + return Some((abbrevs, pos)); + } + abbrevs.push(abbrev); + } + None +} + +/// Parse an Unit Header from a buffer. +/// +/// An Unit Header is the header of a compile unit, at leat for v4. +/// +/// # Arguments +/// +/// * `data` - is the data from the `.debug_info` section. +fn parse_unit_header(data: &[u8]) -> Option { + if data.len() < 4 { + return None; + } + + let mut pos = 0; + let mut init_length = decode_uword(data) as usize; + pos += 4; + + let bits64 = init_length == 0xffffffff; + if bits64 { + if (pos + 8) > data.len() { + return None; + } + init_length = decode_udword(&data[pos..]) as usize; + pos += 8; + } + + if (pos + 2) > data.len() { + return None; + } + let version = decode_uhalf(&data[pos..]); + pos += 2; + + if version == 0x4 { + let debug_abbrev_offset: u64 = if bits64 { + if (pos + 8) > data.len() { + return None; + } + let v = decode_udword(&data[pos..]); + pos += 8; + v + } else { + if (pos + 4) > data.len() { + return None; + } + let v = decode_uword(&data[pos..]); + pos += 4; + v as u64 + }; + let address_size = data[pos]; + pos += 1; + return Some(UnitHeader::CompileV4(CUHeaderV4 { + init_length, + bits64, + version, + debug_abbrev_offset, + address_size, + hdr_size: pos, + })); + } + + if (pos + 1) > data.len() { + return None; + } + let unit_type = data[pos]; + pos += 1; + + match unit_type { + DW_UT_compile => { + if (pos + 1) > data.len() { + return None; + } + let address_size = data[pos]; + pos += 1; + + let debug_abbrev_offset: u64 = if bits64 { + if (pos + 8) > data.len() { + return None; + } + let v = decode_udword(&data[pos..]); + pos += 8; + v + } else { + if (pos + 4) > data.len() { + return None; + } + let v = decode_uword(&data[pos..]); + pos += 4; + v as u64 + }; + Some(UnitHeader::CompileV5(CUHeaderV5 { + init_length, + bits64, + version, + unit_type, + address_size, + debug_abbrev_offset, + hdr_size: pos, + })) + } + _ => Some(UnitHeader::Unknown(UnknownHeader { + init_length, + bits64, + version, + unit_type, + hdr_size: pos, + })), + } +} + +/// Debug Information Entry. +/// +/// A DIE starts with the code of its abbreviation followed by the +/// attribute values described by the abbreviation. The code of an +/// abbreviation is an index to the abbreviation table of the compile +/// unit. +#[allow(clippy::upper_case_acronyms)] +pub struct DIE<'a> { + pub tag: u8, + pub abbrev: Option<&'a Abbrev>, + abbrev_attrs: &'a [AbbrevAttr], + abbrev_attrs_idx: usize, + data: &'a [u8], + dieiter: &'a mut DIEIter<'a>, + reading_offset: usize, + done: bool, +} + +impl<'a> DIE<'a> { + #[inline] + pub fn exhaust(&mut self) -> Result<(), Error> { + let abbrev_attrs = self.abbrev_attrs; + + if self.done { + return Ok(()); + } + + while self.abbrev_attrs_idx < abbrev_attrs.len() { + let attr = &abbrev_attrs[self.abbrev_attrs_idx]; + self.abbrev_attrs_idx += 1; + + if attr.form == 0 { + continue; + } + let (_value, bytes) = extract_attr_value( + &self.data[self.reading_offset..], + attr.form, + self.dieiter.dwarf_sz, + self.dieiter.addr_sz, + ) + .ok_or_else(|| { + Error::new(ErrorKind::InvalidData, "failed to parse attribute values") + })?; + self.reading_offset += bytes; + } + self.dieiter.die_finish_reading(self.reading_offset); + self.done = true; + Ok(()) + } +} + +impl<'a> Iterator for DIE<'a> { + // name, form, opt, value + type Item = (u8, u8, u128, AttrValue<'a>); + + #[inline] + fn next(&mut self) -> Option { + if self.done { + return None; + } + self.abbrev?; + + if self.abbrev_attrs_idx < self.abbrev_attrs.len() { + let AbbrevAttr { name, form, opt } = self.abbrev_attrs[self.abbrev_attrs_idx]; + self.abbrev_attrs_idx += 1; + + #[cfg(debug)] + if form == 0 { + assert_eq!(self.abbrev_off, abbrev.attrs.len()); + } + if form == 0 { + self.dieiter.die_finish_reading(self.reading_offset); + self.done = true; + return None; + } + + let (value, bytes) = extract_attr_value( + &self.data[self.reading_offset..], + form, + self.dieiter.dwarf_sz, + self.dieiter.addr_sz, + )?; + self.reading_offset += bytes; + Some((name, form, opt, value)) + } else { + self.dieiter.die_finish_reading(self.reading_offset); + self.done = true; + None + } + } +} + +/// The iterator of DIEs in an Unit. +pub struct DIEIter<'a> { + data: &'a [u8], + dwarf_sz: usize, + addr_sz: usize, + off: usize, + off_delta: usize, + cur_depth: usize, + abbrevs: Vec, + abbrev: Option<&'a Abbrev>, + die_reading_done: bool, + done: bool, +} + +impl<'a> DIEIter<'a> { + pub fn die_finish_reading(&mut self, size: usize) { + self.die_reading_done = true; + self.off += size; + } + + pub fn seek_to_sibling(&mut self, off: usize) { + self.off = off - self.off_delta; + self.cur_depth -= 1; + self.die_reading_done = true; + } + + #[inline] + pub fn exhaust_die(&mut self) -> Result<(), Error> { + assert!( + !self.die_reading_done, + "DIE should not have been exhausted!" + ); + let abbrev = self.abbrev.unwrap(); + for attr in abbrev.all_attrs() { + if attr.form == 0 { + continue; + } + let (_value, bytes) = extract_attr_value( + &self.data[self.off..], + attr.form, + self.dwarf_sz, + self.addr_sz, + ) + .ok_or_else(|| { + Error::new(ErrorKind::InvalidData, "failed to parse attribute values") + })?; + self.off += bytes; + } + self.die_reading_done = true; + Ok(()) + } +} + +impl<'a> Iterator for DIEIter<'a> { + type Item = DIE<'a>; + + fn next(&mut self) -> Option { + if !self.die_reading_done { + self.exhaust_die().unwrap(); + } + if self.done { + return None; + } + + let (abbrev_idx, bytes) = decode_leb128_128(&self.data[self.off..])?; + self.off += bytes as usize; + + if abbrev_idx == 0 { + self.cur_depth -= 1; + if self.cur_depth == 0 { + self.done = true; + } + Some(DIE { + tag: 0, + abbrev: None, + abbrev_attrs: &[], + abbrev_attrs_idx: 0, + data: &self.data[self.off..], + dieiter: unsafe { mem::transmute(self) }, + reading_offset: 0, + done: false, + }) + } else { + let abbrev = unsafe { mem::transmute(&self.abbrevs[abbrev_idx as usize - 1]) }; + self.abbrev = Some(abbrev); + if abbrev.has_children { + self.cur_depth += 1; + } + + self.die_reading_done = false; + Some(DIE { + tag: abbrev.tag, + abbrev: Some(abbrev), + abbrev_attrs: abbrev.all_attrs(), + abbrev_attrs_idx: 0, + data: &self.data[self.off..], + dieiter: unsafe { mem::transmute(self) }, + reading_offset: 0, + done: false, + }) + } + } +} + +/// An iterator of Units in the `.debug_info` section. +/// +/// An iterator is built from the content of `.debug_info` section, +/// which is a list of compile units. A compile unit usually refers +/// to a source file. In the compile units, it is a forest of DIEs, +/// which presents functions, variables and other debug information. +pub struct UnitIter<'a> { + info_data: &'a [u8], + abbrev_data: &'a [u8], + off: usize, +} + +impl<'a> UnitIter<'a> { + /// Build an iterator from the content of `.debug_info` & `.debug_abbrev`. + /// + /// # Arguments + /// + /// * `info_data` is the content of the `.debug_info` section. + /// * `abbrev_data` is the content of the `.debug_abbrev` section. + pub fn new(info_data: &'a [u8], abbrev_data: &'a [u8]) -> UnitIter<'a> { + UnitIter { + info_data, + abbrev_data, + off: 0, + } + } +} + +impl<'a> Iterator for UnitIter<'a> { + type Item = (UnitHeader, DIEIter<'a>); + + fn next(&mut self) -> Option { + let off = self.off; + let uh = parse_unit_header(&self.info_data[off..])?; + let hdr_sz = uh.header_size(); + self.off += uh.unit_size(); + + match uh { + UnitHeader::CompileV4(ref cuh) => { + let dwarf_sz = if cuh.bits64 { 8 } else { 4 }; + let addr_sz = cuh.address_size as usize; + let (abbrevs, _) = + parse_cu_abbrevs(&self.abbrev_data[cuh.debug_abbrev_offset as usize..])?; + Some(( + uh, + DIEIter { + data: &self.info_data[off + hdr_sz..], + dwarf_sz, + addr_sz, + off: 0, + off_delta: hdr_sz, + cur_depth: 0, + abbrevs, + abbrev: None, + die_reading_done: true, + done: false, + }, + )) + } + UnitHeader::CompileV5(ref _cuh) => { + todo!(); // BlazeSym supports only v4 so far. + } + _ => self.next(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::elf::ElfParser; + use std::env; + use std::path::Path; + + #[test] + fn test_parse_abbrev() { + let raw = [ + 0x01, 0x11, 0x01, 0x25, 0x0e, 0x13, 0x05, 0x03, 0x0e, 0x10, 0x17, 0x1b, 0x0e, 0xb4, + 0x42, 0x19, 0x11, 0x01, 0x55, 0x17, 0x00, 0x00, 0x02, 0x39, 0x01, 0x03, 0x0e, 0x00, + 0x00, 0x03, 0x04, 0x01, 0x49, 0x13, 0x6d, 0x19, 0x03, 0x0e, 0x0b, 0x0b, 0x88, 0x01, + 0x0f, 0x00, 0x00, 0x04, 0x28, 0x00, 0x03, 0x0e, 0x1c, 0x0f, 0x00, 0x00, 0x05, 0x13, + 0x01, 0x03, 0x0e, 0x0b, 0x0b, 0x88, 0x01, 0x0f, 0x00, 0x00, + ]; + let (abbrev, bytes) = parse_abbrev(&raw).unwrap(); + assert_eq!(bytes, 22); + assert_eq!(abbrev.abbrev_code, 0x1); + assert_eq!(abbrev.tag, DW_TAG_compile_unit); + assert!(abbrev.has_children); + let mut pos = bytes; + + let (abbrev, bytes) = parse_abbrev(&raw[pos..]).unwrap(); + assert_eq!(bytes, 7); + assert_eq!(abbrev.abbrev_code, 0x2); + assert_eq!(abbrev.tag, DW_TAG_namespace); + assert!(abbrev.has_children); + pos += bytes; + + let (abbrev, bytes) = parse_abbrev(&raw[pos..]).unwrap(); + assert_eq!(bytes, 16); + assert_eq!(abbrev.abbrev_code, 0x3); + assert_eq!(abbrev.tag, DW_TAG_enumeration_type); + assert!(abbrev.has_children); + pos += bytes; + + let (abbrev, bytes) = parse_abbrev(&raw[pos..]).unwrap(); + assert_eq!(bytes, 9); + assert_eq!(abbrev.abbrev_code, 0x4); + assert!(!abbrev.has_children); + pos += bytes; + + let (abbrev, bytes) = parse_abbrev(&raw[pos..]).unwrap(); + assert_eq!(bytes, 12); + assert_eq!(abbrev.abbrev_code, 0x5); + assert!(abbrev.has_children); + } + + #[test] + fn test_parse_cu_abbrevs() { + let raw = [ + 0x01, 0x11, 0x01, 0x25, 0x0e, 0x13, 0x05, 0x03, 0x0e, 0x10, 0x17, 0x1b, 0x0e, 0xb4, + 0x42, 0x19, 0x11, 0x01, 0x55, 0x17, 0x00, 0x00, 0x02, 0x39, 0x01, 0x03, 0x0e, 0x00, + 0x00, 0x03, 0x04, 0x01, 0x49, 0x13, 0x6d, 0x19, 0x03, 0x0e, 0x0b, 0x0b, 0x88, 0x01, + 0x0f, 0x00, 0x00, 0x04, 0x28, 0x00, 0x03, 0x0e, 0x1c, 0x0f, 0x00, 0x00, 0x05, 0x13, + 0x01, 0x03, 0x0e, 0x0b, 0x0b, 0x88, 0x01, 0x0f, 0x00, 0x00, 0x00, + ]; + let (abbrevs, bytes) = parse_cu_abbrevs(&raw).unwrap(); + assert_eq!(abbrevs.len(), 0x5); + assert_eq!(bytes, raw.len()); + } + + #[test] + fn test_unititer() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("dwarf-example"); + let elfparser = ElfParser::open(&bin_name).unwrap(); + let abbrev_idx = elfparser.find_section(".debug_abbrev").unwrap(); + let abbrev = elfparser.read_section_raw(abbrev_idx).unwrap(); + let info_idx = elfparser.find_section(".debug_info").unwrap(); + let info = elfparser.read_section_raw(info_idx).unwrap(); + + let iter = UnitIter::new(&info, &abbrev); + let mut cnt = 0; + let mut die_cnt = 0; + let mut attr_cnt = 0; + let mut subprog_cnt = 0; + for (_uh, dieiter) in iter { + cnt += 1; + for die in dieiter { + die_cnt += 1; + if die.tag == DW_TAG_subprogram { + subprog_cnt += 1; + } + for (_name, _form, _opt, _value) in die { + attr_cnt += 1; + } + } + } + assert_eq!(cnt, 9); + assert_eq!(die_cnt, 78752); + assert_eq!(subprog_cnt, 12451); + assert_eq!(attr_cnt, 275310); + } +} diff --git a/third_party/blazesym/src/elf/cache.rs b/third_party/blazesym/src/elf/cache.rs new file mode 100644 index 0000000..999c85d --- /dev/null +++ b/third_party/blazesym/src/elf/cache.rs @@ -0,0 +1,312 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::fs::File; +use std::io::Error; +use std::path::{Path, PathBuf}; +use std::ptr; +use std::rc::Rc; + +use nix::sys::stat::{fstat, FileStat}; +use std::os::unix::io::AsRawFd; + +use crate::dwarf::DwarfResolver; + +use super::ElfParser; + +type ElfCacheEntryKey = PathBuf; + +const DFL_CACHE_MAX: usize = 1024; + +#[derive(Clone)] +pub enum ElfBackend { + Dwarf(Rc), // ELF w/ DWARF + Elf(Rc), // ELF w/o DWARF +} + +#[cfg(test)] +impl ElfBackend { + pub fn to_dwarf(&self) -> Option> { + if let Self::Dwarf(dwarf) = self { + Some(Rc::clone(dwarf)) + } else { + None + } + } + + pub fn is_dwarf(&self) -> bool { + matches!(self, Self::Dwarf(_)) + } +} + +struct ElfCacheEntry { + // LRU links + prev: *mut ElfCacheEntry, + next: *mut ElfCacheEntry, + + file_name: PathBuf, + + dev: libc::dev_t, + inode: libc::ino_t, + size: libc::off_t, + mtime_sec: libc::time_t, + mtime_nsec: i64, + backend: ElfBackend, +} + +impl ElfCacheEntry { + pub fn new( + file_name: &Path, + file: File, + line_number_info: bool, + debug_info_symbols: bool, + ) -> Result { + let stat = fstat(file.as_raw_fd())?; + let parser = Rc::new(ElfParser::open_file(file)?); + let backend = if let Ok(dwarf) = DwarfResolver::from_parser_for_addresses( + Rc::clone(&parser), + &[], + line_number_info, + debug_info_symbols, + ) { + ElfBackend::Dwarf(Rc::new(dwarf)) + } else { + ElfBackend::Elf(parser) + }; + + Ok(ElfCacheEntry { + prev: ptr::null_mut(), + next: ptr::null_mut(), + file_name: file_name.to_path_buf(), + dev: stat.st_dev, + inode: stat.st_ino, + size: stat.st_size, + mtime_sec: stat.st_mtime, + mtime_nsec: stat.st_mtime_nsec, + backend, + }) + } + + fn get_key(&self) -> &Path { + &self.file_name + } + + fn is_valid(&self, stat: &FileStat) -> bool { + stat.st_dev == self.dev + && stat.st_ino == self.inode + && stat.st_size == self.size + && stat.st_mtime == self.mtime_sec + && stat.st_mtime_nsec == self.mtime_nsec + } + + fn get_backend(&self) -> ElfBackend { + self.backend.clone() + } +} + +/// Maintain a LRU linked list of entries +struct ElfCacheLru { + head: *mut ElfCacheEntry, + tail: *mut ElfCacheEntry, +} + +impl ElfCacheLru { + /// # Safety + /// + /// Make all entries are valid. + unsafe fn touch(&mut self, ent: &ElfCacheEntry) { + unsafe { self.remove(ent) }; + unsafe { self.push_back(ent) }; + } + + /// # Safety + /// + /// Make all entries are valid. + unsafe fn remove(&mut self, ent: &ElfCacheEntry) { + let ent_ptr = ent as *const ElfCacheEntry as *mut ElfCacheEntry; + let prev = unsafe { (*ent_ptr).prev }; + let next = unsafe { (*ent_ptr).next }; + if !prev.is_null() { + unsafe { (*prev).next = next }; + } else { + self.head = next; + } + if !next.is_null() { + unsafe { (*next).prev = prev }; + } else { + self.tail = prev; + } + } + + /// # Safety + /// + /// Make all entries are valid. + unsafe fn push_back(&mut self, ent: &ElfCacheEntry) { + let ent_ptr = ent as *const ElfCacheEntry as *mut ElfCacheEntry; + if self.head.is_null() { + unsafe { (*ent_ptr).next = ptr::null_mut() }; + unsafe { (*ent_ptr).prev = ptr::null_mut() }; + self.head = ent_ptr; + self.tail = ent_ptr; + } else { + unsafe { (*ent_ptr).next = ptr::null_mut() }; + unsafe { (*self.tail).next = ent_ptr }; + unsafe { (*ent_ptr).prev = self.tail }; + self.tail = ent_ptr; + } + } + + /// # Safety + /// + /// Make all entries are valid. + unsafe fn pop_head(&mut self) -> *mut ElfCacheEntry { + let ent = self.head; + if !ent.is_null() { + unsafe { self.remove(&*ent) }; + } + ent + } +} + +struct _ElfCache { + elfs: HashMap>, + lru: ElfCacheLru, + max_elfs: usize, + line_number_info: bool, + debug_info_symbols: bool, +} + +impl _ElfCache { + fn new(line_number_info: bool, debug_info_symbols: bool) -> _ElfCache { + _ElfCache { + elfs: HashMap::new(), + lru: ElfCacheLru { + head: ptr::null_mut(), + tail: ptr::null_mut(), + }, + max_elfs: DFL_CACHE_MAX, + line_number_info, + debug_info_symbols, + } + } + + /// # Safety + /// + /// The returned reference is only valid before next time calling + /// create_entry(). + /// + unsafe fn find_entry(&mut self, file_name: &Path) -> Option<&ElfCacheEntry> { + let ent = self.elfs.get(file_name)?; + unsafe { self.lru.touch(ent) }; + + Some(ent.as_ref()) + } + + /// # Safety + /// + /// The returned reference is only valid before next time calling + /// create_entry(). + /// + unsafe fn create_entry( + &mut self, + file_name: &Path, + file: File, + ) -> Result<&ElfCacheEntry, Error> { + let ent = Box::new(ElfCacheEntry::new( + file_name, + file, + self.line_number_info, + self.debug_info_symbols, + )?); + let key = ent.get_key().to_path_buf(); + + self.elfs.insert(key.clone(), ent); + unsafe { self.lru.push_back(self.elfs.get(&key).unwrap().as_ref()) }; + unsafe { self.ensure_size() }; + + Ok(unsafe { &*self.lru.tail }) // Get 'static lifetime + } + + /// # Safety + /// + /// This funciton may make some cache entries invalid. Callers + /// should be careful about all references of cache entries they + /// are holding. + unsafe fn ensure_size(&mut self) { + if self.elfs.len() > self.max_elfs { + let to_remove = unsafe { self.lru.pop_head() }; + self.elfs.remove(unsafe { (*to_remove).get_key() }).unwrap(); + } + } + + fn find_or_create_backend( + &mut self, + file_name: &Path, + file: File, + ) -> Result { + if let Some(ent) = unsafe { self.find_entry(file_name) } { + let stat = fstat(file.as_raw_fd())?; + + if ent.is_valid(&stat) { + return Ok(ent.get_backend()); + } + + // Purge the entry and load it from the filesystem. + unsafe { + let ent = &*(ent as *const ElfCacheEntry); // static lifetime to decouple borrowing + self.lru.remove(ent) + }; + self.elfs.remove(file_name); + } + + Ok(unsafe { self.create_entry(file_name, file)? }.get_backend()) + } + + pub fn find(&mut self, path: &Path) -> Result { + let file = File::open(path)?; + self.find_or_create_backend(path, file) + } +} + +pub struct ElfCache { + cache: RefCell<_ElfCache>, +} + +impl ElfCache { + pub fn new(line_number_info: bool, debug_info_symbols: bool) -> ElfCache { + ElfCache { + cache: RefCell::new(_ElfCache::new(line_number_info, debug_info_symbols)), + } + } + + pub fn find(&self, path: &Path) -> Result { + let mut cache = self.cache.borrow_mut(); + cache.find(path) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::env; + + #[test] + fn test_cache() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-no-debug.bin"); + + let cache = ElfCache::new(true, false); + let backend_first = cache.find(Path::new(&bin_name)); + let backend_second = cache.find(Path::new(&bin_name)); + assert!(backend_first.is_ok()); + assert!(backend_second.is_ok()); + let backend_first = backend_first.unwrap(); + let backend_second = backend_second.unwrap(); + assert!(backend_first.is_dwarf()); + assert!(backend_second.is_dwarf()); + assert_eq!( + ptr::addr_of!(*backend_first.to_dwarf().unwrap().get_parser()), + ptr::addr_of!(*backend_second.to_dwarf().unwrap().get_parser()) + ); + } +} diff --git a/third_party/blazesym/src/elf/mod.rs b/third_party/blazesym/src/elf/mod.rs new file mode 100644 index 0000000..97dfcf0 --- /dev/null +++ b/third_party/blazesym/src/elf/mod.rs @@ -0,0 +1,8 @@ +mod cache; +mod parser; +mod resolver; +mod types; + +pub use cache::ElfCache; +pub use parser::ElfParser; +pub use resolver::ElfResolver; diff --git a/third_party/blazesym/src/elf/parser.rs b/third_party/blazesym/src/elf/parser.rs new file mode 100644 index 0000000..68af702 --- /dev/null +++ b/third_party/blazesym/src/elf/parser.rs @@ -0,0 +1,699 @@ +use std::cell::RefCell; +use std::ffi::CStr; +use std::fs::File; +use std::io::{Error, ErrorKind, Read, Seek, SeekFrom}; +use std::mem; +#[cfg(test)] +use std::path::Path; + +use regex::Regex; + +use crate::util::{extract_string, search_address_opt_key}; +use crate::FindAddrOpts; +use crate::SymbolInfo; +use crate::SymbolType; + +use super::types::Elf64_Ehdr; +use super::types::Elf64_Phdr; +use super::types::Elf64_Shdr; +use super::types::Elf64_Sym; +use super::types::SHN_UNDEF; +#[cfg(test)] +use super::types::STT_FUNC; + + +fn read_u8(file: &mut File, off: u64, size: usize) -> Result, Error> { + let mut buf = vec![0; size]; + + file.seek(SeekFrom::Start(off))?; + file.read_exact(buf.as_mut_slice())?; + + Ok(buf) +} + +fn read_elf_header(file: &mut File) -> Result { + let mut buffer = [0u8; mem::size_of::()]; + let () = file.read_exact(&mut buffer)?; + + let pointer = buffer.as_ptr() as *const Elf64_Ehdr; + // SAFETY: `buffer` is valid for reads and the `Elf64_Ehdr` object that we + // read is comprised only of members that are valid for any bit + // pattern. + let elf_header = unsafe { pointer.read_unaligned() }; + + Ok(elf_header) +} + +fn read_elf_sections(file: &mut File, ehdr: &Elf64_Ehdr) -> Result, Error> { + const HDRSIZE: usize = mem::size_of::(); + let off = ehdr.e_shoff as usize; + let num = ehdr.e_shnum as usize; + + let mut buf = read_u8(file, off as u64, num * HDRSIZE)?; + + let shdrs: Vec = unsafe { + let shdrs_ptr = buf.as_mut_ptr() as *mut Elf64_Shdr; + buf.leak(); + Vec::from_raw_parts(shdrs_ptr, num, num) + }; + Ok(shdrs) +} + +fn read_elf_program_headers(file: &mut File, ehdr: &Elf64_Ehdr) -> Result, Error> { + const HDRSIZE: usize = mem::size_of::(); + let off = ehdr.e_phoff as usize; + let num = ehdr.e_phnum as usize; + + let mut buf = read_u8(file, off as u64, num * HDRSIZE)?; + + let phdrs: Vec = unsafe { + let phdrs_ptr = buf.as_mut_ptr() as *mut Elf64_Phdr; + buf.leak(); + Vec::from_raw_parts(phdrs_ptr, num, num) + }; + Ok(phdrs) +} + +fn read_elf_section_raw(file: &mut File, section: &Elf64_Shdr) -> Result, Error> { + read_u8(file, section.sh_offset, section.sh_size as usize) +} + +fn read_elf_section_seek(file: &mut File, section: &Elf64_Shdr) -> Result<(), Error> { + file.seek(SeekFrom::Start(section.sh_offset))?; + Ok(()) +} + +fn get_elf_section_name<'a>(sect: &Elf64_Shdr, strtab: &'a [u8]) -> Option<&'a str> { + extract_string(strtab, sect.sh_name as usize) +} + +struct ElfParserBack { + ehdr: Option, + shdrs: Option>, + shstrtab: Option>, + phdrs: Option>, + symtab: Option>, // in address order + symtab_origin: Option>, // The copy in the same order as the file + strtab: Option>, + str2symtab: Option>, // strtab offset to symtab in the dictionary order + sect_cache: Vec>>, +} + +/// A parser against ELF64 format. +/// +pub struct ElfParser { + file: RefCell, + backobj: RefCell, +} + +impl ElfParser { + pub fn open_file(file: File) -> Result { + let parser = ElfParser { + file: RefCell::new(file), + backobj: RefCell::new(ElfParserBack { + ehdr: None, + shdrs: None, + shstrtab: None, + phdrs: None, + symtab: None, + symtab_origin: None, + strtab: None, + str2symtab: None, + sect_cache: vec![], + }), + }; + Ok(parser) + } + + #[cfg(test)] + pub fn open(filename: &Path) -> Result { + let file = File::open(filename)?; + let parser = Self::open_file(file); + if let Ok(parser) = parser { + Ok(parser) + } else { + parser + } + } + + fn ensure_ehdr(&self) -> Result<(), Error> { + let mut me = self.backobj.borrow_mut(); + + if me.ehdr.is_some() { + return Ok(()); + } + + let ehdr = read_elf_header(&mut self.file.borrow_mut())?; + if !(ehdr.e_ident[0] == 0x7f + && ehdr.e_ident[1] == 0x45 + && ehdr.e_ident[2] == 0x4c + && ehdr.e_ident[3] == 0x46) + { + return Err(Error::new(ErrorKind::InvalidData, "e_ident is wrong")); + } + + me.ehdr = Some(ehdr); + + Ok(()) + } + + fn ensure_shdrs(&self) -> Result<(), Error> { + self.ensure_ehdr()?; + + let mut me = self.backobj.borrow_mut(); + + if me.shdrs.is_some() { + return Ok(()); + } + + let shdrs = read_elf_sections(&mut self.file.borrow_mut(), me.ehdr.as_ref().unwrap())?; + me.sect_cache.resize(shdrs.len(), None); + me.shdrs = Some(shdrs); + + Ok(()) + } + + fn ensure_phdrs(&self) -> Result<(), Error> { + self.ensure_ehdr()?; + + let mut me = self.backobj.borrow_mut(); + + if me.phdrs.is_some() { + return Ok(()); + } + + let phdrs = + read_elf_program_headers(&mut self.file.borrow_mut(), me.ehdr.as_ref().unwrap())?; + me.phdrs = Some(phdrs); + + Ok(()) + } + + fn ensure_shstrtab(&self) -> Result<(), Error> { + self.ensure_shdrs()?; + + let mut me = self.backobj.borrow_mut(); + + if me.shstrtab.is_some() { + return Ok(()); + } + + let shstrndx = me.ehdr.as_ref().unwrap().e_shstrndx; + let shstrtab_sec = &me.shdrs.as_ref().unwrap()[shstrndx as usize]; + let shstrtab = read_elf_section_raw(&mut self.file.borrow_mut(), shstrtab_sec)?; + me.shstrtab = Some(shstrtab); + + Ok(()) + } + + fn ensure_symtab(&self) -> Result<(), Error> { + { + let me = self.backobj.borrow(); + + if me.symtab.is_some() { + return Ok(()); + } + } + + let sect_idx = if let Ok(idx) = self.find_section(".symtab") { + idx + } else { + self.find_section(".dynsym")? + }; + let symtab_raw = self.read_section_raw(sect_idx)?; + + if symtab_raw.len() % mem::size_of::() != 0 { + return Err(Error::new( + ErrorKind::InvalidData, + "size of the .symtab section does not match", + )); + } + let cnt = symtab_raw.len() / mem::size_of::(); + let mut symtab: Vec = unsafe { + let symtab_ptr = symtab_raw.as_ptr() as *mut Elf64_Sym; + symtab_raw.leak(); + Vec::from_raw_parts(symtab_ptr, cnt, cnt) + }; + let origin = symtab.clone(); + symtab.sort_by_key(|x| x.st_value); + + let mut me = self.backobj.borrow_mut(); + me.symtab = Some(symtab); + me.symtab_origin = Some(origin); + + Ok(()) + } + + fn ensure_strtab(&self) -> Result<(), Error> { + { + let me = self.backobj.borrow(); + + if me.strtab.is_some() { + return Ok(()); + } + } + + let sect_idx = if let Ok(idx) = self.find_section(".strtab") { + idx + } else { + self.find_section(".dynstr")? + }; + let strtab = self.read_section_raw(sect_idx)?; + + let mut me = self.backobj.borrow_mut(); + me.strtab = Some(strtab); + + Ok(()) + } + + fn ensure_str2symtab(&self) -> Result<(), Error> { + self.ensure_symtab()?; + self.ensure_strtab()?; + + let mut me = self.backobj.borrow_mut(); + if me.str2symtab.is_some() { + return Ok(()); + } + + // Build strtab offsets to symtab indices + let strtab = me.strtab.as_ref().unwrap(); + let symtab = me.symtab.as_ref().unwrap(); + let mut str2symtab = Vec::<(usize, usize)>::with_capacity(symtab.len()); + for (sym_i, sym) in symtab.iter().enumerate() { + let name_off = sym.st_name; + str2symtab.push((name_off as usize, sym_i)); + } + + // Sort in the dictionary order + str2symtab + .sort_by_key(|&x| unsafe { CStr::from_ptr(&strtab[x.0] as *const u8 as *const i8) }); + + me.str2symtab = Some(str2symtab); + + Ok(()) + } + + pub fn get_elf_file_type(&self) -> Result { + self.ensure_ehdr()?; + + let me = self.backobj.borrow(); + + Ok(me.ehdr.as_ref().unwrap().e_type) + } + + fn check_section_index(&self, sect_idx: usize) -> Result<(), Error> { + let nsects = self.get_num_sections()?; + + if nsects <= sect_idx { + return Err(Error::new(ErrorKind::InvalidInput, "the index is too big")); + } + Ok(()) + } + + pub fn section_seek(&self, sect_idx: usize) -> Result<(), Error> { + self.check_section_index(sect_idx)?; + self.ensure_shdrs()?; + let me = self.backobj.borrow(); + read_elf_section_seek( + &mut self.file.borrow_mut(), + &me.shdrs.as_ref().unwrap()[sect_idx], + ) + } + + /// Read the raw data of the section of a given index. + pub fn read_section_raw(&self, sect_idx: usize) -> Result, Error> { + self.check_section_index(sect_idx)?; + self.ensure_shdrs()?; + + let me = self.backobj.borrow(); + read_elf_section_raw( + &mut self.file.borrow_mut(), + &me.shdrs.as_ref().unwrap()[sect_idx], + ) + } + + /// Read the raw data of the section of a given index. + pub fn read_section_raw_cache(&self, sect_idx: usize) -> Result<&[u8], Error> { + self.check_section_index(sect_idx)?; + self.ensure_shdrs()?; + + let mut me = self.backobj.borrow_mut(); + if me.sect_cache[sect_idx].is_none() { + let buf = read_elf_section_raw( + &mut self.file.borrow_mut(), + &me.shdrs.as_ref().unwrap()[sect_idx], + )?; + me.sect_cache[sect_idx] = Some(buf); + } + + Ok(unsafe { mem::transmute(me.sect_cache[sect_idx].as_ref().unwrap().as_slice()) }) + } + + /// Get the name of the section of a given index. + pub fn get_section_name(&self, sect_idx: usize) -> Result<&str, Error> { + self.check_section_index(sect_idx)?; + + self.ensure_shstrtab()?; + + let me = self.backobj.borrow(); + + let sect = &me.shdrs.as_ref().unwrap()[sect_idx]; + let name = get_elf_section_name(sect, unsafe { + (*self.backobj.as_ptr()).shstrtab.as_ref().unwrap() + }); + if name.is_none() { + return Err(Error::new(ErrorKind::InvalidData, "invalid section name")); + } + Ok(name.unwrap()) + } + + pub fn get_section_size(&self, sect_idx: usize) -> Result { + self.check_section_index(sect_idx)?; + self.ensure_shdrs()?; + + let me = self.backobj.borrow(); + let sect = &me.shdrs.as_ref().unwrap()[sect_idx]; + Ok(sect.sh_size as usize) + } + + pub fn get_num_sections(&self) -> Result { + self.ensure_ehdr()?; + let me = self.backobj.borrow(); + Ok(me.ehdr.as_ref().unwrap().e_shnum as usize) + } + + /// Find the section of a given name. + /// + /// This function return the index of the section if found. + pub fn find_section(&self, name: &str) -> Result { + let nsects = self.get_num_sections()?; + for i in 0..nsects { + if self.get_section_name(i)? == name { + return Ok(i); + } + } + Err(Error::new( + ErrorKind::NotFound, + format!("unable to find ELF section: {name}"), + )) + } + + pub fn find_symbol(&self, address: u64, st_type: u8) -> Result<(&str, u64), Error> { + self.ensure_symtab()?; + self.ensure_strtab()?; + + let me = self.backobj.borrow(); + let idx_r = + search_address_opt_key(me.symtab.as_ref().unwrap(), address, &|sym: &Elf64_Sym| { + if sym.st_info & 0xf != st_type || sym.st_shndx == SHN_UNDEF { + None + } else { + Some(sym.st_value) + } + }); + if idx_r.is_none() { + return Err(Error::new( + ErrorKind::NotFound, + "Does not found a symbol for the given address", + )); + } + let idx = idx_r.unwrap(); + + let sym = &me.symtab.as_ref().unwrap()[idx]; + let sym_name = match extract_string( + unsafe { (*self.backobj.as_ptr()).strtab.as_ref().unwrap().as_slice() }, + sym.st_name as usize, + ) { + Some(sym_name) => sym_name, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "invalid symbol name string/offset", + )); + } + }; + Ok((sym_name, sym.st_value)) + } + + pub fn find_address(&self, name: &str, opts: &FindAddrOpts) -> Result, Error> { + if let SymbolType::Variable = opts.sym_type { + return Err(Error::new(ErrorKind::Unsupported, "Not implemented")); + } + + self.ensure_str2symtab()?; + + let me = self.backobj.borrow(); + let str2symtab = me.str2symtab.as_ref().unwrap(); + let strtab = me.strtab.as_ref().unwrap(); + let r = str2symtab.binary_search_by_key(&name.to_string(), |&x| { + String::from( + unsafe { CStr::from_ptr(&strtab[x.0] as *const u8 as *const i8) } + .to_str() + .unwrap(), + ) + }); + + match r { + Ok(str2sym_i) => { + let mut idx = str2sym_i; + while idx > 0 { + let name_seek = unsafe { + CStr::from_ptr(&strtab[str2symtab[idx].0] as *const u8 as *const i8) + .to_str() + .unwrap() + }; + if !name_seek.eq(name) { + idx += 1; + break; + } + idx -= 1; + } + + let mut found = vec![]; + for idx in idx..str2symtab.len() { + let name_visit = unsafe { + CStr::from_ptr(&strtab[str2symtab[idx].0] as *const u8 as *const i8) + .to_str() + .unwrap() + }; + if !name_visit.eq(name) { + break; + } + let sym_i = str2symtab[idx].1; + let sym_ref = &me.symtab.as_ref().unwrap()[sym_i]; + if sym_ref.st_shndx != SHN_UNDEF { + found.push(SymbolInfo { + name: name.to_string(), + address: sym_ref.st_value, + size: sym_ref.st_size, + sym_type: SymbolType::Function, + ..Default::default() + }); + } + } + Ok(found) + } + Err(_) => Ok(vec![]), + } + } + + pub fn find_address_regex( + &self, + pattern: &str, + opts: &FindAddrOpts, + ) -> Result, Error> { + if let SymbolType::Variable = opts.sym_type { + return Err(Error::new(ErrorKind::Unsupported, "Not implemented")); + } + + self.ensure_str2symtab()?; + + let me = self.backobj.borrow(); + let str2symtab = me.str2symtab.as_ref().unwrap(); + let strtab = me.strtab.as_ref().unwrap(); + let re = Regex::new(pattern).unwrap(); + let mut syms = vec![]; + for (str_off, sym_i) in str2symtab { + let sname = unsafe { + CStr::from_ptr(&strtab[*str_off] as *const u8 as *const i8) + .to_str() + .unwrap() + }; + if re.is_match(sname) { + let sym_ref = &me.symtab.as_ref().unwrap()[*sym_i]; + if sym_ref.st_shndx != SHN_UNDEF { + syms.push(SymbolInfo { + name: sname.to_string(), + address: sym_ref.st_value, + size: sym_ref.st_size, + sym_type: SymbolType::Function, + ..Default::default() + }); + } + } + } + Ok(syms) + } + + #[cfg(test)] + fn get_symbol(&self, idx: usize) -> Result<&Elf64_Sym, Error> { + self.ensure_symtab()?; + + let me = self.backobj.as_ptr(); + Ok(unsafe { &(*me).symtab.as_mut().unwrap()[idx] }) + } + + #[cfg(test)] + fn get_symbol_name(&self, idx: usize) -> Result<&str, Error> { + let sym = self.get_symbol(idx)?; + + let me = self.backobj.as_ptr(); + let sym_name = match extract_string( + unsafe { (*me).strtab.as_ref().unwrap().as_slice() }, + sym.st_name as usize, + ) { + Some(name) => name, + None => { + return Err(Error::new( + ErrorKind::InvalidData, + "invalid symb name string/offset", + )); + } + }; + + Ok(sym_name) + } + + pub fn get_all_program_headers(&self) -> Result<&[Elf64_Phdr], Error> { + self.ensure_phdrs()?; + + let phdrs = unsafe { + let me = self.backobj.as_ptr(); + let phdrs_ref = (*me).phdrs.as_mut().unwrap(); + phdrs_ref + }; + Ok(phdrs) + } + + #[cfg(test)] + fn pick_symtab_addr(&self) -> (&str, u64) { + self.ensure_symtab().unwrap(); + self.ensure_strtab().unwrap(); + + let me = self.backobj.borrow(); + let symtab = me.symtab.as_ref().unwrap(); + let mut idx = symtab.len() / 2; + while symtab[idx].st_info & 0xf != STT_FUNC || symtab[idx].st_shndx == SHN_UNDEF { + idx += 1; + } + let sym = &symtab[idx]; + let addr = sym.st_value; + drop(me); + + let sym_name = self.get_symbol_name(idx).unwrap(); + (sym_name, addr) + } + + /// Read raw data from the file at the current position. + /// + /// The caller can use section_seek() to move the current position + /// of the backed file. However, this function doesn't promise to + /// not cross the boundary of the section. The caller should take + /// care about it. + pub unsafe fn read_raw(&self, buf: &mut [u8]) -> Result<(), Error> { + self.file.borrow_mut().read_exact(buf)?; + Ok(()) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use std::env; + + #[test] + fn test_elf_header_sections() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-no-debug.bin"); + + let mut bin_file = File::open(bin_name).unwrap(); + let ehdr = read_elf_header(&mut bin_file); + assert!(ehdr.is_ok()); + let ehdr = ehdr.unwrap(); + assert_eq!( + ehdr.e_ident, + [ + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00 + ] + ); + assert_eq!(ehdr.e_version, 0x1); + assert_eq!(ehdr.e_shentsize as usize, mem::size_of::()); + + let shdrs = read_elf_sections(&mut bin_file, &ehdr); + assert!(shdrs.is_ok()); + let shdrs = shdrs.unwrap(); + let shstrndx = ehdr.e_shstrndx as usize; + + let shstrtab_sec = &shdrs[shstrndx]; + let shstrtab = read_elf_section_raw(&mut bin_file, shstrtab_sec); + assert!(shstrtab.is_ok()); + let shstrtab = shstrtab.unwrap(); + + let sec_name = get_elf_section_name(shstrtab_sec, &shstrtab); + assert!(sec_name.is_some()); + assert_eq!(sec_name.unwrap(), ".shstrtab"); + } + + #[test] + fn test_elf64_parser() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-no-debug.bin"); + + let parser = ElfParser::open(bin_name.as_ref()).unwrap(); + assert!(parser.find_section(".shstrtab").is_ok()); + } + + #[test] + fn test_elf64_symtab() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-no-debug.bin"); + + let parser = ElfParser::open(bin_name.as_ref()).unwrap(); + assert!(parser.find_section(".shstrtab").is_ok()); + + let (sym_name, addr) = parser.pick_symtab_addr(); + + let sym_r = parser.find_symbol(addr, STT_FUNC); + assert!(sym_r.is_ok()); + let (sym_name_ret, addr_ret) = sym_r.unwrap(); + assert_eq!(addr_ret, addr); + assert_eq!(sym_name_ret, sym_name); + } + + #[test] + fn test_elf64_find_address() { + let bin_name = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test-no-debug.bin"); + + let parser = ElfParser::open(bin_name.as_ref()).unwrap(); + assert!(parser.find_section(".shstrtab").is_ok()); + + let (sym_name, addr) = parser.pick_symtab_addr(); + + println!("{sym_name}"); + let opts = FindAddrOpts { + offset_in_file: false, + obj_file_name: false, + sym_type: SymbolType::Unknown, + }; + let addr_r = parser.find_address(sym_name, &opts).unwrap(); + assert_eq!(addr_r.len(), 1); + assert!(addr_r.iter().any(|x| x.address == addr)); + } +} diff --git a/third_party/blazesym/src/elf/resolver.rs b/third_party/blazesym/src/elf/resolver.rs new file mode 100644 index 0000000..a1215d4 --- /dev/null +++ b/third_party/blazesym/src/elf/resolver.rs @@ -0,0 +1,186 @@ +use std::io::ErrorKind; +use std::path::Path; +use std::path::PathBuf; + +use crate::AddressLineInfo; +use crate::CacheHolder; +use crate::Error; +use crate::FindAddrOpts; +use crate::SymResolver; +use crate::SymbolInfo; + +use super::cache::ElfBackend; +use super::types::ET_DYN; +use super::types::ET_EXEC; +use super::types::PF_X; +use super::types::PT_LOAD; +use super::types::STT_FUNC; +use super::ElfParser; + +/// The symbol resolver for a single ELF file. +/// +/// An ELF file may be loaded into an address space with a relocation. +/// The callers should provide the path of an ELF file and where it's +/// executable segment(s) is loaded. +/// +/// For some ELF files, they are located at a specific address +/// determined during compile-time. For these cases, just pass `0` as +/// it's loaded address. +pub struct ElfResolver { + backend: ElfBackend, + loaded_address: u64, + loaded_to_virt: u64, + foff_to_virt: u64, + size: u64, + file_name: PathBuf, +} + +impl ElfResolver { + pub(crate) fn new( + file_name: &Path, + loaded_address: u64, + cache_holder: &CacheHolder, + ) -> Result { + let backend = cache_holder.get_elf_cache().find(file_name)?; + let parser = match &backend { + ElfBackend::Dwarf(dwarf) => dwarf.get_parser(), + ElfBackend::Elf(parser) => parser, + }; + let e_type = parser.get_elf_file_type()?; + let phdrs = parser.get_all_program_headers()?; + + // Find the size of the block where the ELF file is/was + // mapped. + let mut max_addr = 0; + let mut low_addr = 0xffffffffffffffff; + let mut low_off = 0xffffffffffffffff; + if e_type == ET_DYN || e_type == ET_EXEC { + for phdr in phdrs { + if phdr.p_type != PT_LOAD { + continue; + } + if (phdr.p_flags & PF_X) != PF_X { + continue; + } + let end_at = phdr.p_vaddr + phdr.p_memsz; + if max_addr < end_at { + max_addr = end_at; + } + if phdr.p_vaddr < low_addr { + low_addr = phdr.p_vaddr; + low_off = phdr.p_offset; + } + } + } else { + return Err(Error::new(ErrorKind::InvalidData, "unknown e_type")); + } + + let loaded_address = if e_type == ET_EXEC { + low_addr + } else { + loaded_address + }; + let loaded_to_virt = low_addr; + let foff_to_virt = low_addr - low_off; + let size = max_addr - low_addr; + + Ok(ElfResolver { + backend, + loaded_address, + loaded_to_virt, + foff_to_virt, + size, + file_name: file_name.to_path_buf(), + }) + } + + fn get_parser(&self) -> Option<&ElfParser> { + match &self.backend { + ElfBackend::Dwarf(dwarf) => Some(dwarf.get_parser()), + ElfBackend::Elf(parser) => Some(parser), + } + } +} + +impl SymResolver for ElfResolver { + fn get_address_range(&self) -> (u64, u64) { + (self.loaded_address, self.loaded_address + self.size) + } + + fn find_symbols(&self, addr: u64) -> Vec<(&str, u64)> { + let off = addr - self.loaded_address + self.loaded_to_virt; + let parser = if let Some(parser) = self.get_parser() { + parser + } else { + return vec![]; + }; + + match parser.find_symbol(off, STT_FUNC) { + Ok((name, start_addr)) => { + vec![(name, start_addr - self.loaded_to_virt + self.loaded_address)] + } + Err(_) => vec![], + } + } + + fn find_address(&self, name: &str, opts: &FindAddrOpts) -> Option> { + let mut addr_res = match &self.backend { + ElfBackend::Dwarf(dwarf) => dwarf.find_address(name, opts), + ElfBackend::Elf(parser) => parser.find_address(name, opts), + } + .ok()?; + for x in &mut addr_res { + x.address = x.address - self.loaded_to_virt + self.loaded_address; + } + Some(addr_res) + } + + fn find_address_regex(&self, pattern: &str, opts: &FindAddrOpts) -> Option> { + let syms = match &self.backend { + ElfBackend::Dwarf(dwarf) => dwarf.find_address_regex(pattern, opts), + ElfBackend::Elf(parser) => parser.find_address_regex(pattern, opts), + }; + if syms.is_err() { + return None; + } + let mut syms = syms.unwrap(); + for sym in &mut syms { + sym.address = sym.address - self.loaded_to_virt + self.loaded_address; + } + Some(syms) + } + + fn find_line_info(&self, addr: u64) -> Option { + let off = addr - self.loaded_address + self.loaded_to_virt; + if let ElfBackend::Dwarf(dwarf) = &self.backend { + let (directory, file, line_no) = dwarf.find_line_as_ref(off)?; + let mut path = String::from(directory); + if !path.is_empty() && &path[(path.len() - 1)..] != "/" { + path.push('/'); + } + path.push_str(file); + Some(AddressLineInfo { + path, + line_no, + column: 0, + }) + } else { + None + } + } + + fn addr_file_off(&self, addr: u64) -> Option { + Some(addr - self.loaded_address + self.loaded_to_virt - self.foff_to_virt) + } + + fn get_obj_file_name(&self) -> &Path { + &self.file_name + } + + fn repr(&self) -> String { + match self.backend { + ElfBackend::Dwarf(_) => format!("DWARF {}", self.file_name.display()), + ElfBackend::Elf(_) => format!("ELF {}", self.file_name.display()), + } + } +} diff --git a/third_party/blazesym/src/elf/types.rs b/third_party/blazesym/src/elf/types.rs new file mode 100644 index 0000000..4e33282 --- /dev/null +++ b/third_party/blazesym/src/elf/types.rs @@ -0,0 +1,41 @@ +pub use libc::Elf64_Addr; +pub use libc::Elf64_Half; +pub use libc::Elf64_Off; +pub use libc::Elf64_Phdr; +pub use libc::Elf64_Shdr; +pub use libc::Elf64_Sxword; +pub use libc::Elf64_Sym; +pub use libc::Elf64_Word; +pub use libc::Elf64_Xword; + +pub use libc::Elf64_Ehdr; +pub use libc::ET_CORE; +pub use libc::ET_DYN; +pub use libc::ET_EXEC; +pub use libc::ET_HIPROC; +pub use libc::ET_LOPROC; +pub use libc::ET_NONE; +pub use libc::ET_REL; + +pub use libc::PF_R; +pub use libc::PF_W; +pub use libc::PF_X; + +pub use libc::PT_DYNAMIC; +pub use libc::PT_GNU_EH_FRAME; +pub use libc::PT_GNU_STACK; +pub use libc::PT_HIOS; +pub use libc::PT_HIPROC; +pub use libc::PT_INTERP; +pub use libc::PT_LOAD; +pub use libc::PT_LOOS; +pub use libc::PT_LOPROC; +pub use libc::PT_NOTE; +pub use libc::PT_NULL; +pub use libc::PT_PHDR; +pub use libc::PT_SHLIB; +pub use libc::PT_TLS; + +pub const SHN_UNDEF: u16 = 0; + +pub const STT_FUNC: u8 = 2; diff --git a/third_party/blazesym/src/gsym/linetab.rs b/third_party/blazesym/src/gsym/linetab.rs new file mode 100644 index 0000000..71d5d6c --- /dev/null +++ b/third_party/blazesym/src/gsym/linetab.rs @@ -0,0 +1,147 @@ +//! Opcode runner of GSYM line table. + +use crate::util::decode_leb128; +use crate::util::decode_leb128_s; + +/// End of the line table +const END_SEQUENCE: u8 = 0x00; +/// Set [`LineTableRow.file_idx`], don't push a row. +const SET_FILE: u8 = 0x01; +/// Increment [`LineTableRow.address`], and push a row. +const ADVANCE_PC: u8 = 0x02; +/// Set [`LineTableRow.file_line`], don't push a row. +const ADVANCE_LINE: u8 = 0x03; +/// All special opcodes push a row. +const FIRST_SPECIAL: u8 = 0x04; + + +#[derive(Debug)] +pub enum RunResult { + /// Run the operator successfully. + Ok(usize), + /// This operator creates a new row. + NewRow(usize), + /// The end of the program (the operator stream.) + End, + /// Fails to run the operator at the position. + Err, +} + +#[derive(Debug)] +pub struct LineTableHeader { + /// `min_data` & `max_delta` together is used to set the range and encoding + /// of line delta in special operator. Line delta is the number of lines + /// that a line table row is different from the previous row. + pub min_delta: i64, + pub max_delta: i64, + pub first_line: u32, +} + +#[derive(Clone, Debug)] +pub struct LineTableRow { + pub address: u64, + pub file_idx: u32, + pub file_line: u32, +} + +impl LineTableRow { + /// Create a `LineTableRow` to use as the states of a line table virtual + /// machine. + /// + /// The returned `LineTableRow` can be passed to [`run_op`] as `ctx`. + /// + /// # Arguments + /// + /// * `header` - is a [`LineTableHeader`] returned by [`parse_line_table_header()`]. + /// * `symaddr` - the address of the symbol that `header` belongs to. + pub fn line_table_row_from(header: &LineTableHeader, symaddr: u64) -> LineTableRow { + Self { + address: symaddr, + file_idx: 1, + file_line: header.first_line, + } + } +} + + +/// Run a GSYM line table operator/instruction in the buffer. +/// +/// # Arguments +/// +/// * `ctx` - a line table row to present the current states of the virtual +/// machine. [`line_table_row_from()`] can create a `LineTableRow` to +/// keep the states of a virtual machine. +/// * `header` - is a `LineTableHeader`. +/// * `ops` - is the buffer of the operators following the `LineTableHeader` in +/// a GSYM file. +/// * `pc` - is the program counter of the virtual machine. +/// +/// Returns a [`RunResult`]. `Ok` and `NewRow` will return the size of this +/// instruction. The caller should adjust the value of `pc` according to the +/// value returned. +pub fn run_op( + ctx: &mut LineTableRow, + header: &LineTableHeader, + ops: &[u8], + pc: usize, +) -> RunResult { + let mut off = pc; + let op = ops[off]; + off += 1; + match op { + END_SEQUENCE => RunResult::End, + SET_FILE => { + if let Some((f, bytes)) = decode_leb128(&ops[off..]) { + off += bytes as usize; + ctx.file_idx = f as u32; + RunResult::Ok(off - pc) + } else { + RunResult::Err + } + } + ADVANCE_PC => { + if let Some((adv, bytes)) = decode_leb128(&ops[off..]) { + off += bytes as usize; + ctx.address += adv; + RunResult::NewRow(off - pc) + } else { + RunResult::Err + } + } + ADVANCE_LINE => { + if let Some((adv, bytes)) = decode_leb128_s(&ops[off..]) { + off += bytes as usize; + ctx.file_line = (ctx.file_line as i64 + adv) as u32; + RunResult::Ok(off - pc) + } else { + RunResult::Err + } + } + // Special operators. + // + // All operators that have a value greater than or equal to + // FIRST_SPECIAL are considered special operators. These operators + // change both the line number and address of the virtual machine and + // emit a new row. + _ => { + let adjusted = (op - FIRST_SPECIAL) as i64; + // The range of line number delta is from min_delta to max_delta, + // including max_delta. + let range = header.max_delta - header.min_delta + 1; + if range == 0 { + return RunResult::Err; + } + let line_delta = header.min_delta + (adjusted % range); + let addr_delta = adjusted / range; + + let file_line = ctx.file_line as i32 + line_delta as i32; + if file_line < 1 { + return RunResult::Err; + } + + ctx.file_line = file_line as u32; + ctx.address = (ctx.address as i64 + addr_delta) as u64; + RunResult::NewRow(off - pc) + } + } +} diff --git a/third_party/blazesym/src/gsym/mod.rs b/third_party/blazesym/src/gsym/mod.rs new file mode 100644 index 0000000..5dc54f1 --- /dev/null +++ b/third_party/blazesym/src/gsym/mod.rs @@ -0,0 +1,6 @@ +mod linetab; +mod parser; +mod resolver; +mod types; + +pub use resolver::GsymResolver; diff --git a/third_party/blazesym/src/gsym/parser.rs b/third_party/blazesym/src/gsym/parser.rs new file mode 100644 index 0000000..e070463 --- /dev/null +++ b/third_party/blazesym/src/gsym/parser.rs @@ -0,0 +1,469 @@ +//! Parser of GSYM format. +//! +//! The layout of a standalone GSYM contains following sections in the order. +//! +//! * Header +//! * Address Table +//! * Address Data Offset Table +//! * File Table +//! * String Table +//! * Address Data +//! +//! The standalone GSYM starts with a Header, which describes the +//! size of an entry in the address table, the number of entries in +//! the address table, and the location and the size of the string +//! table. +//! +//! Since the Address Table is immediately after the Header, the +//! Header describes only the size of an entry and number of entries +//! in the table but not where it is. The Address Table comprises +//! addresses of symbols in the ascending order, so we can find the +//! symbol an address belonging to by doing a binary search to find +//! the most close address but smaller or equal. +//! +//! The Address Data Offset Table has the same number of entries as +//! the Address Table. Every entry in one table will has +//! corresponding entry at the same offset in the other table. The +//! entries in the Address Data Offset Table are always 32bits +//! (4bytes.) It is the file offset to the respective Address +//! Data. (AddressInfo actually) +//! +//! An AddressInfo comprises the size and name of a symbol. The name +//! is an offset in the string table. You will find a null terminated +//! C string at the give offset. The size is the number of bytes of +//! the respective object; ex, a function or variable. +//! +//! See + +use std::ffi::CStr; +use std::io::{Error, ErrorKind}; + +use crate::util::decode_leb128; +use crate::util::decode_leb128_s; +use crate::util::decode_udword; +use crate::util::decode_uhalf; +use crate::util::decode_uword; + +use super::linetab::LineTableHeader; +use super::types::AddressData; +use super::types::AddressInfo; +use super::types::FileInfo; +use super::types::Header; +use super::types::InfoTypeEndOfList; +use super::types::InfoTypeInlineInfo; +use super::types::InfoTypeLineTableInfo; +use super::types::ADDR_DATA_OFFSET_SIZE; +use super::types::FILE_INFO_SIZE; +use super::types::GSYM_MAGIC; +use super::types::GSYM_VERSION; + +/// Hold the major parts of a standalone GSYM file. +/// +/// GsymContext provides functions to access major entities in GSYM. +/// GsymContext can find respective AddressInfo for an address. But, +/// it doesn't parse AddressData to get line numbers. +/// +/// The developers should use [`parse_address_data()`], +/// [`parse_line_table_header()`], and [`linetab::run_op()`] to get +/// line number information from [`AddressInfo`]. +pub struct GsymContext<'a> { + header: Header, + addr_tab: &'a [u8], + addr_data_off_tab: &'a [u8], + file_tab: &'a [u8], + str_tab: &'a [u8], + raw_data: &'a [u8], +} + +impl<'a> GsymContext<'a> { + /// Parse the Header of a standalone GSYM file. + /// + /// # Arguments + /// + /// * `data` - is the content of a standalone GSYM. + /// + /// Returns a GsymContext, which includes the Header and other important tables. + pub fn parse_header(data: &[u8]) -> Result { + let mut off = 0; + // Parse Header + let magic = decode_uword(data); + if magic != GSYM_MAGIC { + return Err(Error::new(ErrorKind::InvalidData, "invalid magic number")); + } + off += 4; + let version = decode_uhalf(&data[off..]); + if version != GSYM_VERSION { + return Err(Error::new(ErrorKind::InvalidData, "unknown version number")); + } + off += 2; + let addr_off_size = data[off]; + off += 1; + let uuid_size = data[off]; + off += 1; + let base_address = decode_udword(&data[off..]); + off += 8; + let num_addrs = decode_uword(&data[off..]); + off += 4; + let strtab_offset = decode_uword(&data[off..]); + off += 4; + let strtab_size = decode_uword(&data[off..]); + off += 4; + let uuid: [u8; 20] = (&data[off..(off + 20)]) + .try_into() + .expect("input data is too short"); + off += 20; + + // Get the slices of the Address Table, Address Data Offset Table, + // and String table. + let end_off = off + num_addrs as usize * addr_off_size as usize; + if end_off > data.len() { + return Err(Error::new( + ErrorKind::InvalidData, + "the size of the file is smaller than expectation (address table)", + )); + } + let addr_tab = &data[off..end_off]; + off = (end_off + 0x3) & !0x3; + let end_off = off + num_addrs as usize * ADDR_DATA_OFFSET_SIZE; + if end_off > data.len() { + return Err(Error::new( + ErrorKind::InvalidData, + "the size of the file is smaller than expectation (address data offset table)", + )); + } + let addr_data_off_tab = &data[off..end_off]; + off += num_addrs as usize * ADDR_DATA_OFFSET_SIZE; + let file_num = decode_uword(&data[off..]); + off += 4; + let end_off = off + file_num as usize * FILE_INFO_SIZE; + if end_off > data.len() { + return Err(Error::new( + ErrorKind::InvalidData, + "the size of the file is smaller than expectation (file table)", + )); + } + let file_tab = &data[off..end_off]; + let end_off = strtab_offset as usize + strtab_size as usize; + if end_off > data.len() { + return Err(Error::new( + ErrorKind::InvalidData, + "the size of the file is smaller than expectation (string table)", + )); + } + let str_tab = &data[strtab_offset as usize..end_off]; + + Ok(GsymContext { + header: Header { + magic, + version, + addr_off_size, + uuid_size, + base_address, + num_addrs, + strtab_offset, + strtab_size, + uuid, + }, + addr_tab, + addr_data_off_tab, + file_tab, + str_tab, + raw_data: data, + }) + } + + pub fn num_addresses(&self) -> usize { + self.header.num_addrs as usize + } + + /// Get the address of an entry in the Address Table. + pub fn addr_at(&self, idx: usize) -> Option { + if idx >= self.header.num_addrs as usize { + return None; + } + + let off = idx * self.header.addr_off_size as usize; + let mut addr = 0u64; + let mut shift = 0; + for d in &self.addr_tab[off..(off + self.header.addr_off_size as usize)] { + addr |= (*d as u64) << shift; + shift += 8; + } + addr += self.header.base_address; + Some(addr) + } + + /// Get the AddressInfo of an address given by an index. + pub fn addr_info(&self, idx: usize) -> Option { + if idx >= self.header.num_addrs as usize { + return None; + } + + let off = idx * ADDR_DATA_OFFSET_SIZE; + let ad_off = decode_uword(&self.addr_data_off_tab[off..]) as usize; + let size = decode_uword(&self.raw_data[ad_off..]); + let name = decode_uword(&self.raw_data[ad_off + 4..]); + let info = AddressInfo { + size, + name, + data: &self.raw_data[ad_off + 8..], + }; + + Some(info) + } + + /// Get the string at the given offset from the String Table. + pub fn get_str(&self, off: usize) -> Option<&str> { + if off >= self.str_tab.len() { + return None; + } + + // Ensure there is a null byte. + let mut null_off = self.str_tab.len() - 1; + while null_off > off && self.str_tab[null_off] != 0 { + null_off -= 1; + } + if null_off == off { + return Some(""); + } + + // SAFETY: the lifetime of `CStr` can live as long as `self`. + // The returned reference can also live as long as `self`. + unsafe { + CStr::from_ptr(self.str_tab[off..].as_ptr() as *const i8) + .to_str() + .ok() + } + } + + pub fn file_info(&self, idx: usize) -> Option { + if idx >= self.file_tab.len() / FILE_INFO_SIZE { + return None; + } + let mut off = idx * FILE_INFO_SIZE; + let directory = decode_uword(&self.file_tab[off..(off + 4)]); + off += 4; + let filename = decode_uword(&self.file_tab[off..(off + 4)]); + let info = FileInfo { + directory, + filename, + }; + Some(info) + } +} + +/// Find the index of an entry in the address table most likely +/// containing the given address. +/// +/// The callers should check the respective `AddressInfo` to make sure +/// it is what they request for. +pub fn find_address(ctx: &GsymContext, addr: u64) -> Option { + let mut left = 0; + let mut right = ctx.num_addresses(); + + if right == 0 { + return None; + } + if addr < ctx.addr_at(0)? { + return None; + } + + while (left + 1) < right { + let v = (left + right) / 2; + let cur_addr = ctx.addr_at(v)?; + + if addr == cur_addr { + return Some(v); + } + if addr < cur_addr { + right = v; + } else { + left = v; + } + } + Some(left) +} + +/// Parse AddressData. +/// +/// AddressDatas are items following AndressInfo. +/// [`GsymContext::addr_info()`] returns the raw data of AddressDatas as a +/// slice at [`AddressInfo::data`]. +/// +/// # Arguments +/// +/// * `data` - is the slice from AddressInfo::data. +/// +/// Returns a vector of [`AddressData`]. +pub fn parse_address_data(data: &[u8]) -> Vec { + let mut data_objs = vec![]; + + let mut off = 0; + while off < data.len() { + let typ = decode_uword(&data[off..]); + off += 4; + let length = decode_uword(&data[off..]); + off += 4; + let d = &data[off..(off + length as usize)]; + data_objs.push(AddressData { + typ, + length, + data: d, + }); + off += length as usize; + + #[allow(non_upper_case_globals)] + match typ { + InfoTypeEndOfList => { + break; + } + InfoTypeLineTableInfo | InfoTypeInlineInfo => {} + _ => { + #[cfg(debug_assertions)] + eprintln!("unknown info type"); + } + } + } + + data_objs +} + +/// Parse AddressData of InfoTypeLineTableInfo. +/// +/// An `AddressData` of `InfoTypeLineTableInfo` type is a table of line numbers +/// for a symbol. `AddressData` is the payload of `AddressInfo`. One +/// `AddressInfo` may have several `AddressData` entries in its payload. Each +/// `AddressData` entry stores a type of data relates to the symbol the +/// `AddressInfo` presents. +/// +/// # Arguments +/// +/// * `data` - is what [`AddressData::data`] is. +/// +/// Returns the `LineTableHeader` and the size of the header of a +/// `AddressData` entry of `InfoTypeLineTableInfo` type in the payload +/// of an `Addressinfo`. +pub fn parse_line_table_header(data: &[u8]) -> Option<(LineTableHeader, usize)> { + let mut off = 0; + let (min_delta, bytes) = decode_leb128_s(&data[off..])?; + off += bytes as usize; + let (max_delta, bytes) = decode_leb128_s(&data[off..])?; + off += bytes as usize; + let (first_line, bytes) = decode_leb128(&data[off..])?; + off += bytes as usize; + + let header = LineTableHeader { + min_delta, + max_delta, + first_line: first_line as u32, + }; + Some((header, off)) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::env; + use std::fs::File; + use std::io::{Read, Write}; + use std::path::Path; + + + #[test] + fn test_parse_context() { + let test_gsym = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test.gsym"); + let mut gsym_fo = File::open(test_gsym).unwrap(); + let mut data = vec![]; + + gsym_fo.read_to_end(&mut data).unwrap(); + let ctx = GsymContext::parse_header(&data).unwrap(); + + let idx = find_address(&ctx, 0x0000000002000000).unwrap(); + let addrinfo = ctx.addr_info(idx).unwrap(); + assert_eq!(ctx.get_str(addrinfo.name as usize).unwrap(), "main"); + + let idx = find_address(&ctx, 0x0000000002000100).unwrap(); + let addrinfo = ctx.addr_info(idx).unwrap(); + assert_eq!(ctx.get_str(addrinfo.name as usize).unwrap(), "factorial"); + } + + #[test] + fn test_find_address() { + let test_gsym = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test.gsym"); + let mut gsym_fo = File::open(test_gsym).unwrap(); + let mut data = vec![]; + + const TEST_SIZE: usize = 6; + + gsym_fo.read_to_end(&mut data).unwrap(); + + let mut addr_tab = Vec::::new(); + addr_tab.resize(TEST_SIZE * 4, 0); + + let mut values: Vec = (0_u32..(TEST_SIZE as u32)).collect(); + + let copy_to_addr_tab = |values: &[u32], addr_tab: &mut Vec| { + addr_tab.clear(); + for v in values { + let r = addr_tab.write(&v.to_ne_bytes()); + assert!(r.is_ok()); + } + }; + // Generate all possible sequences that values are in strictly + // ascending order and `< TEST_SIZE * 2`. + let gen_values = |values: &mut [u32]| { + let mut carry_out = TEST_SIZE as u32 * 2; + for i in (0..values.len()).rev() { + values[i] += 1; + if values[i] >= carry_out { + carry_out -= 1; + continue; + } + // Make all values at right side minimal and strictly + // ascending. + for j in (i + 1)..values.len() { + values[j] = values[j - 1] + 1; + } + break; + } + }; + + while values[0] <= TEST_SIZE as u32 { + copy_to_addr_tab(&values, &mut addr_tab); + + for addr in 0..(TEST_SIZE * 2) { + let addr_tab = addr_tab.clone(); + let mut ctx = GsymContext::parse_header(&data).unwrap(); + ctx.header.num_addrs = TEST_SIZE as u32; + ctx.header.addr_off_size = 4; + ctx.header.base_address = 0; + ctx.addr_tab = addr_tab.as_slice(); + + let idx = find_address(&ctx, addr as u64).unwrap_or(0); + let addr_u32 = addr as u32; + let idx1 = match values.binary_search(&addr_u32) { + Ok(idx) => idx, + Err(idx) => { + // When the searching value is falling in + // between two values, it will return the + // index of the later one. But we want the + // earlier one. + if idx > 0 { + idx - 1 + } else { + 0 + } + } + }; + assert_eq!(idx, idx1); + } + + gen_values(&mut values); + } + } +} diff --git a/third_party/blazesym/src/gsym/resolver.rs b/third_party/blazesym/src/gsym/resolver.rs new file mode 100644 index 0000000..a271d50 --- /dev/null +++ b/third_party/blazesym/src/gsym/resolver.rs @@ -0,0 +1,224 @@ +use std::fs::File; +use std::io::{Error, Read}; +use std::mem; +use std::path::{Path, PathBuf}; + +use crate::{AddressLineInfo, FindAddrOpts, SymResolver, SymbolInfo}; + +use super::linetab::run_op; +use super::linetab::LineTableRow; +use super::linetab::RunResult; +use super::parser::find_address; +use super::parser::parse_address_data; +use super::parser::parse_line_table_header; +use super::parser::GsymContext; +use super::types::InfoTypeLineTableInfo; + +/// The symbol resolver for the GSYM format. +pub struct GsymResolver { + file_name: PathBuf, + ctx: GsymContext<'static>, + _data: Vec, + loaded_address: u64, +} + +impl GsymResolver { + pub fn new(file_name: PathBuf, loaded_address: u64) -> Result { + let mut fo = File::open(&file_name)?; + let mut data = vec![]; + fo.read_to_end(&mut data)?; + let ctx = GsymContext::parse_header(&data)?; + + Ok(GsymResolver { + file_name, + // SAFETY: the lifetime of ctx depends on data, which is + // owned by the object. So, it is safe to strip the + // lifetime of ctx. + ctx: unsafe { mem::transmute(ctx) }, + _data: data, + loaded_address, + }) + } +} + +impl SymResolver for GsymResolver { + fn get_address_range(&self) -> (u64, u64) { + let sz = self.ctx.num_addresses(); + if sz == 0 { + return (0, 0); + } + + // TODO: Must not unwrap. + let start = self.ctx.addr_at(0).unwrap() + self.loaded_address; + // TODO: Must not unwrap. + let end = self.ctx.addr_at(sz - 1).unwrap() + + self.ctx.addr_info(sz - 1).unwrap().size as u64 + + self.loaded_address; + (start, end) + } + + fn find_symbols(&self, addr: u64) -> Vec<(&str, u64)> { + let addr = addr - self.loaded_address; + let idx = if let Some(idx) = find_address(&self.ctx, addr) { + idx + } else { + return vec![]; + }; + + let found = if let Some(addr) = self.ctx.addr_at(idx) { + addr + } else { + return vec![]; + }; + + if addr < found { + return vec![]; + } + + let info = if let Some(info) = self.ctx.addr_info(idx) { + info + } else { + return Vec::new(); + }; + + let name = if let Some(name) = self.ctx.get_str(info.name as usize) { + name + } else { + return Vec::new(); + }; + + vec![(name, found + self.loaded_address)] + } + + fn find_address(&self, _name: &str, _opts: &FindAddrOpts) -> Option> { + // It is inefficient to find the address of a symbol with + // GSYM. We may support it in the future if needed. + None + } + + fn find_address_regex(&self, _pattern: &str, _opts: &FindAddrOpts) -> Option> { + None + } + + /// Finds the source code location for a given address. + /// + /// This function takes in an address and returns the file path, + /// line number and column of the line in the source code that + /// the address corresponds to. If it doesn't find any match it + /// returns `None`. + /// + /// # Arguments + /// + /// * `addr` - The address to find the source code location for. + /// + /// # Returns + /// + /// The `AddressLineInfo` corresponding to the address or `None`. + fn find_line_info(&self, addr: u64) -> Option { + let addr = addr.checked_sub(self.loaded_address)?; + let idx = find_address(&self.ctx, addr)?; + let symaddr = self.ctx.addr_at(idx)?; + if addr < symaddr { + return None; + } + let addrinfo = self.ctx.addr_info(idx)?; + if addr >= (symaddr + addrinfo.size as u64) { + return None; + } + + let addrdatas = parse_address_data(addrinfo.data); + for adr_ent in addrdatas { + if adr_ent.typ != InfoTypeLineTableInfo { + continue; + } + // Continue to execute all GSYM line table operations + // until the end of the buffer is reached or a row + // containing addr is located. + let (lntab_hdr, hdr_bytes) = parse_line_table_header(adr_ent.data)?; + let ops = &adr_ent.data[hdr_bytes..]; + let mut lntab_row = LineTableRow::line_table_row_from(&lntab_hdr, symaddr); + let mut last_lntab_row = lntab_row.clone(); + let mut row_cnt = 0; + let mut pc = 0; + while pc < ops.len() { + match run_op(&mut lntab_row, &lntab_hdr, ops, pc) { + RunResult::Ok(bytes) => { + pc += bytes; + } + RunResult::NewRow(bytes) => { + pc += bytes; + row_cnt += 1; + if addr < lntab_row.address { + if row_cnt == 1 { + // The address is lower than the first row. + return None; + } + // Rollback to the last row. + lntab_row = last_lntab_row; + break; + } + last_lntab_row = lntab_row.clone(); + } + RunResult::End | RunResult::Err => { + break; + } + } + } + + if row_cnt == 0 { + continue; + } + + let finfo = self.ctx.file_info(lntab_row.file_idx as usize)?; + let dirname = self.ctx.get_str(finfo.directory as usize)?; + let filename = self.ctx.get_str(finfo.filename as usize)?; + let path = Path::new(dirname).join(filename).to_str()?.to_string(); + return Some(AddressLineInfo { + path, + line_no: lntab_row.file_line as usize, + column: 0, + }); + } + None + } + + fn addr_file_off(&self, _addr: u64) -> Option { + // Unavailable + None + } + + fn get_obj_file_name(&self) -> &Path { + &self.file_name + } + + fn repr(&self) -> String { + format!("GSYM {:?}", self.file_name) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::env; + + + /// Make sure that we can find file line information for a function, if available. + #[test] + fn test_find_line_info() { + let test_gsym = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test.gsym"); + let resolver = GsymResolver::new(test_gsym, 0).unwrap(); + + // `main` resides at address 0x2000000, and it's located at line 19. + let info = resolver.find_line_info(0x2000000).unwrap(); + assert_eq!(info.line_no, 19); + assert!(info.path.ends_with("test-gsym.c")); + + // `factorial` resides at address 0x2000100, and it's located at line 7. + let info = resolver.find_line_info(0x2000100).unwrap(); + assert_eq!(info.line_no, 7); + assert!(info.path.ends_with("test-gsym.c")); + } +} diff --git a/third_party/blazesym/src/gsym/types.rs b/third_party/blazesym/src/gsym/types.rs new file mode 100644 index 0000000..b40b817 --- /dev/null +++ b/third_party/blazesym/src/gsym/types.rs @@ -0,0 +1,46 @@ +pub const GSYM_MAGIC: u32 = 0x4753594d; +pub const GSYM_VERSION: u16 = 1; + +/// The size of address data offsets in GSYM. +pub const ADDR_DATA_OFFSET_SIZE: usize = 4; +/// The size of a GSYM `FileInfo` object. +pub const FILE_INFO_SIZE: usize = 8; + +/// GSYM File Header +pub struct Header { + pub magic: u32, + pub version: u16, + pub addr_off_size: u8, + pub uuid_size: u8, + pub base_address: u64, + pub num_addrs: u32, + pub strtab_offset: u32, + pub strtab_size: u32, + pub uuid: [u8; 20], +} + +pub struct FileInfo { + pub directory: u32, + pub filename: u32, +} + +pub struct AddressInfo<'a> { + pub size: u32, + pub name: u32, + /// The raw data comprises a list of [`AddressData`]. + pub data: &'a [u8], +} + +pub struct AddressData<'a> { + /// The data type. Its value should be one of InfoType*. + pub typ: u32, + pub length: u32, + pub data: &'a [u8], +} + +#[allow(non_upper_case_globals)] +pub const InfoTypeEndOfList: u32 = 0; +#[allow(non_upper_case_globals)] +pub const InfoTypeLineTableInfo: u32 = 1; +#[allow(non_upper_case_globals)] +pub const InfoTypeInlineInfo: u32 = 2; diff --git a/third_party/blazesym/src/ksym.rs b/third_party/blazesym/src/ksym.rs new file mode 100644 index 0000000..40f86fb --- /dev/null +++ b/third_party/blazesym/src/ksym.rs @@ -0,0 +1,415 @@ +use super::{FindAddrOpts, SymbolInfo, SymbolType}; + +use std::cell::RefCell; +use std::collections::HashMap; +use std::default::Default; +use std::fs::File; +use std::io::{BufRead, BufReader, Error}; +use std::path::{Path, PathBuf}; +use std::rc::Rc; +use std::u64; + +use crate::SymResolver; + +use regex::Regex; + +const KALLSYMS: &str = "/proc/kallsyms"; +const DFL_KSYM_CAP: usize = 200000; + +pub struct Ksym { + pub addr: u64, + pub name: String, +} + +/// The symbol resolver for /proc/kallsyms. +/// +/// The users should provide the path of kallsyms, so you can provide +/// a copy from other devices. +pub struct KSymResolver { + syms: Vec, + sym_to_addr: RefCell>, + file_name: PathBuf, +} + +impl KSymResolver { + pub fn new() -> KSymResolver { + Default::default() + } + + pub fn load_file_name(&mut self, filename: PathBuf) -> Result<(), std::io::Error> { + let f = File::open(&filename)?; + let mut reader = BufReader::new(f); + let mut line = String::new(); + + while let Ok(sz) = reader.read_line(&mut line) { + if sz == 0 { + break; + } + let tokens: Vec<&str> = line.split_whitespace().collect(); + if tokens.len() < 3 { + break; + } + let (addr, _symbol, func) = (tokens[0], tokens[1], tokens[2]); + if let Ok(addr) = u64::from_str_radix(addr, 16) { + if addr == 0 { + line.truncate(0); + continue; + } + let name = String::from(func); + self.syms.push(Ksym { addr, name }); + } + + line.truncate(0); + } + + self.syms.sort_by(|a, b| a.addr.cmp(&b.addr)); + + self.file_name = filename; + + Ok(()) + } + + pub fn load(&mut self) -> Result<(), std::io::Error> { + self.load_file_name(PathBuf::from(KALLSYMS)) + } + + fn ensure_sym_to_addr(&self) { + if self.sym_to_addr.borrow().len() > 0 { + return; + } + let mut sym_to_addr = self.sym_to_addr.borrow_mut(); + for Ksym { name, addr } in self.syms.iter() { + // Performance & lifetime hacking + let name_static = unsafe { &*(name as *const String) }; + sym_to_addr.insert(name_static, *addr); + } + } + + pub fn find_addresses_ksym(&self, addr: u64) -> impl Iterator { + let mut l = 0; + let mut r = self.syms.len(); + + while l < r { + let m = (l + r) / 2; + let sym = &self.syms[m]; + + if addr < sym.addr { + r = m; + } else { + l = m + 1; + } + } + debug_assert!( + (l == 0 || l >= self.syms.len()) + || (self.syms[l - 1].addr <= addr && addr < self.syms[l].addr) + ); + self.syms[0..l] + .iter() + .rev() + .take_while(move |sym| sym.addr == self.syms[l - 1].addr) + } + + #[cfg(test)] + pub fn find_addresses_ksym_simple(&self, addr: u64) -> impl Iterator { + let mut i = 0; + while i < self.syms.len() && addr >= self.syms[i].addr { + i += 1; + } + self.syms[..i] + .iter() + .rev() + .take_while(move |x| x.addr == self.syms[i - 1].addr) + } +} + +impl Default for KSymResolver { + fn default() -> Self { + KSymResolver { + syms: Vec::with_capacity(DFL_KSYM_CAP), + sym_to_addr: RefCell::new(HashMap::new()), + file_name: PathBuf::from(""), + } + } +} + +impl SymResolver for KSymResolver { + fn get_address_range(&self) -> (u64, u64) { + (0xffffffff80000000, 0xffffffffffffffff) + } + + fn find_symbols(&self, addr: u64) -> Vec<(&str, u64)> { + self.find_addresses_ksym(addr) + .map(|sym| (sym.name.as_str(), sym.addr)) + .collect() + } + + fn find_address(&self, name: &str, opts: &FindAddrOpts) -> Option> { + if let SymbolType::Variable = opts.sym_type { + return None; + } + self.ensure_sym_to_addr(); + + if let Some(addr) = self.sym_to_addr.borrow().get(name) { + return Some(vec![SymbolInfo { + name: name.to_string(), + address: *addr, + size: 0, + sym_type: SymbolType::Function, + ..Default::default() + }]); + } + None + } + + fn find_address_regex(&self, pattern: &str, opts: &FindAddrOpts) -> Option> { + if let SymbolType::Variable = opts.sym_type { + return None; + } + self.ensure_sym_to_addr(); + + let re = Regex::new(pattern).unwrap(); + let mut syms = vec![]; + for (name, addr) in self.sym_to_addr.borrow().iter() { + if re.is_match(name) { + syms.push(SymbolInfo { + name: name.to_string(), + address: *addr, + size: 0, + sym_type: SymbolType::Function, + ..Default::default() + }); + } + } + Some(syms) + } + + fn find_line_info(&self, _addr: u64) -> Option { + None + } + + fn addr_file_off(&self, _addr: u64) -> Option { + None + } + + fn get_obj_file_name(&self) -> &Path { + &self.file_name + } + + fn repr(&self) -> String { + String::from("KSymResolver") + } +} + +/// Cache of KSymResolver. +/// +/// It returns the same isntance if path is the same. +pub struct KSymCache { + resolvers: RefCell>>, +} + +impl KSymCache { + pub fn new() -> KSymCache { + KSymCache { + resolvers: RefCell::new(HashMap::new()), + } + } + + /// Find an instance of KSymResolver from the cache or create a new one. + pub fn get_resolver(&self, path: &Path) -> Result, Error> { + let mut resolvers = self.resolvers.borrow_mut(); + if let Some(resolver) = resolvers.get(path) { + return Ok(resolver.clone()); + } + + let mut resolver = Rc::new(KSymResolver::new()); + Rc::get_mut(&mut resolver) + .unwrap() + .load_file_name(path.to_path_buf())?; + resolvers.insert(path.to_path_buf(), resolver.clone()); + Ok(resolver) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cmp::Ordering; + + // This test case is skipped by default for /proc/kallsyms may + // not available in some environment. + #[test] + #[ignore = "system-dependent; may fail"] + fn ksym_resolver_load_find() { + let mut resolver = KSymResolver::new(); + assert!(resolver.load().is_ok()); + + assert!( + resolver.syms.len() > 10000, + "kallsyms seems to be unavailable or with all 0 addresses. (Check /proc/kallsyms)" + ); + + // Find the address of the symbol placed at the middle + let sym = &resolver.syms[resolver.syms.len() / 2]; + let addr = sym.addr; + let name = sym.name.clone(); + let found = resolver.find_symbols(addr); + assert!(!found.is_empty()); + assert!(found.iter().any(|x| x.0 == name)); + let addr = addr + 1; + let found = resolver.find_symbols(addr); + assert!(!found.is_empty()); + assert!(found.iter().any(|x| x.0 == name)); + + // 0 is an invalid address. We remove all symbols with 0 as + // thier address from the list. + let found = resolver.find_symbols(0); + assert!(found.is_empty()); + + // Find the address of the last symbol + let sym = &resolver.syms.last().unwrap(); + let addr = sym.addr; + let name = sym.name.clone(); + let found = resolver.find_symbols(addr); + assert!(!found.is_empty()); + assert!(found.iter().any(|x| x.0 == name)); + let found = resolver.find_symbols(addr + 1); + assert!(!found.is_empty()); + assert!(found.iter().any(|x| x.0 == name)); + + // Find the symbol placed at the one third + let sym = &resolver.syms[resolver.syms.len() / 3]; + let addr = sym.addr; + let name = sym.name.clone(); + let opts = FindAddrOpts { + offset_in_file: false, + obj_file_name: false, + sym_type: SymbolType::Function, + }; + let found = resolver.find_address(&name, &opts); + assert!(found.is_some()); + assert!(found.unwrap().iter().any(|x| x.address == addr)); + } + + #[test] + fn ksym_cache() { + let cache = KSymCache::new(); + let resolver = cache.get_resolver(Path::new(KALLSYMS)); + let resolver1 = cache.get_resolver(Path::new(KALLSYMS)); + assert!(resolver.is_ok()); + assert!(resolver1.is_ok()); + } + + #[test] + fn find_addresses_ksym() { + let mut resolver = KSymResolver::new(); + resolver.syms = vec![ + Ksym { + addr: 0x123, + name: "1".to_string(), + }, + Ksym { + addr: 0x123, + name: "1.5".to_string(), + }, + Ksym { + addr: 0x1234, + name: "2".to_string(), + }, + Ksym { + addr: 0x12345, + name: "3".to_string(), + }, + ]; + + // The address is less than the smallest address of all symbols. + assert!(resolver.find_addresses_ksym(1).next().is_none()); + + // The address match symbols exactly (the first address.) + let syms = resolver.find_addresses_ksym(0x123).collect::>(); + assert_eq!(syms.len(), 2); + assert_eq!(syms[0].addr, 0x123); + assert_eq!(syms[0].name, "1.5"); + assert_eq!(syms[1].addr, 0x123); + assert_eq!(syms[1].name, "1"); + + // The address is in between two symbols (the first address.) + let syms = resolver.find_addresses_ksym(0x124).collect::>(); + assert_eq!(syms.len(), 2); + assert_eq!(syms[0].addr, 0x123); + assert_eq!(syms[0].name, "1.5"); + assert_eq!(syms[1].addr, 0x123); + assert_eq!(syms[1].name, "1"); + + // The address match symbols exactly. + let syms = resolver.find_addresses_ksym(0x1234).collect::>(); + assert_eq!(syms.len(), 1); + assert_eq!(syms[0].addr, 0x1234); + assert_eq!(syms[0].name, "2"); + + // The address is in between two symbols. + let syms = resolver.find_addresses_ksym(0x1235).collect::>(); + assert_eq!(syms.len(), 1); + assert_eq!(syms[0].addr, 0x1234); + assert_eq!(syms[0].name, "2"); + + // The address match symbols exactly (the biggest address.) + let syms = resolver.find_addresses_ksym(0x12345).collect::>(); + assert_eq!(syms.len(), 1); + assert_eq!(syms[0].addr, 0x12345); + assert_eq!(syms[0].name, "3"); + + // The address is bigger than the biggest address of all symbols. + let syms = resolver.find_addresses_ksym(0x1234568).collect::>(); + assert_eq!(syms.len(), 1); + assert_eq!(syms[0].addr, 0x12345); + assert_eq!(syms[0].name, "3"); + } + + #[test] + fn find_addresses_ksym_exhaust() { + let syms_sz = 10; + let mut resolver = KSymResolver::new(); + resolver.syms = (0..syms_sz) + .map(|x| Ksym { + addr: 1, + name: x.to_string(), + }) + .collect(); + // A full-adder has a carry-out signal, right? + // Yes! Here it is. + let raised_carry_out = |addr| addr > syms_sz as u64; + + while !raised_carry_out(resolver.syms[0].addr) { + // Test find_addresses_ksym() against every address in the + // range [0..syms_sz+1]. + for i in 0..=(syms_sz + 1) { + let result: Vec<_> = resolver.find_addresses_ksym(i as u64).collect(); + let result_s: Vec<_> = resolver.find_addresses_ksym_simple(i as u64).collect(); + assert_eq!(result.len(), result_s.len()); + assert_eq!( + result + .iter() + .map(|x| x.name.as_str()) + .cmp(result_s.iter().map(|x| x.name.as_str())), + Ordering::Equal + ); + } + + let mut i = syms_sz - 1; + // Increase the address of the last symbol. + resolver.syms[i].addr += 1; + while i > 0 && raised_carry_out(resolver.syms[i].addr) { + // Bring the raised carry-out it to the left. + i -= 1; + resolver.syms[i].addr += 1; + } + // Every symbol on the right side have a raised carry-out. + // Reset their addresses. + let low_addr = resolver.syms[i].addr; + while i < (syms_sz - 1) { + i += 1; + resolver.syms[i].addr = low_addr; + } + } + } +} diff --git a/third_party/blazesym/src/lib.rs b/third_party/blazesym/src/lib.rs new file mode 100644 index 0000000..446ed88 --- /dev/null +++ b/third_party/blazesym/src/lib.rs @@ -0,0 +1,1058 @@ +// A library symbolizes addresses to symbols, filenames, and line numbers. +// +// BlazeSym is a library to symbolize addresses to get symbol names, file +// names of source files, and line numbers. It can translate a stack +// trace to function names and their locations in the +// source code. +#![doc = include_str!("../README.md")] +#![allow(clippy::let_and_return, clippy::let_unit_value)] +#![deny(unsafe_op_in_unsafe_fn)] +#![cfg_attr(feature = "nightly", feature(test))] + +#[cfg(feature = "nightly")] +extern crate test; + +use std::io::{Error, ErrorKind}; +use std::path::{Component, Path, PathBuf}; +use std::ptr; +use std::rc::Rc; +use std::u64; + +use nix::sys::stat::stat; +use nix::sys::utsname; + +mod c_api; +mod dwarf; +mod elf; +mod gsym; +mod ksym; +mod util; + +use elf::ElfCache; +use elf::ElfResolver; +use gsym::GsymResolver; +use ksym::{KSymCache, KSymResolver}; + +#[cfg(doc)] +pub use c_api::*; + +struct CacheHolder { + ksym: KSymCache, + elf: ElfCache, +} + +struct CacheHolderOpts { + line_number_info: bool, + debug_info_symbols: bool, +} + +impl CacheHolder { + fn new(opts: CacheHolderOpts) -> CacheHolder { + CacheHolder { + ksym: ksym::KSymCache::new(), + elf: ElfCache::new(opts.line_number_info, opts.debug_info_symbols), + } + } + + fn get_ksym_cache(&self) -> &KSymCache { + &self.ksym + } + + fn get_elf_cache(&self) -> &ElfCache { + &self.elf + } +} + +trait StackFrame { + fn get_ip(&self) -> u64; + fn get_frame_pointer(&self) -> u64; +} + +trait StackSession { + fn next_frame(&mut self) -> Option<&dyn StackFrame>; + fn prev_frame(&mut self) -> Option<&dyn StackFrame>; + fn go_top(&mut self); +} + +struct AddressLineInfo { + pub path: String, + pub line_no: usize, + pub column: usize, +} + +/// Types of symbols.. +#[derive(Clone, Copy)] +pub enum SymbolType { + Unknown, + Function, + Variable, +} + +/// The context of an address finding request. +/// +/// This type passes additional parameters to resolvers. +#[doc(hidden)] +pub struct FindAddrOpts { + /// Return the offset of the symbol from the first byte of the + /// object file if it is true. (False by default) + offset_in_file: bool, + /// Return the name of the object file if it is true. (False by default) + obj_file_name: bool, + /// Return the symbol(s) matching a given type. Unknown, by default, mean all types. + sym_type: SymbolType, +} + +/// Information of a symbol. +pub struct SymbolInfo { + /// The name of the symbol; for example, a function name. + pub name: String, + /// Start address (the first byte) of the symbol + pub address: u64, + /// The size of the symbol. The size of a function for example. + pub size: u64, + /// A function or a variable. + pub sym_type: SymbolType, + /// The offset in the object file. + pub file_offset: u64, + /// The file name of the shared object. + pub obj_file_name: Option, +} + +impl Default for SymbolInfo { + fn default() -> Self { + SymbolInfo { + name: "".to_string(), + address: 0, + size: 0, + sym_type: SymbolType::Unknown, + file_offset: 0, + obj_file_name: None, + } + } +} + +/// The trait of symbol resolvers. +/// +/// An symbol resolver usually provides information from one symbol +/// source; e., a symbol file. +trait SymResolver { + /// Return the range that this resolver serve in an address space. + fn get_address_range(&self) -> (u64, u64); + /// Find the names and the start addresses of a symbol found for + /// the given address. + fn find_symbols(&self, addr: u64) -> Vec<(&str, u64)>; + /// Find the address and size of a symbol name. + fn find_address(&self, name: &str, opts: &FindAddrOpts) -> Option>; + /// Find the addresses and sizes of the symbols matching a given pattern. + fn find_address_regex(&self, pattern: &str, opts: &FindAddrOpts) -> Option>; + /// Find the file name and the line number of an address. + fn find_line_info(&self, addr: u64) -> Option; + /// Translate an address (virtual) in a process to the file offset + /// in the object file. + fn addr_file_off(&self, addr: u64) -> Option; + /// Get the file name of the shared object. + fn get_obj_file_name(&self) -> &Path; + + fn repr(&self) -> String; +} + +const REG_RBP: usize = 7; +const REG_RIP: usize = 16; + +struct X86_64StackFrame { + rip: u64, + rbp: u64, +} + +impl StackFrame for X86_64StackFrame { + fn get_ip(&self) -> u64 { + self.rip + } + fn get_frame_pointer(&self) -> u64 { + self.rbp + } +} + +/// Do stacking unwind for x86_64 +/// +/// Parse a block of memory that is a copy of stack of thread to get frames. +/// +struct X86_64StackSession { + frames: Vec, + stack: Vec, + stack_base: u64, // The base address of the stack + registers: [u64; 17], + current_rbp: u64, + current_rip: u64, + current_frame_idx: usize, +} + +impl X86_64StackSession { + fn _get_rbp_rel(&self) -> usize { + (self.current_rbp - self.stack_base) as usize + } + + fn _mark_at_bottom(&mut self) { + self.current_rbp = 0; + } + + fn _is_at_bottom(&self) -> bool { + self.current_rbp == 0 + } + + fn _get_u64(&self, off: usize) -> u64 { + let stack = &self.stack; + (stack[off] as u64) + | ((stack[off + 1] as u64) << 8) + | ((stack[off + 2] as u64) << 16) + | ((stack[off + 3] as u64) << 24) + | ((stack[off + 4] as u64) << 32) + | ((stack[off + 5] as u64) << 40) + | ((stack[off + 6] as u64) << 48) + | ((stack[off + 7] as u64) << 56) + } + + #[cfg(test)] + pub fn new(stack: Vec, stack_base: u64, registers: [u64; 17]) -> X86_64StackSession { + X86_64StackSession { + frames: Vec::new(), + stack, + stack_base, + registers, + current_rbp: registers[REG_RBP], + current_rip: registers[REG_RIP], + current_frame_idx: 0, + } + } +} + +impl StackSession for X86_64StackSession { + fn next_frame(&mut self) -> Option<&dyn StackFrame> { + if self._is_at_bottom() { + return None; + } + + if self.frames.len() > self.current_frame_idx { + let frame = &self.frames[self.current_frame_idx]; + self.current_frame_idx += 1; + return Some(frame); + } + + let frame = X86_64StackFrame { + rip: self.current_rip, + rbp: self.current_rbp, + }; + self.frames.push(frame); + + if self._get_rbp_rel() <= (self.stack.len() - 16) { + let new_rbp = self._get_u64(self._get_rbp_rel()); + let new_rip = self._get_u64(self._get_rbp_rel() + 8); + self.current_rbp = new_rbp; + self.current_rip = new_rip; + } else { + self._mark_at_bottom(); + } + + self.current_frame_idx += 1; + Some(self.frames.last().unwrap() as &dyn StackFrame) + } + + fn prev_frame(&mut self) -> Option<&dyn StackFrame> { + if self.current_frame_idx == 0 { + return None; + } + + self.current_frame_idx -= 1; + Some(&self.frames[self.current_frame_idx] as &dyn StackFrame) + } + + fn go_top(&mut self) { + self.current_rip = self.registers[REG_RIP]; + self.current_rbp = self.registers[REG_RBP]; + self.current_frame_idx = 0; + } +} + +/// Create a KSymResolver +/// +/// # Safety +/// +/// This function is supposed to be used by C code. The pointer +/// returned should be free with `sym_resolver_free()`. +/// +#[no_mangle] +#[doc(hidden)] +pub unsafe extern "C" fn sym_resolver_create() -> *mut KSymResolver { + let mut resolver = Box::new(KSymResolver::new()); + if resolver.load().is_err() { + ptr::null_mut() + } else { + Box::leak(resolver) + } +} + + +struct KernelResolver { + ksymresolver: Option>, + kernelresolver: Option, + kallsyms: PathBuf, + kernel_image: PathBuf, +} + +impl KernelResolver { + fn new( + kallsyms: &Path, + kernel_image: &Path, + cache_holder: &CacheHolder, + ) -> Result { + let ksymresolver = cache_holder.get_ksym_cache().get_resolver(kallsyms); + let kernelresolver = ElfResolver::new(kernel_image, 0, cache_holder); + + if ksymresolver.is_err() && kernelresolver.is_err() { + return Err(Error::new( + ErrorKind::NotFound, + format!( + "can not load {} and {}", + kallsyms.display(), + kernel_image.display() + ), + )); + } + + Ok(KernelResolver { + ksymresolver: ksymresolver.ok(), + kernelresolver: kernelresolver.ok(), + kallsyms: kallsyms.to_path_buf(), + kernel_image: kernel_image.to_path_buf(), + }) + } +} + +impl SymResolver for KernelResolver { + fn get_address_range(&self) -> (u64, u64) { + (0xffffffff80000000, 0xffffffffffffffff) + } + + fn find_symbols(&self, addr: u64) -> Vec<(&str, u64)> { + if self.ksymresolver.is_some() { + self.ksymresolver.as_ref().unwrap().find_symbols(addr) + } else { + self.kernelresolver.as_ref().unwrap().find_symbols(addr) + } + } + fn find_address(&self, _name: &str, _opts: &FindAddrOpts) -> Option> { + None + } + fn find_address_regex(&self, _name: &str, _opts: &FindAddrOpts) -> Option> { + None + } + fn find_line_info(&self, addr: u64) -> Option { + self.kernelresolver.as_ref()?; + self.kernelresolver.as_ref().unwrap().find_line_info(addr) + } + + fn addr_file_off(&self, _addr: u64) -> Option { + None + } + + fn get_obj_file_name(&self) -> &Path { + &self.kernel_image + } + + fn repr(&self) -> String { + format!( + "KernelResolver {} {}", + self.kallsyms.display(), + self.kernel_image.display() + ) + } +} + +/// The description of a source of symbols and debug information. +/// +/// The source of symbols and debug information can be an ELF file, kernel +/// image, or process. +#[derive(Clone)] +pub enum SymbolSrcCfg { + /// A single ELF file + /// + /// You should provide the name of an ELF file and its base address. + /// + Elf { + /// The name of ELF files. + /// + /// It can be an executable or shared object. + /// For example, passing `"/bin/sh"` will load symbols and debug information from `sh`. + /// Whereas passing `"/lib/libc.so.xxx"` will load symbols and debug information from the libc. + file_name: PathBuf, + /// The address where the executable segment loaded. + /// + /// The address in the process should be the executable segment's + /// first byte. For example, in `/proc//maps`. + /// + /// ```text + /// 7fe1b2dc4000-7fe1b2f80000 r-xp 00000000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b2f80000-7fe1b3180000 ---p 001bc000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b3180000-7fe1b3184000 r--p 001bc000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// 7fe1b3184000-7fe1b3186000 rw-p 001c0000 00:1d 71695032 /usr/lib64/libc-2.28.so + /// ``` + /// + /// It reveals that the executable segment of libc-2.28.so was + /// loaded at 0x7fe1b2dc4000. This base address is used to + /// translate an address in the segment to the corresponding + /// address in the ELF file. + /// + /// A loader would load an executable segment with the permission of + /// `x`. For example, the first block is with the permission of + /// `r-xp`. + base_address: u64, + }, + /// Linux Kernel's binary image and a copy of /proc/kallsyms + Kernel { + /// The path of a kallsyms copy. + /// + /// For the running kernel on the device, it can be + /// "/proc/kallsyms". However, you can make a copy for later. + /// In that situation, you should give the path of the + /// copy. Passing `None`, by default, will be + /// `"/proc/kallsyms"`. + kallsyms: Option, + /// The path of a kernel image. + /// + /// This should be the path of a kernel image. For example, + /// `"/boot/vmlinux-xxxx"`. A `None` value will find the + /// kernel image of the running kernel in `"/boot/"` or + /// `"/usr/lib/debug/boot/"`. + kernel_image: Option, + }, + /// This one will be expended into all ELF files in a process. + /// + /// With a `None` value, it would means a process calling BlazeSym. + Process { pid: Option }, + Gsym { + file_name: PathBuf, + base_address: u64, + }, +} + +/// The result of symbolization by BlazeSymbolizer. +/// +/// [`BlazeSymbolizer::symbolize()`] returns a list of lists of +/// `SymbolizedResult`. It appears as `[[SymbolizedResult {...}, +/// SymbolizedResult {...}, ...], [SymbolizedResult {...}, ...], +/// ...]`. At the first level, each entry is a list of +/// `SymbolizedResult`. [`BlazeSymbolizer::symbolize()`] can return +/// multiple results of an address due to compiler optimizations. +#[derive(Clone)] +pub struct SymbolizedResult { + /// The symbol name that an address may belong to. + pub symbol: String, + /// The address where the symbol is located within the process. + /// + /// The address is in the target process, not the offset from the + /// shared object file. + pub start_address: u64, + /// The source path that defines the symbol. + pub path: String, + /// The line number of the symbolized instruction in the source code. + /// + /// This is the line number of the instruction of the address being + /// symbolized, not the line number that defines the symbol + /// (function). + pub line_no: usize, + pub column: usize, +} + +type ResolverList = Vec<((u64, u64), Box)>; + +struct ResolverMap { + resolvers: ResolverList, +} + +impl ResolverMap { + fn build_resolvers_proc_maps( + pid: u32, + resolvers: &mut ResolverList, + cache_holder: &CacheHolder, + ) -> Result<(), Error> { + let entries = util::parse_maps(pid)?; + + for entry in entries.iter() { + if entry.path.as_path().components().next() != Some(Component::RootDir) { + continue; + } + if (entry.mode & 0xa) != 0xa { + // r-x- + continue; + } + + if let Ok(filestat) = stat(&entry.path) { + if (filestat.st_mode & 0o170000) != 0o100000 { + // Not a regular file + continue; + } + } else { + continue; + } + if let Ok(resolver) = ElfResolver::new(&entry.path, entry.loaded_address, cache_holder) + { + resolvers.push((resolver.get_address_range(), Box::new(resolver))); + } else { + #[cfg(debug_assertions)] + eprintln!("Fail to create ElfResolver for {}", entry.path.display()); + } + } + + Ok(()) + } + + pub fn new( + sym_srcs: &[SymbolSrcCfg], + cache_holder: &CacheHolder, + ) -> Result { + let mut resolvers = ResolverList::new(); + for cfg in sym_srcs { + match cfg { + SymbolSrcCfg::Elf { + file_name, + base_address, + } => { + let resolver = ElfResolver::new(file_name, *base_address, cache_holder)?; + resolvers.push((resolver.get_address_range(), Box::new(resolver))); + } + SymbolSrcCfg::Kernel { + kallsyms, + kernel_image, + } => { + let kallsyms = kallsyms + .as_deref() + .unwrap_or_else(|| Path::new("/proc/kallsyms")); + let kernel_image = if let Some(img) = kernel_image { + img.clone() + } else { + let release = utsname::uname()?.release().to_str().unwrap().to_string(); + let basename = "vmlinux-"; + let dirs = [Path::new("/boot/"), Path::new("/usr/lib/debug/boot/")]; + let mut i = 0; + let kernel_image = loop { + let path = dirs[i].join(format!("{basename}{release}")); + if stat(&path).is_ok() { + break path; + } + i += 1; + if i >= dirs.len() { + break path; + } + }; + kernel_image + }; + if let Ok(resolver) = KernelResolver::new(kallsyms, &kernel_image, cache_holder) + { + resolvers.push((resolver.get_address_range(), Box::new(resolver))); + } else { + #[cfg(debug_assertions)] + eprintln!("fail to load the kernel image {}", kernel_image.display()); + } + } + SymbolSrcCfg::Process { pid } => { + let pid = if let Some(p) = pid { *p } else { 0 }; + + if let Err(_e) = + Self::build_resolvers_proc_maps(pid, &mut resolvers, cache_holder) + { + #[cfg(debug_assertions)] + eprintln!("Fail to load symbols for the process {pid}: {_e:?}"); + } + } + SymbolSrcCfg::Gsym { + file_name, + base_address, + } => { + let resolver = GsymResolver::new(file_name.clone(), *base_address)?; + resolvers.push((resolver.get_address_range(), Box::new(resolver))); + } + }; + } + resolvers.sort_by_key(|x| x.0 .0); // sorted by the loaded addresses + + Ok(ResolverMap { resolvers }) + } + + pub fn find_resolver(&self, address: u64) -> Option<&dyn SymResolver> { + let idx = + util::search_address_key(&self.resolvers, address, &|map: &( + (u64, u64), + Box, + )| + -> u64 { map.0 .0 })?; + let (loaded_begin, loaded_end) = self.resolvers[idx].0; + if loaded_begin != loaded_end && address >= loaded_end { + // `begin == end` means this ELF file may have only + // symbols and debug information. For this case, we + // always use this resolver if the given address is just + // above its loaded address. + None + } else { + Some(self.resolvers[idx].1.as_ref()) + } + } +} + +/// Switches in the features of BlazeSymbolizer. +/// +/// Passing variants of this `enum` to [`BlazeSymbolizer::new_opt()`] +/// will enable (true) or disable (false) respective features +/// of a symbolizer. +pub enum SymbolizerFeature { + /// Switch on or off the feature of returning file names and line numbers of addresses. + /// + /// By default, it is true. However, if it is false, + /// the symbolizer will not return the line number information. + LineNumberInfo(bool), // default is true. + /// Switch on or off the feature of parsing symbols (subprogram) from DWARF. + /// + /// By default, it is false. BlazeSym parses symbols from DWARF + /// only if the user of BlazeSym enables it. + DebugInfoSymbols(bool), +} + +/// Switches and settings of features to modify the way looking up addresses of +/// symbols or the returned information. +pub enum FindAddrFeature { + /// Return the offset in the file. + /// + /// The offset will be returned as the value of `SymbolInfo::file_offset`. + /// (Off by default) + OffsetInFile(bool), + /// Return the file name of the shared object. + /// + /// The name of the executiable or object file will be returned as + /// the value of `SymbolInfo::obj_file_name`. + /// (Off by default) + ObjFileName(bool), + /// Return symbols having the given type. + /// + /// With `SymbolType::Function`, BlazeSym will return only the + /// symbols that are functions. With `SymbolType::Variable`, + /// BlazeSym will return only the symbols that are variables. + /// With `SymbolType::Unknown`, BlazeSym will return symbols of + /// any type. + SymbolType(SymbolType), + /// Return symbols from the compile unit (source) of the given name. + CommpileUnit(String), +} + +/// BlazeSymbolizer provides an interface to symbolize addresses with +/// a list of symbol sources. +/// +/// Users should present BlazeSymbolizer with a list of symbol sources +/// (`SymbolSrcCfg`); for example, an ELF file and its base address +/// (`SymbolSrcCfg::Elf`), or a Linux kernel image and a copy of its +/// kallsyms (`SymbolSrcCfg::Kernel`). Additionally, BlazeSymbolizer +/// uses information from these sources to symbolize addresses. +pub struct BlazeSymbolizer { + cache_holder: CacheHolder, + + line_number_info: bool, +} + +impl BlazeSymbolizer { + /// Create and return an instance of BlazeSymbolizer. + pub fn new() -> Result { + let opts = CacheHolderOpts { + line_number_info: true, + debug_info_symbols: false, + }; + let cache_holder = CacheHolder::new(opts); + + Ok(BlazeSymbolizer { + cache_holder, + line_number_info: true, + }) + } + + /// Create and return an instance of BlazeSymbolizer. + /// + /// `new_opt()` works like [`BlazeSymbolizer::new()`] except it receives a list of + /// [`SymbolizerFeature`] to turn on or off some features. + pub fn new_opt(features: &[SymbolizerFeature]) -> Result { + let mut line_number_info = true; + let mut debug_info_symbols = false; + + for feature in features { + match feature { + SymbolizerFeature::LineNumberInfo(enabled) => { + line_number_info = *enabled; + } + SymbolizerFeature::DebugInfoSymbols(enabled) => { + debug_info_symbols = *enabled; + } + } + } + + let cache_holder = CacheHolder::new(CacheHolderOpts { + line_number_info, + debug_info_symbols, + }); + + Ok(BlazeSymbolizer { + cache_holder, + line_number_info, + }) + } + + fn find_addr_features_context(features: Vec) -> FindAddrOpts { + let mut opts = FindAddrOpts { + offset_in_file: false, + obj_file_name: false, + sym_type: SymbolType::Unknown, + }; + for f in features { + match f { + FindAddrFeature::OffsetInFile(enable) => { + opts.offset_in_file = enable; + } + FindAddrFeature::ObjFileName(enable) => { + opts.obj_file_name = enable; + } + FindAddrFeature::SymbolType(sym_type) => { + opts.sym_type = sym_type; + } + _ => { + todo!(); + } + } + } + opts + } + + /// Find the addresses of the symbols matching a pattern. + /// + /// Find the addresses of the symbols matching a pattern from the sources + /// of symbols and debug info described by `sym_srcs`. + /// `find_address_regex_opt()` works just like `find_address_regex()` with + /// additional controls on features. + /// + /// # Arguments + /// + /// * `sym_srcs` - A list of symbol and debug sources. + /// * `pattern` - A regex pattern. + /// * `features` - a list of `FindAddrFeature` to enable, disable, or specify parameters. + pub fn find_address_regex_opt( + &self, + sym_srcs: &[SymbolSrcCfg], + pattern: &str, + features: Vec, + ) -> Option> { + let ctx = Self::find_addr_features_context(features); + + let resolver_map = match ResolverMap::new(sym_srcs, &self.cache_holder) { + Ok(map) => map, + _ => { + return None; + } + }; + let mut syms = vec![]; + for (_, resolver) in &resolver_map.resolvers { + for mut sym in resolver + .find_address_regex(pattern, &ctx) + .unwrap_or_default() + { + if ctx.offset_in_file { + if let Some(off) = resolver.addr_file_off(sym.address) { + sym.file_offset = off; + } + } + if ctx.obj_file_name { + sym.obj_file_name = Some(resolver.get_obj_file_name().to_path_buf()); + } + syms.push(sym); + } + } + Some(syms) + } + + /// Find the addresses of the symbols matching a pattern. + /// + /// Find the addresses of the symbols matching a pattern from the sources + /// of symbols and debug info described by `sym_srcs`. + /// + /// # Arguments + /// + /// * `sym_srcs` - A list of symbol and debug sources. + /// * `pattern` - A regex pattern. + pub fn find_address_regex( + &self, + sym_srcs: &[SymbolSrcCfg], + pattern: &str, + ) -> Option> { + self.find_address_regex_opt(sym_srcs, pattern, vec![]) + } + + /// Find the addresses of a list of symbol names. + /// + /// Find the addresses of a list of symbol names from the sources + /// of symbols and debug info described by `sym_srcs`. + /// `find_addresses_opt()` works just like `find_addresses()` with + /// additional controls on features. + /// + /// # Arguments + /// + /// * `sym_srcs` - A list of symbol and debug sources. + /// * `names` - A list of symbol names. + /// * `features` - a list of `FindAddrFeature` to enable, disable, or specify parameters. + pub fn find_addresses_opt( + &self, + sym_srcs: &[SymbolSrcCfg], + names: &[&str], + features: Vec, + ) -> Vec> { + let ctx = Self::find_addr_features_context(features); + + let resolver_map = match ResolverMap::new(sym_srcs, &self.cache_holder) { + Ok(map) => map, + _ => { + return vec![]; + } + }; + let mut syms_list = vec![]; + for name in names { + let mut found = vec![]; + for (_, resolver) in &resolver_map.resolvers { + if let Some(mut syms) = resolver.find_address(name, &ctx) { + for sym in &mut syms { + if ctx.offset_in_file { + if let Some(off) = resolver.addr_file_off(sym.address) { + sym.file_offset = off; + } + } + if ctx.obj_file_name { + sym.obj_file_name = Some(resolver.get_obj_file_name().to_path_buf()); + } + } + found.append(&mut syms); + } + } + syms_list.push(found); + } + syms_list + } + + /// Find the addresses of a list of symbol names. + /// + /// Find the addresses of a list of symbol names from the sources + /// of symbols and debug info described by `sym_srcs`. + /// + /// # Arguments + /// + /// * `sym_srcs` - A list of symbol and debug sources. + /// * `names` - A list of symbol names. + pub fn find_addresses( + &self, + sym_srcs: &[SymbolSrcCfg], + names: &[&str], + ) -> Vec> { + self.find_addresses_opt(sym_srcs, names, vec![]) + } + + /// Symbolize a list of addresses. + /// + /// Symbolize a list of addresses with the information from the + /// sources of symbols and debug info described by `sym_srcs`. + /// + /// # Arguments + /// + /// * `sym_srcs` - A list of symbol and debug sources. + /// * `addresses` - A list of addresses to symbolize. + pub fn symbolize( + &self, + sym_srcs: &[SymbolSrcCfg], + addresses: &[u64], + ) -> Vec> { + let resolver_map = if let Ok(map) = ResolverMap::new(sym_srcs, &self.cache_holder) { + map + } else { + #[cfg(debug_assertions)] + eprintln!("Fail to build ResolverMap"); + return vec![]; + }; + + let info: Vec> = addresses + .iter() + .map(|addr| { + let resolver = if let Some(resolver) = resolver_map.find_resolver(*addr) { + resolver + } else { + return vec![]; + }; + + let res_syms = resolver.find_symbols(*addr); + let linfo = if self.line_number_info { + resolver.find_line_info(*addr) + } else { + None + }; + if res_syms.is_empty() { + if let Some(linfo) = linfo { + vec![SymbolizedResult { + symbol: "".to_string(), + start_address: 0, + path: linfo.path, + line_no: linfo.line_no, + column: linfo.column, + }] + } else { + vec![] + } + } else { + let mut results = vec![]; + for sym in res_syms { + if let Some(ref linfo) = linfo { + let (sym, start) = sym; + results.push(SymbolizedResult { + symbol: String::from(sym), + start_address: start, + path: linfo.path.clone(), + line_no: linfo.line_no, + column: linfo.column, + }); + } else { + let (sym, start) = sym; + results.push(SymbolizedResult { + symbol: String::from(sym), + start_address: start, + path: "".to_string(), + line_no: 0, + column: 0, + }); + } + } + results + } + }) + .collect(); + info + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::env; + use std::path::Path; + + #[test] + fn hello_world_stack() { + // A stack sample from a Hello World proram. + let stack = vec![ + 0xb0, 0xd5, 0xff, 0xff, 0xff, 0x7f, 0x0, 0x0, 0xaf, 0x5, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xd0, 0xd5, 0xff, 0xff, 0xff, 0x7f, 0x0, 0x0, 0xcb, 0x5, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, + ]; + let expected_rips = vec![0x000000000040058a, 0x00000000004005af, 0x00000000004005cb]; + let base = 0x7fffffffd5a0; + let mut registers: [u64; 17] = [0; 17]; + + registers[crate::REG_RIP] = expected_rips[0]; + registers[crate::REG_RBP] = 0x7fffffffd5a0; + + let mut session = crate::X86_64StackSession::new(stack, base, registers); + let frame = session.next_frame().unwrap(); + assert_eq!(frame.get_ip(), expected_rips[0]); + let frame = session.next_frame().unwrap(); + assert_eq!(frame.get_ip(), expected_rips[1]); + let frame = session.next_frame().unwrap(); + assert_eq!(frame.get_ip(), expected_rips[2]); + } + + #[test] + fn load_symbolfilecfg_process() { + // Check if SymbolSrcCfg::Process expands to ELFResolvers. + let cfg = vec![SymbolSrcCfg::Process { pid: None }]; + let cache_holder = CacheHolder::new(CacheHolderOpts { + line_number_info: true, + debug_info_symbols: false, + }); + let resolver_map = ResolverMap::new(&cfg, &cache_holder); + assert!(resolver_map.is_ok()); + let resolver_map = resolver_map.unwrap(); + + let signatures: Vec<_> = resolver_map.resolvers.iter().map(|x| x.1.repr()).collect(); + // ElfResolver for the binary itself. + assert!(signatures.iter().any(|x| x.contains("/blazesym"))); + // ElfResolver for libc. + assert!(signatures.iter().any(|x| x.contains("/libc"))); + } + + #[test] + fn load_symbolfilecfg_processkernel() { + // Check if SymbolSrcCfg::Process & SymbolSrcCfg::Kernel expands to + // ELFResolvers and a KernelResolver. + let srcs = vec![ + SymbolSrcCfg::Process { pid: None }, + SymbolSrcCfg::Kernel { + kallsyms: None, + kernel_image: None, + }, + ]; + let cache_holder = CacheHolder::new(CacheHolderOpts { + line_number_info: true, + debug_info_symbols: false, + }); + let resolver_map = ResolverMap::new(&srcs, &cache_holder); + assert!(resolver_map.is_ok()); + let resolver_map = resolver_map.unwrap(); + + let signatures: Vec<_> = resolver_map.resolvers.iter().map(|x| x.1.repr()).collect(); + // ElfResolver for the binary itself. + assert!(signatures.iter().any(|x| x.contains("/blazesym"))); + // ElfResolver for libc. + assert!(signatures.iter().any(|x| x.contains("/libc"))); + assert!(signatures.iter().any(|x| x.contains("KernelResolver"))); + } + + #[test] + fn load_symbolfilecfg_invalid_kernel() { + // Check if SymbolSrcCfg::Kernel expands to a KernelResolver + // even if kernel_image is invalid. + let srcs = vec![SymbolSrcCfg::Kernel { + kallsyms: None, + kernel_image: Some(PathBuf::from("/dev/null")), + }]; + let cache_holder = CacheHolder::new(CacheHolderOpts { + line_number_info: true, + debug_info_symbols: false, + }); + let resolver_map = ResolverMap::new(&srcs, &cache_holder); + assert!(resolver_map.is_ok()); + let resolver_map = resolver_map.unwrap(); + + let signatures: Vec<_> = resolver_map.resolvers.iter().map(|x| x.1.repr()).collect(); + assert!(signatures.iter().any(|x| x.contains("KernelResolver"))); + + let kallsyms = Path::new("/proc/kallsyms"); + let kernel_image = Path::new("/dev/null"); + let kresolver = KernelResolver::new(kallsyms, kernel_image, &cache_holder).unwrap(); + assert!(kresolver.ksymresolver.is_some()); + assert!(kresolver.kernelresolver.is_none()); + } + + #[test] + fn load_gsym_resolver() { + let test_gsym = Path::new(&env!("CARGO_MANIFEST_DIR")) + .join("data") + .join("test.gsym"); + let features = vec![SymbolizerFeature::LineNumberInfo(true)]; + let srcs = vec![SymbolSrcCfg::Gsym { + file_name: test_gsym, + base_address: 0, + }]; + let symbolizer = BlazeSymbolizer::new_opt(&features).unwrap(); + let count = symbolizer + .symbolize(&srcs, &[0x2000100]) + .into_iter() + .flatten() + .filter(|result| result.symbol == "factorial") + .count(); + assert_eq!(count, 1); + } +} diff --git a/third_party/blazesym/src/util.rs b/third_party/blazesym/src/util.rs new file mode 100644 index 0000000..fcdf91f --- /dev/null +++ b/third_party/blazesym/src/util.rs @@ -0,0 +1,444 @@ +use std::ffi::CStr; +use std::fs; +use std::io::{BufRead, BufReader, Error, ErrorKind}; +use std::mem::size_of; +use std::path::PathBuf; + +use regex::Regex; + +pub fn search_address_key( + data: &[T], + address: V, + keyfn: &dyn Fn(&T) -> V, +) -> Option { + let mut left = 0; + let mut right = data.len(); + + if right == 0 { + return None; + } + if address < keyfn(&data[0]) { + return None; + } + + while (left + 1) < right { + let v = (left + right) / 2; + let key = keyfn(&data[v]); + + if key == address { + return Some(v); + } + if address < key { + right = v; + } else { + left = v; + } + } + + Some(left) +} + +/// Do binary search but skip entries not having a key. +pub fn search_address_opt_key( + data: &[T], + address: V, + keyfn: &dyn Fn(&T) -> Option, +) -> Option { + let mut left = 0; + let mut right = data.len(); + + while left < right { + let left_key = keyfn(&data[left]); + if left_key.is_some() { + break; + } + left += 1; + } + + if left == right { + return None; + } + + if address < keyfn(&data[left]).unwrap() { + return None; + } + + while (left + 1) < right { + let mut v = (left + right) / 2; + + let v_saved = v; + // Skip entries not having a key + while v < right { + let key = keyfn(&data[v]); + if key.is_some() { + break; + } + v += 1; + } + // All entries at the right side haven't keys. + // Shrink to the left side. + if v == right { + right = v_saved; + continue; + } + + let key = keyfn(&data[v]).unwrap(); + + if key == address { + return Some(v); + } + if address < key { + right = v; + } else { + left = v; + } + } + + Some(left) +} + +pub fn extract_string(raw: &[u8], off: usize) -> Option<&str> { + let mut end = off; + + if off >= raw.len() { + return None; + } + while end < raw.len() && raw[end] != 0 { + end += 1; + } + if end >= raw.len() { + return None; + } + CStr::from_bytes_with_nul(&raw[off..=end]) + .ok()? + .to_str() + .ok() +} + +pub struct LinuxMapsEntry { + pub loaded_address: u64, + pub end_address: u64, + pub mode: u8, + pub offset: u64, + pub path: PathBuf, +} + +pub fn parse_maps(pid: u32) -> Result, Error> { + let mut entries = Vec::::new(); + let file_name = if pid == 0 { + String::from("/proc/self/maps") + } else { + format!("/proc/{pid}/maps") + }; + let file = fs::File::open(file_name)?; + let mut reader = BufReader::new(file); + let mut line = String::new(); + let re_ptn = Regex::new( + r"^([0-9a-f]+)-([0-9a-f]+) ([rwxp\\-]+) ([0-9a-f]+) [0-9a-f]+:[0-9a-f]+ [0-9]+ *((/[^/]+)+)$", + ); + if re_ptn.is_err() { + println!("{re_ptn:?}"); + return Err(Error::new(ErrorKind::InvalidData, "Failed to build regex")); + } + let re_ptn = re_ptn.unwrap(); + + while reader.read_line(&mut line)? > 0 { + if let Some(caps) = re_ptn.captures(&line) { + let loaded_address_str = caps.get(1).unwrap().as_str(); + let loaded_address = u64::from_str_radix(loaded_address_str, 16).unwrap(); + + let end_address_str = caps.get(2).unwrap().as_str(); + let end_address = u64::from_str_radix(end_address_str, 16).unwrap(); + + let mode_str = caps.get(3).unwrap().as_str(); + let mut mode = 0; + for c in mode_str.chars() { + mode = (mode << 1) | u8::from(c != '-'); + } + + let offset = u64::from_str_radix(caps.get(4).unwrap().as_str(), 16).unwrap(); + let path = caps.get(5).unwrap().as_str().strip_suffix('\n').unwrap(); + let mut path_str = path.to_string(); + if let Some(pos) = path.rfind(" (deleted)") { + if pos == path.len() - " (deleted)".len() { + path_str = format!("/proc/{pid}/map_files/{loaded_address:x}-{end_address:x}"); + } + } + + let entry = LinuxMapsEntry { + loaded_address, + end_address, + mode, + offset, + path: PathBuf::from(path_str), + }; + entries.push(entry); + } + line.clear(); + } + + Ok(entries) +} + +#[inline] +pub fn decode_leb128_128(mut data: &[u8]) -> Option<(u128, u8)> { + data.read_u128_leb128() +} + +#[inline] +pub fn decode_leb128(mut data: &[u8]) -> Option<(u64, u8)> { + data.read_u128_leb128().map(|(v, s)| (v as u64, s)) +} + +#[inline] +pub fn decode_leb128_s(mut data: &[u8]) -> Option<(i64, u8)> { + data.read_i128_leb128().map(|(v, s)| (v as i64, s)) +} + +#[inline] +pub fn decode_uhalf(mut data: &[u8]) -> u16 { + // TODO: Need to handle errors more gracefully. + data.read_u16().unwrap() +} + +#[inline] +pub fn decode_uword(mut data: &[u8]) -> u32 { + // TODO: Need to handle errors more gracefully. + data.read_u32().unwrap() +} + +#[inline] +pub fn decode_udword(mut data: &[u8]) -> u64 { + // TODO: Need to handle errors more gracefully. + data.read_u64().unwrap() +} + +mod sealed { + /// A marker trait for "plain old data" data types. + /// + /// # Safety + /// Only safe to implement for types that are valid for any bit pattern. + pub unsafe trait Pod {} + + unsafe impl Pod for i8 {} + unsafe impl Pod for u8 {} + unsafe impl Pod for i16 {} + unsafe impl Pod for u16 {} + unsafe impl Pod for i32 {} + unsafe impl Pod for u32 {} + unsafe impl Pod for i64 {} + unsafe impl Pod for u64 {} + unsafe impl Pod for i128 {} + unsafe impl Pod for u128 {} +} + +/// An trait providing utility functions for reading data from a byte buffer. +pub trait ReadRaw<'data> { + /// Ensure that `len` bytes are available for consumption. + fn ensure(&self, len: usize) -> Option<()>; + + /// Consume and return `len` bytes. + fn read_slice(&mut self, len: usize) -> Option<&'data [u8]>; + + /// Read a NUL terminated string. + fn read_cstr(&mut self) -> Option<&'data CStr>; + + /// Read anything implementing `Pod`. + #[inline] + fn read_pod(&mut self) -> Option + where + T: sealed::Pod, + { + let data = self.read_slice(size_of::())?; + // SAFETY: `T` is `Pod` and hence valid for any bit pattern. The pointer + // is guaranteed to be valid and to point to memory of at least + // `sizeof(T)` bytes. + let value = unsafe { data.as_ptr().cast::().read_unaligned() }; + Some(value) + } + + /// Read a `u8` value. + #[inline] + fn read_u8(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `i16` value. + #[inline] + fn read_i16(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `u16` value. + #[inline] + fn read_u16(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `i32` value. + #[inline] + fn read_i32(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `u32` value. + #[inline] + fn read_u32(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `u64` value. + #[inline] + fn read_u64(&mut self) -> Option { + self.read_pod::() + } + + /// Read a `u128` encoded as unsigned variable length little endian base 128 + /// value. + /// + /// The function returns the value read along with the number of bytes + /// consumed. + fn read_u128_leb128(&mut self) -> Option<(u128, u8)> { + let mut shift = 0; + let mut value = 0u128; + while let Some(bytes) = self.read_slice(1) { + if let [byte] = bytes { + value |= ((byte & 0b0111_1111) as u128) << shift; + shift += 7; + if (byte & 0b1000_0000) == 0 { + return Some((value, shift / 7)); + } + } else { + unreachable!() + } + } + None + } + + /// Read a `u128` encoded as signed variable length little endian base 128 + /// value. + /// + /// The function returns the value read along with the number of bytes + /// consumed. + fn read_i128_leb128(&mut self) -> Option<(i128, u8)> { + let (value, shift) = self.read_u128_leb128()?; + let sign_bits = 128 - shift * 7; + let value = ((value as i128) << sign_bits) >> sign_bits; + Some((value, shift)) + } +} + +impl<'data> ReadRaw<'data> for &'data [u8] { + #[inline] + fn ensure(&self, len: usize) -> Option<()> { + if len > self.len() { + return None; + } + Some(()) + } + + #[inline] + fn read_slice(&mut self, len: usize) -> Option<&'data [u8]> { + self.ensure(len)?; + let (a, b) = self.split_at(len); + *self = b; + Some(a) + } + + #[inline] + fn read_cstr(&mut self) -> Option<&'data CStr> { + let idx = self.iter().position(|byte| *byte == b'\0')?; + CStr::from_bytes_with_nul(self.read_slice(idx + 1)?).ok() + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + + /// Make sure that `[u8]::ensure` works as expected. + #[test] + fn u8_slice_len_ensurance() { + let slice = [0u8; 0].as_slice(); + assert_eq!(slice.ensure(0), Some(())); + assert_eq!(slice.ensure(1), None); + + let slice = [1u8].as_slice(); + assert_eq!(slice.ensure(0), Some(())); + assert_eq!(slice.ensure(1), Some(())); + assert_eq!(slice.ensure(2), None); + } + + /// Check that we can read various integers from a slice. + #[test] + fn pod_reading() { + macro_rules! test { + ($type:ty) => {{ + let max = <$type>::MAX.to_ne_bytes(); + let one = (1 as $type).to_ne_bytes(); + + let mut data = Vec::new(); + let () = data.extend_from_slice(&max); + let () = data.extend_from_slice(&one); + let () = data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8]); + + let mut raw = data.as_slice(); + let uword = raw.read_pod::<$type>().unwrap(); + assert_eq!(uword, <$type>::MAX); + + let uword = raw.read_pod::<$type>().unwrap(); + assert_eq!(uword, 1); + }}; + } + + test!(i8); + test!(u8); + test!(i16); + test!(u16); + test!(i32); + test!(u32); + test!(i64); + test!(u64); + test!(i128); + test!(u128); + } + + /// Test reading of signed and unsigned 16 and 32 bit values against known + /// results. + #[test] + fn word_reading() { + let data = 0xf936857fu32.to_ne_bytes(); + assert_eq!(data.as_slice().read_u16().unwrap(), 0x857f); + assert_eq!(data.as_slice().read_i16().unwrap(), -31361); + assert_eq!(data.as_slice().read_u32().unwrap(), 0xf936857f); + assert_eq!(data.as_slice().read_i32().unwrap(), -113867393); + } + + /// Make sure that we can read leb128 encoded values. + #[test] + fn leb128_reading() { + let data = [0xf4, 0xf3, 0x75]; + let (v, s) = data.as_slice().read_u128_leb128().unwrap(); + assert_eq!(v, 0x1d79f4); + assert_eq!(s, 3); + + let (v, s) = data.as_slice().read_i128_leb128().unwrap(); + assert_eq!(v, -165388); + assert_eq!(s, 3); + } + + /// Check that we can read a NUL terminated string from a slice. + #[test] + fn cstr_reading() { + let mut slice = b"abc\x001234".as_slice(); + + let cstr = slice.read_cstr().unwrap(); + assert_eq!(cstr, CStr::from_bytes_with_nul(b"abc\0").unwrap()); + + // No terminating NUL byte. + let mut slice = b"abc".as_slice(); + assert_eq!(slice.read_cstr(), None); + } +} diff --git a/third_party/bpftool/.gitattributes b/third_party/bpftool/.gitattributes new file mode 100644 index 0000000..05e03be --- /dev/null +++ b/third_party/bpftool/.gitattributes @@ -0,0 +1 @@ +src/Makefile.* linguist-language=Makefile diff --git a/third_party/bpftool/.gitmodules b/third_party/bpftool/.gitmodules new file mode 100644 index 0000000..1706b4a --- /dev/null +++ b/third_party/bpftool/.gitmodules @@ -0,0 +1,3 @@ +[submodule "libbpf"] + path = libbpf + url = https://github.com/libbpf/libbpf.git diff --git a/third_party/bpftool/BPF-CHECKPOINT-COMMIT b/third_party/bpftool/BPF-CHECKPOINT-COMMIT new file mode 100644 index 0000000..ec5e4be --- /dev/null +++ b/third_party/bpftool/BPF-CHECKPOINT-COMMIT @@ -0,0 +1 @@ +496720b7cfb6574a8f6f4d434f23e3d1e6cfaeb9 diff --git a/third_party/bpftool/CHECKPOINT-COMMIT b/third_party/bpftool/CHECKPOINT-COMMIT new file mode 100644 index 0000000..130cfa6 --- /dev/null +++ b/third_party/bpftool/CHECKPOINT-COMMIT @@ -0,0 +1 @@ +a3e7e6b17946f48badce98d7ac360678a0ea7393 diff --git a/third_party/bpftool/Dockerfile b/third_party/bpftool/Dockerfile new file mode 100644 index 0000000..730319c --- /dev/null +++ b/third_party/bpftool/Dockerfile @@ -0,0 +1,39 @@ +# With this Dockerfile, you can create a container image: +# $ docker build -f Dockerfile -t bpftool . +# And then use it: +# $ docker run --rm -ti --privileged --pid=host bpftool prog + +# hadolint global ignore=DL3008 + +FROM ubuntu:22.04 as builder + +RUN \ + export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && \ + apt-get -y install --no-install-recommends \ + build-essential \ + libelf-dev \ + libz-dev \ + libcap-dev \ + clang llvm llvm-dev lld \ + binutils-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + +COPY . /src +RUN \ + make -C /src/src clean && \ + make -C /src/src -j "$(nproc)" + +FROM ubuntu:22.04 +RUN \ + export DEBIAN_FRONTEND=noninteractive && \ + apt-get update && \ + apt-get -y install --no-install-recommends \ + libelf1 \ + llvm && \ + rm -rf /var/lib/apt/lists/* + +COPY --from=builder /src/src/bpftool /bin/bpftool + +ENTRYPOINT ["/bin/bpftool"] diff --git a/third_party/bpftool/LICENSE b/third_party/bpftool/LICENSE new file mode 100644 index 0000000..00c68fe --- /dev/null +++ b/third_party/bpftool/LICENSE @@ -0,0 +1 @@ +GPL-2.0-only OR BSD-2-Clause diff --git a/third_party/bpftool/LICENSE.BSD-2-Clause b/third_party/bpftool/LICENSE.BSD-2-Clause new file mode 100644 index 0000000..da366e2 --- /dev/null +++ b/third_party/bpftool/LICENSE.BSD-2-Clause @@ -0,0 +1,32 @@ +Valid-License-Identifier: BSD-2-Clause +SPDX-URL: https://spdx.org/licenses/BSD-2-Clause.html +Usage-Guide: + To use the BSD 2-clause "Simplified" License put the following SPDX + tag/value pair into a comment according to the placement guidelines in + the licensing rules documentation: + SPDX-License-Identifier: BSD-2-Clause +License-Text: + +Copyright (c) . All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/bpftool/LICENSE.GPL-2.0 b/third_party/bpftool/LICENSE.GPL-2.0 new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/third_party/bpftool/LICENSE.GPL-2.0 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/third_party/bpftool/bash-completion/bpftool b/third_party/bpftool/bash-completion/bpftool new file mode 100644 index 0000000..085bf18 --- /dev/null +++ b/third_party/bpftool/bash-completion/bpftool @@ -0,0 +1,1211 @@ +# bpftool(8) bash completion -*- shell-script -*- +# +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2017-2018 Netronome Systems, Inc. +# +# Author: Quentin Monnet + +# Takes a list of words in argument; each one of them is added to COMPREPLY if +# it is not already present on the command line. Returns no value. +_bpftool_once_attr() +{ + local w idx found + for w in $*; do + found=0 + for (( idx=3; idx < ${#words[@]}-1; idx++ )); do + if [[ $w == ${words[idx]} ]]; then + found=1 + break + fi + done + [[ $found -eq 0 ]] && \ + COMPREPLY+=( $( compgen -W "$w" -- "$cur" ) ) + done +} + +# Takes a list of words as argument; if any of those words is present on the +# command line, return 0. Otherwise, return 1. +_bpftool_search_list() +{ + local w idx + for w in $*; do + for (( idx=3; idx < ${#words[@]}-1; idx++ )); do + [[ $w == ${words[idx]} ]] && return 0 + done + done + return 1 +} + +# Takes a list of words in argument; adds them all to COMPREPLY if none of them +# is already present on the command line. Returns no value. +_bpftool_one_of_list() +{ + _bpftool_search_list $* && return 1 + COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) ) +} + +_bpftool_get_map_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +# Takes map type and adds matching map ids to the list of suggestions. +_bpftool_get_map_ids_for_type() +{ + local type="$1" + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command grep -C2 "$type" | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_map_names() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +# Takes map type and adds matching map names to the list of suggestions. +_bpftool_get_map_names_for_type() +{ + local type="$1" + COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ + command grep -C2 "$type" | \ + command sed -n 's/.*"name": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_prog_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_prog_tags() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ + command sed -n 's/.*"tag": "\(.*\)",$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_prog_names() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ + command sed -n 's/.*"name": "\(.*\)",$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_btf_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_link_ids() +{ + COMPREPLY+=( $( compgen -W "$( bpftool -jp link 2>&1 | \ + command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) +} + +_bpftool_get_obj_map_names() +{ + local obj + + obj=$1 + + maps=$(objdump -j maps -t $obj 2>/dev/null | \ + command awk '/g . maps/ {print $NF}') + + COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) ) +} + +_bpftool_get_obj_map_idxs() +{ + local obj + + obj=$1 + + nmaps=$(objdump -j maps -t $obj 2>/dev/null | grep -c 'g . maps') + + COMPREPLY+=( $( compgen -W "$(seq 0 $((nmaps - 1)))" -- "$cur" ) ) +} + +_sysfs_get_netdevs() +{ + COMPREPLY+=( $( compgen -W "$( ls /sys/class/net 2>/dev/null )" -- \ + "$cur" ) ) +} + +# Retrieve type of the map that we are operating on. +_bpftool_map_guess_map_type() +{ + local keyword ref + for (( idx=3; idx < ${#words[@]}-1; idx++ )); do + case "${words[$((idx-2))]}" in + lookup|update) + keyword=${words[$((idx-1))]} + ref=${words[$((idx))]} + ;; + push) + printf "stack" + return 0 + ;; + enqueue) + printf "queue" + return 0 + ;; + esac + done + [[ -z $ref ]] && return 0 + + local type + type=$(bpftool -jp map show $keyword $ref | \ + command sed -n 's/.*"type": "\(.*\)",$/\1/p') + [[ -n $type ]] && printf $type +} + +_bpftool_map_update_get_id() +{ + local command="$1" + + # Is it the map to update, or a map to insert into the map to update? + # Search for "value" keyword. + local idx value + for (( idx=7; idx < ${#words[@]}-1; idx++ )); do + if [[ ${words[idx]} == "value" ]]; then + value=1 + break + fi + done + if [[ $value -eq 0 ]]; then + case "$command" in + push) + _bpftool_get_map_ids_for_type stack + ;; + enqueue) + _bpftool_get_map_ids_for_type queue + ;; + *) + _bpftool_get_map_ids + ;; + esac + return 0 + fi + + # Id to complete is for a value. It can be either prog id or map id. This + # depends on the type of the map to update. + local type=$(_bpftool_map_guess_map_type) + case $type in + array_of_maps|hash_of_maps) + _bpftool_get_map_ids + return 0 + ;; + prog_array) + _bpftool_get_prog_ids + return 0 + ;; + *) + return 0 + ;; + esac +} + +_bpftool_map_update_get_name() +{ + local command="$1" + + # Is it the map to update, or a map to insert into the map to update? + # Search for "value" keyword. + local idx value + for (( idx=7; idx < ${#words[@]}-1; idx++ )); do + if [[ ${words[idx]} == "value" ]]; then + value=1 + break + fi + done + if [[ $value -eq 0 ]]; then + case "$command" in + push) + _bpftool_get_map_names_for_type stack + ;; + enqueue) + _bpftool_get_map_names_for_type queue + ;; + *) + _bpftool_get_map_names + ;; + esac + return 0 + fi + + # Name to complete is for a value. It can be either prog name or map name. This + # depends on the type of the map to update. + local type=$(_bpftool_map_guess_map_type) + case $type in + array_of_maps|hash_of_maps) + _bpftool_get_map_names + return 0 + ;; + prog_array) + _bpftool_get_prog_names + return 0 + ;; + *) + return 0 + ;; + esac +} + +_bpftool() +{ + local cur prev words objword json=0 + _init_completion || return + + # Deal with options + if [[ ${words[cword]} == -* ]]; then + local c='--version --json --pretty --bpffs --mapcompat --debug \ + --use-loader --base-btf' + COMPREPLY=( $( compgen -W "$c" -- "$cur" ) ) + return 0 + fi + if _bpftool_search_list -j --json -p --pretty; then + json=1 + fi + + # Deal with simplest keywords + case $prev in + help|hex) + return 0 + ;; + tag) + _bpftool_get_prog_tags + return 0 + ;; + dev|offload_dev|xdpmeta_dev) + _sysfs_get_netdevs + return 0 + ;; + file|pinned|-B|--base-btf) + _filedir + return 0 + ;; + batch) + COMPREPLY=( $( compgen -W 'file' -- "$cur" ) ) + return 0 + ;; + esac + + # Remove all options so completions don't have to deal with them. + local i + for (( i=1; i < ${#words[@]}; )); do + if [[ ${words[i]::1} == - ]] && + [[ ${words[i]} != "-B" ]] && [[ ${words[i]} != "--base-btf" ]]; then + words=( "${words[@]:0:i}" "${words[@]:i+1}" ) + [[ $i -le $cword ]] && cword=$(( cword - 1 )) + else + i=$(( ++i )) + fi + done + cur=${words[cword]} + prev=${words[cword - 1]} + pprev=${words[cword - 2]} + + local object=${words[1]} command=${words[2]} + + if [[ -z $object || $cword -eq 1 ]]; then + case $cur in + *) + COMPREPLY=( $( compgen -W "$( bpftool help 2>&1 | \ + command sed \ + -e '/OBJECT := /!d' \ + -e 's/.*{//' \ + -e 's/}.*//' \ + -e 's/|//g' )" -- "$cur" ) ) + COMPREPLY+=( $( compgen -W 'batch help' -- "$cur" ) ) + return 0 + ;; + esac + fi + + [[ $command == help ]] && return 0 + + # Completion depends on object and command in use + case $object in + prog) + # Complete id and name, only for subcommands that use prog (but no + # map) ids/names. + case $command in + show|list|dump|pin) + case $prev in + id) + _bpftool_get_prog_ids + return 0 + ;; + name) + _bpftool_get_prog_names + return 0 + ;; + esac + ;; + esac + + local PROG_TYPE='id pinned tag name' + local MAP_TYPE='id pinned name' + local METRIC_TYPE='cycles instructions l1d_loads llc_misses \ + itlb_misses dtlb_misses' + case $command in + show|list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + dump) + case $prev in + $command) + COMPREPLY+=( $( compgen -W "xlated jited" -- \ + "$cur" ) ) + return 0 + ;; + xlated|jited) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ + "$cur" ) ) + return 0 + ;; + *) + # "file" is not compatible with other keywords here + if _bpftool_search_list 'file'; then + return 0 + fi + if ! _bpftool_search_list 'linum opcodes visual'; then + _bpftool_once_attr 'file' + fi + _bpftool_once_attr 'linum opcodes' + if _bpftool_search_list 'xlated' && [[ "$json" == 0 ]]; then + _bpftool_once_attr 'visual' + fi + return 0 + ;; + esac + ;; + pin) + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + else + _filedir + fi + return 0 + ;; + attach|detach) + case $cword in + 3) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + 4) + case $prev in + id) + _bpftool_get_prog_ids + ;; + name) + _bpftool_get_prog_names + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + 5) + local BPFTOOL_PROG_ATTACH_TYPES='sk_msg_verdict \ + sk_skb_verdict sk_skb_stream_verdict sk_skb_stream_parser \ + flow_dissector' + COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_ATTACH_TYPES" -- "$cur" ) ) + return 0 + ;; + 6) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + 7) + case $prev in + id) + _bpftool_get_map_ids + ;; + name) + _bpftool_get_map_names + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + esac + ;; + load|loadall) + local obj + + # Propose "load/loadall" to complete "bpftool prog load", + # or bash tries to complete "load" as a filename below. + if [[ ${#words[@]} -eq 3 ]]; then + COMPREPLY=( $( compgen -W "load loadall" -- "$cur" ) ) + return 0 + fi + + if [[ ${#words[@]} -lt 6 ]]; then + _filedir + return 0 + fi + + obj=${words[3]} + + if [[ ${words[-4]} == "map" ]]; then + COMPREPLY=( $( compgen -W "id pinned" -- "$cur" ) ) + return 0 + fi + if [[ ${words[-3]} == "map" ]]; then + if [[ ${words[-2]} == "idx" ]]; then + _bpftool_get_obj_map_idxs $obj + elif [[ ${words[-2]} == "name" ]]; then + _bpftool_get_obj_map_names $obj + fi + return 0 + fi + if [[ ${words[-2]} == "map" ]]; then + COMPREPLY=( $( compgen -W "idx name" -- "$cur" ) ) + return 0 + fi + + case $prev in + type) + local BPFTOOL_PROG_LOAD_TYPES='socket kprobe \ + kretprobe classifier flow_dissector \ + action tracepoint raw_tracepoint \ + xdp perf_event cgroup/skb cgroup/sock \ + cgroup/dev lwt_in lwt_out lwt_xmit \ + lwt_seg6local sockops sk_skb sk_msg \ + lirc_mode2 cgroup/bind4 cgroup/bind6 \ + cgroup/connect4 cgroup/connect6 \ + cgroup/getpeername4 cgroup/getpeername6 \ + cgroup/getsockname4 cgroup/getsockname6 \ + cgroup/sendmsg4 cgroup/sendmsg6 \ + cgroup/recvmsg4 cgroup/recvmsg6 \ + cgroup/post_bind4 cgroup/post_bind6 \ + cgroup/sysctl cgroup/getsockopt \ + cgroup/setsockopt cgroup/sock_release struct_ops \ + fentry fexit freplace sk_lookup' + COMPREPLY=( $( compgen -W "$BPFTOOL_PROG_LOAD_TYPES" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids + return 0 + ;; + name) + _bpftool_get_map_names + return 0 + ;; + pinned|pinmaps) + _filedir + return 0 + ;; + *) + COMPREPLY=( $( compgen -W "map" -- "$cur" ) ) + _bpftool_once_attr 'type pinmaps autoattach' + _bpftool_one_of_list 'offload_dev xdpmeta_dev' + return 0 + ;; + esac + ;; + tracelog) + return 0 + ;; + profile) + case $cword in + 3) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + 4) + case $prev in + id) + _bpftool_get_prog_ids + ;; + name) + _bpftool_get_prog_names + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + 5) + COMPREPLY=( $( compgen -W "$METRIC_TYPE duration" -- "$cur" ) ) + return 0 + ;; + 6) + case $prev in + duration) + return 0 + ;; + *) + COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + return 0 + ;; + esac + return 0 + ;; + *) + COMPREPLY=( $( compgen -W "$METRIC_TYPE" -- "$cur" ) ) + return 0 + ;; + esac + ;; + run) + if [[ ${#words[@]} -eq 4 ]]; then + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + fi + case $prev in + id) + _bpftool_get_prog_ids + return 0 + ;; + name) + _bpftool_get_prog_names + return 0 + ;; + data_in|data_out|ctx_in|ctx_out) + _filedir + return 0 + ;; + repeat|data_size_out|ctx_size_out) + return 0 + ;; + *) + _bpftool_once_attr 'data_in data_out data_size_out \ + ctx_in ctx_out ctx_size_out repeat' + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'dump help pin attach detach \ + load loadall show list tracelog run profile' -- "$cur" ) ) + ;; + esac + ;; + struct_ops) + local STRUCT_OPS_TYPE='id name' + case $command in + show|list|dump|unregister) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$STRUCT_OPS_TYPE" -- "$cur" ) ) + ;; + id) + _bpftool_get_map_ids_for_type struct_ops + ;; + name) + _bpftool_get_map_names_for_type struct_ops + ;; + esac + return 0 + ;; + register) + _filedir + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'register unregister show list dump help' \ + -- "$cur" ) ) + ;; + esac + ;; + iter) + case $command in + pin) + case $prev in + $command) + _filedir + ;; + id) + _bpftool_get_map_ids + ;; + name) + _bpftool_get_map_names + ;; + pinned) + _filedir + ;; + *) + _bpftool_one_of_list $MAP_TYPE + ;; + esac + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'pin help' \ + -- "$cur" ) ) + ;; + esac + ;; + map) + local MAP_TYPE='id pinned name' + case $command in + show|list|dump|peek|pop|dequeue|freeze) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + case "$command" in + peek) + _bpftool_get_map_ids_for_type stack + _bpftool_get_map_ids_for_type queue + ;; + pop) + _bpftool_get_map_ids_for_type stack + ;; + dequeue) + _bpftool_get_map_ids_for_type queue + ;; + *) + _bpftool_get_map_ids + ;; + esac + return 0 + ;; + name) + case "$command" in + peek) + _bpftool_get_map_names_for_type stack + _bpftool_get_map_names_for_type queue + ;; + pop) + _bpftool_get_map_names_for_type stack + ;; + dequeue) + _bpftool_get_map_names_for_type queue + ;; + *) + _bpftool_get_map_names + ;; + esac + return 0 + ;; + *) + return 0 + ;; + esac + ;; + create) + case $prev in + $command) + _filedir + return 0 + ;; + type) + local BPFTOOL_MAP_CREATE_TYPES="$(bpftool feature list_builtins map_types 2>/dev/null | \ + grep -v '^unspec$')" + COMPREPLY=( $( compgen -W "$BPFTOOL_MAP_CREATE_TYPES" -- "$cur" ) ) + return 0 + ;; + key|value|flags|entries) + return 0 + ;; + inner_map) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids + ;; + name) + case $pprev in + inner_map) + _bpftool_get_map_names + ;; + *) + return 0 + ;; + esac + ;; + *) + _bpftool_once_attr 'type key value entries name flags offload_dev' + if _bpftool_search_list 'array_of_maps' 'hash_of_maps'; then + _bpftool_once_attr 'inner_map' + fi + return 0 + ;; + esac + ;; + lookup|getnext|delete) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids + return 0 + ;; + name) + _bpftool_get_map_names + return 0 + ;; + key) + COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) ) + ;; + *) + case $(_bpftool_map_guess_map_type) in + queue|stack) + return 0 + ;; + esac + + _bpftool_once_attr 'key' + return 0 + ;; + esac + ;; + update|push|enqueue) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_map_update_get_id $command + return 0 + ;; + name) + _bpftool_map_update_get_name $command + return 0 + ;; + key) + COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) ) + ;; + value) + # We can have bytes, or references to a prog or a + # map, depending on the type of the map to update. + case "$(_bpftool_map_guess_map_type)" in + array_of_maps|hash_of_maps) + local MAP_TYPE='id pinned name' + COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ + -- "$cur" ) ) + return 0 + ;; + prog_array) + local PROG_TYPE='id pinned tag name' + COMPREPLY+=( $( compgen -W "$PROG_TYPE" \ + -- "$cur" ) ) + return 0 + ;; + *) + COMPREPLY+=( $( compgen -W 'hex' \ + -- "$cur" ) ) + return 0 + ;; + esac + return 0 + ;; + *) + case $(_bpftool_map_guess_map_type) in + queue|stack) + _bpftool_once_attr 'value' + return 0; + ;; + esac + + _bpftool_once_attr 'key' + local UPDATE_FLAGS='any exist noexist' + for (( idx=3; idx < ${#words[@]}-1; idx++ )); do + if [[ ${words[idx]} == 'value' ]]; then + # 'value' is present, but is not the last + # word i.e. we can now have UPDATE_FLAGS. + _bpftool_one_of_list "$UPDATE_FLAGS" + return 0 + fi + done + for (( idx=3; idx < ${#words[@]}-1; idx++ )); do + if [[ ${words[idx]} == 'key' ]]; then + # 'key' is present, but is not the last + # word i.e. we can now have 'value'. + _bpftool_once_attr 'value' + return 0 + fi + done + + return 0 + ;; + esac + ;; + pin) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + ;; + id) + _bpftool_get_map_ids + ;; + name) + _bpftool_get_map_names + ;; + esac + return 0 + ;; + event_pipe) + case $prev in + $command) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + _bpftool_get_map_ids_for_type perf_event_array + return 0 + ;; + name) + _bpftool_get_map_names_for_type perf_event_array + return 0 + ;; + cpu) + return 0 + ;; + index) + return 0 + ;; + *) + _bpftool_once_attr 'cpu index' + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'delete dump getnext help \ + lookup pin event_pipe show list update create \ + peek push enqueue pop dequeue freeze' -- \ + "$cur" ) ) + ;; + esac + ;; + btf) + local PROG_TYPE='id pinned tag name' + local MAP_TYPE='id pinned name' + case $command in + dump) + case $prev in + $command) + COMPREPLY+=( $( compgen -W "id map prog file" -- \ + "$cur" ) ) + return 0 + ;; + prog) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + map) + COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) + return 0 + ;; + id) + case $pprev in + prog) + _bpftool_get_prog_ids + ;; + map) + _bpftool_get_map_ids + ;; + $command) + _bpftool_get_btf_ids + ;; + esac + return 0 + ;; + name) + case $pprev in + prog) + _bpftool_get_prog_names + ;; + map) + _bpftool_get_map_names + ;; + esac + return 0 + ;; + format) + COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) ) + ;; + *) + # emit extra options + case ${words[3]} in + id|file) + _bpftool_once_attr 'format' + ;; + map|prog) + if [[ ${words[3]} == "map" ]] && [[ $cword == 6 ]]; then + COMPREPLY+=( $( compgen -W "key value kv all" -- "$cur" ) ) + fi + _bpftool_once_attr 'format' + ;; + *) + ;; + esac + return 0 + ;; + esac + ;; + show|list) + case $prev in + $command) + COMPREPLY+=( $( compgen -W "id" -- "$cur" ) ) + ;; + id) + _bpftool_get_btf_ids + ;; + esac + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'dump help show list' \ + -- "$cur" ) ) + ;; + esac + ;; + gen) + case $command in + object) + _filedir + return 0 + ;; + skeleton) + case $prev in + $command) + _filedir + return 0 + ;; + *) + _bpftool_once_attr 'name' + return 0 + ;; + esac + ;; + subskeleton) + case $prev in + $command) + _filedir + return 0 + ;; + *) + _bpftool_once_attr 'name' + return 0 + ;; + esac + ;; + min_core_btf) + _filedir + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'object skeleton subskeleton help min_core_btf' -- "$cur" ) ) + ;; + esac + ;; + cgroup) + case $command in + show|list|tree) + case $cword in + 3) + _filedir + ;; + 4) + COMPREPLY=( $( compgen -W 'effective' -- "$cur" ) ) + ;; + esac + return 0 + ;; + attach|detach) + local BPFTOOL_CGROUP_ATTACH_TYPES="$(bpftool feature list_builtins attach_types 2>/dev/null | \ + grep '^cgroup_')" + local ATTACH_FLAGS='multi override' + local PROG_TYPE='id pinned tag name' + # Check for $prev = $command first + if [ $prev = $command ]; then + _filedir + return 0 + # Then check for attach type. This is done outside of the + # "case $prev in" to avoid writing the whole list of attach + # types again as pattern to match (where we cannot reuse + # our variable). + elif [[ $BPFTOOL_CGROUP_ATTACH_TYPES =~ $prev ]]; then + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \ + "$cur" ) ) + return 0 + fi + # case/esac for the other cases + case $prev in + id) + _bpftool_get_prog_ids + return 0 + ;; + *) + if ! _bpftool_search_list "$BPFTOOL_CGROUP_ATTACH_TYPES"; then + COMPREPLY=( $( compgen -W \ + "$BPFTOOL_CGROUP_ATTACH_TYPES" -- "$cur" ) ) + elif [[ "$command" == "attach" ]]; then + # We have an attach type on the command line, + # but it is not the previous word, or + # "id|pinned|tag|name" (we already checked for + # that). This should only leave the case when + # we need attach flags for "attach" commamnd. + _bpftool_one_of_list "$ATTACH_FLAGS" + fi + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help attach detach \ + show list tree' -- "$cur" ) ) + ;; + esac + ;; + perf) + case $command in + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list' -- "$cur" ) ) + ;; + esac + ;; + net) + local PROG_TYPE='id pinned tag name' + local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload' + case $command in + show|list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) ) + return 0 + ;; + attach) + case $cword in + 3) + COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) ) + return 0 + ;; + 4) + COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) ) + return 0 + ;; + 5) + case $prev in + id) + _bpftool_get_prog_ids + ;; + name) + _bpftool_get_prog_names + ;; + pinned) + _filedir + ;; + esac + return 0 + ;; + 6) + COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) ) + return 0 + ;; + 8) + _bpftool_once_attr 'overwrite' + return 0 + ;; + esac + ;; + detach) + case $cword in + 3) + COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) ) + return 0 + ;; + 4) + COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) ) + return 0 + ;; + esac + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list attach detach' -- "$cur" ) ) + ;; + esac + ;; + feature) + case $command in + probe) + [[ $prev == "prefix" ]] && return 0 + if _bpftool_search_list 'macros'; then + _bpftool_once_attr 'prefix' + else + COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) + fi + _bpftool_one_of_list 'kernel dev' + _bpftool_once_attr 'full unprivileged' + return 0 + ;; + list_builtins) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W 'prog_types map_types \ + attach_types link_types helpers' -- "$cur" ) ) + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help list_builtins probe' -- "$cur" ) ) + ;; + esac + ;; + link) + case $command in + show|list|pin|detach) + case $prev in + id) + _bpftool_get_link_ids + return 0 + ;; + esac + ;; + esac + + local LINK_TYPE='id pinned' + case $command in + show|list) + [[ $prev != "$command" ]] && return 0 + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + return 0 + ;; + pin|detach) + if [[ $prev == "$command" ]]; then + COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) ) + else + _filedir + fi + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help pin show list' -- "$cur" ) ) + ;; + esac + ;; + esac +} && +complete -F _bpftool bpftool + +# ex: ts=4 sw=4 et filetype=sh diff --git a/third_party/bpftool/docs/.gitignore b/third_party/bpftool/docs/.gitignore new file mode 100644 index 0000000..3c99250 --- /dev/null +++ b/third_party/bpftool/docs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +bpftool*.8 diff --git a/third_party/bpftool/docs/Makefile b/third_party/bpftool/docs/Makefile new file mode 100644 index 0000000..8355aea --- /dev/null +++ b/third_party/bpftool/docs/Makefile @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +include ../src/Makefile.include + +INSTALL ?= install +RM ?= rm -f +RMDIR ?= rmdir --ignore-fail-on-non-empty + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +prefix ?= /usr/local +mandir ?= $(prefix)/man +man8dir = $(mandir)/man8 + +MAN8_RST = $(wildcard bpftool*.rst) + +_DOC_MAN8 = $(patsubst %.rst,%.8,$(MAN8_RST)) +DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8)) + +man: man8 +man8: $(DOC_MAN8) + +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) +RST2MAN_OPTS += --verbose --strip-comments + +list_pages = $(sort $(basename $(filter-out $(1),$(MAN8_RST)))) +see_also = $(subst " ",, \ + "\n" \ + "SEE ALSO\n" \ + "========\n" \ + "\t**bpf**\ (2),\n" \ + "\t**bpf-helpers**\\ (7)" \ + $(foreach page,$(call list_pages,$(1)),",\n\t**$(page)**\\ (8)") \ + "\n") + +$(OUTPUT)%.8: %.rst +ifndef RST2MAN_DEP + $(error "rst2man not found, but required to generate man pages") +endif + $(QUIET_GEN)( cat $< ; printf "%b" $(call see_also,$<) ) | rst2man $(RST2MAN_OPTS) > $@ + +clean: + $(call QUIET_CLEAN, Documentation) + $(Q)$(RM) $(DOC_MAN8) + +install: man + $(call QUIET_INSTALL, Documentation-man) + $(Q)$(INSTALL) -d -m 755 $(DESTDIR)$(man8dir) + $(Q)$(INSTALL) -m 644 $(DOC_MAN8) $(DESTDIR)$(man8dir) + +uninstall: + $(call QUIET_UNINST, Documentation-man) + $(Q)$(RM) $(addprefix $(DESTDIR)$(man8dir)/,$(_DOC_MAN8)) + $(Q)$(RMDIR) $(DESTDIR)$(man8dir) + +.PHONY: man man8 clean install uninstall +.DEFAULT_GOAL := man diff --git a/third_party/bpftool/docs/bpftool-btf.rst b/third_party/bpftool/docs/bpftool-btf.rst new file mode 100644 index 0000000..342716f --- /dev/null +++ b/third_party/bpftool/docs/bpftool-btf.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-btf +================ +------------------------------------------------------------------------------- +tool for inspection of BTF data +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **btf** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | { **-B** | **--base-btf** } } + + *COMMANDS* := { **dump** | **help** } + +BTF COMMANDS +============= + +| **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*] +| **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*] +| **bpftool** **btf help** +| +| *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } +| *FORMAT* := { **raw** | **c** } +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } + +DESCRIPTION +=========== + **bpftool btf { show | list }** [**id** *BTF_ID*] + Show information about loaded BTF objects. If a BTF ID is + specified, show information only about given BTF object, + otherwise list all BTF objects currently loaded on the + system. + + Since Linux 5.8 bpftool is able to discover information about + processes that hold open file descriptors (FDs) against BTF + objects. On such kernels bpftool will automatically emit this + information as well. + + **bpftool btf dump** *BTF_SRC* + Dump BTF entries from a given *BTF_SRC*. + + When **id** is specified, BTF object with that ID will be + loaded and all its BTF types emitted. + + When **map** is provided, it's expected that map has + associated BTF object with BTF types describing key and + value. It's possible to select whether to dump only BTF + type(s) associated with key (**key**), value (**value**), + both key and value (**kv**), or all BTF types present in + associated BTF object (**all**). If not specified, **kv** + is assumed. + + When **prog** is provided, it's expected that program has + associated BTF object with BTF types. + + When specifying *FILE*, an ELF file is expected, containing + .BTF section with well-defined BTF binary format data, + typically produced by clang or pahole. + + **format** option can be used to override default (raw) + output format. Raw (**raw**) or C-syntax (**c**) output + formats are supported. + + **bpftool btf help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -B, --base-btf *FILE* + Pass a base BTF object. Base BTF objects are typically used + with BTF objects for kernel modules. To avoid duplicating + all kernel symbols required by modules, BTF objects for + modules are "split", they are built incrementally on top of + the kernel (vmlinux) BTF object. So the base BTF reference + should usually point to the kernel BTF. + + When the main BTF object to process (for example, the + module BTF to dump) is passed as a *FILE*, bpftool attempts + to autodetect the path for the base object, and passing + this option is optional. When the main BTF object is passed + through other handles, this option becomes necessary. + +EXAMPLES +======== +**# bpftool btf dump id 1226** + +:: + + [1] PTR '(anon)' type_id=2 + [2] STRUCT 'dummy_tracepoint_args' size=16 vlen=2 + 'pad' type_id=3 bits_offset=0 + 'sock' type_id=4 bits_offset=64 + [3] INT 'long long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none) + [4] PTR '(anon)' type_id=5 + [5] FWD 'sock' fwd_kind=union + +This gives an example of default output for all supported BTF kinds. + +**$ cat prog.c** + +:: + + struct fwd_struct; + + enum my_enum { + VAL1 = 3, + VAL2 = 7, + }; + + typedef struct my_struct my_struct_t; + + struct my_struct { + const unsigned int const_int_field; + int bitfield_field: 4; + char arr_field[16]; + const struct fwd_struct *restrict fwd_field; + enum my_enum enum_field; + volatile my_struct_t *typedef_ptr_field; + }; + + union my_union { + int a; + struct my_struct b; + }; + + struct my_struct struct_global_var __attribute__((section("data_sec"))) = { + .bitfield_field = 3, + .enum_field = VAL1, + }; + int global_var __attribute__((section("data_sec"))) = 7; + + __attribute__((noinline)) + int my_func(union my_union *arg1, int arg2) + { + static int static_var __attribute__((section("data_sec"))) = 123; + static_var++; + return static_var; + } + +**$ bpftool btf dump file prog.o** + +:: + + [1] PTR '(anon)' type_id=2 + [2] UNION 'my_union' size=48 vlen=2 + 'a' type_id=3 bits_offset=0 + 'b' type_id=4 bits_offset=0 + [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED + [4] STRUCT 'my_struct' size=48 vlen=6 + 'const_int_field' type_id=5 bits_offset=0 + 'bitfield_field' type_id=3 bits_offset=32 bitfield_size=4 + 'arr_field' type_id=8 bits_offset=40 + 'fwd_field' type_id=10 bits_offset=192 + 'enum_field' type_id=14 bits_offset=256 + 'typedef_ptr_field' type_id=15 bits_offset=320 + [5] CONST '(anon)' type_id=6 + [6] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none) + [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED + [8] ARRAY '(anon)' type_id=7 index_type_id=9 nr_elems=16 + [9] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none) + [10] RESTRICT '(anon)' type_id=11 + [11] PTR '(anon)' type_id=12 + [12] CONST '(anon)' type_id=13 + [13] FWD 'fwd_struct' fwd_kind=union + [14] ENUM 'my_enum' size=4 vlen=2 + 'VAL1' val=3 + 'VAL2' val=7 + [15] PTR '(anon)' type_id=16 + [16] VOLATILE '(anon)' type_id=17 + [17] TYPEDEF 'my_struct_t' type_id=4 + [18] FUNC_PROTO '(anon)' ret_type_id=3 vlen=2 + 'arg1' type_id=1 + 'arg2' type_id=3 + [19] FUNC 'my_func' type_id=18 + [20] VAR 'struct_global_var' type_id=4, linkage=global-alloc + [21] VAR 'global_var' type_id=3, linkage=global-alloc + [22] VAR 'my_func.static_var' type_id=3, linkage=static + [23] DATASEC 'data_sec' size=0 vlen=3 + type_id=20 offset=0 size=48 + type_id=21 offset=0 size=4 + type_id=22 offset=52 size=4 + +The following commands print BTF types associated with specified map's key, +value, both key and value, and all BTF types, respectively. By default, both +key and value types will be printed. + +**# bpftool btf dump map id 123 key** + +:: + + [39] TYPEDEF 'u32' type_id=37 + +**# bpftool btf dump map id 123 value** + +:: + + [86] PTR '(anon)' type_id=87 + +**# bpftool btf dump map id 123 kv** + +:: + + [39] TYPEDEF 'u32' type_id=37 + [86] PTR '(anon)' type_id=87 + +**# bpftool btf dump map id 123 all** + +:: + + [1] PTR '(anon)' type_id=0 + . + . + . + [2866] ARRAY '(anon)' type_id=52 index_type_id=51 nr_elems=4 + +All the standard ways to specify map or program are supported: + +**# bpftool btf dump map id 123** + +**# bpftool btf dump map pinned /sys/fs/bpf/map_name** + +**# bpftool btf dump prog id 456** + +**# bpftool btf dump prog tag b88e0a09b1d9759d** + +**# bpftool btf dump prog pinned /sys/fs/bpf/prog_name** + +| +| **# bpftool btf dump file /sys/kernel/btf/i2c_smbus** +| (or) +| **# I2C_SMBUS_ID=$(bpftool btf show -p | jq '.[] | select(.name=="i2c_smbus").id')** +| **# bpftool btf dump id ${I2C_SMBUS_ID} -B /sys/kernel/btf/vmlinux** + +:: + + [104848] STRUCT 'i2c_smbus_alert' size=40 vlen=2 + 'alert' type_id=393 bits_offset=0 + 'ara' type_id=56050 bits_offset=256 + [104849] STRUCT 'alert_data' size=12 vlen=3 + 'addr' type_id=16 bits_offset=0 + 'type' type_id=56053 bits_offset=32 + 'data' type_id=7 bits_offset=64 + [104850] PTR '(anon)' type_id=104848 + [104851] PTR '(anon)' type_id=104849 + [104852] FUNC 'i2c_register_spd' type_id=84745 linkage=static + [104853] FUNC 'smbalert_driver_init' type_id=1213 linkage=static + [104854] FUNC_PROTO '(anon)' ret_type_id=18 vlen=1 + 'ara' type_id=56050 + [104855] FUNC 'i2c_handle_smbus_alert' type_id=104854 linkage=static + [104856] FUNC 'smbalert_remove' type_id=104854 linkage=static + [104857] FUNC_PROTO '(anon)' ret_type_id=18 vlen=2 + 'ara' type_id=56050 + 'id' type_id=56056 + [104858] FUNC 'smbalert_probe' type_id=104857 linkage=static + [104859] FUNC 'smbalert_work' type_id=9695 linkage=static + [104860] FUNC 'smbus_alert' type_id=71367 linkage=static + [104861] FUNC 'smbus_do_alert' type_id=84827 linkage=static diff --git a/third_party/bpftool/docs/bpftool-cgroup.rst b/third_party/bpftool/docs/bpftool-cgroup.rst new file mode 100644 index 0000000..bd015ec --- /dev/null +++ b/third_party/bpftool/docs/bpftool-cgroup.rst @@ -0,0 +1,157 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-cgroup +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF progs +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **cgroup** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } } + + *COMMANDS* := + { **show** | **list** | **tree** | **attach** | **detach** | **help** } + +CGROUP COMMANDS +=============== + +| **bpftool** **cgroup** { **show** | **list** } *CGROUP* [**effective**] +| **bpftool** **cgroup tree** [*CGROUP_ROOT*] [**effective**] +| **bpftool** **cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] +| **bpftool** **cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* +| **bpftool** **cgroup help** +| +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **cgroup_inet_ingress** | **cgroup_inet_egress** | +| **cgroup_inet_sock_create** | **cgroup_sock_ops** | +| **cgroup_device** | **cgroup_inet4_bind** | **cgroup_inet6_bind** | +| **cgroup_inet4_post_bind** | **cgroup_inet6_post_bind** | +| **cgroup_inet4_connect** | **cgroup_inet6_connect** | +| **cgroup_inet4_getpeername** | **cgroup_inet6_getpeername** | +| **cgroup_inet4_getsockname** | **cgroup_inet6_getsockname** | +| **cgroup_udp4_sendmsg** | **cgroup_udp6_sendmsg** | +| **cgroup_udp4_recvmsg** | **cgroup_udp6_recvmsg** | +| **cgroup_sysctl** | **cgroup_getsockopt** | **cgroup_setsockopt** | +| **cgroup_inet_sock_release** } +| *ATTACH_FLAGS* := { **multi** | **override** } + +DESCRIPTION +=========== + **bpftool cgroup { show | list }** *CGROUP* [**effective**] + List all programs attached to the cgroup *CGROUP*. + + Output will start with program ID followed by attach type, + attach flags and program name. + + If **effective** is specified retrieve effective programs that + will execute for events within a cgroup. This includes + inherited along with attached ones. + + **bpftool cgroup tree** [*CGROUP_ROOT*] [**effective**] + Iterate over all cgroups in *CGROUP_ROOT* and list all + attached programs. If *CGROUP_ROOT* is not specified, + bpftool uses cgroup v2 mountpoint. + + The output is similar to the output of cgroup show/list + commands: it starts with absolute cgroup path, followed by + program ID, attach type, attach flags and program name. + + If **effective** is specified retrieve effective programs that + will execute for events within a cgroup. This includes + inherited along with attached ones. + + **bpftool cgroup attach** *CGROUP* *ATTACH_TYPE* *PROG* [*ATTACH_FLAGS*] + Attach program *PROG* to the cgroup *CGROUP* with attach type + *ATTACH_TYPE* and optional *ATTACH_FLAGS*. + + *ATTACH_FLAGS* can be one of: **override** if a sub-cgroup installs + some bpf program, the program in this cgroup yields to sub-cgroup + program; **multi** if a sub-cgroup installs some bpf program, + that cgroup program gets run in addition to the program in this + cgroup. + + Only one program is allowed to be attached to a cgroup with + no attach flags or the **override** flag. Attaching another + program will release old program and attach the new one. + + Multiple programs are allowed to be attached to a cgroup with + **multi**. They are executed in FIFO order (those that were + attached first, run first). + + Non-default *ATTACH_FLAGS* are supported by kernel version 4.14 + and later. + + *ATTACH_TYPE* can be on of: + **ingress** ingress path of the inet socket (since 4.10); + **egress** egress path of the inet socket (since 4.10); + **sock_create** opening of an inet socket (since 4.10); + **sock_ops** various socket operations (since 4.12); + **device** device access (since 4.15); + **bind4** call to bind(2) for an inet4 socket (since 4.17); + **bind6** call to bind(2) for an inet6 socket (since 4.17); + **post_bind4** return from bind(2) for an inet4 socket (since 4.17); + **post_bind6** return from bind(2) for an inet6 socket (since 4.17); + **connect4** call to connect(2) for an inet4 socket (since 4.17); + **connect6** call to connect(2) for an inet6 socket (since 4.17); + **sendmsg4** call to sendto(2), sendmsg(2), sendmmsg(2) for an + unconnected udp4 socket (since 4.18); + **sendmsg6** call to sendto(2), sendmsg(2), sendmmsg(2) for an + unconnected udp6 socket (since 4.18); + **recvmsg4** call to recvfrom(2), recvmsg(2), recvmmsg(2) for + an unconnected udp4 socket (since 5.2); + **recvmsg6** call to recvfrom(2), recvmsg(2), recvmmsg(2) for + an unconnected udp6 socket (since 5.2); + **sysctl** sysctl access (since 5.2); + **getsockopt** call to getsockopt (since 5.3); + **setsockopt** call to setsockopt (since 5.3); + **getpeername4** call to getpeername(2) for an inet4 socket (since 5.8); + **getpeername6** call to getpeername(2) for an inet6 socket (since 5.8); + **getsockname4** call to getsockname(2) for an inet4 socket (since 5.8); + **getsockname6** call to getsockname(2) for an inet6 socket (since 5.8). + **sock_release** closing an userspace inet socket (since 5.9). + + **bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG* + Detach *PROG* from the cgroup *CGROUP* and attach type + *ATTACH_TYPE*. + + **bpftool prog help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -f, --bpffs + Show file names of pinned programs. + +EXAMPLES +======== +| +| **# mount -t bpf none /sys/fs/bpf/** +| **# mkdir /sys/fs/cgroup/test.slice** +| **# bpftool prog load ./device_cgroup.o /sys/fs/bpf/prog** +| **# bpftool cgroup attach /sys/fs/cgroup/test.slice/ device id 1 allow_multi** + +**# bpftool cgroup list /sys/fs/cgroup/test.slice/** + +:: + + ID AttachType AttachFlags Name + 1 device allow_multi bpf_prog1 + +| +| **# bpftool cgroup detach /sys/fs/cgroup/test.slice/ device id 1** +| **# bpftool cgroup list /sys/fs/cgroup/test.slice/** + +:: + + ID AttachType AttachFlags Name diff --git a/third_party/bpftool/docs/bpftool-feature.rst b/third_party/bpftool/docs/bpftool-feature.rst new file mode 100644 index 0000000..e44039f --- /dev/null +++ b/third_party/bpftool/docs/bpftool-feature.rst @@ -0,0 +1,90 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +=============== +bpftool-feature +=============== +------------------------------------------------------------------------------- +tool for inspection of eBPF-related parameters for Linux kernel or net device +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **feature** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| } + + *COMMANDS* := { **probe** | **help** } + +FEATURE COMMANDS +================ + +| **bpftool** **feature probe** [*COMPONENT*] [**full**] [**unprivileged**] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature list_builtins** *GROUP* +| **bpftool** **feature help** +| +| *COMPONENT* := { **kernel** | **dev** *NAME* } +| *GROUP* := { **prog_types** | **map_types** | **attach_types** | **link_types** | **helpers** } + +DESCRIPTION +=========== + **bpftool feature probe** [**kernel**] [**full**] [**macros** [**prefix** *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related + parameters, such as availability of the **bpf**\ () system call, + JIT status, eBPF program types availability, eBPF helper + functions availability, and more. + + By default, bpftool **does not run probes** for + **bpf_probe_write_user**\ () and **bpf_trace_printk**\() + helpers which print warnings to kernel logs. To enable them + and run all probes, the **full** keyword should be used. + + If the **macros** keyword (but not the **-j** option) is + passed, a subset of the output is dumped as a list of + **#define** macros that are ready to be included in a C + header file, for example. If, additionally, **prefix** is + used to define a *PREFIX*, the provided string will be used + as a prefix to the names of the macros: this can be used to + avoid conflicts on macro names when including the output of + this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is + specified, probing the kernel is the default behaviour. + + When the **unprivileged** keyword is used, bpftool will dump + only the features available to a user who does not have the + **CAP_SYS_ADMIN** capability set. The features available in + that case usually represent a small subset of the parameters + supported by the system. Unprivileged users MUST use the + **unprivileged** keyword: This is to avoid misdetection if + bpftool is inadvertently run as non-root, for example. This + keyword is unavailable if bpftool was compiled without + libcap. + + **bpftool feature probe dev** *NAME* [**full**] [**macros** [**prefix** *PREFIX*]] + Probe network device for supported eBPF features and dump + results to the console. + + The keywords **full**, **macros** and **prefix** have the + same role as when probing the kernel. + + **bpftool feature list_builtins** *GROUP* + List items known to bpftool. These can be BPF program types + (**prog_types**), BPF map types (**map_types**), attach types + (**attach_types**), link types (**link_types**), or BPF helper + functions (**helpers**). The command does not probe the system, but + simply lists the elements that bpftool knows from compilation time, + as provided from libbpf (for all object types) or from the BPF UAPI + header (list of helpers). This can be used in scripts to iterate over + BPF types or helpers. + + **bpftool feature help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst diff --git a/third_party/bpftool/docs/bpftool-gen.rst b/third_party/bpftool/docs/bpftool-gen.rst new file mode 100644 index 0000000..5006e72 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-gen.rst @@ -0,0 +1,446 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-gen +================ +------------------------------------------------------------------------------- +tool for BPF code-generation +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **gen** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | { **-L** | **--use-loader** } } + + *COMMAND* := { **object** | **skeleton** | **help** } + +GEN COMMANDS +============= + +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen subskeleton** *FILE* [**name** *OBJECT_NAME*] +| **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] +| **bpftool** **gen help** + +DESCRIPTION +=========== + **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s + into a single resulting *OUTPUT_FILE*. All the files involved + are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for + user-space object files, but in addition to combining data + and instruction sections, .BTF and .BTF.ext (if present in + any of the input files) data are combined together. .BTF + data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting + BTF information. + + BPF static linking allows to partition BPF source code into + individually compiled files that are then linked into + a single resulting BPF object file, which can be used to + generated BPF skeleton (with **gen skeleton** command) or + passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + + **bpftool gen skeleton** *FILE* + Generate BPF skeleton C header file for a given *FILE*. + + BPF skeleton is an alternative interface to existing libbpf + APIs for working with BPF objects. Skeleton code is intended + to significantly shorten and simplify code to load and work + with BPF programs from userspace side. Generated code is + tailored to specific input BPF object *FILE*, reflecting its + structure by listing out available maps, program, variables, + etc. Skeleton eliminates the need to lookup mentioned + components by name. Instead, if skeleton instantiation + succeeds, they are populated in skeleton structure as valid + libbpf types (e.g., **struct bpf_map** pointer) and can be + passed to existing generic libbpf APIs. + + In addition to simple and reliable access to maps and + programs, skeleton provides a storage for BPF links (**struct + bpf_link**) for each BPF program within BPF object. When + requested, supported BPF programs will be automatically + attached and resulting BPF links stored for further use by + user in pre-allocated fields in skeleton struct. For BPF + programs that can't be automatically attached by libbpf, + user can attach them manually, but store resulting BPF link + in per-program link field. All such set up links will be + automatically destroyed on BPF skeleton destruction. This + eliminates the need for users to manage links manually and + rely on libbpf support to detach programs and free up + resources. + + Another facility provided by BPF skeleton is an interface to + global variables of all supported kinds: mutable, read-only, + as well as extern ones. This interface allows to pre-setup + initial values of variables before BPF object is loaded and + verified by kernel. For non-read-only variables, the same + interface can be used to fetch values of global variables on + userspace side, even if they are modified by BPF code. + + During skeleton generation, contents of source BPF object + *FILE* is embedded within generated code and is thus not + necessary to keep around. This ensures skeleton and BPF + object file are matching 1-to-1 and always stay in sync. + Generated code is dual-licensed under LGPL-2.1 and + BSD-2-Clause licenses. + + It is a design goal and guarantee that skeleton interfaces + are interoperable with generic libbpf APIs. User should + always be able to use skeleton API to create and load BPF + object, and later use libbpf APIs to keep working with + specific maps, programs, etc. + + As part of skeleton, few custom functions are generated. + Each of them is prefixed with object name. Object name can + either be derived from object file name, i.e., if BPF object + file name is **example.o**, BPF object name will be + **example**. Object name can be also specified explicitly + through **name** *OBJECT_NAME* parameter. The following + custom functions are provided (assuming **example** as + the object name): + + - **example__open** and **example__open_opts**. + These functions are used to instantiate skeleton. It + corresponds to libbpf's **bpf_object__open**\ () API. + **_opts** variants accepts extra **bpf_object_open_opts** + options. + + - **example__load**. + This function creates maps, loads and verifies BPF + programs, initializes global data maps. It corresponds to + libppf's **bpf_object__load**\ () API. + + - **example__open_and_load** combines **example__open** and + **example__load** invocations in one commonly used + operation. + + - **example__attach** and **example__detach** + This pair of functions allow to attach and detach, + correspondingly, already loaded BPF object. Only BPF + programs of types supported by libbpf for auto-attachment + will be auto-attached and their corresponding BPF links + instantiated. For other BPF programs, user can manually + create a BPF link and assign it to corresponding fields in + skeleton struct. **example__detach** will detach both + links created automatically, as well as those populated by + user manually. + + - **example__destroy** + Detach and unload BPF programs, free up all the resources + used by skeleton and BPF object. + + If BPF object has global variables, corresponding structs + with memory layout corresponding to global data data section + layout will be created. Currently supported ones are: *.data*, + *.bss*, *.rodata*, and *.kconfig* structs/data sections. + These data sections/structs can be used to set up initial + values of variables, if set before **example__load**. + Afterwards, if target kernel supports memory-mapped BPF + arrays, same structs can be used to fetch and update + (non-read-only) data from userspace, with same simplicity + as for BPF side. + + **bpftool gen subskeleton** *FILE* + Generate BPF subskeleton C header file for a given *FILE*. + + Subskeletons are similar to skeletons, except they do not own + the corresponding maps, programs, or global variables. They + require that the object file used to generate them is already + loaded into a *bpf_object* by some other means. + + This functionality is useful when a library is included into a + larger BPF program. A subskeleton for the library would have + access to all objects and globals defined in it, without + having to know about the larger program. + + Consequently, there are only two functions defined + for subskeletons: + + - **example__open(bpf_object\*)** + Instantiates a subskeleton from an already opened (but not + necessarily loaded) **bpf_object**. + + - **example__destroy()** + Frees the storage for the subskeleton but *does not* unload + any BPF programs or maps. + + **bpftool** **gen min_core_btf** *INPUT* *OUTPUT* *OBJECT* [*OBJECT*...] + Generate a minimum BTF file as *OUTPUT*, derived from a given + *INPUT* BTF file, containing all needed BTF types so one, or + more, given eBPF objects CO-RE relocations may be satisfied. + + When kernels aren't compiled with CONFIG_DEBUG_INFO_BTF, + libbpf, when loading an eBPF object, has to rely on external + BTF files to be able to calculate CO-RE relocations. + + Usually, an external BTF file is built from existing kernel + DWARF data using pahole. It contains all the types used by + its respective kernel image and, because of that, is big. + + The min_core_btf feature builds smaller BTF files, customized + to one or multiple eBPF objects, so they can be distributed + together with an eBPF CO-RE based application, turning the + application portable to different kernel versions. + + Check examples bellow for more information how to use it. + + **bpftool gen help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -L, --use-loader + For skeletons, generate a "light" skeleton (also known as "loader" + skeleton). A light skeleton contains a loader eBPF program. It does + not use the majority of the libbpf infrastructure, and does not need + libelf. + +EXAMPLES +======== +**$ cat example1.bpf.c** + +:: + + #include + #include + #include + #include + + const volatile int param1 = 42; + bool global_flag = true; + struct { int x; } data = {}; + + SEC("raw_tp/sys_enter") + int handle_sys_enter(struct pt_regs *ctx) + { + static long my_static_var; + if (global_flag) + my_static_var++; + else + data.x += param1; + return 0; + } + +**$ cat example2.bpf.c** + +:: + + #include + #include + #include + + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, int); + __type(value, long); + } my_map SEC(".maps"); + + SEC("raw_tp/sys_exit") + int handle_sys_exit(struct pt_regs *ctx) + { + int zero = 0; + bpf_map_lookup_elem(&my_map, &zero); + return 0; + } + +This is example BPF application with two BPF programs and a mix of BPF maps +and global variables. Source code is split across two source code files. + +**$ clang --target=bpf -g example1.bpf.c -o example1.bpf.o** + +**$ clang --target=bpf -g example2.bpf.c -o example2.bpf.o** + +**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o** + +This set of commands compiles *example1.bpf.c* and *example2.bpf.c* +individually and then statically links respective object files into the final +BPF ELF object file *example.bpf.o*. + +**$ bpftool gen skeleton example.bpf.o name example | tee example.skel.h** + +:: + + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + + /* THIS FILE IS AUTOGENERATED! */ + #ifndef __EXAMPLE_SKEL_H__ + #define __EXAMPLE_SKEL_H__ + + #include + #include + + struct example { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + struct bpf_map *data; + struct bpf_map *bss; + struct bpf_map *my_map; + } maps; + struct { + struct bpf_program *handle_sys_enter; + struct bpf_program *handle_sys_exit; + } progs; + struct { + struct bpf_link *handle_sys_enter; + struct bpf_link *handle_sys_exit; + } links; + struct example__bss { + struct { + int x; + } data; + } *bss; + struct example__data { + _Bool global_flag; + long int handle_sys_enter_my_static_var; + } *data; + struct example__rodata { + int param1; + } *rodata; + }; + + static void example__destroy(struct example *obj); + static inline struct example *example__open_opts( + const struct bpf_object_open_opts *opts); + static inline struct example *example__open(); + static inline int example__load(struct example *obj); + static inline struct example *example__open_and_load(); + static inline int example__attach(struct example *obj); + static inline void example__detach(struct example *obj); + + #endif /* __EXAMPLE_SKEL_H__ */ + +**$ cat example.c** + +:: + + #include "example.skel.h" + + int main() + { + struct example *skel; + int err = 0; + + skel = example__open(); + if (!skel) + goto cleanup; + + skel->rodata->param1 = 128; + + err = example__load(skel); + if (err) + goto cleanup; + + err = example__attach(skel); + if (err) + goto cleanup; + + /* all libbpf APIs are usable */ + printf("my_map name: %s\n", bpf_map__name(skel->maps.my_map)); + printf("sys_enter prog FD: %d\n", + bpf_program__fd(skel->progs.handle_sys_enter)); + + /* detach and re-attach sys_exit program */ + bpf_link__destroy(skel->links.handle_sys_exit); + skel->links.handle_sys_exit = + bpf_program__attach(skel->progs.handle_sys_exit); + + printf("my_static_var: %ld\n", + skel->bss->handle_sys_enter_my_static_var); + + cleanup: + example__destroy(skel); + return err; + } + +**# ./example** + +:: + + my_map name: my_map + sys_enter prog FD: 8 + my_static_var: 7 + +This is a stripped-out version of skeleton generated for above example code. + +min_core_btf +------------ + +**$ bpftool btf dump file 5.4.0-example.btf format raw** + +:: + + [1] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none) + [2] CONST '(anon)' type_id=1 + [3] VOLATILE '(anon)' type_id=1 + [4] ARRAY '(anon)' type_id=1 index_type_id=21 nr_elems=2 + [5] PTR '(anon)' type_id=8 + [6] CONST '(anon)' type_id=5 + [7] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=(none) + [8] CONST '(anon)' type_id=7 + [9] INT 'unsigned int' size=4 bits_offset=0 nr_bits=32 encoding=(none) + + +**$ bpftool btf dump file one.bpf.o format raw** + +:: + + [1] PTR '(anon)' type_id=2 + [2] STRUCT 'trace_event_raw_sys_enter' size=64 vlen=4 + 'ent' type_id=3 bits_offset=0 + 'id' type_id=7 bits_offset=64 + 'args' type_id=9 bits_offset=128 + '__data' type_id=12 bits_offset=512 + [3] STRUCT 'trace_entry' size=8 vlen=4 + 'type' type_id=4 bits_offset=0 + 'flags' type_id=5 bits_offset=16 + 'preempt_count' type_id=5 bits_offset=24 + + +**$ bpftool gen min_core_btf 5.4.0-example.btf 5.4.0-smaller.btf one.bpf.o** + +**$ bpftool btf dump file 5.4.0-smaller.btf format raw** + +:: + + [1] TYPEDEF 'pid_t' type_id=6 + [2] STRUCT 'trace_event_raw_sys_enter' size=64 vlen=1 + 'args' type_id=4 bits_offset=128 + [3] STRUCT 'task_struct' size=9216 vlen=2 + 'pid' type_id=1 bits_offset=17920 + 'real_parent' type_id=7 bits_offset=18048 + [4] ARRAY '(anon)' type_id=5 index_type_id=8 nr_elems=6 + [5] INT 'long unsigned int' size=8 bits_offset=0 nr_bits=64 encoding=(none) + [6] TYPEDEF '__kernel_pid_t' type_id=8 + [7] PTR '(anon)' type_id=3 + [8] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED + + +Now, the "5.4.0-smaller.btf" file may be used by libbpf as an external BTF file +when loading the "one.bpf.o" object into the "5.4.0-example" kernel. Note that +the generated BTF file won't allow other eBPF objects to be loaded, just the +ones given to min_core_btf. + +:: + + LIBBPF_OPTS(bpf_object_open_opts, opts, .btf_custom_path = "5.4.0-smaller.btf"); + struct bpf_object *obj; + + obj = bpf_object__open_file("one.bpf.o", &opts); + + ... diff --git a/third_party/bpftool/docs/bpftool-iter.rst b/third_party/bpftool/docs/bpftool-iter.rst new file mode 100644 index 0000000..84839d4 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-iter.rst @@ -0,0 +1,76 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +============ +bpftool-iter +============ +------------------------------------------------------------------------------- +tool to create BPF iterators +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **iter** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| } + + *COMMANDS* := { **pin** | **help** } + +ITER COMMANDS +=================== + +| **bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*] +| **bpftool** **iter help** +| +| *OBJ* := /a/file/of/bpf_iter_target.o +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } + +DESCRIPTION +=========== + **bpftool iter pin** *OBJ* *PATH* [**map** *MAP*] + A bpf iterator combines a kernel iterating of + particular kernel data (e.g., tasks, bpf_maps, etc.) + and a bpf program called for each kernel data object + (e.g., one task, one bpf_map, etc.). User space can + *read* kernel iterator output through *read()* syscall. + + The *pin* command creates a bpf iterator from *OBJ*, + and pin it to *PATH*. The *PATH* should be located + in *bpffs* mount. It must not contain a dot + character ('.'), which is reserved for future extensions + of *bpffs*. + + Map element bpf iterator requires an additional parameter + *MAP* so bpf program can iterate over map elements for + that map. User can have a bpf program in kernel to run + with each map element, do checking, filtering, aggregation, + etc. without copying data to user space. + + User can then *cat PATH* to see the bpf iterator output. + + **bpftool iter help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + +EXAMPLES +======== +**# bpftool iter pin bpf_iter_netlink.o /sys/fs/bpf/my_netlink** + +:: + + Create a file-based bpf iterator from bpf_iter_netlink.o and pin it + to /sys/fs/bpf/my_netlink + +**# bpftool iter pin bpf_iter_hashmap.o /sys/fs/bpf/my_hashmap map id 20** + +:: + + Create a file-based bpf iterator from bpf_iter_hashmap.o and map with + id 20, and pin it to /sys/fs/bpf/my_hashmap diff --git a/third_party/bpftool/docs/bpftool-link.rst b/third_party/bpftool/docs/bpftool-link.rst new file mode 100644 index 0000000..52a4eee --- /dev/null +++ b/third_party/bpftool/docs/bpftool-link.rst @@ -0,0 +1,112 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-link +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF links +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **link** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } + + *COMMANDS* := { **show** | **list** | **pin** | **help** } + +LINK COMMANDS +============= + +| **bpftool** **link { show | list }** [*LINK*] +| **bpftool** **link pin** *LINK* *FILE* +| **bpftool** **link detach** *LINK* +| **bpftool** **link help** +| +| *LINK* := { **id** *LINK_ID* | **pinned** *FILE* } + + +DESCRIPTION +=========== + **bpftool link { show | list }** [*LINK*] + Show information about active links. If *LINK* is + specified show information only about given link, + otherwise list all links currently active on the system. + + Output will start with link ID followed by link type and + zero or more named attributes, some of which depend on type + of link. + + Since Linux 5.8 bpftool is able to discover information about + processes that hold open file descriptors (FDs) against BPF + links. On such kernels bpftool will automatically emit this + information as well. + + **bpftool link pin** *LINK* *FILE* + Pin link *LINK* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool link detach** *LINK* + Force-detach link *LINK*. BPF link and its underlying BPF + program will stay valid, but they will be detached from the + respective BPF hook and BPF link will transition into + a defunct state until last open file descriptor for that + link is closed. + + **bpftool link help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -f, --bpffs + When showing BPF links, show file names of pinned + links. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. + +EXAMPLES +======== +**# bpftool link show** + +:: + + 10: cgroup prog 25 + cgroup_id 614 attach_type egress + pids test_progs(223) + +**# bpftool --json --pretty link show** + +:: + + [{ + "type": "cgroup", + "prog_id": 25, + "cgroup_id": 614, + "attach_type": "egress", + "pids": [{ + "pid": 223, + "comm": "test_progs" + } + ] + } + ] + +| +| **# bpftool link pin id 10 /sys/fs/bpf/link** +| **# ls -l /sys/fs/bpf/** + +:: + + -rw------- 1 root root 0 Apr 23 21:39 link diff --git a/third_party/bpftool/docs/bpftool-map.rst b/third_party/bpftool/docs/bpftool-map.rst new file mode 100644 index 0000000..3b7ba03 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-map.rst @@ -0,0 +1,277 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-map +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF maps +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **map** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | { **-f** | **--bpffs** } | { **-n** | **--nomount** } } + + *COMMANDS* := + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | + **delete** | **pin** | **help** } + +MAP COMMANDS +============= + +| **bpftool** **map** { **show** | **list** } [*MAP*] +| **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ +| **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] \ +| [**offload_dev** *NAME*] +| **bpftool** **map dump** *MAP* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] +| **bpftool** **map getnext** *MAP* [**key** *DATA*] +| **bpftool** **map delete** *MAP* **key** *DATA* +| **bpftool** **map pin** *MAP* *FILE* +| **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* +| **bpftool** **map freeze** *MAP* +| **bpftool** **map help** +| +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* | **name** *MAP_NAME* } +| *DATA* := { [**hex**] *BYTES* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *VALUE* := { *DATA* | *MAP* | *PROG* } +| *UPDATE_FLAGS* := { **any** | **exist** | **noexist** } +| *TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash** +| | **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash** +| | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** +| | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** +| | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** +| | **task_storage** | **bloom_filter** | **user_ringbuf** | **cgrp_storage** } + +DESCRIPTION +=========== + **bpftool map { show | list }** [*MAP*] + Show information about loaded maps. If *MAP* is specified + show information only about given maps, otherwise list all + maps currently loaded on the system. In case of **name**, + *MAP* may match several maps which will all be shown. + + Output will start with map ID followed by map type and + zero or more named attributes (depending on kernel version). + + Since Linux 5.8 bpftool is able to discover information about + processes that hold open file descriptors (FDs) against BPF + maps. On such kernels bpftool will automatically emit this + information as well. + + **bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**inner_map** *MAP*] [**offload_dev** *NAME*] + Create a new map with given parameters and pin it to *bpffs* + as *FILE*. + + *FLAGS* should be an integer which is the combination of + desired flags, e.g. 1024 for **BPF_F_MMAPABLE** (see bpf.h + UAPI header for existing flags). + + To create maps of type array-of-maps or hash-of-maps, the + **inner_map** keyword must be used to pass an inner map. The + kernel needs it to collect metadata related to the inner maps + that the new map will work with. + + Keyword **offload_dev** expects a network interface name, + and is used to request hardware offload for the map. + + **bpftool map dump** *MAP* + Dump all entries in a given *MAP*. In case of **name**, + *MAP* may match several maps which will all be dumped. + + **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] + Update map entry for a given *KEY*. + + *UPDATE_FLAGS* can be one of: **any** update existing entry + or add if doesn't exit; **exist** update only if entry already + exists; **noexist** update only if entry doesn't exist. + + If the **hex** keyword is provided in front of the bytes + sequence, the bytes are parsed as hexadecimal values, even if + no "0x" prefix is added. If the keyword is not provided, then + the bytes are parsed as decimal values, unless a "0x" prefix + (for hexadecimal) or a "0" prefix (for octal) is provided. + + **bpftool map lookup** *MAP* [**key** *DATA*] + Lookup **key** in the map. + + **bpftool map getnext** *MAP* [**key** *DATA*] + Get next key. If *key* is not specified, get first key. + + **bpftool map delete** *MAP* **key** *DATA* + Remove entry from the map. + + **bpftool map pin** *MAP* *FILE* + Pin map *MAP* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] + Read events from a **BPF_MAP_TYPE_PERF_EVENT_ARRAY** map. + + Install perf rings into a perf event array map and dump + output of any **bpf_perf_event_output**\ () call in the kernel. + By default read the number of CPUs on the system and + install perf ring for each CPU in the corresponding index + in the array. + + If **cpu** and **index** are specified, install perf ring + for given **cpu** at **index** in the array (single ring). + + Note that installing a perf ring into an array will silently + replace any existing ring. Any other application will stop + receiving events if it installed its rings earlier. + + **bpftool map peek** *MAP* + Peek next value in the queue or stack. + + **bpftool map push** *MAP* **value** *VALUE* + Push *VALUE* onto the stack. + + **bpftool map pop** *MAP* + Pop and print value from the stack. + + **bpftool map enqueue** *MAP* **value** *VALUE* + Enqueue *VALUE* into the queue. + + **bpftool map dequeue** *MAP* + Dequeue and print value from the queue. + + **bpftool map freeze** *MAP* + Freeze the map as read-only from user space. Entries from a + frozen map can not longer be updated or deleted with the + **bpf**\ () system call. This operation is not reversible, + and the map remains immutable from user space until its + destruction. However, read and write permissions for BPF + programs to the map remain unchanged. + + **bpftool map help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -f, --bpffs + Show file names of pinned maps. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. + +EXAMPLES +======== +**# bpftool map show** + +:: + + 10: hash name some_map flags 0x0 + key 4B value 8B max_entries 2048 memlock 167936B + pids systemd(1) + +The following three commands are equivalent: + +| +| **# bpftool map update id 10 key hex 20 c4 b7 00 value hex 0f ff ff ab 01 02 03 4c** +| **# bpftool map update id 10 key 0x20 0xc4 0xb7 0x00 value 0x0f 0xff 0xff 0xab 0x01 0x02 0x03 0x4c** +| **# bpftool map update id 10 key 32 196 183 0 value 15 255 255 171 1 2 3 76** + +**# bpftool map lookup id 10 key 0 1 2 3** + +:: + + key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 + + +**# bpftool map dump id 10** + +:: + + key: 00 01 02 03 value: 00 01 02 03 04 05 06 07 + key: 0d 00 07 00 value: 02 00 00 00 01 02 03 04 + Found 2 elements + +**# bpftool map getnext id 10 key 0 1 2 3** + +:: + + key: + 00 01 02 03 + next key: + 0d 00 07 00 + +| +| **# mount -t bpf none /sys/fs/bpf/** +| **# bpftool map pin id 10 /sys/fs/bpf/map** +| **# bpftool map del pinned /sys/fs/bpf/map key 13 00 07 00** + +Note that map update can also be used in order to change the program references +hold by a program array map. This can be used, for example, to change the +programs used for tail-call jumps at runtime, without having to reload the +entry-point program. Below is an example for this use case: we load a program +defining a prog array map, and with a main function that contains a tail call +to other programs that can be used either to "process" packets or to "debug" +processing. Note that the prog array map MUST be pinned into the BPF virtual +file system for the map update to work successfully, as kernel flushes prog +array maps when they have no more references from user space (and the update +would be lost as soon as bpftool exits). + +| +| **# bpftool prog loadall tail_calls.o /sys/fs/bpf/foo type xdp** +| **# bpftool prog --bpffs** + +:: + + 545: xdp name main_func tag 674b4b5597193dc3 gpl + loaded_at 2018-12-12T15:02:58+0000 uid 0 + xlated 240B jited 257B memlock 4096B map_ids 294 + pinned /sys/fs/bpf/foo/xdp + 546: xdp name bpf_func_process tag e369a529024751fc gpl + loaded_at 2018-12-12T15:02:58+0000 uid 0 + xlated 200B jited 164B memlock 4096B + pinned /sys/fs/bpf/foo/process + 547: xdp name bpf_func_debug tag 0b597868bc7f0976 gpl + loaded_at 2018-12-12T15:02:58+0000 uid 0 + xlated 200B jited 164B memlock 4096B + pinned /sys/fs/bpf/foo/debug + +**# bpftool map** + +:: + + 294: prog_array name jmp_table flags 0x0 + key 4B value 4B max_entries 1 memlock 4096B + owner_prog_type xdp owner jited + +| +| **# bpftool map pin id 294 /sys/fs/bpf/bar** +| **# bpftool map dump pinned /sys/fs/bpf/bar** + +:: + + Found 0 elements + +| +| **# bpftool map update pinned /sys/fs/bpf/bar key 0 0 0 0 value pinned /sys/fs/bpf/foo/debug** +| **# bpftool map dump pinned /sys/fs/bpf/bar** + +:: + + key: 00 00 00 00 value: 22 02 00 00 + Found 1 element diff --git a/third_party/bpftool/docs/bpftool-net.rst b/third_party/bpftool/docs/bpftool-net.rst new file mode 100644 index 0000000..f4e0a51 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-net.rst @@ -0,0 +1,178 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-net +================ +------------------------------------------------------------------------------- +tool for inspection of netdev/tc related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **net** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| } + + *COMMANDS* := + { **show** | **list** | **attach** | **detach** | **help** } + +NET COMMANDS +============ + +| **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* +| **bpftool** **net help** +| +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* } +| *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** } + +DESCRIPTION +=========== + **bpftool net { show | list }** [ **dev** *NAME* ] + List bpf program attachments in the kernel networking subsystem. + + Currently, only device driver xdp attachments and tc filter + classification/action attachments are implemented, i.e., for + program types **BPF_PROG_TYPE_SCHED_CLS**, + **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**. + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + users can use **bpftool cgroup** to dump cgroup attachments. + For sk_{filter, skb, msg, reuseport} and lwt/seg6 + bpf programs, users should consult other tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by + all tc class/qdisc bpf program attachments. Both xdp programs and + tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc filter**, + the order will be first all bpf programs attached to tc classes, then + all bpf programs attached to non clsact qdiscs, and finally all + bpf programs attached to root and clsact qdisc. + + **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] + Attach bpf program *PROG* to network interface *NAME* with + type specified by *ATTACH_TYPE*. Previously attached bpf program + can be replaced by the command used with **overwrite** option. + Currently, only XDP-related modes are supported for *ATTACH_TYPE*. + + *ATTACH_TYPE* can be of: + **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; + **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb; + **xdpdrv** - Native XDP. runs earliest point in driver's receive path; + **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception; + + **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* + Detach bpf program attached to network interface *NAME* with + type specified by *ATTACH_TYPE*. To detach bpf program, same + *ATTACH_TYPE* previously used for attach must be specified. + Currently, only XDP-related modes are supported for *ATTACH_TYPE*. + + **bpftool net help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + +EXAMPLES +======== + +| **# bpftool net** + +:: + + xdp: + eth0(2) driver id 198 + + tc: + eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act [] + eth0(2) clsact/ingress fbflow_icmp id 130246 act [] + eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726 + eth0(2) clsact/egress cls_fg_dscp id 108619 act [] + eth0(2) clsact/egress fbflow_egress id 130245 + +| +| **# bpftool -jp net** + +:: + + [{ + "xdp": [{ + "devname": "eth0", + "ifindex": 2, + "mode": "driver", + "id": 198 + } + ], + "tc": [{ + "devname": "eth0", + "ifindex": 2, + "kind": "htb", + "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", + "id": 111727, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/ingress", + "name": "fbflow_icmp", + "id": 130246, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", + "id": 111726, + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "cls_fg_dscp", + "id": 108619, + "act": [] + },{ + "devname": "eth0", + "ifindex": 2, + "kind": "clsact/egress", + "name": "fbflow_egress", + "id": 130245, + } + ] + } + ] + +| +| **# bpftool net attach xdpdrv id 16 dev enp6s0np0** +| **# bpftool net** + +:: + + xdp: + enp6s0np0(4) driver id 16 + +| +| **# bpftool net attach xdpdrv id 16 dev enp6s0np0** +| **# bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite** +| **# bpftool net** + +:: + + xdp: + enp6s0np0(4) driver id 20 + +| +| **# bpftool net attach xdpdrv id 16 dev enp6s0np0** +| **# bpftool net detach xdpdrv dev enp6s0np0** +| **# bpftool net** + +:: + + xdp: diff --git a/third_party/bpftool/docs/bpftool-perf.rst b/third_party/bpftool/docs/bpftool-perf.rst new file mode 100644 index 0000000..5fea633 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-perf.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-perf +================ +------------------------------------------------------------------------------- +tool for inspection of perf related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **perf** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| } + + *COMMANDS* := + { **show** | **list** | **help** } + +PERF COMMANDS +============= + +| **bpftool** **perf** { **show** | **list** } +| **bpftool** **perf help** + +DESCRIPTION +=========== + **bpftool perf { show | list }** + List all raw_tracepoint, tracepoint, kprobe attachment in the system. + + Output will start with process id and file descriptor in that process, + followed by bpf program id, attachment information, and attachment point. + The attachment point for raw_tracepoint/tracepoint is the trace probe name. + The attachment point for k[ret]probe is either symbol name and offset, + or a kernel virtual address. + The attachment point for u[ret]probe is the file name and the file offset. + + **bpftool perf help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + +EXAMPLES +======== + +| **# bpftool perf** + +:: + + pid 21711 fd 5: prog_id 5 kprobe func __x64_sys_write offset 0 + pid 21765 fd 5: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0 + pid 21767 fd 5: prog_id 8 tracepoint sys_enter_nanosleep + pid 21800 fd 5: prog_id 9 uprobe filename /home/yhs/a.out offset 1159 + +| +| **# bpftool -j perf** + +:: + + [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \ + {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \ + {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \ + {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}] diff --git a/third_party/bpftool/docs/bpftool-prog.rst b/third_party/bpftool/docs/bpftool-prog.rst new file mode 100644 index 0000000..dcae81b --- /dev/null +++ b/third_party/bpftool/docs/bpftool-prog.rst @@ -0,0 +1,375 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +bpftool-prog +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF progs +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **prog** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| | + { **-f** | **--bpffs** } | { **-m** | **--mapcompat** } | { **-n** | **--nomount** } | + { **-L** | **--use-loader** } } + + *COMMANDS* := + { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | + **loadall** | **help** } + +PROG COMMANDS +============= + +| **bpftool** **prog** { **show** | **list** } [*PROG*] +| **bpftool** **prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] +| **bpftool** **prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] +| **bpftool** **prog pin** *PROG* *FILE* +| **bpftool** **prog** { **load** | **loadall** } *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] +| **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*] +| **bpftool** **prog tracelog** +| **bpftool** **prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] +| **bpftool** **prog profile** *PROG* [**duration** *DURATION*] *METRICs* +| **bpftool** **prog help** +| +| *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } +| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } +| *TYPE* := { +| **socket** | **kprobe** | **kretprobe** | **classifier** | **action** | +| **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** | **cgroup/skb** | +| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** | +| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** | +| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** | +| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** | +| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | +| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** | +| **cgroup/getsockopt** | **cgroup/setsockopt** | **cgroup/sock_release** | +| **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup** +| } +| *ATTACH_TYPE* := { +| **sk_msg_verdict** | **sk_skb_verdict** | **sk_skb_stream_verdict** | +| **sk_skb_stream_parser** | **flow_dissector** +| } +| *METRICs* := { +| **cycles** | **instructions** | **l1d_loads** | **llc_misses** | +| **itlb_misses** | **dtlb_misses** +| } + + +DESCRIPTION +=========== + **bpftool prog { show | list }** [*PROG*] + Show information about loaded programs. If *PROG* is + specified show information only about given programs, + otherwise list all programs currently loaded on the system. + In case of **tag** or **name**, *PROG* may match several + programs which will all be shown. + + Output will start with program ID followed by program type and + zero or more named attributes (depending on kernel version). + + Since Linux 5.1 the kernel can collect statistics on BPF + programs (such as the total time spent running the program, + and the number of times it was run). If available, bpftool + shows such statistics. However, the kernel does not collect + them by defaults, as it slightly impacts performance on each + program run. Activation or deactivation of the feature is + performed via the **kernel.bpf_stats_enabled** sysctl knob. + + Since Linux 5.8 bpftool is able to discover information about + processes that hold open file descriptors (FDs) against BPF + programs. On such kernels bpftool will automatically emit this + information as well. + + **bpftool prog dump xlated** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] [**visual**] }] + Dump eBPF instructions of the programs from the kernel. By + default, eBPF will be disassembled and printed to standard + output in human-readable format. In this case, **opcodes** + controls if raw opcodes should be printed as well. + + In case of **tag** or **name**, *PROG* may match several + programs which will all be dumped. However, if **file** or + **visual** is specified, *PROG* must match a single program. + + If **file** is specified, the binary image will instead be + written to *FILE*. + + If **visual** is specified, control flow graph (CFG) will be + built instead, and eBPF instructions will be presented with + CFG in DOT format, on standard output. + + If the programs have line_info available, the source line will + be displayed. If **linum** is specified, the filename, line + number and line column will also be displayed. + + **bpftool prog dump jited** *PROG* [{ **file** *FILE* | [**opcodes**] [**linum**] }] + Dump jited image (host machine code) of the program. + + If *FILE* is specified image will be written to a file, + otherwise it will be disassembled and printed to stdout. + *PROG* must match a single program when **file** is specified. + + **opcodes** controls if raw opcodes will be printed. + + If the prog has line_info available, the source line will + be displayed. If **linum** is specified, the filename, line + number and line column will also be displayed. + + **bpftool prog pin** *PROG* *FILE* + Pin program *PROG* as *FILE*. + + Note: *FILE* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool prog { load | loadall }** *OBJ* *PATH* [**type** *TYPE*] [**map** { **idx** *IDX* | **name** *NAME* } *MAP*] [{ **offload_dev** | **xdpmeta_dev** } *NAME*] [**pinmaps** *MAP_DIR*] [**autoattach**] + Load bpf program(s) from binary *OBJ* and pin as *PATH*. + **bpftool prog load** pins only the first program from the + *OBJ* as *PATH*. **bpftool prog loadall** pins all programs + from the *OBJ* under *PATH* directory. + **type** is optional, if not specified program type will be + inferred from section names. + By default bpftool will create new maps as declared in the ELF + object being loaded. **map** parameter allows for the reuse + of existing maps. It can be specified multiple times, each + time for a different map. *IDX* refers to index of the map + to be replaced in the ELF file counting from 0, while *NAME* + allows to replace a map by name. *MAP* specifies the map to + use, referring to it by **id** or through a **pinned** file. + If **offload_dev** *NAME* is specified program will be loaded + onto given networking device (offload). + If **xdpmeta_dev** *NAME* is specified program will become + device-bound without offloading, this facilitates access + to XDP metadata. + Optional **pinmaps** argument can be provided to pin all + maps under *MAP_DIR* directory. + + If **autoattach** is specified program will be attached + before pin. In that case, only the link (representing the + program attached to its hook) is pinned, not the program as + such, so the path won't show in **bpftool prog show -f**, + only show in **bpftool link show -f**. Also, this only works + when bpftool (libbpf) is able to infer all necessary + information from the object file, in particular, it's not + supported for all program types. If a program does not + support autoattach, bpftool falls back to regular pinning + for that program instead. + + Note: *PATH* must be located in *bpffs* mount. It must not + contain a dot character ('.'), which is reserved for future + extensions of *bpffs*. + + **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*] + Attach bpf program *PROG* (with type specified by + *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* + parameter, with the exception of *flow_dissector* which is + attached to current networking name space. + + **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*] + Detach bpf program *PROG* (with type specified by + *ATTACH_TYPE*). Most *ATTACH_TYPEs* require a *MAP* + parameter, with the exception of *flow_dissector* which is + detached from the current networking name space. + + **bpftool prog tracelog** + Dump the trace pipe of the system to the console (stdout). + Hit to stop printing. BPF programs can write to this + trace pipe at runtime with the **bpf_trace_printk**\ () helper. + This should be used only for debugging purposes. For + streaming data from BPF programs to user space, one can use + perf events (see also **bpftool-map**\ (8)). + + **bpftool prog run** *PROG* **data_in** *FILE* [**data_out** *FILE* [**data_size_out** *L*]] [**ctx_in** *FILE* [**ctx_out** *FILE* [**ctx_size_out** *M*]]] [**repeat** *N*] + Run BPF program *PROG* in the kernel testing infrastructure + for BPF, meaning that the program works on the data and + context provided by the user, and not on actual packets or + monitored functions etc. Return value and duration for the + test run are printed out to the console. + + Input data is read from the *FILE* passed with **data_in**. + If this *FILE* is "**-**", input data is read from standard + input. Input context, if any, is read from *FILE* passed with + **ctx_in**. Again, "**-**" can be used to read from standard + input, but only if standard input is not already in use for + input data. If a *FILE* is passed with **data_out**, output + data is written to that file. Similarly, output context is + written to the *FILE* passed with **ctx_out**. For both + output flows, "**-**" can be used to print to the standard + output (as plain text, or JSON if relevant option was + passed). If output keywords are omitted, output data and + context are discarded. Keywords **data_size_out** and + **ctx_size_out** are used to pass the size (in bytes) for the + output buffers to the kernel, although the default of 32 kB + should be more than enough for most cases. + + Keyword **repeat** is used to indicate the number of + consecutive runs to perform. Note that output data and + context printed to files correspond to the last of those + runs. The duration printed out at the end of the runs is an + average over all runs performed by the command. + + Not all program types support test run. Among those which do, + not all of them can take the **ctx_in**/**ctx_out** + arguments. bpftool does not perform checks on program types. + + **bpftool prog profile** *PROG* [**duration** *DURATION*] *METRICs* + Profile *METRICs* for bpf program *PROG* for *DURATION* + seconds or until user hits . *DURATION* is optional. + If *DURATION* is not specified, the profiling will run up to + **UINT_MAX** seconds. + + **bpftool prog help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + + -f, --bpffs + When showing BPF programs, show file names of pinned + programs. + + -m, --mapcompat + Allow loading maps with unknown map definitions. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. + + -L, --use-loader + Load program as a "loader" program. This is useful to debug + the generation of such programs. When this option is in + use, bpftool attempts to load the programs from the object + file into the kernel, but does not pin them (therefore, the + *PATH* must not be provided). + + When combined with the **-d**\ \|\ **--debug** option, + additional debug messages are generated, and the execution + of the loader program will use the **bpf_trace_printk**\ () + helper to log each step of loading BTF, creating the maps, + and loading the programs (see **bpftool prog tracelog** as + a way to dump those messages). + +EXAMPLES +======== +**# bpftool prog show** + +:: + + 10: xdp name some_prog tag 005a3d2123620c8b gpl run_time_ns 81632 run_cnt 10 + loaded_at 2017-09-29T20:11:00+0000 uid 0 + xlated 528B jited 370B memlock 4096B map_ids 10 + pids systemd(1) + +**# bpftool --json --pretty prog show** + +:: + + [{ + "id": 10, + "type": "xdp", + "tag": "005a3d2123620c8b", + "gpl_compatible": true, + "run_time_ns": 81632, + "run_cnt": 10, + "loaded_at": 1506715860, + "uid": 0, + "bytes_xlated": 528, + "jited": true, + "bytes_jited": 370, + "bytes_memlock": 4096, + "map_ids": [10 + ], + "pids": [{ + "pid": 1, + "comm": "systemd" + } + ] + } + ] + +| +| **# bpftool prog dump xlated id 10 file /tmp/t** +| **$ ls -l /tmp/t** + +:: + + -rw------- 1 root root 560 Jul 22 01:42 /tmp/t + +**# bpftool prog dump jited tag 005a3d2123620c8b** + +:: + + 0: push %rbp + 1: mov %rsp,%rbp + 2: sub $0x228,%rsp + 3: sub $0x28,%rbp + 4: mov %rbx,0x0(%rbp) + +| +| **# mount -t bpf none /sys/fs/bpf/** +| **# bpftool prog pin id 10 /sys/fs/bpf/prog** +| **# bpftool prog load ./my_prog.o /sys/fs/bpf/prog2** +| **# ls -l /sys/fs/bpf/** + +:: + + -rw------- 1 root root 0 Jul 22 01:43 prog + -rw------- 1 root root 0 Jul 22 01:44 prog2 + +**# bpftool prog dump jited pinned /sys/fs/bpf/prog opcodes** + +:: + + 0: push %rbp + 55 + 1: mov %rsp,%rbp + 48 89 e5 + 4: sub $0x228,%rsp + 48 81 ec 28 02 00 00 + b: sub $0x28,%rbp + 48 83 ed 28 + f: mov %rbx,0x0(%rbp) + 48 89 5d 00 + +| +| **# bpftool prog load xdp1_kern.o /sys/fs/bpf/xdp1 type xdp map name rxcnt id 7** +| **# bpftool prog show pinned /sys/fs/bpf/xdp1** + +:: + + 9: xdp name xdp_prog1 tag 539ec6ce11b52f98 gpl + loaded_at 2018-06-25T16:17:31-0700 uid 0 + xlated 488B jited 336B memlock 4096B map_ids 7 + +**# rm /sys/fs/bpf/xdp1** + +| +| **# bpftool prog profile id 337 duration 10 cycles instructions llc_misses** + +:: + + 51397 run_cnt + 40176203 cycles (83.05%) + 42518139 instructions # 1.06 insns per cycle (83.39%) + 123 llc_misses # 2.89 LLC misses per million insns (83.15%) + +| +| Output below is for the trace logs. +| Run in separate terminals: +| **# bpftool prog tracelog** +| **# bpftool prog load -L -d file.o** + +:: + + bpftool-620059 [004] d... 2634685.517903: bpf_trace_printk: btf_load size 665 r=5 + bpftool-620059 [004] d... 2634685.517912: bpf_trace_printk: map_create sample_map idx 0 type 2 value_size 4 value_btf_id 0 r=6 + bpftool-620059 [004] d... 2634685.517997: bpf_trace_printk: prog_load sample insn_cnt 13 r=7 + bpftool-620059 [004] d... 2634685.517999: bpf_trace_printk: close(5) = 0 diff --git a/third_party/bpftool/docs/bpftool-struct_ops.rst b/third_party/bpftool/docs/bpftool-struct_ops.rst new file mode 100644 index 0000000..8022b53 --- /dev/null +++ b/third_party/bpftool/docs/bpftool-struct_ops.rst @@ -0,0 +1,92 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================== +bpftool-struct_ops +================== +------------------------------------------------------------------------------- +tool to register/unregister/introspect BPF struct_ops +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **struct_ops** *COMMAND* + + *OPTIONS* := { |COMMON_OPTIONS| } + + *COMMANDS* := + { **show** | **list** | **dump** | **register** | **unregister** | **help** } + +STRUCT_OPS COMMANDS +=================== + +| **bpftool** **struct_ops { show | list }** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops dump** [*STRUCT_OPS_MAP*] +| **bpftool** **struct_ops register** *OBJ* [*LINK_DIR*] +| **bpftool** **struct_ops unregister** *STRUCT_OPS_MAP* +| **bpftool** **struct_ops help** +| +| *STRUCT_OPS_MAP* := { **id** *STRUCT_OPS_MAP_ID* | **name** *STRUCT_OPS_MAP_NAME* } +| *OBJ* := /a/file/of/bpf_struct_ops.o + + +DESCRIPTION +=========== + **bpftool struct_ops { show | list }** [*STRUCT_OPS_MAP*] + Show brief information about the struct_ops in the system. + If *STRUCT_OPS_MAP* is specified, it shows information only + for the given struct_ops. Otherwise, it lists all struct_ops + currently existing in the system. + + Output will start with struct_ops map ID, followed by its map + name and its struct_ops's kernel type. + + **bpftool struct_ops dump** [*STRUCT_OPS_MAP*] + Dump details information about the struct_ops in the system. + If *STRUCT_OPS_MAP* is specified, it dumps information only + for the given struct_ops. Otherwise, it dumps all struct_ops + currently existing in the system. + + **bpftool struct_ops register** *OBJ* [*LINK_DIR*] + Register bpf struct_ops from *OBJ*. All struct_ops under + the ELF section ".struct_ops" and ".struct_ops.link" will + be registered to its kernel subsystem. For each + struct_ops in the ".struct_ops.link" section, a link + will be created. You can give *LINK_DIR* to provide a + directory path where these links will be pinned with the + same name as their corresponding map name. + + **bpftool struct_ops unregister** *STRUCT_OPS_MAP* + Unregister the *STRUCT_OPS_MAP* from the kernel subsystem. + + **bpftool struct_ops help** + Print short help message. + +OPTIONS +======= + .. include:: common_options.rst + +EXAMPLES +======== +**# bpftool struct_ops show** + +:: + + 100: dctcp tcp_congestion_ops + 105: cubic tcp_congestion_ops + +**# bpftool struct_ops unregister id 105** + +:: + + Unregistered tcp_congestion_ops cubic id 105 + +**# bpftool struct_ops register bpf_cubic.o** + +:: + + Registered tcp_congestion_ops cubic id 110 diff --git a/third_party/bpftool/docs/bpftool.rst b/third_party/bpftool/docs/bpftool.rst new file mode 100644 index 0000000..6965c94 --- /dev/null +++ b/third_party/bpftool/docs/bpftool.rst @@ -0,0 +1,70 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================ +BPFTOOL +================ +------------------------------------------------------------------------------- +tool for inspection and simple manipulation of eBPF programs and maps +------------------------------------------------------------------------------- + +:Manual section: 8 + +.. include:: substitutions.rst + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] *OBJECT* { *COMMAND* | **help** } + + **bpftool** **batch file** *FILE* + + **bpftool** **version** + + *OBJECT* := { **map** | **program** | **link** | **cgroup** | **perf** | **net** | **feature** | + **btf** | **gen** | **struct_ops** | **iter** } + + *OPTIONS* := { { **-V** | **--version** } | |COMMON_OPTIONS| } + + *MAP-COMMANDS* := + { **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext** | + **delete** | **pin** | **event_pipe** | **help** } + + *PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin** | + **load** | **attach** | **detach** | **help** } + + *LINK-COMMANDS* := { **show** | **list** | **pin** | **detach** | **help** } + + *CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** } + + *PERF-COMMANDS* := { **show** | **list** | **help** } + + *NET-COMMANDS* := { **show** | **list** | **help** } + + *FEATURE-COMMANDS* := { **probe** | **help** } + + *BTF-COMMANDS* := { **show** | **list** | **dump** | **help** } + + *GEN-COMMANDS* := { **object** | **skeleton** | **min_core_btf** | **help** } + + *STRUCT-OPS-COMMANDS* := { **show** | **list** | **dump** | **register** | **unregister** | **help** } + + *ITER-COMMANDS* := { **pin** | **help** } + +DESCRIPTION +=========== + *bpftool* allows for inspection and simple modification of BPF objects + on the system. + + Note that format of the output of all tools is not guaranteed to be + stable and should not be depended upon. + +OPTIONS +======= + .. include:: common_options.rst + + -m, --mapcompat + Allow loading maps with unknown map definitions. + + -n, --nomount + Do not automatically attempt to mount any virtual file system + (such as tracefs or BPF virtual file system) when necessary. diff --git a/third_party/bpftool/docs/common_options.rst b/third_party/bpftool/docs/common_options.rst new file mode 100644 index 0000000..30df7a7 --- /dev/null +++ b/third_party/bpftool/docs/common_options.rst @@ -0,0 +1,25 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +-h, --help + Print short help message (similar to **bpftool help**). + +-V, --version + Print bpftool's version number (similar to **bpftool version**), the + number of the libbpf version in use, and optional features that were + included when bpftool was compiled. Optional features include linking + against LLVM or libbfd to provide the disassembler for JIT-ted + programs (**bpftool prog dump jited**) and usage of BPF skeletons + (some features like **bpftool prog profile** or showing pids + associated to BPF objects may rely on it). + +-j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + +-p, --pretty + Generate human-readable JSON output. Implies **-j**. + +-d, --debug + Print all logs available, even debug-level information. This includes + logs from libbpf as well as from the verifier, when attempting to + load programs. diff --git a/third_party/bpftool/docs/substitutions.rst b/third_party/bpftool/docs/substitutions.rst new file mode 100644 index 0000000..827e3ff --- /dev/null +++ b/third_party/bpftool/docs/substitutions.rst @@ -0,0 +1,3 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +.. |COMMON_OPTIONS| replace:: { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-d** | **--debug** } diff --git a/third_party/bpftool/include/linux/bitops.h b/third_party/bpftool/include/linux/bitops.h new file mode 100644 index 0000000..b4d3926 --- /dev/null +++ b/third_party/bpftool/include/linux/bitops.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _LINUX_BITOPS_H_ +#define _LINUX_BITOPS_H_ + +#ifndef BITS_PER_LONG +# define BITS_PER_LONG __WORDSIZE +#endif + +#define BITS_PER_BYTE 8 + +#endif diff --git a/third_party/bpftool/include/linux/build_bug.h b/third_party/bpftool/include/linux/build_bug.h new file mode 100644 index 0000000..f26967c --- /dev/null +++ b/third_party/bpftool/include/linux/build_bug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _LINUX_BUILD_BUG_H +#define _LINUX_BUILD_BUG_H + +#include + +/* + * Force a compilation error if condition is true, but also produce a + * result (of value 0 and type int), so the expression can be used + * e.g. in a structure initializer (or where-ever else comma expressions + * aren't permitted). + */ +#define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); }))) + +/** + * BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied + * error message. + * @condition: the condition which the compiler should know is false. + * + * See BUILD_BUG_ON for description. + */ +#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) + +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @condition: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * some other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + */ +#define BUILD_BUG_ON(condition) \ + BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") + +#endif diff --git a/third_party/bpftool/include/linux/compiler-gcc.h b/third_party/bpftool/include/linux/compiler-gcc.h new file mode 100644 index 0000000..6524675 --- /dev/null +++ b/third_party/bpftool/include/linux/compiler-gcc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _LINUX_COMPILER_H_ +#error "Please don't include directly, include instead." +#endif + +/* + * Common definitions for all gcc versions go here. + */ +#ifndef GCC_VERSION +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#endif + +#if GCC_VERSION >= 70000 && !defined(__CHECKER__) +# define __fallthrough __attribute__ ((fallthrough)) +#endif + +#if __has_attribute(__error__) +# define __compiletime_error(message) __attribute__((error(message))) +#endif + +/* &a[0] degrades to a pointer: a different type from an array */ +#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +#ifndef __noreturn +#define __noreturn __attribute__((noreturn)) +#endif +#define __printf(a, b) __attribute__((format(printf, a, b))) diff --git a/third_party/bpftool/include/linux/compiler.h b/third_party/bpftool/include/linux/compiler.h new file mode 100644 index 0000000..8a9ac85 --- /dev/null +++ b/third_party/bpftool/include/linux/compiler.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _LINUX_COMPILER_H_ +#define _LINUX_COMPILER_H_ + +#include + +#if defined(__OPTIMIZE__) && __has_attribute(__error__) +# define __compiletime_assert(condition, msg, prefix, suffix) \ + do { \ + extern void prefix ## suffix(void) __compiletime_error(msg); \ + if (!(condition)) \ + prefix ## suffix(); \ + } while (0) +#else +# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +#endif + +#define _compiletime_assert(condition, msg, prefix, suffix) \ + __compiletime_assert(condition, msg, prefix, suffix) + +/** + * compiletime_assert - break build and emit msg if condition is false + * @condition: a compile-time constant condition to check + * @msg: a message to emit if condition is false + * + * In tradition of POSIX assert, this macro will break the build if the + * supplied condition is *false*, emitting the supplied error message if the + * compiler has support to do so. + */ +#define compiletime_assert(condition, msg) \ + _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) + +/* Are two types/vars the same type (ignoring qualifiers)? */ +#ifndef __same_type +# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#endif + +#ifndef __maybe_unused +# define __maybe_unused __attribute__((unused)) +#endif + +#ifndef __weak +# define __weak __attribute__((weak)) +#endif + +#ifndef __fallthrough +# define __fallthrough +#endif + +#endif diff --git a/third_party/bpftool/include/linux/compiler_types.h b/third_party/bpftool/include/linux/compiler_types.h new file mode 100644 index 0000000..0655f12 --- /dev/null +++ b/third_party/bpftool/include/linux/compiler_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_COMPILER_TYPES_H +#define __LINUX_COMPILER_TYPES_H + +/* Compiler specific macros. */ +#ifdef __GNUC__ +#include +#endif + +#endif diff --git a/third_party/bpftool/include/linux/err.h b/third_party/bpftool/include/linux/err.h new file mode 100644 index 0000000..d5b7211 --- /dev/null +++ b/third_party/bpftool/include/linux/err.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_ERR_H +#define __LINUX_ERR_H + +#include +#include + +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * ERR_PTR(long error_) +{ + return (void *) error_; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline bool IS_ERR_OR_NULL(const void *ptr) +{ + return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); +} + +#endif diff --git a/third_party/bpftool/include/linux/filter.h b/third_party/bpftool/include/linux/filter.h new file mode 100644 index 0000000..f256caf --- /dev/null +++ b/third_party/bpftool/include/linux/filter.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_FILTER_H +#define __LINUX_FILTER_H + +#include + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((FUNC) - BPF_FUNC_unspec) }) + +#endif diff --git a/third_party/bpftool/include/linux/kernel.h b/third_party/bpftool/include/linux/kernel.h new file mode 100644 index 0000000..48d23ea --- /dev/null +++ b/third_party/bpftool/include/linux/kernel.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_KERNEL_H +#define __LINUX_KERNEL_H + +#include + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#ifndef max +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) +#endif + +#ifndef roundup +#define roundup(x, y) ( \ +{ \ + const typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#endif + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) + +#endif diff --git a/third_party/bpftool/include/linux/list.h b/third_party/bpftool/include/linux/list.h new file mode 100644 index 0000000..1113376 --- /dev/null +++ b/third_party/bpftool/include/linux/list.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_LIST_H +#define __LINUX_LIST_H + +#include + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) + + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) +#define list_prev_entry(pos, member) \ + list_entry((pos)->member.prev, typeof(*(pos)), member) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + +#endif diff --git a/third_party/bpftool/include/linux/sizes.h b/third_party/bpftool/include/linux/sizes.h new file mode 100644 index 0000000..95e51ac --- /dev/null +++ b/third_party/bpftool/include/linux/sizes.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_SIZES_H__ +#define __LINUX_SIZES_H__ + +#define SZ_32K 0x00008000 + +#endif diff --git a/third_party/bpftool/include/linux/stringify.h b/third_party/bpftool/include/linux/stringify.h new file mode 100644 index 0000000..d7cb093 --- /dev/null +++ b/third_party/bpftool/include/linux/stringify.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_STRINGIFY_H +#define __LINUX_STRINGIFY_H + +/* Indirect stringification. Doing two levels allows the parameter to be a + * macro itself. For example, compile with -DFOO=bar, __stringify(FOO) + * converts to "bar". + */ + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#endif diff --git a/third_party/bpftool/include/linux/types.h b/third_party/bpftool/include/linux/types.h new file mode 100644 index 0000000..f991d11 --- /dev/null +++ b/third_party/bpftool/include/linux/types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef __LINUX_TYPES_H +#define __LINUX_TYPES_H + +#include +#include +#include + +#include +#include + +typedef uint64_t u64; +typedef __u32 u32; +typedef __u8 u8; + +#define __bitwise__ +#define __bitwise __bitwise__ + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +#ifndef __aligned_u64 +# define __aligned_u64 __u64 __attribute__((aligned(8))) +#endif + +struct list_head { + struct list_head *next, *prev; +}; + +#endif diff --git a/third_party/bpftool/include/tools/dis-asm-compat.h b/third_party/bpftool/include/tools/dis-asm-compat.h new file mode 100644 index 0000000..70f331e --- /dev/null +++ b/third_party/bpftool/include/tools/dis-asm-compat.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */ +#ifndef _TOOLS_DIS_ASM_COMPAT_H +#define _TOOLS_DIS_ASM_COMPAT_H + +#include +#include + +/* define types for older binutils version, to centralize ifdef'ery a bit */ +#ifndef DISASM_INIT_STYLED +enum disassembler_style {DISASSEMBLER_STYLE_NOT_EMPTY}; +typedef int (*fprintf_styled_ftype) (void *, enum disassembler_style, const char*, ...); +#endif + +/* + * Trivial fprintf wrapper to be used as the fprintf_styled_func argument to + * init_disassemble_info_compat() when normal fprintf suffices. + */ +static inline int fprintf_styled(void *out, + enum disassembler_style style, + const char *fmt, ...) +{ + va_list args; + int r; + + (void)style; + + va_start(args, fmt); + r = vfprintf(out, fmt, args); + va_end(args); + + return r; +} + +/* + * Wrapper for init_disassemble_info() that hides version + * differences. Depending on binutils version and architecture either + * fprintf_func or fprintf_styled_func will be called. + */ +static inline void init_disassemble_info_compat(struct disassemble_info *info, + void *stream, + fprintf_ftype unstyled_func, + fprintf_styled_ftype styled_func) +{ +#ifdef DISASM_INIT_STYLED + init_disassemble_info(info, stream, + unstyled_func, + styled_func); +#else + (void)styled_func; + init_disassemble_info(info, stream, + unstyled_func); +#endif +} + +#endif /* _TOOLS_DIS_ASM_COMPAT_H */ diff --git a/third_party/bpftool/include/uapi/asm-generic/bitsperlong.h b/third_party/bpftool/include/uapi/asm-generic/bitsperlong.h new file mode 100644 index 0000000..23e6c41 --- /dev/null +++ b/third_party/bpftool/include/uapi/asm-generic/bitsperlong.h @@ -0,0 +1,15 @@ +#ifndef _UAPI__ASM_GENERIC_BITS_PER_LONG +#define _UAPI__ASM_GENERIC_BITS_PER_LONG + +/* + * There seems to be no way of detecting this automatically from user + * space, so 64 bit architectures should override this in their + * bitsperlong.h. In particular, an architecture that supports + * both 32 and 64 bit user space must not rely on CONFIG_64BIT + * to decide it, but rather check a compiler provided macro. + */ +#ifndef __BITS_PER_LONG +#define __BITS_PER_LONG 32 +#endif + +#endif /* _UAPI__ASM_GENERIC_BITS_PER_LONG */ diff --git a/third_party/bpftool/include/uapi/linux/bpf.h b/third_party/bpftool/include/uapi/linux/bpf.h new file mode 100644 index 0000000..60a9d59 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/bpf.h @@ -0,0 +1,7198 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +/* jmp encodings */ +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ +}; + +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; +}; + +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, + BPF_PROG_RUN = BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, +}; + +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, + BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + BPF_STRUCT_OPS, + BPF_NETFILTER, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, + + MAX_BPF_LINK_TYPE, +}; + +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] + * insn[0].imm: map fd or fd_idx + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP + */ +#define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_IDX 5 + +/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE + * insn[0].imm: map fd or fd_idx + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ +#define BPF_PSEUDO_MAP_VALUE 2 +#define BPF_PSEUDO_MAP_IDX_VALUE 6 + +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 + +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 + +/* flags for BPF_MAP_UPDATE_ELEM command */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; + +/* flags for BPF_MAP_CREATE command */ +enum { + BPF_F_NO_PREALLOC = (1U << 0), +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ + BPF_F_NO_COMMON_LRU = (1U << 1), +/* Specify numa node during map creation */ + BPF_F_NUMA_NODE = (1U << 2), + +/* Flags for accessing BPF object from syscall side. */ + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), + +/* Flag for stack_map, store build_id+offset instead of pointer */ + BPF_F_STACK_BUILD_ID = (1U << 5), + +/* Zero-initialize hash function seed. This should only be used for testing. */ + BPF_F_ZERO_SEED = (1U << 6), + +/* Flags for accessing BPF object from program side. */ + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), + +/* Clone map from listener for newly accepted socket */ + BPF_F_CLONE = (1U << 9), + +/* Enable memory-mapping BPF map */ + BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), +}; + +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are always returned 0. + */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) + +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + +#define BPF_OBJ_NAME_LEN 16U + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; + __u32 flags; /* extra flags */ + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + +} __attribute__((aligned(8))); + +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * Return + * Current *ktime*. + * + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * long bpf_get_current_comm(void *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * long bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * + * u64 bpf_get_current_task(void) + * Description + * Get the current task. + * Return + * A pointer to the current task struct. + * + * long bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. + * + * long bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * Return + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * long bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * Return + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + * + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_override_return(struct pt_regs *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + * + * void *bpf_get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * Return + * A pointer to the local storage area. + * + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_sk_release(void *sock) + * Description + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * long bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + * + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + * + * long bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + * + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies + * + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * Return + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * Return + * Current *ktime*. + * + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(void *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * Return + * 0 on success, or a negative error in case of failure. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. + * + * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) + * Description + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * Return + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + * + * long bpf_sys_close(u32 fd) + * Description + * Execute close syscall for given FD. + * Return + * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * Return + * The number of argument registers of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * Return + * The BPF program's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + * + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) + * Description + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + * + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + * + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ + /* */ + +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, +enum bpf_func_id { + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; + +/* BPF_FUNC_l4_csum_replace flags. */ +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +enum { + BPF_F_INGRESS = (1ULL << 0), +}; + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; + +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), +/* flags used by BPF_FUNC_get_stackid only. */ + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), +/* flags used by BPF_FUNC_get_stack only. */ + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), +}; + +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, +/* BPF_FUNC_perf_event_output for sk_buff input context. */ + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; + +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + +/* BPF_FUNC_skb_adjust_room flags. */ +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), +}; + +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; + +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; + +/* BPF_FUNC__storage_get flags */ +enum { + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; + +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + +/* Mode for BPF_FUNC_skb_adjust_room helper. */ +enum bpf_adj_room_mode { + BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, +}; + +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, +}; + +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + +enum { + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ +}; + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; + + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; + __u8 tstamp_type; + __u32 :24; /* Padding, future use. */ + __u64 hwtstamp; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; + __u32 tunnel_label; + union { + __u32 local_ipv4; + __u32 local_ipv6[4]; + }; +}; + +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + __u16 ext; /* Padding, future use. */ + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; + __u32 mark; + __u32 priority; + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; + __s32 rx_queue_mapping; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +}; + +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + +struct bpf_xdp_sock { + __u32 queue_id; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, + XDP_REDIRECT, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; + __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ +}; + +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +enum sk_action { + SK_DROP = 0, + SK_PASS, +}; + +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ +}; + +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); +}; + +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ + __u64 netns_dev; + __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; + __u64 recursion_misses; + __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 btf_vmlinux_value_type_id; + __u64 netns_dev; + __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 :32; /* alignment pad */ + __u64 map_extra; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; +} __attribute__((aligned(8))); + +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ + union { + struct { + __u32 map_id; + } map; + }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; + }; + } iter; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; + struct { + __u32 ifindex; + } xdp; + struct { + __u32 map_id; + } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; +} __attribute__((aligned(8))); + +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __bpf_md_ptr(struct bpf_sock *, sk); +}; + +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ + __u64 skb_hwtstamp; +}; + +/* Definitions for bpf_sock_ops_cb_flags */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ +}; + +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ +}; + +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; + +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; + +struct bpf_cgroup_dev_ctx { + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; + __u32 major; + __u32 minor; +}; + +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), +}; + +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + +struct bpf_fib_lookup { + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + + /* output: MTU value */ + __u16 mtu_result; + }; + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ + + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; + }; + + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route + */ + union { + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; + +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; + __u32 flags; + __be32 flow_label; +}; + +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + +struct bpf_spin_lock { + __u32 val; +}; + +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_root { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ + __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ +}; + +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ +}; + +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/third_party/bpftool/include/uapi/linux/bpf_common.h b/third_party/bpftool/include/uapi/linux/bpf_common.h new file mode 100644 index 0000000..ee97668 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/bpf_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_BPF_COMMON_H__ +#define _UAPI__LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/third_party/bpftool/include/uapi/linux/btf.h b/third_party/bpftool/include/uapi/linux/btf.h new file mode 100644 index 0000000..ec1798b --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/btf.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x000fffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x00ffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union, enum, fwd and enum64 + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) + +enum { + BTF_KIND_UNKN = 0, /* Unknown */ + BTF_KIND_INT = 1, /* Integer */ + BTF_KIND_PTR = 2, /* Pointer */ + BTF_KIND_ARRAY = 3, /* Array */ + BTF_KIND_STRUCT = 4, /* Struct */ + BTF_KIND_UNION = 5, /* Union */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ + BTF_KIND_FWD = 7, /* Forward */ + BTF_KIND_TYPEDEF = 8, /* Typedef */ + BTF_KIND_VOLATILE = 9, /* Volatile */ + BTF_KIND_CONST = 10, /* Const */ + BTF_KIND_RESTRICT = 11, /* Restrict */ + BTF_KIND_FUNC = 12, /* Function */ + BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ + BTF_KIND_VAR = 14, /* Variable */ + BTF_KIND_DATASEC = 15, /* Section */ + BTF_KIND_FLOAT = 16, /* Floating point */ + BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ + + NR_BTF_KINDS, + BTF_KIND_MAX = NR_BTF_KINDS - 1, +}; + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name_off; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name_off; + __u32 type; + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; +}; + +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, +}; + +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + +/* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe + * additional information related to the tag applied location. + * If component_idx == -1, the tag is applied to a struct, union, + * variable or function. Otherwise, it is applied to a struct/union + * member or a func argument, and component_idx indicates which member + * or argument (0 ... vlen-1). + */ +struct btf_decl_tag { + __s32 component_idx; +}; + +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/third_party/bpftool/include/uapi/linux/const.h b/third_party/bpftool/include/uapi/linux/const.h new file mode 100644 index 0000000..a429381 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/const.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* const.h: Macros for dealing with constants. */ + +#ifndef _UAPI_LINUX_CONST_H +#define _UAPI_LINUX_CONST_H + +/* Some constant macros are used in both assembler and + * C code. Therefore we cannot annotate them always with + * 'UL' and other type specifiers unilaterally. We + * use the following macros to deal with this. + * + * Similarly, _AT() will cast an expression with a type in C, but + * leave it unchanged in asm. + */ + +#ifdef __ASSEMBLY__ +#define _AC(X,Y) X +#define _AT(T,X) X +#else +#define __AC(X,Y) (X##Y) +#define _AC(X,Y) __AC(X,Y) +#define _AT(T,X) ((T)(X)) +#endif + +#define _UL(x) (_AC(x, UL)) +#define _ULL(x) (_AC(x, ULL)) + +#define _BITUL(x) (_UL(1) << (x)) +#define _BITULL(x) (_ULL(1) << (x)) + +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#endif /* _UAPI_LINUX_CONST_H */ diff --git a/third_party/bpftool/include/uapi/linux/if_link.h b/third_party/bpftool/include/uapi/linux/if_link.h new file mode 100644 index 0000000..39e659c --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/if_link.h @@ -0,0 +1,1284 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; + __u32 collisions; + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; +}; + +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ +struct rtnl_link_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + __u64 rx_nohandler; +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* VXLAN section */ +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, + IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/third_party/bpftool/include/uapi/linux/netlink.h b/third_party/bpftool/include/uapi/linux/netlink.h new file mode 100644 index 0000000..0a4d733 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/netlink.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Sending process port ID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/third_party/bpftool/include/uapi/linux/perf_event.h b/third_party/bpftool/include/uapi/linux/perf_event.h new file mode 100644 index 0000000..39c6a25 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/perf_event.h @@ -0,0 +1,1449 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _UAPI_LINUX_PERF_EVENT_H +#define _UAPI_LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, + PERF_COUNT_HW_REF_CPU_CYCLES = 9, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + PERF_COUNT_HW_CACHE_NODE = 6, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, + PERF_COUNT_SW_EMULATION_FAULTS = 8, + PERF_COUNT_SW_DUMMY = 9, + PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, + PERF_SAMPLE_REGS_USER = 1U << 12, + PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, + PERF_SAMPLE_TRANSACTION = 1U << 17, + PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ +}; + +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ + PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ + + PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ + PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_MAX, +}; + +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + +/* + * Values to determine ABI of the registers dump. + */ +enum perf_sample_regs_abi { + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, +}; + +/* + * Values for the memory transaction event qualifier, mostly for + * abort events. Multiple bits can be set. + */ +enum { + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + + PERF_TXN_MAX = (1 << 8), /* non-ABI */ + + /* bits 32..63 are reserved for the abort code */ + + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, + + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ +#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ + /* add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + * + * @sample_max_stack: Max number of frame pointers in a callchain, + * should be < /proc/sys/kernel/perf_event_max_stack + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ + + exclude_host : 1, /* don't count in host */ + exclude_guest : 1, /* don't count in guest */ + + exclude_callchain_kernel : 1, /* exclude kernel callchains */ + exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ + comm_exec : 1, /* flag comm events that are due to an exec */ + use_clockid : 1, /* use @clockid for time fields */ + context_switch : 1, /* context switch data */ + write_backward : 1, /* Write ring buffer from end to beginning */ + namespaces : 1, /* include namespaces data */ + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + aux_output : 1, /* generate AUX records instead of events */ + cgroup : 1, /* include cgroup events */ + text_poke : 1, /* include text poke events */ + build_id : 1, /* use build id in mmap2 events */ + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + + __u32 bp_type; + union { + __u64 bp_addr; + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ + __u64 config2; /* extension of config1 */ + }; + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + + /* + * Defines set of user regs to dump on samples. + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_user; + + /* + * Defines size of the user stack to dump on samples. + */ + __u32 sample_stack_user; + + __s32 clockid; + /* + * Defines set of regs to dump for each sample + * state captured on: + * - precise = 0: PMU interrupt + * - precise > 0: sampled instruction + * + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u16 sample_max_stack; + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. + */ + __u64 sig_data; + + __u64 config3; /* extension of config2 */ +}; + +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[]; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq, time_mult, time_shift, index, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; + * + * do { + * seq = pc->lock; + * barrier() + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * index = pc->index; + * count = pc->offset; + * if (pc->cap_user_rdpmc && index) { + * width = pc->pmc_width; + * pmc = rdpmc(index - 1); + * } + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + union { + __u64 capabilities; + struct { + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; + }; + }; + + /* + * If cap_user_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & (((u64)1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if index), improving the scaling: + * + * enabled += delta; + * if (index) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; + __u64 time_offset; + /* + * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated + * from sample timestamps. + * + * time = timestamp - time_zero; + * quot = time / time_mult; + * rem = time % time_mult; + * cyc = (quot << time_shift) + (rem << time_shift) / time_mult; + * + * And vice versa: + * + * quot = cyc >> time_shift; + * rem = cyc & (((u64)1 << time_shift) - 1); + * timestamp = time_zero + quot * time_mult + + * ((rem * time_mult) >> time_shift); + */ + __u64 time_zero; + + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; + + /* + * Hole for extension of the self monitor capabilities + */ + + __u8 __reserved[116*8]; /* align to 1k. */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; +}; + +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) + +/* + * Indicates that /proc/PID/maps parsing are truncated by time out. + */ +#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12) +/* + * Following PERF_RECORD_MISC_* are used on different + * events, so can reuse the same bit position: + * + * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events + * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) + * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events + */ +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) +#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) +/* + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +struct perf_ns_link_info { + __u64 dev; + __u64 ino; +}; + +enum { + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ +}; + +enum perf_event_type { + + /* + * If perf_event_attr.sample_id_all is set then all event types will + * have the sample_type selected fields related to where/when + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. + * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. + */ + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 nr; + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK + * + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 transaction; } && PERF_SAMPLE_TRANSACTION + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE + * }; + */ + PERF_RECORD_SAMPLE = 9, + + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; + * u32 prot, flags; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + + /* + * Records the dropped/lost sample number. + * + * struct { + * struct perf_event_header header; + * + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST_SAMPLES = 13, + + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + + /* + * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and + * next_prev_tid that are the next (switching out) or previous + * (switching in) pid/tid. + * + * struct { + * struct perf_event_header header; + * u32 next_prev_pid; + * u32 next_prev_tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH_CPU_WIDE = 15, + + /* + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * u64 nr_namespaces; + * { u64 dev, inode; } [nr_namespaces]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_NAMESPACES = 16, + + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + + /* + * Data written to the AUX area by hardware due to aux_output, may need + * to be matched to the event by an architecture-specific hardware ID. + * This records the hardware ID, but requires sample_id to provide the + * event ID. e.g. Intel PT uses this record to disambiguate PEBS-via-PT + * records from multiple events. + * + * struct { + * struct perf_event_header header; + * u64 hw_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ + +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ + +#if defined(__LITTLE_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; +#elif defined(__BIG_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ + mem_blk:3, /* access blocked */ + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_dtlb:7, /* tlb access */ + mem_lock:2, /* lock instr */ + mem_snoop:5, /* snoop mode */ + mem_lvl:14, /* memory hierarchy level */ + mem_op:5; /* type of opcode */ + }; +}; +#else +#error "Unknown endianness" +#endif + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 */ +#define PERF_MEM_LVL_L3 0x40 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ +/* 5-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + +#define PERF_MEM_S(a, s) \ + (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction + * cycles: cycles from last branch (or 0 if not supported) + * type: branch type + * spec: branch speculation info (or 0 if not supported) + */ +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + cycles:16, /* cycle count to last branch */ + type:4, /* branch type */ + spec:2, /* branch speculation info */ + new_type:4, /* additional branch type */ + priv:3, /* privilege level */ + reserved:31; +}; + +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + +#endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/third_party/bpftool/include/uapi/linux/pkt_cls.h b/third_party/bpftool/include/uapi/linux/pkt_cls.h new file mode 100644 index 0000000..3faee01 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/pkt_cls.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include +#include + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum { + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, + TCA_RSVP_ACT, + __TCA_RSVP_MAX +}; + +#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) + +struct tc_rsvp_gpi { + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo { + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; + __u8 pad; +}; + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* TC index filter */ + +enum { + TCA_TCINDEX_UNSPEC, + TCA_TCINDEX_HASH, + TCA_TCINDEX_MASK, + TCA_TCINDEX_SHIFT, + TCA_TCINDEX_FALL_THROUGH, + TCA_TCINDEX_CLASSID, + TCA_TCINDEX_POLICE, + TCA_TCINDEX_ACT, + __TCA_TCINDEX_MAX +}; + +#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/third_party/bpftool/include/uapi/linux/pkt_sched.h b/third_party/bpftool/include/uapi/linux/pkt_sched.h new file mode 100644 index 0000000..5c903ab --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/pkt_sched.h @@ -0,0 +1,1164 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + TCA_HTB_OFFLOAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u32 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif diff --git a/third_party/bpftool/include/uapi/linux/tc_act/tc_bpf.h b/third_party/bpftool/include/uapi/linux/tc_act/tc_bpf.h new file mode 100644 index 0000000..fe6c8f8 --- /dev/null +++ b/third_party/bpftool/include/uapi/linux/tc_act/tc_bpf.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2015 Jiri Pirko + */ + +#ifndef __LINUX_TC_BPF_H +#define __LINUX_TC_BPF_H + +#include + +struct tc_act_bpf { + tc_gen; +}; + +enum { + TCA_ACT_BPF_UNSPEC, + TCA_ACT_BPF_TM, + TCA_ACT_BPF_PARMS, + TCA_ACT_BPF_OPS_LEN, + TCA_ACT_BPF_OPS, + TCA_ACT_BPF_FD, + TCA_ACT_BPF_NAME, + TCA_ACT_BPF_PAD, + TCA_ACT_BPF_TAG, + TCA_ACT_BPF_ID, + __TCA_ACT_BPF_MAX, +}; +#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) + +#endif diff --git a/third_party/bpftool/libbpf/.gitattributes b/third_party/bpftool/libbpf/.gitattributes new file mode 100644 index 0000000..c899af4 --- /dev/null +++ b/third_party/bpftool/libbpf/.gitattributes @@ -0,0 +1 @@ +assets/** export-ignore diff --git a/third_party/bpftool/libbpf/.readthedocs.yaml b/third_party/bpftool/libbpf/.readthedocs.yaml new file mode 100644 index 0000000..803dfa2 --- /dev/null +++ b/third_party/bpftool/libbpf/.readthedocs.yaml @@ -0,0 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: docs/conf.py + +formats: + - htmlzip + - pdf + - epub + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.7 + install: + - requirements: docs/sphinx/requirements.txt \ No newline at end of file diff --git a/third_party/bpftool/libbpf/BPF-CHECKPOINT-COMMIT b/third_party/bpftool/libbpf/BPF-CHECKPOINT-COMMIT new file mode 100644 index 0000000..ec5e4be --- /dev/null +++ b/third_party/bpftool/libbpf/BPF-CHECKPOINT-COMMIT @@ -0,0 +1 @@ +496720b7cfb6574a8f6f4d434f23e3d1e6cfaeb9 diff --git a/third_party/bpftool/libbpf/CHECKPOINT-COMMIT b/third_party/bpftool/libbpf/CHECKPOINT-COMMIT new file mode 100644 index 0000000..130cfa6 --- /dev/null +++ b/third_party/bpftool/libbpf/CHECKPOINT-COMMIT @@ -0,0 +1 @@ +a3e7e6b17946f48badce98d7ac360678a0ea7393 diff --git a/third_party/bpftool/libbpf/LICENSE b/third_party/bpftool/libbpf/LICENSE new file mode 100644 index 0000000..d38fed3 --- /dev/null +++ b/third_party/bpftool/libbpf/LICENSE @@ -0,0 +1 @@ +LGPL-2.1 OR BSD-2-Clause diff --git a/third_party/bpftool/libbpf/LICENSE.BSD-2-Clause b/third_party/bpftool/libbpf/LICENSE.BSD-2-Clause new file mode 100644 index 0000000..bce40aa --- /dev/null +++ b/third_party/bpftool/libbpf/LICENSE.BSD-2-Clause @@ -0,0 +1,32 @@ +Valid-License-Identifier: BSD-2-Clause +SPDX-URL: https://spdx.org/licenses/BSD-2-Clause.html +Usage-Guide: + To use the BSD 2-clause "Simplified" License put the following SPDX + tag/value pair into a comment according to the placement guidelines in + the licensing rules documentation: + SPDX-License-Identifier: BSD-2-Clause +License-Text: + +Copyright (c) 2015 The Libbpf Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/bpftool/libbpf/LICENSE.LGPL-2.1 b/third_party/bpftool/libbpf/LICENSE.LGPL-2.1 new file mode 100644 index 0000000..27bb434 --- /dev/null +++ b/third_party/bpftool/libbpf/LICENSE.LGPL-2.1 @@ -0,0 +1,503 @@ +Valid-License-Identifier: LGPL-2.1 +Valid-License-Identifier: LGPL-2.1+ +SPDX-URL: https://spdx.org/licenses/LGPL-2.1.html +Usage-Guide: + To use this license in source code, put one of the following SPDX + tag/value pairs into a comment according to the placement + guidelines in the licensing rules documentation. + For 'GNU Lesser General Public License (LGPL) version 2.1 only' use: + SPDX-License-Identifier: LGPL-2.1 + For 'GNU Lesser General Public License (LGPL) version 2.1 or any later + version' use: + SPDX-License-Identifier: LGPL-2.1+ +License-Text: + +GNU LESSER GENERAL PUBLIC LICENSE +Version 2.1, February 1999 + +Copyright (C) 1991, 1999 Free Software Foundation, Inc. +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts as +the successor of the GNU Library Public License, version 2, hence the +version number 2.1.] + +Preamble + +The licenses for most software are designed to take away your freedom to +share and change it. By contrast, the GNU General Public Licenses are +intended to guarantee your freedom to share and change free software--to +make sure the software is free for all its users. + +This license, the Lesser General Public License, applies to some specially +designated software packages--typically libraries--of the Free Software +Foundation and other authors who decide to use it. You can use it too, but +we suggest you first think carefully about whether this license or the +ordinary General Public License is the better strategy to use in any +particular case, based on the explanations below. + +When we speak of free software, we are referring to freedom of use, not +price. Our General Public Licenses are designed to make sure that you have +the freedom to distribute copies of free software (and charge for this +service if you wish); that you receive source code or can get it if you +want it; that you can change the software and use pieces of it in new free +programs; and that you are informed that you can do these things. + +To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for you if +you distribute copies of the library or if you modify it. + +For example, if you distribute copies of the library, whether gratis or for +a fee, you must give the recipients all the rights that we gave you. You +must make sure that they, too, receive or can get the source code. If you +link other code with the library, you must provide complete object files to +the recipients, so that they can relink them with the library after making +changes to the library and recompiling it. And you must show them these +terms so they know their rights. + +We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + +To protect each distributor, we want to make it very clear that there is no +warranty for the free library. Also, if the library is modified by someone +else and passed on, the recipients should know that what they have is not +the original version, so that the original author's reputation will not be +affected by problems that might be introduced by others. + +Finally, software patents pose a constant threat to the existence of any +free program. We wish to make sure that a company cannot effectively +restrict the users of a free program by obtaining a restrictive license +from a patent holder. Therefore, we insist that any patent license obtained +for a version of the library must be consistent with the full freedom of +use specified in this license. + +Most GNU software, including some libraries, is covered by the ordinary GNU +General Public License. This license, the GNU Lesser General Public +License, applies to certain designated libraries, and is quite different +from the ordinary General Public License. We use this license for certain +libraries in order to permit linking those libraries into non-free +programs. + +When a program is linked with a library, whether statically or using a +shared library, the combination of the two is legally speaking a combined +work, a derivative of the original library. The ordinary General Public +License therefore permits such linking only if the entire combination fits +its criteria of freedom. The Lesser General Public License permits more lax +criteria for linking other code with the library. + +We call this license the "Lesser" General Public License because it does +Less to protect the user's freedom than the ordinary General Public +License. It also provides other free software developers Less of an +advantage over competing non-free programs. These disadvantages are the +reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + +For example, on rare occasions, there may be a special need to encourage +the widest possible use of a certain library, so that it becomes a de-facto +standard. To achieve this, non-free programs must be allowed to use the +library. A more frequent case is that a free library does the same job as +widely used non-free libraries. In this case, there is little to gain by +limiting the free library to free software only, so we use the Lesser +General Public License. + +In other cases, permission to use a particular library in non-free programs +enables a greater number of people to use a large body of free +software. For example, permission to use the GNU C Library in non-free +programs enables many more people to use the whole GNU operating system, as +well as its variant, the GNU/Linux operating system. + +Although the Lesser General Public License is Less protective of the users' +freedom, it does ensure that the user of a program that is linked with the +Library has the freedom and the wherewithal to run that program using a +modified version of the Library. + +The precise terms and conditions for copying, distribution and modification +follow. Pay close attention to the difference between a "work based on the +library" and a "work that uses the library". The former contains code +derived from the library, whereas the latter must be combined with the +library in order to run. + +TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +0. This License Agreement applies to any software library or other program + which contains a notice placed by the copyright holder or other + authorized party saying it may be distributed under the terms of this + Lesser General Public License (also called "this License"). Each + licensee is addressed as "you". + + A "library" means a collection of software functions and/or data + prepared so as to be conveniently linked with application programs + (which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work which + has been distributed under these terms. A "work based on the Library" + means either the Library or any derivative work under copyright law: + that is to say, a work containing the Library or a portion of it, either + verbatim or with modifications and/or translated straightforwardly into + another language. (Hereinafter, translation is included without + limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for making + modifications to it. For a library, complete source code means all the + source code for all modules it contains, plus any associated interface + definition files, plus the scripts used to control compilation and + installation of the library. + + Activities other than copying, distribution and modification are not + covered by this License; they are outside its scope. The act of running + a program using the Library is not restricted, and output from such a + program is covered only if its contents constitute a work based on the + Library (independent of the use of the Library in a tool for writing + it). Whether that is true depends on what the Library does and what the + program that uses the Library does. + +1. You may copy and distribute verbatim copies of the Library's complete + source code as you receive it, in any medium, provided that you + conspicuously and appropriately publish on each copy an appropriate + copyright notice and disclaimer of warranty; keep intact all the notices + that refer to this License and to the absence of any warranty; and + distribute a copy of this License along with the Library. + + You may charge a fee for the physical act of transferring a copy, and + you may at your option offer warranty protection in exchange for a fee. + +2. You may modify your copy or copies of the Library or any portion of it, + thus forming a work based on the Library, and copy and distribute such + modifications or work under the terms of Section 1 above, provided that + you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices stating + that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no charge to + all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a table + of data to be supplied by an application program that uses the + facility, other than as an argument passed when the facility is + invoked, then you must make a good faith effort to ensure that, in + the event an application does not supply such function or table, the + facility still operates, and performs whatever part of its purpose + remains meaningful. + + (For example, a function in a library to compute square roots has a + purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must be + optional: if the application does not supply it, the square root + function must still compute square roots.) + + These requirements apply to the modified work as a whole. If + identifiable sections of that work are not derived from the Library, and + can be reasonably considered independent and separate works in + themselves, then this License, and its terms, do not apply to those + sections when you distribute them as separate works. But when you + distribute the same sections as part of a whole which is a work based on + the Library, the distribution of the whole must be on the terms of this + License, whose permissions for other licensees extend to the entire + whole, and thus to each and every part regardless of who wrote it. + + Thus, it is not the intent of this section to claim rights or contest + your rights to work written entirely by you; rather, the intent is to + exercise the right to control the distribution of derivative or + collective works based on the Library. + + In addition, mere aggregation of another work not based on the Library + with the Library (or with a work based on the Library) on a volume of a + storage or distribution medium does not bring the other work under the + scope of this License. + +3. You may opt to apply the terms of the ordinary GNU General Public + License instead of this License to a given copy of the Library. To do + this, you must alter all the notices that refer to this License, so that + they refer to the ordinary GNU General Public License, version 2, + instead of to this License. (If a newer version than version 2 of the + ordinary GNU General Public License has appeared, then you can specify + that version instead if you wish.) Do not make any other change in these + notices. + + Once this change is made in a given copy, it is irreversible for that + copy, so the ordinary GNU General Public License applies to all + subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of the + Library into a program that is not a library. + +4. You may copy and distribute the Library (or a portion or derivative of + it, under Section 2) in object code or executable form under the terms + of Sections 1 and 2 above provided that you accompany it with the + complete corresponding machine-readable source code, which must be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange. + + If distribution of object code is made by offering access to copy from a + designated place, then offering equivalent access to copy the source + code from the same place satisfies the requirement to distribute the + source code, even though third parties are not compelled to copy the + source along with the object code. + +5. A program that contains no derivative of any portion of the Library, but + is designed to work with the Library by being compiled or linked with + it, is called a "work that uses the Library". Such a work, in isolation, + is not a derivative work of the Library, and therefore falls outside the + scope of this License. + + However, linking a "work that uses the Library" with the Library creates + an executable that is a derivative of the Library (because it contains + portions of the Library), rather than a "work that uses the + library". The executable is therefore covered by this License. Section 6 + states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file + that is part of the Library, the object code for the work may be a + derivative work of the Library even though the source code is + not. Whether this is true is especially significant if the work can be + linked without the Library, or if the work is itself a library. The + threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data structure + layouts and accessors, and small macros and small inline functions (ten + lines or less in length), then the use of the object file is + unrestricted, regardless of whether it is legally a derivative + work. (Executables containing this object code plus portions of the + Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may + distribute the object code for the work under the terms of Section + 6. Any executables containing that work also fall under Section 6, + whether or not they are linked directly with the Library itself. + +6. As an exception to the Sections above, you may also combine or link a + "work that uses the Library" with the Library to produce a work + containing portions of the Library, and distribute that work under terms + of your choice, provided that the terms permit modification of the work + for the customer's own use and reverse engineering for debugging such + modifications. + + You must give prominent notice with each copy of the work that the + Library is used in it and that the Library and its use are covered by + this License. You must supply a copy of this License. If the work during + execution displays copyright notices, you must include the copyright + notice for the Library among them, as well as a reference directing the + user to the copy of this License. Also, you must do one of these things: + + a) Accompany the work with the complete corresponding machine-readable + source code for the Library including whatever changes were used in + the work (which must be distributed under Sections 1 and 2 above); + and, if the work is an executable linked with the Library, with the + complete machine-readable "work that uses the Library", as object + code and/or source code, so that the user can modify the Library and + then relink to produce a modified executable containing the modified + Library. (It is understood that the user who changes the contents of + definitions files in the Library will not necessarily be able to + recompile the application to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a copy + of the library already present on the user's computer system, rather + than copying library functions into the executable, and (2) will + operate properly with a modified version of the library, if the user + installs one, as long as the modified version is interface-compatible + with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least three + years, to give the same user the materials specified in Subsection + 6a, above, for a charge no more than the cost of performing this + distribution. + + d) If distribution of the work is made by offering access to copy from a + designated place, offer equivalent access to copy the above specified + materials from the same place. + + e) Verify that the user has already received a copy of these materials + or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the Library" + must include any data and utility programs needed for reproducing the + executable from it. However, as a special exception, the materials to be + distributed need not include anything that is normally distributed (in + either source or binary form) with the major components (compiler, + kernel, and so on) of the operating system on which the executable runs, + unless that component itself accompanies the executable. + + It may happen that this requirement contradicts the license restrictions + of other proprietary libraries that do not normally accompany the + operating system. Such a contradiction means you cannot use both them + and the Library together in an executable that you distribute. + +7. You may place library facilities that are a work based on the Library + side-by-side in a single library together with other library facilities + not covered by this License, and distribute such a combined library, + provided that the separate distribution of the work based on the Library + and of the other library facilities is otherwise permitted, and provided + that you do these two things: + + a) Accompany the combined library with a copy of the same work based on + the Library, uncombined with any other library facilities. This must + be distributed under the terms of the Sections above. + + b) Give prominent notice with the combined library of the fact that part + of it is a work based on the Library, and explaining where to find + the accompanying uncombined form of the same work. + +8. You may not copy, modify, sublicense, link with, or distribute the + Library except as expressly provided under this License. Any attempt + otherwise to copy, modify, sublicense, link with, or distribute the + Library is void, and will automatically terminate your rights under this + License. However, parties who have received copies, or rights, from you + under this License will not have their licenses terminated so long as + such parties remain in full compliance. + +9. You are not required to accept this License, since you have not signed + it. However, nothing else grants you permission to modify or distribute + the Library or its derivative works. These actions are prohibited by law + if you do not accept this License. Therefore, by modifying or + distributing the Library (or any work based on the Library), you + indicate your acceptance of this License to do so, and all its terms and + conditions for copying, distributing or modifying the Library or works + based on it. + +10. Each time you redistribute the Library (or any work based on the + Library), the recipient automatically receives a license from the + original licensor to copy, distribute, link with or modify the Library + subject to these terms and conditions. You may not impose any further + restrictions on the recipients' exercise of the rights granted + herein. You are not responsible for enforcing compliance by third + parties with this License. + +11. If, as a consequence of a court judgment or allegation of patent + infringement or for any other reason (not limited to patent issues), + conditions are imposed on you (whether by court order, agreement or + otherwise) that contradict the conditions of this License, they do not + excuse you from the conditions of this License. If you cannot + distribute so as to satisfy simultaneously your obligations under this + License and any other pertinent obligations, then as a consequence you + may not distribute the Library at all. For example, if a patent license + would not permit royalty-free redistribution of the Library by all + those who receive copies directly or indirectly through you, then the + only way you could satisfy both it and this License would be to refrain + entirely from distribution of the Library. + + If any portion of this section is held invalid or unenforceable under + any particular circumstance, the balance of the section is intended to + apply, and the section as a whole is intended to apply in other + circumstances. + + It is not the purpose of this section to induce you to infringe any + patents or other property right claims or to contest validity of any + such claims; this section has the sole purpose of protecting the + integrity of the free software distribution system which is implemented + by public license practices. Many people have made generous + contributions to the wide range of software distributed through that + system in reliance on consistent application of that system; it is up + to the author/donor to decide if he or she is willing to distribute + software through any other system and a licensee cannot impose that + choice. + + This section is intended to make thoroughly clear what is believed to + be a consequence of the rest of this License. + +12. If the distribution and/or use of the Library is restricted in certain + countries either by patents or by copyrighted interfaces, the original + copyright holder who places the Library under this License may add an + explicit geographical distribution limitation excluding those + countries, so that distribution is permitted only in or among countries + not thus excluded. In such case, this License incorporates the + limitation as if written in the body of this License. + +13. The Free Software Foundation may publish revised and/or new versions of + the Lesser General Public License from time to time. Such new versions + will be similar in spirit to the present version, but may differ in + detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the Library + specifies a version number of this License which applies to it and "any + later version", you have the option of following the terms and + conditions either of that version or of any later version published by + the Free Software Foundation. If the Library does not specify a license + version number, you may choose any version ever published by the Free + Software Foundation. + +14. If you wish to incorporate parts of the Library into other free + programs whose distribution conditions are incompatible with these, + write to the author to ask for permission. For software which is + copyrighted by the Free Software Foundation, write to the Free Software + Foundation; we sometimes make exceptions for this. Our decision will be + guided by the two goals of preserving the free status of all + derivatives of our free software and of promoting the sharing and reuse + of software generally. + +NO WARRANTY + +15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY + FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN + OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES + PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER + EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE + ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH + YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL + NECESSARY SERVICING, REPAIR OR CORRECTION. + +16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING + WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR + REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR + DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL + DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY + (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED + INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF + THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR + OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Libraries + +If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + +To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + +one line to give the library's name and an idea of what it does. +Copyright (C) year name of author + +This library is free software; you can redistribute it and/or modify it +under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at +your option) any later version. + +This library is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this library; if not, write to the Free Software Foundation, +Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add +information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + +Yoyodyne, Inc., hereby disclaims all copyright interest in +the library `Frob' (a library for tweaking knobs) written +by James Random Hacker. + +signature of Ty Coon, 1 April 1990 +Ty Coon, President of Vice +That's all there is to it! diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-compact-darkbg.png b/third_party/bpftool/libbpf/assets/libbpf-logo-compact-darkbg.png new file mode 100644 index 0000000..aaef60e Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-compact-darkbg.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-compact-mono.png b/third_party/bpftool/libbpf/assets/libbpf-logo-compact-mono.png new file mode 100644 index 0000000..51daed1 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-compact-mono.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-compact.png b/third_party/bpftool/libbpf/assets/libbpf-logo-compact.png new file mode 100644 index 0000000..99ae3ab Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-compact.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-darkbg.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-darkbg.png new file mode 100644 index 0000000..00d6d83 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-darkbg.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-mono.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-mono.png new file mode 100644 index 0000000..82f1ce5 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways-mono.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sideways.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways.png new file mode 100644 index 0000000..48186c3 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sideways.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-darkbg.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-darkbg.png new file mode 100644 index 0000000..5e57430 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-darkbg.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-mono.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-mono.png new file mode 100644 index 0000000..b02d432 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse-mono.png differ diff --git a/third_party/bpftool/libbpf/assets/libbpf-logo-sparse.png b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse.png new file mode 100644 index 0000000..a306588 Binary files /dev/null and b/third_party/bpftool/libbpf/assets/libbpf-logo-sparse.png differ diff --git a/third_party/bpftool/libbpf/ci/diffs/.keep b/third_party/bpftool/libbpf/ci/diffs/.keep new file mode 100644 index 0000000..e69de29 diff --git a/third_party/bpftool/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch b/third_party/bpftool/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch new file mode 100644 index 0000000..2f8d84f --- /dev/null +++ b/third_party/bpftool/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch @@ -0,0 +1,70 @@ +From 6fba14e2ed9d159f76b23fa5c16f3ea99acbc003 Mon Sep 17 00:00:00 2001 +From: Masahiro Yamada +Date: Thu, 5 Jan 2023 12:13:06 +0900 +Subject: [PATCH] s390: define RUNTIME_DISCARD_EXIT to fix link error with GNU + ld < 2.36 + +Nathan Chancellor reports that the s390 vmlinux fails to link with +GNU ld < 2.36 since commit 99cb0d917ffa ("arch: fix broken BuildID +for arm64 and riscv"). + +It happens for defconfig, or more specifically for CONFIG_EXPOLINE=y. + + $ s390x-linux-gnu-ld --version | head -n1 + GNU ld (GNU Binutils for Debian) 2.35.2 + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- allnoconfig + $ ./scripts/config -e CONFIG_EXPOLINE + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- olddefconfig + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- + `.exit.text' referenced in section `.s390_return_reg' of drivers/base/dd.o: defined in discarded section `.exit.text' of drivers/base/dd.o + make[1]: *** [scripts/Makefile.vmlinux:34: vmlinux] Error 1 + make: *** [Makefile:1252: vmlinux] Error 2 + +arch/s390/kernel/vmlinux.lds.S wants to keep EXIT_TEXT: + + .exit.text : { + EXIT_TEXT + } + +But, at the same time, EXIT_TEXT is thrown away by DISCARD because +s390 does not define RUNTIME_DISCARD_EXIT. + +I still do not understand why the latter wins after 99cb0d917ffa, +but defining RUNTIME_DISCARD_EXIT seems correct because the comment +line in arch/s390/kernel/vmlinux.lds.S says: + + /* + * .exit.text is discarded at runtime, not link time, + * to deal with references from __bug_table + */ + +Nathan also found that binutils commit 21401fc7bf67 ("Duplicate output +sections in scripts") cured this issue, so we cannot reproduce it with +binutils 2.36+, but it is better to not rely on it. + +Fixes: 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") +Link: https://lore.kernel.org/all/Y7Jal56f6UBh1abE@dev-arch.thelio-3990X/ +Reported-by: Nathan Chancellor +Signed-off-by: Masahiro Yamada +Link: https://lore.kernel.org/r/20230105031306.1455409-1-masahiroy@kernel.org +Signed-off-by: Heiko Carstens +--- + arch/s390/kernel/vmlinux.lds.S | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S +index 5ea3830af0cc..6e101e6f499d 100644 +--- a/arch/s390/kernel/vmlinux.lds.S ++++ b/arch/s390/kernel/vmlinux.lds.S +@@ -17,6 +17,8 @@ + /* Handle ro_after_init data on our own. */ + #define RO_AFTER_INIT_DATA + ++#define RUNTIME_DISCARD_EXIT ++ + #define EMITS_PT_NOTE + + #include +-- +2.30.2 + diff --git a/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch b/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch new file mode 100644 index 0000000..64bf9ff --- /dev/null +++ b/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch @@ -0,0 +1,37 @@ +From ff8be5401b359e23ec2b74184034082564bac7c5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20M=C3=BCller?= +Date: Thu, 25 May 2023 16:04:20 -0700 +Subject: [PATCH] selftests/bpf: Check whether to run selftest +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The sockopt test invokes test__start_subtest and then unconditionally +asserts the success. That means that even if deny-listed, any test will +still run and potentially fail. +Evaluate the return value of test__start_subtest() to achieve the +desired behavior, as other tests do. + +Signed-off-by: Daniel Müller +--- + tools/testing/selftests/bpf/prog_tests/sockopt.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c +index 33dd45..9e6a5e 100644 +--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c ++++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c +@@ -1060,7 +1060,9 @@ void test_sockopt(void) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { +- test__start_subtest(tests[i].descr); ++ if (!test__start_subtest(tests[i].descr)) ++ continue; ++ + ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr); + } + +-- +2.34.1 + diff --git a/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch b/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch new file mode 100644 index 0000000..3ee42c6 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch @@ -0,0 +1,46 @@ +From a8dfde09c90109e3a98af54847e91bde7dc2d5c2 Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Tue, 13 Dec 2022 14:05:00 -0800 +Subject: [PATCH] selftests/bpf: Select CONFIG_FUNCTION_ERROR_INJECTION +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +BPF selftests require CONFIG_FUNCTION_ERROR_INJECTION to work. However, +CONFIG_FUNCTION_ERROR_INJECTION is no longer 'y' by default after recent +changes. As a result, we are seeing errors like the following from BPF CI: + + bpf_testmod_test_read() is not modifiable + __x64_sys_setdomainname is not sleepable + __x64_sys_getpgid is not sleepable + +Fix this by explicitly selecting CONFIG_FUNCTION_ERROR_INJECTION in the +selftest config. + +Fixes: a4412fdd49dc ("error-injection: Add prompt for function error injection") +Reported-by: Daniel Müller +Signed-off-by: Song Liu +Signed-off-by: Andrii Nakryiko +Signed-off-by: Daniel Borkmann +Acked-by: Daniel Müller +Link: https://lore.kernel.org/bpf/20221213220500.3427947-1-song@kernel.org +Signed-off-by: Daniel Müller +--- + tools/testing/selftests/bpf/config | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config +index 612f69..63cd4a 100644 +--- a/tools/testing/selftests/bpf/config ++++ b/tools/testing/selftests/bpf/config +@@ -16,6 +16,7 @@ CONFIG_CRYPTO_USER_API_HASH=y + CONFIG_DYNAMIC_FTRACE=y + CONFIG_FPROBE=y + CONFIG_FTRACE_SYSCALLS=y ++CONFIG_FUNCTION_ERROR_INJECTION=y + CONFIG_FUNCTION_TRACER=y + CONFIG_GENEVE=y + CONFIG_IKCONFIG=y +-- +2.30.2 + diff --git a/third_party/bpftool/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch b/third_party/bpftool/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch new file mode 100644 index 0000000..9547c62 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch @@ -0,0 +1,68 @@ +From d3484f640bc82cff459beb85a00f7ebab20f0a41 Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Sun, 9 Apr 2023 11:28:31 +0900 +Subject: [PATCH] tracing: fprobe: Initialize ret valiable to fix smatch error + +The commit 39d954200bf6 ("fprobe: Skip exit_handler if entry_handler returns +!0") introduced a hidden dependency of 'ret' local variable in the +fprobe_handler(), Smatch warns the `ret` can be accessed without +initialization. + + kernel/trace/fprobe.c:59 fprobe_handler() + error: uninitialized symbol 'ret'. + +kernel/trace/fprobe.c + 49 fpr->entry_ip = ip; + 50 if (fp->entry_data_size) + 51 entry_data = fpr->data; + 52 } + 53 + 54 if (fp->entry_handler) + 55 ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + +ret is only initialized if there is an ->entry_handler + + 56 + 57 /* If entry_handler returns !0, nmissed is not counted. */ + 58 if (rh) { + +rh is only true if there is an ->exit_handler. Presumably if you have +and ->exit_handler that means you also have a ->entry_handler but Smatch +is not smart enough to figure it out. + +--> 59 if (ret) + ^^^ +Warning here. + + 60 rethook_recycle(rh); + 61 else + 62 rethook_hook(rh, ftrace_get_regs(fregs), true); + 63 } + 64 out: + 65 ftrace_test_recursion_unlock(bit); + 66 } + +Reported-by: Dan Carpenter +Link: https://lore.kernel.org/all/85429a5c-a4b9-499e-b6c0-cbd313291c49@kili.mountain +Fixes: 39d954200bf6 ("fprobe: Skip exit_handler if entry_handler returns !0") +Signed-off-by: Masami Hiramatsu (Google) +--- + kernel/trace/fprobe.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c +index 9abb3905bc8e..293184227394 100644 +--- a/kernel/trace/fprobe.c ++++ b/kernel/trace/fprobe.c +@@ -27,7 +27,7 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct rethook_node *rh = NULL; + struct fprobe *fp; + void *entry_data = NULL; +- int bit, ret; ++ int bit, ret = 0; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) +-- +2.34.1 + diff --git a/third_party/bpftool/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch b/third_party/bpftool/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch new file mode 100644 index 0000000..b97dba0 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch @@ -0,0 +1,83 @@ +From 8267fc71abb2dc47338570e56dd3473a58313fce Mon Sep 17 00:00:00 2001 +From: Lorenzo Bianconi +Date: Mon, 17 Apr 2023 23:53:22 +0200 +Subject: [PATCH] veth: take into account peer device for + NETDEV_XDP_ACT_NDO_XMIT xdp_features flag + +For veth pairs, NETDEV_XDP_ACT_NDO_XMIT is supported by the current +device if the peer one is running a XDP program or if it has GRO enabled. +Fix the xdp_features flags reporting considering peer device and not +current one for NETDEV_XDP_ACT_NDO_XMIT. + +Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") +Signed-off-by: Lorenzo Bianconi +Link: https://lore.kernel.org/r/4f1ca6f6f6b42ae125bfdb5c7782217c83968b2e.1681767806.git.lorenzo@kernel.org +Signed-off-by: Alexei Starovoitov +--- + drivers/net/veth.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/veth.c b/drivers/net/veth.c +index e1b38fbf1dd9..4b3c6647edc6 100644 +--- a/drivers/net/veth.c ++++ b/drivers/net/veth.c +@@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev) + + peer = rtnl_dereference(priv->peer); + if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { ++ struct veth_priv *priv_peer = netdev_priv(peer); + xdp_features_t val = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG; + +- if (priv->_xdp_prog || veth_gro_requested(dev)) ++ if (priv_peer->_xdp_prog || veth_gro_requested(peer)) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); +@@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev, + { + netdev_features_t changed = features ^ dev->features; + struct veth_priv *priv = netdev_priv(dev); ++ struct net_device *peer; + int err; + + if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) + return 0; + ++ peer = rtnl_dereference(priv->peer); + if (features & NETIF_F_GRO) { + err = veth_napi_enable(dev); + if (err) + return err; + +- xdp_features_set_redirect_target(dev, true); ++ if (peer) ++ xdp_features_set_redirect_target(peer, true); + } else { +- xdp_features_clear_redirect_target(dev); ++ if (peer) ++ xdp_features_clear_redirect_target(peer); + veth_napi_del(dev); + } + return 0; +@@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, + peer->max_mtu = max_mtu; + } + +- xdp_features_set_redirect_target(dev, true); ++ xdp_features_set_redirect_target(peer, true); + } + + if (old_prog) { + if (!prog) { +- if (!veth_gro_requested(dev)) +- xdp_features_clear_redirect_target(dev); ++ if (peer && !veth_gro_requested(dev)) ++ xdp_features_clear_redirect_target(peer); + + if (dev->flags & IFF_UP) + veth_disable_xdp(dev); +-- +2.34.1 + diff --git a/third_party/bpftool/libbpf/ci/managers/debian.sh b/third_party/bpftool/libbpf/ci/managers/debian.sh new file mode 100755 index 0000000..3d3f860 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/managers/debian.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +PHASES=(${@:-SETUP RUN RUN_ASAN CLEANUP}) +DEBIAN_RELEASE="${DEBIAN_RELEASE:-testing}" +CONT_NAME="${CONT_NAME:-libbpf-debian-$DEBIAN_RELEASE}" +ENV_VARS="${ENV_VARS:-}" +DOCKER_RUN="${DOCKER_RUN:-docker run}" +REPO_ROOT="${REPO_ROOT:-$PWD}" +ADDITIONAL_DEPS=(pkgconf) +EXTRA_CFLAGS="" +EXTRA_LDFLAGS="" + +function info() { + echo -e "\033[33;1m$1\033[0m" +} + +function error() { + echo -e "\033[31;1m$1\033[0m" +} + +function docker_exec() { + docker exec $ENV_VARS $CONT_NAME "$@" +} + +set -eu + +source "$(dirname $0)/travis_wait.bash" + +for phase in "${PHASES[@]}"; do + case $phase in + SETUP) + info "Setup phase" + info "Using Debian $DEBIAN_RELEASE" + + docker --version + + docker pull debian:$DEBIAN_RELEASE + info "Starting container $CONT_NAME" + $DOCKER_RUN -v $REPO_ROOT:/build:rw \ + -w /build --privileged=true --name $CONT_NAME \ + -dit --net=host debian:$DEBIAN_RELEASE /bin/bash + echo -e "::group::Build Env Setup" + docker_exec bash -c "echo deb-src http://deb.debian.org/debian $DEBIAN_RELEASE main >>/etc/apt/sources.list" + docker_exec apt-get -y update + docker_exec apt-get -y install aptitude + docker_exec aptitude -y install make libz-dev libelf-dev + docker_exec aptitude -y install "${ADDITIONAL_DEPS[@]}" + echo -e "::endgroup::" + ;; + RUN|RUN_CLANG|RUN_CLANG14|RUN_CLANG15|RUN_CLANG16|RUN_GCC10|RUN_GCC11|RUN_GCC12|RUN_ASAN|RUN_CLANG_ASAN|RUN_GCC10_ASAN) + CC="cc" + if [[ "$phase" =~ "RUN_CLANG(\d+)(_ASAN)?" ]]; then + ENV_VARS="-e CC=clang-${BASH_REMATCH[1]} -e CXX=clang++-${BASH_REMATCH[1]}" + CC="clang-${BASH_REMATCH[1]}" + elif [[ "$phase" = *"CLANG"* ]]; then + ENV_VARS="-e CC=clang -e CXX=clang++" + CC="clang" + elif [[ "$phase" =~ "RUN_GCC(\d+)(_ASAN)?" ]]; then + ENV_VARS="-e CC=gcc-${BASH_REMATCH[1]} -e CXX=g++-${BASH_REMATCH[1]}" + CC="gcc-${BASH_REMATCH[1]}" + fi + if [[ "$phase" = *"ASAN"* ]]; then + EXTRA_CFLAGS="${EXTRA_CFLAGS} -fsanitize=address,undefined" + EXTRA_LDFLAGS="${EXTRA_LDFLAGS} -fsanitize=address,undefined" + fi + if [[ "$CC" != "cc" ]]; then + docker_exec aptitude -y install "$CC" + else + docker_exec aptitude -y install gcc + fi + docker_exec mkdir build install + docker_exec ${CC} --version + info "build" + docker_exec make -j$((4*$(nproc))) EXTRA_CFLAGS="${EXTRA_CFLAGS}" EXTRA_LDFLAGS="${EXTRA_LDFLAGS}" -C ./src -B OBJDIR=../build + info "ldd build/libbpf.so:" + docker_exec ldd build/libbpf.so + if ! docker_exec ldd build/libbpf.so | grep -q libelf; then + error "No reference to libelf.so in libbpf.so!" + exit 1 + fi + info "install" + docker_exec make -j$((4*$(nproc))) -C src OBJDIR=../build DESTDIR=../install install + info "link binary" + docker_exec bash -c "EXTRA_CFLAGS=\"${EXTRA_CFLAGS}\" EXTRA_LDFLAGS=\"${EXTRA_LDFLAGS}\" ./ci/managers/test_compile.sh" + ;; + CLEANUP) + info "Cleanup phase" + docker stop $CONT_NAME + docker rm -f $CONT_NAME + ;; + *) + echo >&2 "Unknown phase '$phase'" + exit 1 + esac +done diff --git a/third_party/bpftool/libbpf/ci/managers/test_compile.sh b/third_party/bpftool/libbpf/ci/managers/test_compile.sh new file mode 100755 index 0000000..094ba3e --- /dev/null +++ b/third_party/bpftool/libbpf/ci/managers/test_compile.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euox pipefail + +EXTRA_CFLAGS=${EXTRA_CFLAGS:-} +EXTRA_LDFLAGS=${EXTRA_LDFLAGS:-} + +cat << EOF > main.c +#include +int main() { + return bpf_object__open(0) < 0; +} +EOF + +# static linking +${CC:-cc} ${EXTRA_CFLAGS} ${EXTRA_LDFLAGS} -o main -I./include/uapi -I./install/usr/include main.c ./build/libbpf.a -lelf -lz diff --git a/third_party/bpftool/libbpf/ci/managers/travis_wait.bash b/third_party/bpftool/libbpf/ci/managers/travis_wait.bash new file mode 100644 index 0000000..acf6ad1 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/managers/travis_wait.bash @@ -0,0 +1,61 @@ +# This was borrowed from https://github.com/travis-ci/travis-build/tree/master/lib/travis/build/bash +# to get around https://github.com/travis-ci/travis-ci/issues/9979. It should probably be removed +# as soon as Travis CI has started to provide an easy way to export the functions to bash scripts. + +travis_jigger() { + local cmd_pid="${1}" + shift + local timeout="${1}" + shift + local count=0 + + echo -e "\\n" + + while [[ "${count}" -lt "${timeout}" ]]; do + count="$((count + 1))" + echo -ne "Still running (${count} of ${timeout}): ${*}\\r" + sleep 60 + done + + echo -e "\\n${ANSI_RED}Timeout (${timeout} minutes) reached. Terminating \"${*}\"${ANSI_RESET}\\n" + kill -9 "${cmd_pid}" +} + +travis_wait() { + local timeout="${1}" + + if [[ "${timeout}" =~ ^[0-9]+$ ]]; then + shift + else + timeout=20 + fi + + local cmd=("${@}") + local log_file="travis_wait_${$}.log" + + "${cmd[@]}" &>"${log_file}" & + local cmd_pid="${!}" + + travis_jigger "${!}" "${timeout}" "${cmd[@]}" & + local jigger_pid="${!}" + local result + + { + set +e + wait "${cmd_pid}" 2>/dev/null + result="${?}" + ps -p"${jigger_pid}" &>/dev/null && kill "${jigger_pid}" + set -e + } + + if [[ "${result}" -eq 0 ]]; then + echo -e "\\n${ANSI_GREEN}The command ${cmd[*]} exited with ${result}.${ANSI_RESET}" + else + echo -e "\\n${ANSI_RED}The command ${cmd[*]} exited with ${result}.${ANSI_RESET}" + fi + + echo -e "\\n${ANSI_GREEN}Log:${ANSI_RESET}\\n" + cat "${log_file}" + + return "${result}" +} diff --git a/third_party/bpftool/libbpf/ci/managers/ubuntu.sh b/third_party/bpftool/libbpf/ci/managers/ubuntu.sh new file mode 100755 index 0000000..7fe1b3f --- /dev/null +++ b/third_party/bpftool/libbpf/ci/managers/ubuntu.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -eux + +RELEASE="focal" + +apt-get update +apt-get install -y pkg-config + +source "$(dirname $0)/travis_wait.bash" + +cd $REPO_ROOT + +EXTRA_CFLAGS="-Werror -Wall -fsanitize=address,undefined" +EXTRA_LDFLAGS="-Werror -Wall -fsanitize=address,undefined" +mkdir build install +cc --version +make -j$((4*$(nproc))) EXTRA_CFLAGS="${EXTRA_CFLAGS}" EXTRA_LDFLAGS="${EXTRA_LDFLAGS}" -C ./src -B OBJDIR=../build +ldd build/libbpf.so +if ! ldd build/libbpf.so | grep -q libelf; then + echo "FAIL: No reference to libelf.so in libbpf.so!" + exit 1 +fi +make -j$((4*$(nproc))) -C src OBJDIR=../build DESTDIR=../install install +EXTRA_CFLAGS=${EXTRA_CFLAGS} EXTRA_LDFLAGS=${EXTRA_LDFLAGS} $(dirname $0)/test_compile.sh diff --git a/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 b/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 new file mode 100644 index 0000000..ee0d3db --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 @@ -0,0 +1,8 @@ +# btf_dump -- need to disable data dump sub-tests +core_retro +cpu_mask +hashmap +legacy_printk +perf_buffer +section_names + diff --git a/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 b/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 new file mode 100644 index 0000000..998f036 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 @@ -0,0 +1,51 @@ +# attach_probe +autoload +bpf_verif_scale +cgroup_attach_autodetach +cgroup_attach_override +core_autosize +core_extern +core_read_macros +core_reloc +core_retro +cpu_mask +endian +get_branch_snapshot +get_stackid_cannot_attach +global_data +global_data_init +global_func_args +hashmap +legacy_printk +linked_funcs +linked_maps +map_lock +obj_name +perf_buffer +perf_event_stackmap +pinning +pkt_md_access +probe_user +queue_stack_map +raw_tp_writable_reject_nbd_invalid +raw_tp_writable_test_run +rdonly_maps +section_names +signal_pending +skeleton +sockmap_ktls +sockopt +spinlock +stacktrace_map +stacktrace_map_raw_tp +static_linked +task_fd_query_rawtp +task_fd_query_tp +tc_bpf +tcp_estats +tcp_rtt +tp_attach_query +usdt/urand_pid_attach +xdp +xdp_noinline +xdp_perf diff --git a/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 new file mode 100644 index 0000000..2688771 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 @@ -0,0 +1,120 @@ +# This file is not used and is there for historic purposes only. +# See ALLOWLIST-5.5.0 instead. + +# PERMANENTLY DISABLED +align # verifier output format changed +atomics # new atomic operations (v5.12+) +atomic_bounds # new atomic operations (v5.12+) +bind_perm # changed semantics of return values (v5.12+) +bpf_cookie # 5.15+ +bpf_iter # bpf_iter support is missing +bpf_obj_id # bpf_link support missing for GET_OBJ_INFO, GET_FD_BY_ID, etc +bpf_tcp_ca # STRUCT_OPS is missing +btf_map_in_map # inner map leak fixed in 5.8 +btf_skc_cls_ingress # v5.10+ functionality +cg_storage_multi # v5.9+ functionality +cgroup_attach_multi # BPF_F_REPLACE_PROG missing +cgroup_link # LINK_CREATE is missing +cgroup_skb_sk_lookup # bpf_sk_lookup_tcp() helper is missing +check_mtu # missing BPF helper (v5.12+) +cls_redirect # bpf_csum_level() helper is missing +connect_force_port # cgroup/get{peer,sock}name{4,6} support is missing +d_path # v5.10+ feature +enable_stats # BPF_ENABLE_STATS support is missing +fentry_fexit # bpf_prog_test_tracing missing +fentry_test # bpf_prog_test_tracing missing +fexit_bpf2bpf # freplace is missing +fexit_sleep # relies on bpf_trampoline fix in 5.12+ +fexit_test # bpf_prog_test_tracing missing +flow_dissector # bpf_link-based flow dissector is in 5.8+ +flow_dissector_reattach +for_each # v5.12+ +get_func_ip_test # v5.15+ +get_stack_raw_tp # exercising BPF verifier bug causing infinite loop +hash_large_key # v5.11+ +ima # v5.11+ +kfree_skb # 32-bit pointer arith in test_pkt_access +ksyms # __start_BTF has different name +kfunc_call # v5.13+ +link_pinning # bpf_link is missing +linked_vars # v5.13+ +load_bytes_relative # new functionality in 5.8 +lookup_and_delete # v5.14+ +map_init # per-CPU LRU missing +map_ptr # test uses BPF_MAP_TYPE_RINGBUF, added in 5.8 +metadata # v5.10+ +migrate_reuseport # v5.14+ +mmap # 5.5 kernel is too permissive with re-mmaping +modify_return # fmod_ret support is missing +module_attach # module BTF support missing (v5.11+) +netcnt +netns_cookie # v5.15+ +ns_current_pid_tgid # bpf_get_ns_current_pid_tgid() helper is missing +pe_preserve_elems # v5.10+ +perf_branches # bpf_read_branch_records() helper is missing +perf_link # v5.15+ +pkt_access # 32-bit pointer arith in test_pkt_access +probe_read_user_str # kernel bug with garbage bytes at the end +prog_run_xattr # 32-bit pointer arith in test_pkt_access +raw_tp_test_run # v5.10+ +recursion # v5.12+ +ringbuf # BPF_MAP_TYPE_RINGBUF is supported in 5.8+ + +# bug in verifier w/ tracking references +#reference_tracking/classifier/sk_lookup_success +reference_tracking + +select_reuseport # UDP support is missing +send_signal # bpf_send_signal_thread() helper is missing +sk_assign # bpf_sk_assign helper missing +sk_lookup # v5.9+ +sk_storage_tracing # missing bpf_sk_storage_get() helper +skb_ctx # ctx_{size, }_{in, out} in BPF_PROG_TEST_RUN is missing +skb_helpers # helpers added in 5.8+ +skeleton # creates too big ARRAY map +snprintf # v5.13+ +snprintf_btf # v5.10+ +sock_fields # v5.10+ +socket_cookie # v5.12+ +sockmap_basic # uses new socket fields, 5.8+ +sockmap_listen # no listen socket supportin SOCKMAP +sockopt/getsockopt: ignore >PAGE_SIZE optlen +sockopt/setsockopt: ignore >PAGE_SIZE optlen +sockopt_sk +sockopt_qos_to_cc # v5.15+ +stacktrace_build_id # v5.9+ +stack_var_off # v5.12+ +syscall # v5.14+ +task_local_storage # v5.12+ +task_pt_regs # v5.15+ +tcp_hdr_options # v5.10+, new TCP header options feature in BPF +tcpbpf_user # LINK_CREATE is missing +tc_redirect # v5.14+ +test_bpffs # v5.10+, new CONFIG_BPF_PRELOAD=y and CONFIG_BPF_PRELOAD_UMG=y|m +test_bprm_opts # v5.11+ +test_global_funcs # kernel doesn't support BTF linkage=global on FUNCs +test_local_storage # v5.10+ feature +test_lsm # no BPF_LSM support +test_overhead # no fmod_ret support +test_profiler # needs verifier logic improvements from v5.10+ +test_skb_pkt_end # v5.11+ +timer # v5.15+ +timer_mim # v5.15+ +trace_ext # v5.10+ +trace_printk # v5.14+ +trampoline_count # v5.12+ have lower allowed limits +udp_limit # no cgroup/sock_release BPF program type (5.9+) +varlen # verifier bug fixed in later kernels +vmlinux # hrtimer_nanosleep() signature changed incompatibly +xdp_adjust_tail # new XDP functionality added in 5.8 +xdp_attach # IFLA_XDP_EXPECTED_FD support is missing +xdp_bonding # v5.15+ +xdp_bpf2bpf # freplace is missing +xdp_context_test_run # v5.15+ +xdp_cpumap_attach # v5.9+ +xdp_devmap_attach # new feature in 5.8 +xdp_link # v5.9+ + +# SUBTESTS FAILING (block entire test until blocking subtests works properly) +btf # "size check test", "func (Non zero vlen)" +tailcalls # tailcall_bpf2bpf_1, tailcall_bpf2bpf_2, tailcall_bpf2bpf_3 diff --git a/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest new file mode 100644 index 0000000..1984883 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest @@ -0,0 +1,4 @@ +decap_sanity # weird failure with decap_sanity_ns netns already existing, TBD +bpf_nf/tc-bpf-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +bpf_nf/xdp-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +kprobe_multi_bench_attach # suspected to cause crashes in CI diff --git a/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x new file mode 100644 index 0000000..e97d7ba --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x @@ -0,0 +1,4 @@ +# TEMPORARY +sockmap_listen/sockhash VSOCK test_vsock_redir +usdt/basic # failing verifier due to bounds check after LLVM update +usdt/multispec # same as above diff --git a/third_party/bpftool/libbpf/ci/vmtest/helpers.sh b/third_party/bpftool/libbpf/ci/vmtest/helpers.sh new file mode 100755 index 0000000..c44d098 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/helpers.sh @@ -0,0 +1,38 @@ +# shellcheck shell=bash + +# $1 - start or end +# $2 - fold identifier, no spaces +# $3 - fold section description +foldable() { + local YELLOW='\033[1;33m' + local NOCOLOR='\033[0m' + if [ $1 = "start" ]; then + line="::group::$2" + if [ ! -z "${3:-}" ]; then + line="$line - ${YELLOW}$3${NOCOLOR}" + fi + else + line="::endgroup::" + fi + echo -e "$line" +} + +__print() { + local TITLE="" + if [[ -n $2 ]]; then + TITLE=" title=$2" + fi + echo "::$1${TITLE}::$3" +} + +# $1 - title +# $2 - message +print_error() { + __print error $1 $2 +} + +# $1 - title +# $2 - message +print_notice() { + __print notice $1 $2 +} diff --git a/third_party/bpftool/libbpf/ci/vmtest/run_selftests.sh b/third_party/bpftool/libbpf/ci/vmtest/run_selftests.sh new file mode 100755 index 0000000..0e5ac17 --- /dev/null +++ b/third_party/bpftool/libbpf/ci/vmtest/run_selftests.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -euo pipefail + +source $(cd $(dirname $0) && pwd)/helpers.sh + +ARCH=$(uname -m) + +STATUS_FILE=/exitstatus + +read_lists() { + (for path in "$@"; do + if [[ -s "$path" ]]; then + cat "$path" + fi; + done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ',' +} + +test_progs() { + if [[ "${KERNEL}" != '4.9.0' ]]; then + foldable start test_progs "Testing test_progs" + # "&& true" does not change the return code (it is not executed + # if the Python script fails), but it prevents exiting on a + # failure due to the "set -e". + ./test_progs ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true + echo "test_progs:$?" >> "${STATUS_FILE}" + foldable end test_progs + fi +} + +test_progs_no_alu32() { + foldable start test_progs-no_alu32 "Testing test_progs-no_alu32" + ./test_progs-no_alu32 ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true + echo "test_progs-no_alu32:$?" >> "${STATUS_FILE}" + foldable end test_progs-no_alu32 +} + +test_maps() { + if [[ "${KERNEL}" == 'latest' ]]; then + foldable start test_maps "Testing test_maps" + ./test_maps && true + echo "test_maps:$?" >> "${STATUS_FILE}" + foldable end test_maps + fi +} + +test_verifier() { + if [[ "${KERNEL}" == 'latest' ]]; then + foldable start test_verifier "Testing test_verifier" + ./test_verifier && true + echo "test_verifier:$?" >> "${STATUS_FILE}" + foldable end test_verifier + fi +} + +foldable end vm_init + +foldable start kernel_config "Kconfig" + +zcat /proc/config.gz + +foldable end kernel_config + + +configs_path=/${PROJECT_NAME}/selftests/bpf +local_configs_path=${PROJECT_NAME}/vmtest/configs +DENYLIST=$(read_lists \ + "$configs_path/DENYLIST" \ + "$configs_path/DENYLIST.${ARCH}" \ + "$local_configs_path/DENYLIST-${KERNEL}" \ + "$local_configs_path/DENYLIST-${KERNEL}.${ARCH}" \ +) +ALLOWLIST=$(read_lists \ + "$configs_path/ALLOWLIST" \ + "$configs_path/ALLOWLIST.${ARCH}" \ + "$local_configs_path/ALLOWLIST-${KERNEL}" \ + "$local_configs_path/ALLOWLIST-${KERNEL}.${ARCH}" \ +) + +echo "DENYLIST: ${DENYLIST}" +echo "ALLOWLIST: ${ALLOWLIST}" + +cd ${PROJECT_NAME}/selftests/bpf + +if [ $# -eq 0 ]; then + test_progs + test_progs_no_alu32 + # test_maps + test_verifier +else + for test_name in "$@"; do + "${test_name}" + done +fi diff --git a/third_party/bpftool/libbpf/docs/.gitignore b/third_party/bpftool/libbpf/docs/.gitignore new file mode 100644 index 0000000..dd08fae --- /dev/null +++ b/third_party/bpftool/libbpf/docs/.gitignore @@ -0,0 +1,2 @@ +sphinx/build +sphinx/doxygen/build \ No newline at end of file diff --git a/third_party/bpftool/libbpf/docs/api.rst b/third_party/bpftool/libbpf/docs/api.rst new file mode 100644 index 0000000..7a8e709 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/api.rst @@ -0,0 +1,93 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _api: + +.. toctree:: Table of Contents + + +LIBBPF API +========== + +Error Handling +-------------- + +When libbpf is used in "libbpf 1.0 mode", API functions can return errors in one of two ways. + +You can set "libbpf 1.0" mode with the following line: + +.. code-block:: + + libbpf_set_strict_mode(LIBBPF_STRICT_DIRECT_ERRS | LIBBPF_STRICT_CLEAN_PTRS); + +If the function returns an error code directly, it uses 0 to indicate success +and a negative error code to indicate what caused the error. In this case the +error code should be checked directly from the return, you do not need to check +errno. + +For example: + +.. code-block:: + + err = some_libbpf_api_with_error_return(...); + if (err < 0) { + /* Handle error accordingly */ + } + +If the function returns a pointer, it will return NULL to indicate there was +an error. In this case errno should be checked for the error code. + +For example: + +.. code-block:: + + ptr = some_libbpf_api_returning_ptr(); + if (!ptr) { + /* note no minus sign for EINVAL and E2BIG below */ + if (errno == EINVAL) { + /* handle EINVAL error */ + } else if (errno == E2BIG) { + /* handle E2BIG error */ + } + } + +libbpf.h +-------- +.. doxygenfile:: libbpf.h + :project: libbpf + :sections: func define public-type enum + +bpf.h +----- +.. doxygenfile:: bpf.h + :project: libbpf + :sections: func define public-type enum + +btf.h +----- +.. doxygenfile:: btf.h + :project: libbpf + :sections: func define public-type enum + +xsk.h +----- +.. doxygenfile:: xsk.h + :project: libbpf + :sections: func define public-type enum + +bpf_tracing.h +------------- +.. doxygenfile:: bpf_tracing.h + :project: libbpf + :sections: func define public-type enum + +bpf_core_read.h +--------------- +.. doxygenfile:: bpf_core_read.h + :project: libbpf + :sections: func define public-type enum + +bpf_endian.h +------------ +.. doxygenfile:: bpf_endian.h + :project: libbpf + :sections: func define public-type enum diff --git a/third_party/bpftool/libbpf/docs/conf.py b/third_party/bpftool/libbpf/docs/conf.py new file mode 100644 index 0000000..1d8714e --- /dev/null +++ b/third_party/bpftool/libbpf/docs/conf.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import subprocess + +project = "libbpf" + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.imgmath', + 'sphinx.ext.todo', + 'breathe', +] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' + +if read_the_docs_build: + subprocess.call('cd sphinx ; make clean', shell=True) + subprocess.call('cd sphinx/doxygen ; doxygen', shell=True) + +html_theme = 'sphinx_rtd_theme' + +breathe_projects = { "libbpf": "./sphinx/doxygen/build/xml/" } +breathe_default_project = "libbpf" +breathe_show_define_initializer = True +breathe_show_enumvalue_initializer = True diff --git a/third_party/bpftool/libbpf/docs/index.rst b/third_party/bpftool/libbpf/docs/index.rst new file mode 100644 index 0000000..7545a20 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/index.rst @@ -0,0 +1,33 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _libbpf: + +====== +libbpf +====== + +If you are looking to develop BPF applications using the libbpf library, this +directory contains important documentation that you should read. + +To get started, it is recommended to begin with the :doc:`libbpf Overview +` document, which provides a high-level understanding of the +libbpf APIs and their usage. This will give you a solid foundation to start +exploring and utilizing the various features of libbpf to develop your BPF +applications. + +.. toctree:: + :maxdepth: 1 + + libbpf_overview + API Documentation + program_types + libbpf_naming_convention + libbpf_build + + +All general BPF questions, including kernel functionality, libbpf APIs and their +application, should be sent to bpf@vger.kernel.org mailing list. You can +`subscribe `_ to the mailing list +search its `archive `_. Please search the archive +before asking new questions. It may be that this was already addressed or +answered before. diff --git a/third_party/bpftool/libbpf/docs/libbpf_build.rst b/third_party/bpftool/libbpf/docs/libbpf_build.rst new file mode 100644 index 0000000..8e8c23e --- /dev/null +++ b/third_party/bpftool/libbpf/docs/libbpf_build.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +Building libbpf +=============== + +libelf and zlib are internal dependencies of libbpf and thus are required to link +against and must be installed on the system for applications to work. +pkg-config is used by default to find libelf, and the program called +can be overridden with PKG_CONFIG. + +If using pkg-config at build time is not desired, it can be disabled by +setting NO_PKG_CONFIG=1 when calling make. + +To build both static libbpf.a and shared libbpf.so: + +.. code-block:: bash + + $ cd src + $ make + +To build only static libbpf.a library in directory build/ and install them +together with libbpf headers in a staging directory root/: + +.. code-block:: bash + + $ cd src + $ mkdir build root + $ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install + +To build both static libbpf.a and shared libbpf.so against a custom libelf +dependency installed in /build/root/ and install them together with libbpf +headers in a build directory /build/root/: + +.. code-block:: bash + + $ cd src + $ PKG_CONFIG_PATH=/build/root/lib64/pkgconfig DESTDIR=/build/root make \ No newline at end of file diff --git a/third_party/bpftool/libbpf/docs/libbpf_naming_convention.rst b/third_party/bpftool/libbpf/docs/libbpf_naming_convention.rst new file mode 100644 index 0000000..b5b41b6 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/libbpf_naming_convention.rst @@ -0,0 +1,193 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +API naming convention +===================== + +libbpf API provides access to a few logically separated groups of +functions and types. Every group has its own naming convention +described here. It's recommended to follow these conventions whenever a +new function or type is added to keep libbpf API clean and consistent. + +All types and functions provided by libbpf API should have one of the +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``btf_dump_``, +``ring_buffer_``, ``perf_buffer_``. + +System call wrappers +-------------------- + +System call wrappers are simple wrappers for commands supported by +sys_bpf system call. These wrappers should go to ``bpf.h`` header file +and map one to one to corresponding commands. + +For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM`` +command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc. + +Objects +------- + +Another class of types and functions provided by libbpf API is "objects" +and functions to work with them. Objects are high-level abstractions +such as BPF program or BPF map. They're represented by corresponding +structures such as ``struct bpf_object``, ``struct bpf_program``, +``struct bpf_map``, etc. + +Structures are forward declared and access to their fields should be +provided via corresponding getters and setters rather than directly. + +These objects are associated with corresponding parts of ELF object that +contains compiled BPF programs. + +For example ``struct bpf_object`` represents ELF object itself created +from an ELF file or from a buffer, ``struct bpf_program`` represents a +program in ELF object and ``struct bpf_map`` is a map. + +Functions that work with an object have names built from object name, +double underscore and part that describes function purpose. + +For example ``bpf_object__open`` consists of the name of corresponding +object, ``bpf_object``, double underscore and ``open`` that defines the +purpose of the function to open ELF file and create ``bpf_object`` from +it. + +All objects and corresponding functions other than BTF related should go +to ``libbpf.h``. BTF types and functions should go to ``btf.h``. + +Auxiliary functions +------------------- + +Auxiliary functions and types that don't fit well in any of categories +described above should have ``libbpf_`` prefix, e.g. +``libbpf_get_error`` or ``libbpf_prog_type_by_name``. + +ABI +--- + +libbpf can be both linked statically or used as DSO. To avoid possible +conflicts with other libraries an application is linked with, all +non-static libbpf symbols should have one of the prefixes mentioned in +API documentation above. See API naming convention to choose the right +name for a new symbol. + +Symbol visibility +----------------- + +libbpf follow the model when all global symbols have visibility "hidden" +by default and to make a symbol visible it has to be explicitly +attributed with ``LIBBPF_API`` macro. For example: + +.. code-block:: c + + LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); + +This prevents from accidentally exporting a symbol, that is not supposed +to be a part of ABI what, in turn, improves both libbpf developer- and +user-experiences. + +ABI versioning +-------------- + +To make future ABI extensions possible libbpf ABI is versioned. +Versioning is implemented by ``libbpf.map`` version script that is +passed to linker. + +Version name is ``LIBBPF_`` prefix + three-component numeric version, +starting from ``0.0.1``. + +Every time ABI is being changed, e.g. because a new symbol is added or +semantic of existing symbol is changed, ABI version should be bumped. +This bump in ABI version is at most once per kernel development cycle. + +For example, if current state of ``libbpf.map`` is: + +.. code-block:: none + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + +, and a new symbol ``bpf_func_c`` is being introduced, then +``libbpf.map`` should be changed like this: + +.. code-block:: none + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + LIBBPF_0.0.2 { + global: + bpf_func_c; + } LIBBPF_0.0.1; + +, where new version ``LIBBPF_0.0.2`` depends on the previous +``LIBBPF_0.0.1``. + +Format of version script and ways to handle ABI changes, including +incompatible ones, described in details in [1]. + +Stand-alone build +------------------- + +Under https://github.com/libbpf/libbpf there is a (semi-)automated +mirror of the mainline's version of libbpf for a stand-alone build. + +However, all changes to libbpf's code base must be upstreamed through +the mainline kernel tree. + + +API documentation convention +============================ + +The libbpf API is documented via comments above definitions in +header files. These comments can be rendered by doxygen and sphinx +for well organized html output. This section describes the +convention in which these comments should be formatted. + +Here is an example from btf.h: + +.. code-block:: c + + /** + * @brief **btf__new()** creates a new instance of a BTF object from the raw + * bytes of an ELF's BTF section + * @param data raw bytes + * @param size number of bytes passed in `data` + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ + +The comment must start with a block comment of the form '/\*\*'. + +The documentation always starts with a @brief directive. This line is a short +description about this API. It starts with the name of the API, denoted in bold +like so: **api_name**. Please include an open and close parenthesis if this is a +function. Follow with the short description of the API. A longer form description +can be added below the last directive, at the bottom of the comment. + +Parameters are denoted with the @param directive, there should be one for each +parameter. If this is a function with a non-void return, use the @return directive +to document it. + +License +------------------- + +libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. + +Links +------------------- + +[1] https://www.akkadia.org/drepper/dsohowto.pdf + (Chapter 3. Maintaining APIs and ABIs). diff --git a/third_party/bpftool/libbpf/docs/libbpf_overview.rst b/third_party/bpftool/libbpf/docs/libbpf_overview.rst new file mode 100644 index 0000000..f36a2d4 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/libbpf_overview.rst @@ -0,0 +1,228 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +libbpf Overview +=============== + +libbpf is a C-based library containing a BPF loader that takes compiled BPF +object files and prepares and loads them into the Linux kernel. libbpf takes the +heavy lifting of loading, verifying, and attaching BPF programs to various +kernel hooks, allowing BPF application developers to focus only on BPF program +correctness and performance. + +The following are the high-level features supported by libbpf: + +* Provides high-level and low-level APIs for user space programs to interact + with BPF programs. The low-level APIs wrap all the bpf system call + functionality, which is useful when users need more fine-grained control + over the interactions between user space and BPF programs. +* Provides overall support for the BPF object skeleton generated by bpftool. + The skeleton file simplifies the process for the user space programs to access + global variables and work with BPF programs. +* Provides BPF-side APIS, including BPF helper definitions, BPF maps support, + and tracing helpers, allowing developers to simplify BPF code writing. +* Supports BPF CO-RE mechanism, enabling BPF developers to write portable + BPF programs that can be compiled once and run across different kernel + versions. + +This document will delve into the above concepts in detail, providing a deeper +understanding of the capabilities and advantages of libbpf and how it can help +you develop BPF applications efficiently. + +BPF App Lifecycle and libbpf APIs +================================== + +A BPF application consists of one or more BPF programs (either cooperating or +completely independent), BPF maps, and global variables. The global +variables are shared between all BPF programs, which allows them to cooperate on +a common set of data. libbpf provides APIs that user space programs can use to +manipulate the BPF programs by triggering different phases of a BPF application +lifecycle. + +The following section provides a brief overview of each phase in the BPF life +cycle: + +* **Open phase**: In this phase, libbpf parses the BPF + object file and discovers BPF maps, BPF programs, and global variables. After + a BPF app is opened, user space apps can make additional adjustments + (setting BPF program types, if necessary; pre-setting initial values for + global variables, etc.) before all the entities are created and loaded. + +* **Load phase**: In the load phase, libbpf creates BPF + maps, resolves various relocations, and verifies and loads BPF programs into + the kernel. At this point, libbpf validates all the parts of a BPF application + and loads the BPF program into the kernel, but no BPF program has yet been + executed. After the load phase, it’s possible to set up the initial BPF map + state without racing with the BPF program code execution. + +* **Attachment phase**: In this phase, libbpf + attaches BPF programs to various BPF hook points (e.g., tracepoints, kprobes, + cgroup hooks, network packet processing pipeline, etc.). During this + phase, BPF programs perform useful work such as processing + packets, or updating BPF maps and global variables that can be read from user + space. + +* **Tear down phase**: In the tear down phase, + libbpf detaches BPF programs and unloads them from the kernel. BPF maps are + destroyed, and all the resources used by the BPF app are freed. + +BPF Object Skeleton File +======================== + +BPF skeleton is an alternative interface to libbpf APIs for working with BPF +objects. Skeleton code abstract away generic libbpf APIs to significantly +simplify code for manipulating BPF programs from user space. Skeleton code +includes a bytecode representation of the BPF object file, simplifying the +process of distributing your BPF code. With BPF bytecode embedded, there are no +extra files to deploy along with your application binary. + +You can generate the skeleton header file ``(.skel.h)`` for a specific object +file by passing the BPF object to the bpftool. The generated BPF skeleton +provides the following custom functions that correspond to the BPF lifecycle, +each of them prefixed with the specific object name: + +* ``__open()`` – creates and opens BPF application (```` stands for + the specific bpf object name) +* ``__load()`` – instantiates, loads,and verifies BPF application parts +* ``__attach()`` – attaches all auto-attachable BPF programs (it’s + optional, you can have more control by using libbpf APIs directly) +* ``__destroy()`` – detaches all BPF programs and + frees up all used resources + +Using the skeleton code is the recommended way to work with bpf programs. Keep +in mind, BPF skeleton provides access to the underlying BPF object, so whatever +was possible to do with generic libbpf APIs is still possible even when the BPF +skeleton is used. It's an additive convenience feature, with no syscalls, and no +cumbersome code. + +Other Advantages of Using Skeleton File +--------------------------------------- + +* BPF skeleton provides an interface for user space programs to work with BPF + global variables. The skeleton code memory maps global variables as a struct + into user space. The struct interface allows user space programs to initialize + BPF programs before the BPF load phase and fetch and update data from user + space afterward. + +* The ``skel.h`` file reflects the object file structure by listing out the + available maps, programs, etc. BPF skeleton provides direct access to all the + BPF maps and BPF programs as struct fields. This eliminates the need for + string-based lookups with ``bpf_object_find_map_by_name()`` and + ``bpf_object_find_program_by_name()`` APIs, reducing errors due to BPF source + code and user-space code getting out of sync. + +* The embedded bytecode representation of the object file ensures that the + skeleton and the BPF object file are always in sync. + +BPF Helpers +=========== + +libbpf provides BPF-side APIs that BPF programs can use to interact with the +system. The BPF helpers definition allows developers to use them in BPF code as +any other plain C function. For example, there are helper functions to print +debugging messages, get the time since the system was booted, interact with BPF +maps, manipulate network packets, etc. + +For a complete description of what the helpers do, the arguments they take, and +the return value, see the `bpf-helpers +`_ man page. + +BPF CO-RE (Compile Once – Run Everywhere) +========================================= + +BPF programs work in the kernel space and have access to kernel memory and data +structures. One limitation that BPF applications come across is the lack of +portability across different kernel versions and configurations. `BCC +`_ is one of the solutions for BPF +portability. However, it comes with runtime overhead and a large binary size +from embedding the compiler with the application. + +libbpf steps up the BPF program portability by supporting the BPF CO-RE concept. +BPF CO-RE brings together BTF type information, libbpf, and the compiler to +produce a single executable binary that you can run on multiple kernel versions +and configurations. + +To make BPF programs portable libbpf relies on the BTF type information of the +running kernel. Kernel also exposes this self-describing authoritative BTF +information through ``sysfs`` at ``/sys/kernel/btf/vmlinux``. + +You can generate the BTF information for the running kernel with the following +command: + +:: + + $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h + +The command generates a ``vmlinux.h`` header file with all kernel types +(:doc:`BTF types <../btf>`) that the running kernel uses. Including +``vmlinux.h`` in your BPF program eliminates dependency on system-wide kernel +headers. + +libbpf enables portability of BPF programs by looking at the BPF program’s +recorded BTF type and relocation information and matching them to BTF +information (vmlinux) provided by the running kernel. libbpf then resolves and +matches all the types and fields, and updates necessary offsets and other +relocatable data to ensure that BPF program’s logic functions correctly for a +specific kernel on the host. BPF CO-RE concept thus eliminates overhead +associated with BPF development and allows developers to write portable BPF +applications without modifications and runtime source code compilation on the +target machine. + +The following code snippet shows how to read the parent field of a kernel +``task_struct`` using BPF CO-RE and libbf. The basic helper to read a field in a +CO-RE relocatable manner is ``bpf_core_read(dst, sz, src)``, which will read +``sz`` bytes from the field referenced by ``src`` into the memory pointed to by +``dst``. + +.. code-block:: C + :emphasize-lines: 6 + + //... + struct task_struct *task = (void *)bpf_get_current_task(); + struct task_struct *parent_task; + int err; + + err = bpf_core_read(&parent_task, sizeof(void *), &task->parent); + if (err) { + /* handle error */ + } + + /* parent_task contains the value of task->parent pointer */ + +In the code snippet, we first get a pointer to the current ``task_struct`` using +``bpf_get_current_task()``. We then use ``bpf_core_read()`` to read the parent +field of task struct into the ``parent_task`` variable. ``bpf_core_read()`` is +just like ``bpf_probe_read_kernel()`` BPF helper, except it records information +about the field that should be relocated on the target kernel. i.e, if the +``parent`` field gets shifted to a different offset within +``struct task_struct`` due to some new field added in front of it, libbpf will +automatically adjust the actual offset to the proper value. + +Getting Started with libbpf +=========================== + +Check out the `libbpf-bootstrap `_ +repository with simple examples of using libbpf to build various BPF +applications. + +See also `libbpf API documentation +`_. + +libbpf and Rust +=============== + +If you are building BPF applications in Rust, it is recommended to use the +`Libbpf-rs `_ library instead of bindgen +bindings directly to libbpf. Libbpf-rs wraps libbpf functionality in +Rust-idiomatic interfaces and provides libbpf-cargo plugin to handle BPF code +compilation and skeleton generation. Using Libbpf-rs will make building user +space part of the BPF application easier. Note that the BPF program themselves +must still be written in plain C. + +Additional Documentation +======================== + +* `Program types and ELF Sections `_ +* `API naming convention `_ +* `Building libbpf `_ +* `API documentation Convention `_ diff --git a/third_party/bpftool/libbpf/docs/program_types.rst b/third_party/bpftool/libbpf/docs/program_types.rst new file mode 100644 index 0000000..ad4d4d5 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/program_types.rst @@ -0,0 +1,203 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _program_types_and_elf: + +Program Types and ELF Sections +============================== + +The table below lists the program types, their attach types where relevant and the ELF section +names supported by libbpf for them. The ELF section names follow these rules: + +- ``type`` is an exact match, e.g. ``SEC("socket")`` +- ``type+`` means it can be either exact ``SEC("type")`` or well-formed ``SEC("type/extras")`` + with a '``/``' separator between ``type`` and ``extras``. + +When ``extras`` are specified, they provide details of how to auto-attach the BPF program. The +format of ``extras`` depends on the program type, e.g. ``SEC("tracepoint//")`` +for tracepoints or ``SEC("usdt/::")`` for USDT probes. The extras are +described in more detail in the footnotes. + + ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| Program Type | Attach Type | ELF Section Name | Sleepable | ++===========================================+========================================+==================================+===========+ +| ``BPF_PROG_TYPE_CGROUP_DEVICE`` | ``BPF_CGROUP_DEVICE`` | ``cgroup/dev`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SKB`` | | ``cgroup/skb`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_EGRESS`` | ``cgroup_skb/egress`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_INGRESS`` | ``cgroup_skb/ingress`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCKOPT`` | ``BPF_CGROUP_GETSOCKOPT`` | ``cgroup/getsockopt`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_SETSOCKOPT`` | ``cgroup/setsockopt`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCK_ADDR`` | ``BPF_CGROUP_INET4_BIND`` | ``cgroup/bind4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_CONNECT`` | ``cgroup/connect4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_GETPEERNAME`` | ``cgroup/getpeername4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_GETSOCKNAME`` | ``cgroup/getsockname4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_BIND`` | ``cgroup/bind6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_CONNECT`` | ``cgroup/connect6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_GETPEERNAME`` | ``cgroup/getpeername6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_GETSOCKNAME`` | ``cgroup/getsockname6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP4_RECVMSG`` | ``cgroup/recvmsg4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP4_SENDMSG`` | ``cgroup/sendmsg4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP6_RECVMSG`` | ``cgroup/recvmsg6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP6_SENDMSG`` | ``cgroup/sendmsg6`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCK`` | ``BPF_CGROUP_INET4_POST_BIND`` | ``cgroup/post_bind4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_POST_BIND`` | ``cgroup/post_bind6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_SOCK_CREATE`` | ``cgroup/sock_create`` | | ++ + +----------------------------------+-----------+ +| | | ``cgroup/sock`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_SOCK_RELEASE`` | ``cgroup/sock_release`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SYSCTL`` | ``BPF_CGROUP_SYSCTL`` | ``cgroup/sysctl`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_EXT`` | | ``freplace+`` [#fentry]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_FLOW_DISSECTOR`` | ``BPF_FLOW_DISSECTOR`` | ``flow_dissector`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_KPROBE`` | | ``kprobe+`` [#kprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretprobe+`` [#kprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``ksyscall+`` [#ksyscall]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretsyscall+`` [#ksyscall]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe+`` [#uprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe.s+`` [#uprobe]_ | Yes | ++ + +----------------------------------+-----------+ +| | | ``uretprobe+`` [#uprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``uretprobe.s+`` [#uprobe]_ | Yes | ++ + +----------------------------------+-----------+ +| | | ``usdt+`` [#usdt]_ | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretprobe.multi+`` [#kpmulti]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LSM`` | ``BPF_LSM_CGROUP`` | ``lsm_cgroup+`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_LSM_MAC`` | ``lsm+`` [#lsm]_ | | ++ + +----------------------------------+-----------+ +| | | ``lsm.s+`` [#lsm]_ | Yes | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_IN`` | | ``lwt_in`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_OUT`` | | ``lwt_out`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_SEG6LOCAL`` | | ``lwt_seg6local`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_XMIT`` | | ``lwt_xmit`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_PERF_EVENT`` | | ``perf_event`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` | | ``raw_tp.w+`` [#rawtp]_ | | ++ + +----------------------------------+-----------+ +| | | ``raw_tracepoint.w+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_RAW_TRACEPOINT`` | | ``raw_tp+`` [#rawtp]_ | | ++ + +----------------------------------+-----------+ +| | | ``raw_tracepoint+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` | | ++ + +----------------------------------+-----------+ +| | | ``tc`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_LOOKUP`` | ``BPF_SK_LOOKUP`` | ``sk_lookup`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_MSG`` | ``BPF_SK_MSG_VERDICT`` | ``sk_msg`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_REUSEPORT`` | ``BPF_SK_REUSEPORT_SELECT_OR_MIGRATE`` | ``sk_reuseport/migrate`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_REUSEPORT_SELECT`` | ``sk_reuseport`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_SKB`` | | ``sk_skb`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_SKB_STREAM_PARSER`` | ``sk_skb/stream_parser`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_SKB_STREAM_VERDICT`` | ``sk_skb/stream_verdict`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SOCKET_FILTER`` | | ``socket`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SOCK_OPS`` | ``BPF_CGROUP_SOCK_OPS`` | ``sockops`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SYSCALL`` | | ``syscall`` | Yes | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_TRACEPOINT`` | | ``tp+`` [#tp]_ | | ++ + +----------------------------------+-----------+ +| | | ``tracepoint+`` [#tp]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_TRACING`` | ``BPF_MODIFY_RETURN`` | ``fmod_ret+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fmod_ret.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_FENTRY`` | ``fentry+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fentry.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_FEXIT`` | ``fexit+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fexit.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_ITER`` | ``iter+`` [#iter]_ | | ++ + +----------------------------------+-----------+ +| | | ``iter.s+`` [#iter]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_RAW_TP`` | ``tp_btf+`` [#fentry]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_XDP`` | ``BPF_XDP_CPUMAP`` | ``xdp.frags/cpumap`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp/cpumap`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_XDP_DEVMAP`` | ``xdp.frags/devmap`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp/devmap`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_XDP`` | ``xdp.frags`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ + + +.. rubric:: Footnotes + +.. [#fentry] The ``fentry`` attach format is ``fentry[.s]/``. +.. [#kprobe] The ``kprobe`` attach format is ``kprobe/[+]``. Valid + characters for ``function`` are ``a-zA-Z0-9_.`` and ``offset`` must be a valid + non-negative integer. +.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/``. +.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/:[+]``. +.. [#usdt] The ``usdt`` attach format is ``usdt/::``. +.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/`` where ``pattern`` + supports ``*`` and ``?`` wildcards. Valid characters for pattern are + ``a-zA-Z0-9_.*?``. +.. [#lsm] The ``lsm`` attachment format is ``lsm[.s]/``. +.. [#rawtp] The ``raw_tp`` attach format is ``raw_tracepoint[.w]/``. +.. [#tp] The ``tracepoint`` attach format is ``tracepoint//``. +.. [#iter] The ``iter`` attach format is ``iter[.s]/``. diff --git a/third_party/bpftool/libbpf/docs/sphinx/Makefile b/third_party/bpftool/libbpf/docs/sphinx/Makefile new file mode 100644 index 0000000..5dc39c5 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/sphinx/Makefile @@ -0,0 +1,9 @@ +SPHINXBUILD ?= sphinx-build +SOURCEDIR = ../src +BUILDDIR = build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" + +%: + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" diff --git a/third_party/bpftool/libbpf/docs/sphinx/doxygen/Doxyfile b/third_party/bpftool/libbpf/docs/sphinx/doxygen/Doxyfile new file mode 100644 index 0000000..b04c115 --- /dev/null +++ b/third_party/bpftool/libbpf/docs/sphinx/doxygen/Doxyfile @@ -0,0 +1,277 @@ +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "libbpf" +PROJECT_NUMBER = +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = ./build +CREATE_SUBDIRS = NO +ALLOW_UNICODE_NAMES = NO +OUTPUT_LANGUAGE = English +OUTPUT_TEXT_DIRECTION = None +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +JAVADOC_BANNER = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +PYTHON_DOCSTRING = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 4 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +OPTIMIZE_OUTPUT_SLICE = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +TOC_INCLUDE_HEADINGS = 5 +AUTOLINK_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +GROUP_NESTED_COMPOUNDS = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +LOOKUP_CACHE_SIZE = 0 +NUM_PROC_THREADS = 1 +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PRIV_VIRTUAL = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +RESOLVE_UNNAMED_PARAMS = YES +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +HIDE_COMPOUND_REFERENCE= NO +SHOW_INCLUDE_FILES = YES +SHOW_GROUPED_MEMB_INC = NO +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_AS_ERROR = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +INPUT = ../../../src +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.c \ + *.h +RECURSIVE = NO +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = ___* +EXAMPLE_PATH = +EXAMPLE_PATTERNS = * +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +USE_MDFILE_AS_MAINPAGE = YES +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +SOURCE_TOOLTIPS = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +ALPHABETICAL_INDEX = YES +IGNORE_PREFIX = +GENERATE_HTML = NO +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_EXTRA_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = NO +HTML_DYNAMIC_MENUS = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +HTML_FORMULA_FORMAT = png +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +FORMULA_MACROFILE = +USE_MATHJAX = NO +MATHJAX_FORMAT = HTML-CSS +MATHJAX_RELPATH = https://cdn.jsdelivr.net/npm/mathjax@2 +MATHJAX_EXTENSIONS = +MATHJAX_CODEFILE = +SEARCHENGINE = YES +SERVER_BASED_SEARCH = NO +EXTERNAL_SEARCH = NO +SEARCHENGINE_URL = +SEARCHDATA_FILE = searchdata.xml +EXTERNAL_SEARCH_ID = +EXTRA_SEARCH_MAPPINGS = +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = +MAKEINDEX_CMD_NAME = makeindex +LATEX_MAKEINDEX_CMD = makeindex +COMPACT_LATEX = NO +PAPER_TYPE = a4 +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_FILES = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +LATEX_TIMESTAMP = NO +LATEX_EMOJI_DIRECTORY = +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +RTF_SOURCE_CODE = NO +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_SUBDIR = +MAN_LINKS = NO +GENERATE_XML = YES +XML_OUTPUT = xml +XML_PROGRAMLISTING = YES +XML_NS_MEMB_FILE_SCOPE = NO +GENERATE_DOCBOOK = NO +DOCBOOK_OUTPUT = docbook +DOCBOOK_PROGRAMLISTING = NO +GENERATE_AUTOGEN_DEF = NO +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = YES +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = NO +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +EXTERNAL_PAGES = YES +CLASS_DIAGRAMS = YES +DIA_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = Helvetica +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +DOT_UML_DETAILS = NO +DOT_WRAP_THRESHOLD = 17 +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DIAFILE_DIRS = +PLANTUML_JAR_PATH = +PLANTUML_CFG_FILE = +PLANTUML_INCLUDE_PATH = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/third_party/bpftool/libbpf/docs/sphinx/requirements.txt b/third_party/bpftool/libbpf/docs/sphinx/requirements.txt new file mode 100644 index 0000000..188f51e --- /dev/null +++ b/third_party/bpftool/libbpf/docs/sphinx/requirements.txt @@ -0,0 +1 @@ +breathe \ No newline at end of file diff --git a/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer.c b/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer.c new file mode 100644 index 0000000..89286e2 --- /dev/null +++ b/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer.c @@ -0,0 +1,23 @@ +#include "libbpf.h" + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return 0; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + struct bpf_object *obj = NULL; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + int err; + + libbpf_set_print(libbpf_print_fn); + + opts.object_name = "fuzz-object"; + obj = bpf_object__open_mem(data, size, &opts); + err = libbpf_get_error(obj); + if (err) + return 0; + + bpf_object__close(obj); + return 0; +} diff --git a/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip b/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip new file mode 100644 index 0000000..602b381 Binary files /dev/null and b/third_party/bpftool/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip differ diff --git a/third_party/bpftool/libbpf/include/asm/barrier.h b/third_party/bpftool/libbpf/include/asm/barrier.h new file mode 100644 index 0000000..1fc6aee --- /dev/null +++ b/third_party/bpftool/libbpf/include/asm/barrier.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#include + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/compiler.h b/third_party/bpftool/libbpf/include/linux/compiler.h new file mode 100644 index 0000000..26336dc --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/compiler.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_COMPILER_H +#define __LINUX_COMPILER_H + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#define barrier() asm volatile("" ::: "memory") + +#if defined(__x86_64__) + +# define smp_rmb() barrier() +# define smp_wmb() barrier() +# define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc") + +# define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + barrier(); \ + ___p; \ +}) + +#elif defined(__aarch64__) + +# define smp_rmb() asm volatile("dmb ishld" ::: "memory") +# define smp_wmb() asm volatile("dmb ishst" ::: "memory") +# define smp_mb() asm volatile("dmb ish" ::: "memory") + +#endif + +#ifndef smp_mb +# define smp_mb() __sync_synchronize() +#endif + +#ifndef smp_rmb +# define smp_rmb() smp_mb() +#endif + +#ifndef smp_wmb +# define smp_wmb() smp_mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + smp_mb(); \ + ___p; \ +}) +#endif + +#endif /* __LINUX_COMPILER_H */ diff --git a/third_party/bpftool/libbpf/include/linux/err.h b/third_party/bpftool/libbpf/include/linux/err.h new file mode 100644 index 0000000..1b1dafb --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/err.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_ERR_H +#define __LINUX_ERR_H + +#include +#include + +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * ERR_PTR(long error_) +{ + return (void *) error_; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline bool IS_ERR_OR_NULL(const void *ptr) +{ + return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long PTR_ERR_OR_ZERO(const void *ptr) +{ + return IS_ERR(ptr) ? PTR_ERR(ptr) : 0; +} + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/filter.h b/third_party/bpftool/libbpf/include/linux/filter.h new file mode 100644 index 0000000..e7e3373 --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/filter.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_FILTER_H +#define __LINUX_FILTER_H + +#include + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((FUNC) - BPF_FUNC_unspec) }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_LD_IMM64_RAW_FULL(DST, SRC, OFF1, OFF2, IMM1, IMM2) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF1, \ + .imm = IMM1 }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF2, \ + .imm = IMM2 }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_FD, 0, 0, \ + MAP_FD, 0) + +#define BPF_LD_MAP_VALUE(DST, MAP_FD, VALUE_OFF) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \ + MAP_FD, VALUE_OFF) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/kernel.h b/third_party/bpftool/libbpf/include/linux/kernel.h new file mode 100644 index 0000000..a4a7a9d --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/kernel.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_KERNEL_H +#define __LINUX_KERNEL_H + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#ifndef max +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) +#endif + +#ifndef min +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) +#endif + +#ifndef roundup +#define roundup(x, y) ( \ +{ \ + const typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#endif + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/list.h b/third_party/bpftool/libbpf/include/linux/list.h new file mode 100644 index 0000000..fc91c34 --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/list.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_LIST_H +#define __LINUX_LIST_H + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) + + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/overflow.h b/third_party/bpftool/libbpf/include/linux/overflow.h new file mode 100644 index 0000000..53d7580 --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/overflow.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#ifdef __GNUC__ +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 50100 +#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 +#endif +#endif + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW + +#define check_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +}) + +#else + +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (__u64)__a * (__u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + +#define check_mul_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d)) + + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +#endif diff --git a/third_party/bpftool/libbpf/include/linux/ring_buffer.h b/third_party/bpftool/libbpf/include/linux/ring_buffer.h new file mode 100644 index 0000000..fc4677b --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/ring_buffer.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +static inline __u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ + return smp_load_acquire(&base->data_head); +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + __u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ diff --git a/third_party/bpftool/libbpf/include/linux/types.h b/third_party/bpftool/libbpf/include/linux/types.h new file mode 100644 index 0000000..b15252a --- /dev/null +++ b/third_party/bpftool/libbpf/include/linux/types.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_TYPES_H +#define __LINUX_TYPES_H + +#include +#include +#include + +#include + +#include +#include + +#define __bitwise__ +#define __bitwise __bitwise__ + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +#ifndef __aligned_u64 +# define __aligned_u64 __u64 __attribute__((aligned(8))) +#endif + +struct list_head { + struct list_head *next, *prev; +}; + +#endif diff --git a/third_party/bpftool/libbpf/include/uapi/linux/bpf.h b/third_party/bpftool/libbpf/include/uapi/linux/bpf.h new file mode 100644 index 0000000..60a9d59 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/bpf.h @@ -0,0 +1,7198 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +/* jmp encodings */ +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ +}; + +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; +}; + +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, + BPF_PROG_RUN = BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, +}; + +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, + BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + BPF_STRUCT_OPS, + BPF_NETFILTER, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, + + MAX_BPF_LINK_TYPE, +}; + +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] + * insn[0].imm: map fd or fd_idx + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP + */ +#define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_IDX 5 + +/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE + * insn[0].imm: map fd or fd_idx + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ +#define BPF_PSEUDO_MAP_VALUE 2 +#define BPF_PSEUDO_MAP_IDX_VALUE 6 + +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 + +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 + +/* flags for BPF_MAP_UPDATE_ELEM command */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; + +/* flags for BPF_MAP_CREATE command */ +enum { + BPF_F_NO_PREALLOC = (1U << 0), +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ + BPF_F_NO_COMMON_LRU = (1U << 1), +/* Specify numa node during map creation */ + BPF_F_NUMA_NODE = (1U << 2), + +/* Flags for accessing BPF object from syscall side. */ + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), + +/* Flag for stack_map, store build_id+offset instead of pointer */ + BPF_F_STACK_BUILD_ID = (1U << 5), + +/* Zero-initialize hash function seed. This should only be used for testing. */ + BPF_F_ZERO_SEED = (1U << 6), + +/* Flags for accessing BPF object from program side. */ + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), + +/* Clone map from listener for newly accepted socket */ + BPF_F_CLONE = (1U << 9), + +/* Enable memory-mapping BPF map */ + BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), +}; + +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are always returned 0. + */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) + +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + +#define BPF_OBJ_NAME_LEN 16U + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; + __u32 flags; /* extra flags */ + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + +} __attribute__((aligned(8))); + +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * Return + * Current *ktime*. + * + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * long bpf_get_current_comm(void *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * long bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * + * u64 bpf_get_current_task(void) + * Description + * Get the current task. + * Return + * A pointer to the current task struct. + * + * long bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. + * + * long bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * Return + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * long bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * Return + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + * + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_override_return(struct pt_regs *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + * + * void *bpf_get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * Return + * A pointer to the local storage area. + * + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_sk_release(void *sock) + * Description + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * long bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + * + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + * + * long bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + * + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies + * + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * Return + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * Return + * Current *ktime*. + * + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(void *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * Return + * 0 on success, or a negative error in case of failure. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. + * + * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) + * Description + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * Return + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + * + * long bpf_sys_close(u32 fd) + * Description + * Execute close syscall for given FD. + * Return + * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * Return + * The number of argument registers of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * Return + * The BPF program's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + * + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) + * Description + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + * + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + * + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ + /* */ + +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, +enum bpf_func_id { + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; + +/* BPF_FUNC_l4_csum_replace flags. */ +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +enum { + BPF_F_INGRESS = (1ULL << 0), +}; + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; + +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), +/* flags used by BPF_FUNC_get_stackid only. */ + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), +/* flags used by BPF_FUNC_get_stack only. */ + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), +}; + +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, +/* BPF_FUNC_perf_event_output for sk_buff input context. */ + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; + +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + +/* BPF_FUNC_skb_adjust_room flags. */ +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), +}; + +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; + +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; + +/* BPF_FUNC__storage_get flags */ +enum { + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; + +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + +/* Mode for BPF_FUNC_skb_adjust_room helper. */ +enum bpf_adj_room_mode { + BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, +}; + +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, +}; + +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + +enum { + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ +}; + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; + + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; + __u8 tstamp_type; + __u32 :24; /* Padding, future use. */ + __u64 hwtstamp; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; + __u32 tunnel_label; + union { + __u32 local_ipv4; + __u32 local_ipv6[4]; + }; +}; + +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + __u16 ext; /* Padding, future use. */ + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; + __u32 mark; + __u32 priority; + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; + __s32 rx_queue_mapping; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +}; + +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + +struct bpf_xdp_sock { + __u32 queue_id; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, + XDP_REDIRECT, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; + __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ +}; + +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +enum sk_action { + SK_DROP = 0, + SK_PASS, +}; + +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ +}; + +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); +}; + +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ + __u64 netns_dev; + __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; + __u64 recursion_misses; + __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 btf_vmlinux_value_type_id; + __u64 netns_dev; + __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 :32; /* alignment pad */ + __u64 map_extra; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; +} __attribute__((aligned(8))); + +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ + union { + struct { + __u32 map_id; + } map; + }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; + }; + } iter; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; + struct { + __u32 ifindex; + } xdp; + struct { + __u32 map_id; + } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; +} __attribute__((aligned(8))); + +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __bpf_md_ptr(struct bpf_sock *, sk); +}; + +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ + __u64 skb_hwtstamp; +}; + +/* Definitions for bpf_sock_ops_cb_flags */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ +}; + +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ +}; + +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; + +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; + +struct bpf_cgroup_dev_ctx { + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; + __u32 major; + __u32 minor; +}; + +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), +}; + +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + +struct bpf_fib_lookup { + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + + /* output: MTU value */ + __u16 mtu_result; + }; + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ + + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; + }; + + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route + */ + union { + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; + +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; + __u32 flags; + __be32 flow_label; +}; + +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + +struct bpf_spin_lock { + __u32 val; +}; + +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_root { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ + __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ +}; + +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ +}; + +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/bpf_common.h b/third_party/bpftool/libbpf/include/uapi/linux/bpf_common.h new file mode 100644 index 0000000..ee97668 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/bpf_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_BPF_COMMON_H__ +#define _UAPI__LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/btf.h b/third_party/bpftool/libbpf/include/uapi/linux/btf.h new file mode 100644 index 0000000..ec1798b --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/btf.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x000fffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x00ffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union, enum, fwd and enum64 + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) + +enum { + BTF_KIND_UNKN = 0, /* Unknown */ + BTF_KIND_INT = 1, /* Integer */ + BTF_KIND_PTR = 2, /* Pointer */ + BTF_KIND_ARRAY = 3, /* Array */ + BTF_KIND_STRUCT = 4, /* Struct */ + BTF_KIND_UNION = 5, /* Union */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ + BTF_KIND_FWD = 7, /* Forward */ + BTF_KIND_TYPEDEF = 8, /* Typedef */ + BTF_KIND_VOLATILE = 9, /* Volatile */ + BTF_KIND_CONST = 10, /* Const */ + BTF_KIND_RESTRICT = 11, /* Restrict */ + BTF_KIND_FUNC = 12, /* Function */ + BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ + BTF_KIND_VAR = 14, /* Variable */ + BTF_KIND_DATASEC = 15, /* Section */ + BTF_KIND_FLOAT = 16, /* Floating point */ + BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ + + NR_BTF_KINDS, + BTF_KIND_MAX = NR_BTF_KINDS - 1, +}; + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name_off; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name_off; + __u32 type; + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; +}; + +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, +}; + +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + +/* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe + * additional information related to the tag applied location. + * If component_idx == -1, the tag is applied to a struct, union, + * variable or function. Otherwise, it is applied to a struct/union + * member or a func argument, and component_idx indicates which member + * or argument (0 ... vlen-1). + */ +struct btf_decl_tag { + __s32 component_idx; +}; + +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/fcntl.h b/third_party/bpftool/libbpf/include/uapi/linux/fcntl.h new file mode 100644 index 0000000..e8c07da --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/fcntl.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FCNTL_H +#define _UAPI_LINUX_FCNTL_H + +#include +#include + +#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) +#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) + +/* + * Cancel a blocking posix lock; internal use only until we expose an + * asynchronous lock api to userspace: + */ +#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) + +/* Create a file descriptor with FD_CLOEXEC set. */ +#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) + +/* + * Request nofications on a directory. + * See below for events that may be notified. + */ +#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) + +/* + * Set and get of pipe page size array + */ +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) + +/* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ +/* (1U << 31) is reserved for signed error codes */ + +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWH_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + +/* + * Types of directory notifications that may be requested. + */ +#define DN_ACCESS 0x00000001 /* File accessed */ +#define DN_MODIFY 0x00000002 /* File modified */ +#define DN_CREATE 0x00000004 /* File created */ +#define DN_DELETE 0x00000008 /* File removed */ +#define DN_RENAME 0x00000010 /* File renamed */ +#define DN_ATTRIB 0x00000020 /* File changed attibutes */ +#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ + +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ +#define AT_FDCWD -100 /* Special value used to indicate + openat should use the current + working directory. */ +#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ +#define AT_REMOVEDIR 0x200 /* Remove directory instead of + unlinking file. */ +#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ + +#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ +#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ +#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ +#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ + +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + +#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/if_link.h b/third_party/bpftool/libbpf/include/uapi/linux/if_link.h new file mode 100644 index 0000000..39e659c --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/if_link.h @@ -0,0 +1,1284 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; + __u32 collisions; + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; +}; + +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ +struct rtnl_link_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + __u64 rx_nohandler; +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* VXLAN section */ +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, + IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/if_xdp.h b/third_party/bpftool/libbpf/include/uapi/linux/if_xdp.h new file mode 100644 index 0000000..a78a809 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/if_xdp.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ +/* If this option is set, the driver might go sleep and in that case + * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be + * set. If it is set, the application need to explicitly wake up the + * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are + * running the driver and the application on the same core, you should + * use this option so that the kernel will yield to the user space + * application. + */ +#define XDP_USE_NEED_WAKEUP (1 << 3) + +/* Flags for xsk_umem_config flags */ +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +struct sockaddr_xdp { + __u16 sxdp_family; + __u16 sxdp_flags; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; +}; + +/* XDP_RING flags */ +#define XDP_RING_NEED_WAKEUP (1 << 0) + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; + __u64 flags; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + +/* XDP socket options */ +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; +}; + +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for other reasons */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ +}; + +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + +/* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL + +/* Masks for unaligned chunks mode */ +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 +#define XSK_UNALIGNED_BUF_ADDR_MASK \ + ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) + +/* Rx/Tx descriptor */ +struct xdp_desc { + __u64 addr; + __u32 len; + __u32 options; +}; + +/* UMEM descriptor is __u64 */ + +#endif /* _LINUX_IF_XDP_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/netdev.h b/third_party/bpftool/libbpf/include/uapi/linux/netdev.h new file mode 100644 index 0000000..639524b --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/netdev.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NETDEV_H +#define _UAPI_LINUX_NETDEV_H + +#define NETDEV_FAMILY_NAME "netdev" +#define NETDEV_FAMILY_VERSION 1 + +/** + * enum netdev_xdp_act + * @NETDEV_XDP_ACT_BASIC: XDP feautues set supported by all drivers + * (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX) + * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT + * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements + * ndo_xdp_xmit callback. + * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP + * in zero copy mode. + * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw + * offloading. + * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear + * XDP buffer support in the driver napi callback. + * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements + * non-linear XDP buffer support in ndo_xdp_xmit callback. + */ +enum netdev_xdp_act { + NETDEV_XDP_ACT_BASIC = 1, + NETDEV_XDP_ACT_REDIRECT = 2, + NETDEV_XDP_ACT_NDO_XMIT = 4, + NETDEV_XDP_ACT_XSK_ZEROCOPY = 8, + NETDEV_XDP_ACT_HW_OFFLOAD = 16, + NETDEV_XDP_ACT_RX_SG = 32, + NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + NETDEV_XDP_ACT_MASK = 127, +}; + +enum { + NETDEV_A_DEV_IFINDEX = 1, + NETDEV_A_DEV_PAD, + NETDEV_A_DEV_XDP_FEATURES, + + __NETDEV_A_DEV_MAX, + NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) +}; + +enum { + NETDEV_CMD_DEV_GET = 1, + NETDEV_CMD_DEV_ADD_NTF, + NETDEV_CMD_DEV_DEL_NTF, + NETDEV_CMD_DEV_CHANGE_NTF, + + __NETDEV_CMD_MAX, + NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) +}; + +#define NETDEV_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_NETDEV_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/netlink.h b/third_party/bpftool/libbpf/include/uapi/linux/netlink.h new file mode 100644 index 0000000..0a4d733 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/netlink.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Sending process port ID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/openat2.h b/third_party/bpftool/libbpf/include/uapi/linux/openat2.h new file mode 100644 index 0000000..a5feb76 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/openat2.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ + +#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/perf_event.h b/third_party/bpftool/libbpf/include/uapi/linux/perf_event.h new file mode 100644 index 0000000..3767543 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/perf_event.h @@ -0,0 +1,1448 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _UAPI_LINUX_PERF_EVENT_H +#define _UAPI_LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, + PERF_COUNT_HW_REF_CPU_CYCLES = 9, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + PERF_COUNT_HW_CACHE_NODE = 6, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, + PERF_COUNT_SW_EMULATION_FAULTS = 8, + PERF_COUNT_SW_DUMMY = 9, + PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, + PERF_SAMPLE_REGS_USER = 1U << 12, + PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, + PERF_SAMPLE_TRANSACTION = 1U << 17, + PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ +}; + +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ + PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ + + PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ + PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_MAX, +}; + +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + +/* + * Values to determine ABI of the registers dump. + */ +enum perf_sample_regs_abi { + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, +}; + +/* + * Values for the memory transaction event qualifier, mostly for + * abort events. Multiple bits can be set. + */ +enum { + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + + PERF_TXN_MAX = (1 << 8), /* non-ABI */ + + /* bits 32..63 are reserved for the abort code */ + + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, + + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ +#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ + /* add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + * + * @sample_max_stack: Max number of frame pointers in a callchain, + * should be < /proc/sys/kernel/perf_event_max_stack + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ + + exclude_host : 1, /* don't count in host */ + exclude_guest : 1, /* don't count in guest */ + + exclude_callchain_kernel : 1, /* exclude kernel callchains */ + exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ + comm_exec : 1, /* flag comm events that are due to an exec */ + use_clockid : 1, /* use @clockid for time fields */ + context_switch : 1, /* context switch data */ + write_backward : 1, /* Write ring buffer from end to beginning */ + namespaces : 1, /* include namespaces data */ + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + aux_output : 1, /* generate AUX records instead of events */ + cgroup : 1, /* include cgroup events */ + text_poke : 1, /* include text poke events */ + build_id : 1, /* use build id in mmap2 events */ + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + + __u32 bp_type; + union { + __u64 bp_addr; + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ + __u64 config2; /* extension of config1 */ + }; + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + + /* + * Defines set of user regs to dump on samples. + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_user; + + /* + * Defines size of the user stack to dump on samples. + */ + __u32 sample_stack_user; + + __s32 clockid; + /* + * Defines set of regs to dump for each sample + * state captured on: + * - precise = 0: PMU interrupt + * - precise > 0: sampled instruction + * + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u16 sample_max_stack; + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. + */ + __u64 sig_data; + + __u64 config3; /* extension of config2 */ +}; + +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[]; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq, time_mult, time_shift, index, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; + * + * do { + * seq = pc->lock; + * barrier() + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * index = pc->index; + * count = pc->offset; + * if (pc->cap_user_rdpmc && index) { + * width = pc->pmc_width; + * pmc = rdpmc(index - 1); + * } + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + union { + __u64 capabilities; + struct { + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; + }; + }; + + /* + * If cap_user_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & (((u64)1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if index), improving the scaling: + * + * enabled += delta; + * if (index) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; + __u64 time_offset; + /* + * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated + * from sample timestamps. + * + * time = timestamp - time_zero; + * quot = time / time_mult; + * rem = time % time_mult; + * cyc = (quot << time_shift) + (rem << time_shift) / time_mult; + * + * And vice versa: + * + * quot = cyc >> time_shift; + * rem = cyc & (((u64)1 << time_shift) - 1); + * timestamp = time_zero + quot * time_mult + + * ((rem * time_mult) >> time_shift); + */ + __u64 time_zero; + + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; + + /* + * Hole for extension of the self monitor capabilities + */ + + __u8 __reserved[116*8]; /* align to 1k. */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; +}; + +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) + +/* + * Indicates that /proc/PID/maps parsing are truncated by time out. + */ +#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12) +/* + * Following PERF_RECORD_MISC_* are used on different + * events, so can reuse the same bit position: + * + * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events + * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) + * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events + */ +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) +#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) +/* + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +struct perf_ns_link_info { + __u64 dev; + __u64 ino; +}; + +enum { + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ +}; + +enum perf_event_type { + + /* + * If perf_event_attr.sample_id_all is set then all event types will + * have the sample_type selected fields related to where/when + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. + * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. + */ + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 nr; + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK + * + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 transaction; } && PERF_SAMPLE_TRANSACTION + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE + * }; + */ + PERF_RECORD_SAMPLE = 9, + + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; + * u32 prot, flags; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + + /* + * Records the dropped/lost sample number. + * + * struct { + * struct perf_event_header header; + * + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST_SAMPLES = 13, + + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + + /* + * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and + * next_prev_tid that are the next (switching out) or previous + * (switching in) pid/tid. + * + * struct { + * struct perf_event_header header; + * u32 next_prev_pid; + * u32 next_prev_tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH_CPU_WIDE = 15, + + /* + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * u64 nr_namespaces; + * { u64 dev, inode; } [nr_namespaces]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_NAMESPACES = 16, + + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + + /* + * Data written to the AUX area by hardware due to aux_output, may need + * to be matched to the event by an architecture-specific hardware ID. + * This records the hardware ID, but requires sample_id to provide the + * event ID. e.g. Intel PT uses this record to disambiguate PEBS-via-PT + * records from multiple events. + * + * struct { + * struct perf_event_header header; + * u64 hw_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ + +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ + +#if defined(__LITTLE_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; +#elif defined(__BIG_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ + mem_blk:3, /* access blocked */ + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_dtlb:7, /* tlb access */ + mem_lock:2, /* lock instr */ + mem_snoop:5, /* snoop mode */ + mem_lvl:14, /* memory hierarchy level */ + mem_op:5; /* type of opcode */ + }; +}; +#else +#error "Unknown endianness" +#endif + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 */ +#define PERF_MEM_LVL_L3 0x40 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ +/* 5-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + +#define PERF_MEM_S(a, s) \ + (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction + * cycles: cycles from last branch (or 0 if not supported) + * type: branch type + * spec: branch speculation info (or 0 if not supported) + */ +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + cycles:16, /* cycle count to last branch */ + type:4, /* branch type */ + spec:2, /* branch speculation info */ + new_type:4, /* additional branch type */ + priv:3, /* privilege level */ + reserved:31; +}; + +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + +#endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/third_party/bpftool/libbpf/include/uapi/linux/pkt_cls.h b/third_party/bpftool/libbpf/include/uapi/linux/pkt_cls.h new file mode 100644 index 0000000..3faee01 --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/pkt_cls.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include +#include + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum { + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, + TCA_RSVP_ACT, + __TCA_RSVP_MAX +}; + +#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) + +struct tc_rsvp_gpi { + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo { + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; + __u8 pad; +}; + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* TC index filter */ + +enum { + TCA_TCINDEX_UNSPEC, + TCA_TCINDEX_HASH, + TCA_TCINDEX_MASK, + TCA_TCINDEX_SHIFT, + TCA_TCINDEX_FALL_THROUGH, + TCA_TCINDEX_CLASSID, + TCA_TCINDEX_POLICE, + TCA_TCINDEX_ACT, + __TCA_TCINDEX_MAX +}; + +#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/third_party/bpftool/libbpf/include/uapi/linux/pkt_sched.h b/third_party/bpftool/libbpf/include/uapi/linux/pkt_sched.h new file mode 100644 index 0000000..5c903ab --- /dev/null +++ b/third_party/bpftool/libbpf/include/uapi/linux/pkt_sched.h @@ -0,0 +1,1164 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + TCA_HTB_OFFLOAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u32 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif diff --git a/third_party/bpftool/libbpf/scripts/build-fuzzers.sh b/third_party/bpftool/libbpf/scripts/build-fuzzers.sh new file mode 100755 index 0000000..5171e95 --- /dev/null +++ b/third_party/bpftool/libbpf/scripts/build-fuzzers.sh @@ -0,0 +1,82 @@ +#!/bin/bash +set -eux + +SANITIZER=${SANITIZER:-address} +flags="-O1 -fno-omit-frame-pointer -g -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=$SANITIZER -fsanitize=fuzzer-no-link" + +export CC=${CC:-clang} +export CFLAGS=${CFLAGS:-$flags} + +export CXX=${CXX:-clang++} +export CXXFLAGS=${CXXFLAGS:-$flags} + +cd "$(dirname -- "$0")/.." + +export OUT=${OUT:-"$(pwd)/out"} +mkdir -p "$OUT" + +export LIB_FUZZING_ENGINE=${LIB_FUZZING_ENGINE:--fsanitize=fuzzer} + +# libelf is compiled with _FORTIFY_SOURCE by default and it +# isn't compatible with MSan. It was borrowed +# from https://github.com/google/oss-fuzz/pull/7422 +if [[ "$SANITIZER" == memory ]]; then + CFLAGS+=" -U_FORTIFY_SOURCE" + CXXFLAGS+=" -U_FORTIFY_SOURCE" +fi + +# The alignment check is turned off by default on OSS-Fuzz/CFLite so it should be +# turned on explicitly there. It was borrowed from +# https://github.com/google/oss-fuzz/pull/7092 +if [[ "$SANITIZER" == undefined ]]; then + additional_ubsan_checks=alignment + UBSAN_FLAGS="-fsanitize=$additional_ubsan_checks -fno-sanitize-recover=$additional_ubsan_checks" + CFLAGS+=" $UBSAN_FLAGS" + CXXFLAGS+=" $UBSAN_FLAGS" +fi + +# Ideally libbelf should be built using release tarballs available +# at https://sourceware.org/elfutils/ftp/. Unfortunately sometimes they +# fail to compile (for example, elfutils-0.185 fails to compile with LDFLAGS enabled +# due to https://bugs.gentoo.org/794601) so let's just point the script to +# commits referring to versions of libelf that actually can be built +rm -rf elfutils +git clone git://sourceware.org/git/elfutils.git +( +cd elfutils +git checkout 67a187d4c1790058fc7fd218317851cb68bb087c +git log --oneline -1 + +# ASan isn't compatible with -Wl,--no-undefined: https://github.com/google/sanitizers/issues/380 +sed -i 's/^\(NO_UNDEFINED=\).*/\1/' configure.ac + +# ASan isn't compatible with -Wl,-z,defs either: +# https://clang.llvm.org/docs/AddressSanitizer.html#usage +sed -i 's/^\(ZDEFS_LDFLAGS=\).*/\1/' configure.ac + +if [[ "$SANITIZER" == undefined ]]; then + # That's basicaly what --enable-sanitize-undefined does to turn off unaligned access + # elfutils heavily relies on on i386/x86_64 but without changing compiler flags along the way + sed -i 's/\(check_undefined_val\)=[0-9]/\1=1/' configure.ac +fi + +autoreconf -i -f +if ! ./configure --enable-maintainer-mode --disable-debuginfod --disable-libdebuginfod \ + --disable-demangler --without-bzlib --without-lzma --without-zstd \ + CC="$CC" CFLAGS="-Wno-error $CFLAGS" CXX="$CXX" CXXFLAGS="-Wno-error $CXXFLAGS" LDFLAGS="$CFLAGS"; then + cat config.log + exit 1 +fi + +make -C config -j$(nproc) V=1 +make -C lib -j$(nproc) V=1 +make -C libelf -j$(nproc) V=1 +) + +make -C src BUILD_STATIC_ONLY=y V=1 clean +make -C src -j$(nproc) CFLAGS="-I$(pwd)/elfutils/libelf $CFLAGS" BUILD_STATIC_ONLY=y V=1 + +$CC $CFLAGS -Isrc -Iinclude -Iinclude/uapi -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -c fuzz/bpf-object-fuzzer.c -o bpf-object-fuzzer.o +$CXX $CXXFLAGS $LIB_FUZZING_ENGINE bpf-object-fuzzer.o src/libbpf.a "$(pwd)/elfutils/libelf/libelf.a" -l:libz.a -o "$OUT/bpf-object-fuzzer" + +cp fuzz/bpf-object-fuzzer_seed_corpus.zip "$OUT" diff --git a/third_party/bpftool/libbpf/scripts/coverity.sh b/third_party/bpftool/libbpf/scripts/coverity.sh new file mode 100755 index 0000000..99e4809 --- /dev/null +++ b/third_party/bpftool/libbpf/scripts/coverity.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Taken from: https://scan.coverity.com/scripts/travisci_build_coverity_scan.sh +# Local changes are annotated with "#[local]" + +set -e + +# Environment check +echo -e "\033[33;1mNote: COVERITY_SCAN_PROJECT_NAME and COVERITY_SCAN_TOKEN are available on Project Settings page on scan.coverity.com\033[0m" +[ -z "$COVERITY_SCAN_PROJECT_NAME" ] && echo "ERROR: COVERITY_SCAN_PROJECT_NAME must be set" && exit 1 +[ -z "$COVERITY_SCAN_NOTIFICATION_EMAIL" ] && echo "ERROR: COVERITY_SCAN_NOTIFICATION_EMAIL must be set" && exit 1 +[ -z "$COVERITY_SCAN_BRANCH_PATTERN" ] && echo "ERROR: COVERITY_SCAN_BRANCH_PATTERN must be set" && exit 1 +[ -z "$COVERITY_SCAN_BUILD_COMMAND" ] && echo "ERROR: COVERITY_SCAN_BUILD_COMMAND must be set" && exit 1 +[ -z "$COVERITY_SCAN_TOKEN" ] && echo "ERROR: COVERITY_SCAN_TOKEN must be set" && exit 1 + +PLATFORM=`uname` +#[local] Use /var/tmp for TOOL_ARCHIVE and TOOL_BASE, as on certain systems +# /tmp is tmpfs and is sometimes too small to handle all necessary tooling +TOOL_ARCHIVE=/var//tmp/cov-analysis-${PLATFORM}.tgz +TOOL_URL=https://scan.coverity.com/download/${PLATFORM} +TOOL_BASE=/var/tmp/coverity-scan-analysis +UPLOAD_URL="https://scan.coverity.com/builds" +SCAN_URL="https://scan.coverity.com" + +# Do not run on pull requests +if [ "${TRAVIS_PULL_REQUEST}" = "true" ]; then + echo -e "\033[33;1mINFO: Skipping Coverity Analysis: branch is a pull request.\033[0m" + exit 0 +fi + +# Verify this branch should run +IS_COVERITY_SCAN_BRANCH=`ruby -e "puts '${TRAVIS_BRANCH}' =~ /\\A$COVERITY_SCAN_BRANCH_PATTERN\\z/ ? 1 : 0"` +if [ "$IS_COVERITY_SCAN_BRANCH" = "1" ]; then + echo -e "\033[33;1mCoverity Scan configured to run on branch ${TRAVIS_BRANCH}\033[0m" +else + echo -e "\033[33;1mCoverity Scan NOT configured to run on branch ${TRAVIS_BRANCH}\033[0m" + exit 1 +fi + +# Verify upload is permitted +AUTH_RES=`curl -s --form project="$COVERITY_SCAN_PROJECT_NAME" --form token="$COVERITY_SCAN_TOKEN" $SCAN_URL/api/upload_permitted` +if [ "$AUTH_RES" = "Access denied" ]; then + echo -e "\033[33;1mCoverity Scan API access denied. Check COVERITY_SCAN_PROJECT_NAME and COVERITY_SCAN_TOKEN.\033[0m" + exit 1 +else + AUTH=`echo $AUTH_RES | ruby -e "require 'rubygems'; require 'json'; puts JSON[STDIN.read]['upload_permitted']"` + if [ "$AUTH" = "true" ]; then + echo -e "\033[33;1mCoverity Scan analysis authorized per quota.\033[0m" + else + WHEN=`echo $AUTH_RES | ruby -e "require 'rubygems'; require 'json'; puts JSON[STDIN.read]['next_upload_permitted_at']"` + echo -e "\033[33;1mCoverity Scan analysis NOT authorized until $WHEN.\033[0m" + exit 0 + fi +fi + +if [ ! -d $TOOL_BASE ]; then + # Download Coverity Scan Analysis Tool + if [ ! -e $TOOL_ARCHIVE ]; then + echo -e "\033[33;1mDownloading Coverity Scan Analysis Tool...\033[0m" + wget -nv -O $TOOL_ARCHIVE $TOOL_URL --post-data "project=$COVERITY_SCAN_PROJECT_NAME&token=$COVERITY_SCAN_TOKEN" + fi + + # Extract Coverity Scan Analysis Tool + echo -e "\033[33;1mExtracting Coverity Scan Analysis Tool...\033[0m" + mkdir -p $TOOL_BASE + pushd $TOOL_BASE + tar xzf $TOOL_ARCHIVE + popd +fi + +TOOL_DIR=`find $TOOL_BASE -type d -name 'cov-analysis*'` +export PATH=$TOOL_DIR/bin:$PATH + +# Build +echo -e "\033[33;1mRunning Coverity Scan Analysis Tool...\033[0m" +COV_BUILD_OPTIONS="" +#COV_BUILD_OPTIONS="--return-emit-failures 8 --parse-error-threshold 85" +RESULTS_DIR="cov-int" +eval "${COVERITY_SCAN_BUILD_COMMAND_PREPEND}" +COVERITY_UNSUPPORTED=1 cov-build --dir $RESULTS_DIR $COV_BUILD_OPTIONS $COVERITY_SCAN_BUILD_COMMAND +cov-import-scm --dir $RESULTS_DIR --scm git --log $RESULTS_DIR/scm_log.txt 2>&1 + +# Upload results +echo -e "\033[33;1mTarring Coverity Scan Analysis results...\033[0m" +RESULTS_ARCHIVE=analysis-results.tgz +tar czf $RESULTS_ARCHIVE $RESULTS_DIR +SHA=`git rev-parse --short HEAD` + +echo -e "\033[33;1mUploading Coverity Scan Analysis results...\033[0m" +response=$(curl \ + --silent --write-out "\n%{http_code}\n" \ + --form project=$COVERITY_SCAN_PROJECT_NAME \ + --form token=$COVERITY_SCAN_TOKEN \ + --form email=$COVERITY_SCAN_NOTIFICATION_EMAIL \ + --form file=@$RESULTS_ARCHIVE \ + --form version=$SHA \ + --form description="Travis CI build" \ + $UPLOAD_URL) +status_code=$(echo "$response" | sed -n '$p') +#[local] Coverity used to return 201 on success, but it's 200 now +# See https://github.com/systemd/systemd/blob/master/tools/coverity.sh#L145 +if [ "$status_code" != "200" ]; then + TEXT=$(echo "$response" | sed '$d') + echo -e "\033[33;1mCoverity Scan upload failed: $TEXT.\033[0m" + exit 1 +fi diff --git a/third_party/bpftool/libbpf/scripts/sync-kernel.sh b/third_party/bpftool/libbpf/scripts/sync-kernel.sh new file mode 100755 index 0000000..55ffcd1 --- /dev/null +++ b/third_party/bpftool/libbpf/scripts/sync-kernel.sh @@ -0,0 +1,355 @@ +#!/bin/bash + +usage () { + echo "USAGE: ./sync-kernel.sh " + echo "" + echo "Set BPF_NEXT_BASELINE to override bpf-next tree commit, otherwise read from /CHECKPOINT-COMMIT." + echo "Set BPF_BASELINE to override bpf tree commit, otherwise read from /BPF-CHECKPOINT-COMMIT." + echo "Set MANUAL_MODE to 1 to manually control every cherry-picked commits." + exit 1 +} + +set -eu + +LIBBPF_REPO=${1-""} +LINUX_REPO=${2-""} +BPF_BRANCH=${3-""} +BASELINE_COMMIT=${BPF_NEXT_BASELINE:-$(cat ${LIBBPF_REPO}/CHECKPOINT-COMMIT)} +BPF_BASELINE_COMMIT=${BPF_BASELINE:-$(cat ${LIBBPF_REPO}/BPF-CHECKPOINT-COMMIT)} + +if [ -z "${LIBBPF_REPO}" ] || [ -z "${LINUX_REPO}" ]; then + echo "Error: libbpf or linux repos are not specified" + usage +fi +if [ -z "${BPF_BRANCH}" ]; then + echo "Error: linux's bpf tree branch is not specified" + usage +fi +if [ -z "${BASELINE_COMMIT}" ] || [ -z "${BPF_BASELINE_COMMIT}" ]; then + echo "Error: bpf or bpf-next baseline commits are not provided" + usage +fi + +SUFFIX=$(date --utc +%Y-%m-%dT%H-%M-%S.%3NZ) +WORKDIR=$(pwd) +TMP_DIR=$(mktemp -d) + +trap "cd ${WORKDIR}; exit" INT TERM EXIT + +declare -A PATH_MAP +PATH_MAP=( \ + [tools/lib/bpf]=src \ + [tools/include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \ + [tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \ + [tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \ + [tools/include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \ + [tools/include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \ + [tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \ + [tools/include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \ + [tools/include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \ + [tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \ + [tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \ + [tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \ + [include/uapi/linux/perf_event.h]=include/uapi/linux/perf_event.h \ + [Documentation/bpf/libbpf]=docs \ +) + +LIBBPF_PATHS=("${!PATH_MAP[@]}" ":^tools/lib/bpf/Makefile" ":^tools/lib/bpf/Build" ":^tools/lib/bpf/.gitignore" ":^tools/include/tools/libc_compat.h") +LIBBPF_VIEW_PATHS=("${PATH_MAP[@]}") +LIBBPF_VIEW_EXCLUDE_REGEX='^src/(Makefile|Build|test_libbpf\.c|bpf_helper_defs\.h|\.gitignore)$|^docs/(\.gitignore|api\.rst|conf\.py)$|^docs/sphinx/.*' +LINUX_VIEW_EXCLUDE_REGEX='^include/tools/libc_compat.h$' + +LIBBPF_TREE_FILTER="mkdir -p __libbpf/include/uapi/linux __libbpf/include/tools && "$'\\\n' +for p in "${!PATH_MAP[@]}"; do + LIBBPF_TREE_FILTER+="git mv -kf ${p} __libbpf/${PATH_MAP[${p}]} && "$'\\\n' +done +LIBBPF_TREE_FILTER+="git rm --ignore-unmatch -f __libbpf/src/{Makefile,Build,test_libbpf.c,.gitignore} >/dev/null" + +cd_to() +{ + cd ${WORKDIR} && cd "$1" +} + +# Output brief single-line commit description +# $1 - commit ref +commit_desc() +{ + git log -n1 --pretty='%h ("%s")' $1 +} + +# Create commit single-line signature, which consists of: +# - full commit subject +# - author date in ISO8601 format +# - full commit body with newlines replaced with vertical bars (|) +# - shortstat appended at the end +# The idea is that this single-line signature is good enough to make final +# decision about whether two commits are the same, across different repos. +# $1 - commit ref +# $2 - paths filter +commit_signature() +{ + local ref=$1 + shift + git show --pretty='("%s")|%aI|%b' --shortstat $ref -- "${@-.}" | tr '\n' '|' +} + +# Cherry-pick commits touching libbpf-related files +# $1 - baseline_tag +# $2 - tip_tag +cherry_pick_commits() +{ + local manual_mode=${MANUAL_MODE:-0} + local baseline_tag=$1 + local tip_tag=$2 + local new_commits + local signature + local should_skip + local synced_cnt + local manual_check + local libbpf_conflict_cnt + local desc + + new_commits=$(git rev-list --no-merges --topo-order --reverse ${baseline_tag}..${tip_tag} -- "${LIBBPF_PATHS[@]}") + for new_commit in ${new_commits}; do + desc="$(commit_desc ${new_commit})" + signature="$(commit_signature ${new_commit} "${LIBBPF_PATHS[@]}")" + synced_cnt=$(grep -F "${signature}" ${TMP_DIR}/libbpf_commits.txt | wc -l) + manual_check=0 + if ((${synced_cnt} > 0)); then + # commit with the same subject is already in libbpf, but it's + # not 100% the same commit, so check with user + echo "Commit '${desc}' is synced into libbpf as:" + grep -F "${signature}" ${TMP_DIR}/libbpf_commits.txt | \ + cut -d'|' -f1 | sed -e 's/^/- /' + if ((${manual_mode} != 1 && ${synced_cnt} == 1)); then + echo "Skipping '${desc}' due to unique match..." + continue + fi + if ((${synced_cnt} > 1)); then + echo "'${desc} matches multiple commits, please, double-check!" + manual_check=1 + fi + fi + if ((${manual_mode} == 1 || ${manual_check} == 1)); then + read -p "Do you want to skip '${desc}'? [y/N]: " should_skip + case "${should_skip}" in + "y" | "Y") + echo "Skipping '${desc}'..." + continue + ;; + esac + fi + # commit hasn't been synced into libbpf yet + echo "Picking '${desc}'..." + if ! git cherry-pick ${new_commit} &>/dev/null; then + echo "Warning! Cherry-picking '${desc} failed, checking if it's non-libbpf files causing problems..." + libbpf_conflict_cnt=$(git diff --name-only --diff-filter=U -- "${LIBBPF_PATHS[@]}" | wc -l) + conflict_cnt=$(git diff --name-only | wc -l) + prompt_resolution=1 + + if ((${libbpf_conflict_cnt} == 0)); then + echo "Looks like only non-libbpf files have conflicts, ignoring..." + if ((${conflict_cnt} == 0)); then + echo "Empty cherry-pick, skipping it..." + git cherry-pick --abort + continue + fi + + git add . + # GIT_EDITOR=true to avoid editor popping up to edit commit message + if ! GIT_EDITOR=true git cherry-pick --continue &>/dev/null; then + echo "Error! That still failed! Please resolve manually." + else + echo "Success! All cherry-pick conflicts were resolved for '${desc}'!" + prompt_resolution=0 + fi + fi + + if ((${prompt_resolution} == 1)); then + read -p "Error! Cherry-picking '${desc}' failed, please fix manually and press to proceed..." + fi + fi + # Append signature of just cherry-picked commit to avoid + # potentially cherry-picking the same commit twice later when + # processing bpf tree commits. At this point we don't know yet + # the final commit sha in libbpf repo, so we record Linux SHA + # instead as LINUX_. + echo LINUX_$(git log --pretty='%h' -n1) "${signature}" >> ${TMP_DIR}/libbpf_commits.txt + done +} + +cleanup() +{ + echo "Cleaning up..." + rm -r ${TMP_DIR} + cd_to ${LINUX_REPO} + git checkout ${TIP_SYM_REF} + git branch -D ${BASELINE_TAG} ${TIP_TAG} ${BPF_BASELINE_TAG} ${BPF_TIP_TAG} \ + ${SQUASH_BASE_TAG} ${SQUASH_TIP_TAG} ${VIEW_TAG} || true + + cd_to . + echo "DONE." +} + + +cd_to ${LIBBPF_REPO} +GITHUB_ABS_DIR=$(pwd) +echo "Dumping existing libbpf commit signatures..." +for h in $(git log --pretty='%h' -n500); do + echo $h "$(commit_signature $h)" >> ${TMP_DIR}/libbpf_commits.txt +done + +# Use current kernel repo HEAD as a source of patches +cd_to ${LINUX_REPO} +LINUX_ABS_DIR=$(pwd) +TIP_SYM_REF=$(git symbolic-ref -q --short HEAD || git rev-parse HEAD) +TIP_COMMIT=$(git rev-parse HEAD) +BPF_TIP_COMMIT=$(git rev-parse ${BPF_BRANCH}) +BASELINE_TAG=libbpf-baseline-${SUFFIX} +TIP_TAG=libbpf-tip-${SUFFIX} +BPF_BASELINE_TAG=libbpf-bpf-baseline-${SUFFIX} +BPF_TIP_TAG=libbpf-bpf-tip-${SUFFIX} +VIEW_TAG=libbpf-view-${SUFFIX} +LIBBPF_SYNC_TAG=libbpf-sync-${SUFFIX} + +# Squash state of kernel repo at baseline into single commit +SQUASH_BASE_TAG=libbpf-squash-base-${SUFFIX} +SQUASH_TIP_TAG=libbpf-squash-tip-${SUFFIX} +SQUASH_COMMIT=$(git commit-tree ${BASELINE_COMMIT}^{tree} -m "BASELINE SQUASH ${BASELINE_COMMIT}") + +echo "WORKDIR: ${WORKDIR}" +echo "LINUX REPO: ${LINUX_REPO}" +echo "LIBBPF REPO: ${LIBBPF_REPO}" +echo "TEMP DIR: ${TMP_DIR}" +echo "SUFFIX: ${SUFFIX}" +echo "BASE COMMIT: '$(commit_desc ${BASELINE_COMMIT})'" +echo "TIP COMMIT: '$(commit_desc ${TIP_COMMIT})'" +echo "BPF BASE COMMIT: '$(commit_desc ${BPF_BASELINE_COMMIT})'" +echo "BPF TIP COMMIT: '$(commit_desc ${BPF_TIP_COMMIT})'" +echo "SQUASH COMMIT: ${SQUASH_COMMIT}" +echo "BASELINE TAG: ${BASELINE_TAG}" +echo "TIP TAG: ${TIP_TAG}" +echo "BPF BASELINE TAG: ${BPF_BASELINE_TAG}" +echo "BPF TIP TAG: ${BPF_TIP_TAG}" +echo "SQUASH BASE TAG: ${SQUASH_BASE_TAG}" +echo "SQUASH TIP TAG: ${SQUASH_TIP_TAG}" +echo "VIEW TAG: ${VIEW_TAG}" +echo "LIBBPF SYNC TAG: ${LIBBPF_SYNC_TAG}" +echo "PATCHES: ${TMP_DIR}/patches" + +git branch ${BASELINE_TAG} ${BASELINE_COMMIT} +git branch ${TIP_TAG} ${TIP_COMMIT} +git branch ${BPF_BASELINE_TAG} ${BPF_BASELINE_COMMIT} +git branch ${BPF_TIP_TAG} ${BPF_TIP_COMMIT} +git branch ${SQUASH_BASE_TAG} ${SQUASH_COMMIT} +git checkout -b ${SQUASH_TIP_TAG} ${SQUASH_COMMIT} + +# Cherry-pick new commits onto squashed baseline commit +cherry_pick_commits ${BASELINE_TAG} ${TIP_TAG} +cherry_pick_commits ${BPF_BASELINE_TAG} ${BPF_TIP_TAG} + +# Move all libbpf files into __libbpf directory. +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --tree-filter "${LIBBPF_TREE_FILTER}" ${SQUASH_TIP_TAG} ${SQUASH_BASE_TAG} +# Make __libbpf a new root directory +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --subdirectory-filter __libbpf ${SQUASH_TIP_TAG} ${SQUASH_BASE_TAG} + +# If there are no new commits with libbpf-related changes, bail out +COMMIT_CNT=$(git rev-list --count ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG}) +if ((${COMMIT_CNT} <= 0)); then + echo "No new changes to apply, we are done!" + cleanup + exit 2 +fi + +# Exclude baseline commit and generate nice cover letter with summary +git format-patch --no-signature ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG} --cover-letter -o ${TMP_DIR}/patches + +# Now is time to re-apply libbpf-related linux patches to libbpf repo +cd_to ${LIBBPF_REPO} +git checkout -b ${LIBBPF_SYNC_TAG} + +for patch in $(ls -1 ${TMP_DIR}/patches | tail -n +2); do + if ! git am -3 --committer-date-is-author-date "${TMP_DIR}/patches/${patch}"; then + if ! patch -p1 --merge < "${TMP_DIR}/patches/${patch}"; then + read -p "Applying ${TMP_DIR}/patches/${patch} failed, please resolve manually and press to proceed..." + fi + git am --continue + fi +done + +# Generate bpf_helper_defs.h and commit, if anything changed +# restore Linux tip to use bpf_doc.py +cd_to ${LINUX_REPO} +git checkout ${TIP_TAG} +# re-generate bpf_helper_defs.h +cd_to ${LIBBPF_REPO} +"${LINUX_ABS_DIR}/scripts/bpf_doc.py" --header \ + --file include/uapi/linux/bpf.h > src/bpf_helper_defs.h +# if anything changed, commit it +helpers_changes=$(git status --porcelain src/bpf_helper_defs.h | wc -l) +if ((${helpers_changes} == 1)); then + git add src/bpf_helper_defs.h + git commit -s -m "sync: auto-generate latest BPF helpers + +Latest changes to BPF helper definitions. +" -- src/bpf_helper_defs.h +fi + +# Use generated cover-letter as a template for "sync commit" with +# baseline and checkpoint commits from kernel repo (and leave summary +# from cover letter intact, of course) +echo ${TIP_COMMIT} > CHECKPOINT-COMMIT && \ +echo ${BPF_TIP_COMMIT} > BPF-CHECKPOINT-COMMIT && \ +git add CHECKPOINT-COMMIT && \ +git add BPF-CHECKPOINT-COMMIT && \ +awk '/\*\*\* BLURB HERE \*\*\*/ {p=1} p' ${TMP_DIR}/patches/0000-cover-letter.patch | \ +sed "s/\*\*\* BLURB HERE \*\*\*/\ +sync: latest libbpf changes from kernel\n\ +\n\ +Syncing latest libbpf commits from kernel repository.\n\ +Baseline bpf-next commit: ${BASELINE_COMMIT}\n\ +Checkpoint bpf-next commit: ${TIP_COMMIT}\n\ +Baseline bpf commit: ${BPF_BASELINE_COMMIT}\n\ +Checkpoint bpf commit: ${BPF_TIP_COMMIT}/" | \ +git commit -s --file=- + +echo "SUCCESS! ${COMMIT_CNT} commits synced." + +echo "Verifying Linux's and Github's libbpf state" + +cd_to ${LINUX_REPO} +git checkout -b ${VIEW_TAG} ${TIP_COMMIT} +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --tree-filter "${LIBBPF_TREE_FILTER}" ${VIEW_TAG}^..${VIEW_TAG} +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --subdirectory-filter __libbpf ${VIEW_TAG}^..${VIEW_TAG} +git ls-files -- "${LIBBPF_VIEW_PATHS[@]}" | grep -v -E "${LINUX_VIEW_EXCLUDE_REGEX}" > ${TMP_DIR}/linux-view.ls + +cd_to ${LIBBPF_REPO} +git ls-files -- "${LIBBPF_VIEW_PATHS[@]}" | grep -v -E "${LIBBPF_VIEW_EXCLUDE_REGEX}" > ${TMP_DIR}/github-view.ls + +echo "Comparing list of files..." +diff -u ${TMP_DIR}/linux-view.ls ${TMP_DIR}/github-view.ls +echo "Comparing file contents..." +CONSISTENT=1 +for F in $(cat ${TMP_DIR}/linux-view.ls); do + if ! diff -u "${LINUX_ABS_DIR}/${F}" "${GITHUB_ABS_DIR}/${F}"; then + echo "${LINUX_ABS_DIR}/${F} and ${GITHUB_ABS_DIR}/${F} are different!" + CONSISTENT=0 + fi +done +if ((${CONSISTENT} == 1)); then + echo "Great! Content is identical!" +else + ignore_inconsistency=n + echo "Unfortunately, there are some inconsistencies, please double check." + read -p "Does everything look good? [y/N]: " ignore_inconsistency + case "${ignore_inconsistency}" in + "y" | "Y") + echo "Ok, proceeding..." + ;; + *) + echo "Oops, exiting with error..." + exit 4 + esac +fi + +cleanup diff --git a/third_party/bpftool/libbpf/src/.gitignore b/third_party/bpftool/libbpf/src/.gitignore new file mode 100644 index 0000000..0473afb --- /dev/null +++ b/third_party/bpftool/libbpf/src/.gitignore @@ -0,0 +1,6 @@ +*.o +*.a +/libbpf.pc +/libbpf.so* +/staticobjs +/sharedobjs diff --git a/third_party/bpftool/libbpf/src/Makefile b/third_party/bpftool/libbpf/src/Makefile new file mode 100644 index 0000000..52188db --- /dev/null +++ b/third_party/bpftool/libbpf/src/Makefile @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' "$(1)" "$(2)" "$(if $(3), $(3))"; +endif + +LIBBPF_MAJOR_VERSION := 1 +LIBBPF_MINOR_VERSION := 3 +LIBBPF_PATCH_VERSION := 0 +LIBBPF_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).$(LIBBPF_PATCH_VERSION) +LIBBPF_MAJMIN_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).0 +LIBBPF_MAP_VERSION := $(shell grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | sort -rV | head -n1 | cut -d'_' -f2) +ifneq ($(LIBBPF_MAJMIN_VERSION), $(LIBBPF_MAP_VERSION)) +$(error Libbpf release ($(LIBBPF_VERSION)) and map ($(LIBBPF_MAP_VERSION)) versions are out of sync!) +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +TOPDIR = .. + +INCLUDES := -I. -I$(TOPDIR)/include -I$(TOPDIR)/include/uapi +ALL_CFLAGS := $(INCLUDES) + +SHARED_CFLAGS += -fPIC -fvisibility=hidden -DSHARED + +CFLAGS ?= -g -O2 -Werror -Wall -std=gnu89 +ALL_CFLAGS += $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(EXTRA_CFLAGS) +ALL_LDFLAGS += $(LDFLAGS) $(EXTRA_LDFLAGS) + +ifdef NO_PKG_CONFIG + ALL_LDFLAGS += -lelf -lz +else + PKG_CONFIG ?= pkg-config + ALL_CFLAGS += $(shell $(PKG_CONFIG) --cflags libelf zlib) + ALL_LDFLAGS += $(shell $(PKG_CONFIG) --libs libelf zlib) +endif + +OBJDIR ?= . +SHARED_OBJDIR := $(OBJDIR)/sharedobjs +STATIC_OBJDIR := $(OBJDIR)/staticobjs +OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \ + nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \ + btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \ + relo_core.o usdt.o zip.o +SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) +STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) + +STATIC_LIBS := $(OBJDIR)/libbpf.a +ifndef BUILD_STATIC_ONLY + SHARED_LIBS := $(OBJDIR)/libbpf.so \ + $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + VERSION_SCRIPT := libbpf.map +endif + +HEADERS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ + bpf_helpers.h bpf_helper_defs.h bpf_tracing.h \ + bpf_endian.h bpf_core_read.h skel_internal.h libbpf_version.h \ + usdt.bpf.h +UAPI_HEADERS := $(addprefix $(TOPDIR)/include/uapi/linux/,\ + bpf.h bpf_common.h btf.h) + +PC_FILE := $(OBJDIR)/libbpf.pc + +INSTALL = install + +DESTDIR ?= + +HOSTARCH = $(firstword $(subst -, ,$(shell $(CC) -dumpmachine))) +ifeq ($(filter-out %64 %64be %64eb %64le %64el s390x, $(HOSTARCH)),) + LIBSUBDIR := lib64 +else + LIBSUBDIR := lib +endif + +# By default let the pc file itself use ${prefix} in includedir/libdir so that +# the prefix can be overridden at runtime (eg: --define-prefix) +ifndef LIBDIR + LIBDIR_PC := $$\{prefix\}/$(LIBSUBDIR) +else + LIBDIR_PC := $(LIBDIR) +endif +PREFIX ?= /usr +LIBDIR ?= $(PREFIX)/$(LIBSUBDIR) +INCLUDEDIR ?= $(PREFIX)/include +UAPIDIR ?= $(PREFIX)/include + +TAGS_PROG := $(if $(shell which etags 2>/dev/null),etags,ctags) + +all: $(STATIC_LIBS) $(SHARED_LIBS) $(PC_FILE) + +$(OBJDIR)/libbpf.a: $(STATIC_OBJS) + $(call msg,AR,$@) + $(Q)$(AR) rcs $@ $^ + +$(OBJDIR)/libbpf.so: $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION): $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_VERSION): $(SHARED_OBJS) + $(call msg,CC,$@) + $(Q)$(CC) -shared -Wl,--version-script=$(VERSION_SCRIPT) \ + -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $^ $(ALL_LDFLAGS) -o $@ + +$(OBJDIR)/libbpf.pc: force + $(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \ + -e "s|@LIBDIR@|$(LIBDIR_PC)|" \ + -e "s|@VERSION@|$(LIBBPF_VERSION)|" \ + < libbpf.pc.template > $@ + +$(STATIC_OBJDIR) $(SHARED_OBJDIR): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(STATIC_OBJDIR)/%.o: %.c | $(STATIC_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +$(SHARED_OBJDIR)/%.o: %.c | $(SHARED_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(SHARED_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +define do_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)$(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR)$2' +endef + +# Preserve symlinks at installation. +define do_s_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)cp -fR $1 '$(DESTDIR)$2' +endef + +install: all install_headers install_pkgconfig + $(call do_s_install,$(STATIC_LIBS) $(SHARED_LIBS),$(LIBDIR)) + +install_headers: + $(call do_install,$(HEADERS),$(INCLUDEDIR)/bpf,644) + +# UAPI headers can be installed by a different package so they're not installed +# in by install rule. +install_uapi_headers: + $(call do_install,$(UAPI_HEADERS),$(UAPIDIR)/linux,644) + +install_pkgconfig: $(PC_FILE) + $(call do_install,$(PC_FILE),$(LIBDIR)/pkgconfig,644) + +clean: + $(call msg,CLEAN) + $(Q)rm -rf *.o *.a *.so *.so.* *.pc $(SHARED_OBJDIR) $(STATIC_OBJDIR) + +.PHONY: cscope tags force +cscope: + $(call msg,CSCOPE) + $(Q)ls *.c *.h > cscope.files + $(Q)cscope -b -q -f cscope.out + +tags: + $(call msg,CTAGS) + $(Q)rm -f TAGS tags + $(Q)ls *.c *.h | xargs $(TAGS_PROG) -a + +force: diff --git a/third_party/bpftool/libbpf/src/bpf.c b/third_party/bpftool/libbpf/src/bpf.c new file mode 100644 index 0000000..3b0da19 --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf.c @@ -0,0 +1,1211 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +/* + * When building perf, unistd.h is overridden. __NR_bpf is + * required to be defined explicitly. + */ +#ifndef __NR_bpf +# if defined(__i386__) +# define __NR_bpf 357 +# elif defined(__x86_64__) +# define __NR_bpf 321 +# elif defined(__aarch64__) +# define __NR_bpf 280 +# elif defined(__sparc__) +# define __NR_bpf 349 +# elif defined(__s390__) +# define __NR_bpf 351 +# elif defined(__arc__) +# define __NR_bpf 280 +# elif defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# else +# error __NR_bpf not defined. libbpf does not support your arch. +# endif +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + return syscall(__NR_bpf, cmd, attr, size); +} + +static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + int fd; + + fd = sys_bpf(cmd, attr, size); + return ensure_good_fd(fd); +} + +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) +{ + int fd; + + do { + fd = sys_bpf_fd(BPF_PROG_LOAD, attr, size); + } while (fd < 0 && errno == EAGAIN && --attempts > 0); + + return fd; +} + +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and progs. This was done in [0]. + * We use the support for bpf_ktime_get_coarse_ns() helper, which was added in + * the same 5.11 Linux release ([1]), to detect memcg-based accounting for BPF. + * + * [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/ + * [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") + */ +int probe_memcg_account(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + struct bpf_insn insns[] = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd; + + /* attempt loading freplace trying to use custom BTF */ + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); + if (prog_fd >= 0) { + close(prog_fd); + return 1; + } + return 0; +} + +static bool memlock_bumped; +static rlim_t memlock_rlim = RLIM_INFINITY; + +int libbpf_set_memlock_rlim(size_t memlock_bytes) +{ + if (memlock_bumped) + return libbpf_err(-EBUSY); + + memlock_rlim = memlock_bytes; + return 0; +} + +int bump_rlimit_memlock(void) +{ + struct rlimit rlim; + + /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ + if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) + return 0; + + memlock_bumped = true; + + /* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */ + if (memlock_rlim == 0) + return 0; + + rlim.rlim_cur = rlim.rlim_max = memlock_rlim; + if (setrlimit(RLIMIT_MEMLOCK, &rlim)) + return -errno; + + return 0; +} + +int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_map_create_opts)) + return libbpf_err(-EINVAL); + + attr.map_type = map_type; + if (map_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + attr.btf_fd = OPTS_GET(opts, btf_fd, 0); + attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0); + attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0); + attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0); + + attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0); + attr.map_flags = OPTS_GET(opts, map_flags, 0); + attr.map_extra = OPTS_GET(opts, map_extra, 0); + attr.numa_node = OPTS_GET(opts, numa_node, 0); + attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); + + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +static void * +alloc_zero_tailing_info(const void *orecord, __u32 cnt, + __u32 actual_rec_size, __u32 expected_rec_size) +{ + __u64 info_len = (__u64)actual_rec_size * cnt; + void *info, *nrecord; + int i; + + info = malloc(info_len); + if (!info) + return NULL; + + /* zero out bytes kernel does not understand */ + nrecord = info; + for (i = 0; i < cnt; i++) { + memcpy(nrecord, orecord, expected_rec_size); + memset(nrecord + expected_rec_size, 0, + actual_rec_size - expected_rec_size); + orecord += actual_rec_size; + nrecord += actual_rec_size; + } + + return info; +} + +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, log_true_size); + void *finfo = NULL, *linfo = NULL; + const char *func_info, *line_info; + __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; + __u32 func_info_rec_size, line_info_rec_size; + int fd, attempts; + union bpf_attr attr; + char *log_buf; + + bump_rlimit_memlock(); + + if (!OPTS_VALID(opts, bpf_prog_load_opts)) + return libbpf_err(-EINVAL); + + attempts = OPTS_GET(opts, attempts, 0); + if (attempts < 0) + return libbpf_err(-EINVAL); + if (attempts == 0) + attempts = PROG_LOAD_ATTEMPTS; + + memset(&attr, 0, attr_sz); + + attr.prog_type = prog_type; + attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0); + + attr.prog_btf_fd = OPTS_GET(opts, prog_btf_fd, 0); + attr.prog_flags = OPTS_GET(opts, prog_flags, 0); + attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); + attr.kern_version = OPTS_GET(opts, kern_version, 0); + + if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + attr.license = ptr_to_u64(license); + + if (insn_cnt > UINT_MAX) + return libbpf_err(-E2BIG); + + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)insn_cnt; + + attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); + attach_btf_obj_fd = OPTS_GET(opts, attach_btf_obj_fd, 0); + + if (attach_prog_fd && attach_btf_obj_fd) + return libbpf_err(-EINVAL); + + attr.attach_btf_id = OPTS_GET(opts, attach_btf_id, 0); + if (attach_prog_fd) + attr.attach_prog_fd = attach_prog_fd; + else + attr.attach_btf_obj_fd = attach_btf_obj_fd; + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (!!log_buf != !!log_size) + return libbpf_err(-EINVAL); + + func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); + func_info = OPTS_GET(opts, func_info, NULL); + attr.func_info_rec_size = func_info_rec_size; + attr.func_info = ptr_to_u64(func_info); + attr.func_info_cnt = OPTS_GET(opts, func_info_cnt, 0); + + line_info_rec_size = OPTS_GET(opts, line_info_rec_size, 0); + line_info = OPTS_GET(opts, line_info, NULL); + attr.line_info_rec_size = line_info_rec_size; + attr.line_info = ptr_to_u64(line_info); + attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0); + + attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL)); + + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = log_level; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + if (fd >= 0) + return fd; + + /* After bpf_prog_load, the kernel may modify certain attributes + * to give user space a hint how to deal with loading failure. + * Check to see whether we can make some changes and load again. + */ + while (errno == E2BIG && (!finfo || !linfo)) { + if (!finfo && attr.func_info_cnt && + attr.func_info_rec_size < func_info_rec_size) { + /* try with corrected func info records */ + finfo = alloc_zero_tailing_info(func_info, + attr.func_info_cnt, + func_info_rec_size, + attr.func_info_rec_size); + if (!finfo) { + errno = E2BIG; + goto done; + } + + attr.func_info = ptr_to_u64(finfo); + attr.func_info_rec_size = func_info_rec_size; + } else if (!linfo && attr.line_info_cnt && + attr.line_info_rec_size < line_info_rec_size) { + linfo = alloc_zero_tailing_info(line_info, + attr.line_info_cnt, + line_info_rec_size, + attr.line_info_rec_size); + if (!linfo) { + errno = E2BIG; + goto done; + } + + attr.line_info = ptr_to_u64(linfo); + attr.line_info_rec_size = line_info_rec_size; + } else { + break; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + if (fd >= 0) + goto done; + } + + if (log_level == 0 && log_buf) { + /* log_level == 0 with non-NULL log_buf requires retrying on error + * with log_level == 1 and log_buf/log_buf_size set, to get details of + * failure + */ + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = 1; + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + } +done: + /* free() doesn't affect errno, so we don't need to restore it */ + free(finfo); + free(linfo); + return libbpf_err_errno(fd); +} + +int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_get_next_key(int fd, const void *key, void *next_key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, next_key); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.next_key = ptr_to_u64(next_key); + + ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + ret = sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +static int bpf_map_batch_common(int cmd, int fd, void *in_batch, + void *out_batch, void *keys, void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, batch); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_map_batch_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.batch.map_fd = fd; + attr.batch.in_batch = ptr_to_u64(in_batch); + attr.batch.out_batch = ptr_to_u64(out_batch); + attr.batch.keys = ptr_to_u64(keys); + attr.batch.values = ptr_to_u64(values); + attr.batch.count = *count; + attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); + attr.batch.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(cmd, &attr, attr_sz); + *count = attr.batch.count; + + return libbpf_err_errno(ret); +} + +int bpf_map_delete_batch(int fd, const void *keys, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL, + NULL, (void *)keys, NULL, count, opts); +} + +int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, in_batch, + out_batch, keys, values, count, opts); +} + +int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH, + fd, in_batch, out_batch, keys, values, + count, opts); +} + +int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL, + (void *)keys, (void *)values, count, opts); +} + +int bpf_obj_pin_opts(int fd, const char *pathname, const struct bpf_obj_pin_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_obj_pin_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); + attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); + attr.bpf_fd = fd; + + ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_obj_pin(int fd, const char *pathname) +{ + return bpf_obj_pin_opts(fd, pathname, NULL); +} + +int bpf_obj_get(const char *pathname) +{ + return bpf_obj_get_opts(pathname, NULL); +} + +int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_obj_get_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); + attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); + + fd = sys_bpf_fd(BPF_OBJ_GET, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = flags, + ); + + return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts); +} + +int bpf_prog_attach_opts(int prog_fd, int target_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_attach_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.attach_flags = OPTS_GET(opts, flags, 0); + attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); + + ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create); + __u32 target_btf_id, iter_info_len; + union bpf_attr attr; + int fd, err; + + if (!OPTS_VALID(opts, bpf_link_create_opts)) + return libbpf_err(-EINVAL); + + iter_info_len = OPTS_GET(opts, iter_info_len, 0); + target_btf_id = OPTS_GET(opts, target_btf_id, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (iter_info_len || target_btf_id) { + if (iter_info_len && target_btf_id) + return libbpf_err(-EINVAL); + if (!OPTS_ZEROED(opts, target_btf_id)) + return libbpf_err(-EINVAL); + } + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + attr.link_create.flags = OPTS_GET(opts, flags, 0); + + if (target_btf_id) { + attr.link_create.target_btf_id = target_btf_id; + goto proceed; + } + + switch (attach_type) { + case BPF_TRACE_ITER: + attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0)); + attr.link_create.iter_info_len = iter_info_len; + break; + case BPF_PERF_EVENT: + attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0); + if (!OPTS_ZEROED(opts, perf_event)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_KPROBE_MULTI: + attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); + attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); + attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); + attr.link_create.kprobe_multi.addrs = ptr_to_u64(OPTS_GET(opts, kprobe_multi.addrs, 0)); + attr.link_create.kprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, kprobe_multi.cookies, 0)); + if (!OPTS_ZEROED(opts, kprobe_multi)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: + attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); + if (!OPTS_ZEROED(opts, tracing)) + return libbpf_err(-EINVAL); + break; + case BPF_NETFILTER: + attr.link_create.netfilter.pf = OPTS_GET(opts, netfilter.pf, 0); + attr.link_create.netfilter.hooknum = OPTS_GET(opts, netfilter.hooknum, 0); + attr.link_create.netfilter.priority = OPTS_GET(opts, netfilter.priority, 0); + attr.link_create.netfilter.flags = OPTS_GET(opts, netfilter.flags, 0); + if (!OPTS_ZEROED(opts, netfilter)) + return libbpf_err(-EINVAL); + break; + default: + if (!OPTS_ZEROED(opts, flags)) + return libbpf_err(-EINVAL); + break; + } +proceed: + fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz); + if (fd >= 0) + return fd; + /* we'll get EINVAL if LINK_CREATE doesn't support attaching fentry + * and other similar programs + */ + err = -errno; + if (err != -EINVAL) + return libbpf_err(err); + + /* if user used features not supported by + * BPF_RAW_TRACEPOINT_OPEN command, then just give up immediately + */ + if (attr.link_create.target_fd || attr.link_create.target_btf_id) + return libbpf_err(err); + if (!OPTS_ZEROED(opts, sz)) + return libbpf_err(err); + + /* otherwise, for few select kinds of programs that can be + * attached using BPF_RAW_TRACEPOINT_OPEN command, try that as + * a fallback for older kernels + */ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + case BPF_LSM_MAC: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + return bpf_raw_tracepoint_open(NULL, prog_fd); + default: + return libbpf_err(err); + } +} + +int bpf_link_detach(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_detach); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.link_detach.link_fd = link_fd; + + ret = sys_bpf(BPF_LINK_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_update); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_link_update_opts)) + return libbpf_err(-EINVAL); + + if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_update.link_fd = link_fd; + attr.link_update.new_prog_fd = new_prog_fd; + attr.link_update.flags = OPTS_GET(opts, flags, 0); + if (OPTS_GET(opts, old_prog_fd, 0)) + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + else if (OPTS_GET(opts, old_map_fd, 0)) + attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0); + + ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_iter_create(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, iter_create); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.iter_create.link_fd = link_fd; + + fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, query); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + + attr.query.target_fd = target_fd; + attr.query.attach_type = type; + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + + ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); + + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + + return libbpf_err_errno(ret); +} + +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + LIBBPF_OPTS(bpf_prog_query_opts, opts); + int ret; + + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; + + ret = bpf_prog_query_opts(target_fd, type, &opts); + + if (attach_flags) + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; + + return libbpf_err_errno(ret); +} + +int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, test); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_test_run_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.batch_size = OPTS_GET(opts, batch_size, 0); + attr.test.cpu = OPTS_GET(opts, cpu, 0); + attr.test.flags = OPTS_GET(opts, flags, 0); + attr.test.repeat = OPTS_GET(opts, repeat, 0); + attr.test.duration = OPTS_GET(opts, duration, 0); + attr.test.ctx_size_in = OPTS_GET(opts, ctx_size_in, 0); + attr.test.ctx_size_out = OPTS_GET(opts, ctx_size_out, 0); + attr.test.data_size_in = OPTS_GET(opts, data_size_in, 0); + attr.test.data_size_out = OPTS_GET(opts, data_size_out, 0); + attr.test.ctx_in = ptr_to_u64(OPTS_GET(opts, ctx_in, NULL)); + attr.test.ctx_out = ptr_to_u64(OPTS_GET(opts, ctx_out, NULL)); + attr.test.data_in = ptr_to_u64(OPTS_GET(opts, data_in, NULL)); + attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL)); + + ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, attr_sz); + + OPTS_SET(opts, data_size_out, attr.test.data_size_out); + OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out); + OPTS_SET(opts, duration, attr.test.duration); + OPTS_SET(opts, retval, attr.test.retval); + + return libbpf_err_errno(ret); +} + +static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.start_id = start_id; + + err = sys_bpf(cmd, &attr, attr_sz); + if (!err) + *next_id = attr.next_id; + + return libbpf_err_errno(err); +} + +int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID); +} + +int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID); +} + +int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID); +} + +int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); +} + +int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_get_fd_by_id(__u32 id) +{ + return bpf_prog_get_fd_by_id_opts(id, NULL); +} + +int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.map_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_map_get_fd_by_id(__u32 id) +{ + return bpf_map_get_fd_by_id_opts(id, NULL); +} + +int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.btf_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_get_fd_by_id(__u32 id) +{ + return bpf_btf_get_fd_by_id_opts(id, NULL); +} + +int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_link_get_fd_by_id(__u32 id) +{ + return bpf_link_get_fd_by_id_opts(id, NULL); +} + +int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = bpf_fd; + attr.info.info_len = *info_len; + attr.info.info = ptr_to_u64(info); + + err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); + if (!err) + *info_len = attr.info.info_len; + return libbpf_err_errno(err); +} + +int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(prog_fd, info, info_len); +} + +int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(map_fd, info, info_len); +} + +int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(btf_fd, info, info_len); +} + +int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(link_fd, info, info_len); +} + +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = ptr_to_u64(name); + attr.raw_tracepoint.prog_fd = prog_fd; + + fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size); + union bpf_attr attr; + char *log_buf; + size_t log_size; + __u32 log_level; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_btf_load_opts)) + return libbpf_err(-EINVAL); + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (log_size > UINT_MAX) + return libbpf_err(-EINVAL); + if (log_size && !log_buf) + return libbpf_err(-EINVAL); + + attr.btf = ptr_to_u64(btf_data); + attr.btf_size = btf_size; + /* log_level == 0 and log_buf != NULL means "try loading without + * log_buf, but retry with log_buf and log_level=1 on error", which is + * consistent across low-level and high-level BTF and program loading + * APIs within libbpf and provides a sensible behavior in practice + */ + if (log_level) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = log_level; + } + + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + if (fd < 0 && log_buf && log_level == 0) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = 1; + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + } + + OPTS_SET(opts, log_true_size, attr.btf_log_true_size); + return libbpf_err_errno(fd); +} + +int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, + __u64 *probe_addr) +{ + const size_t attr_sz = offsetofend(union bpf_attr, task_fd_query); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.task_fd_query.pid = pid; + attr.task_fd_query.fd = fd; + attr.task_fd_query.flags = flags; + attr.task_fd_query.buf = ptr_to_u64(buf); + attr.task_fd_query.buf_len = *buf_len; + + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, attr_sz); + + *buf_len = attr.task_fd_query.buf_len; + *prog_id = attr.task_fd_query.prog_id; + *fd_type = attr.task_fd_query.fd_type; + *probe_offset = attr.task_fd_query.probe_offset; + *probe_addr = attr.task_fd_query.probe_addr; + + return libbpf_err_errno(err); +} + +int bpf_enable_stats(enum bpf_stats_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, enable_stats); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.enable_stats.type = type; + + fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_bind_map); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_bind_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_bind_map.prog_fd = prog_fd; + attr.prog_bind_map.map_fd = map_fd; + attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); + return libbpf_err_errno(ret); +} diff --git a/third_party/bpftool/libbpf/src/bpf.h b/third_party/bpftool/libbpf/src/bpf.h new file mode 100644 index 0000000..c676295 --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf.h @@ -0,0 +1,564 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common BPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see + */ +#ifndef __LIBBPF_BPF_H +#define __LIBBPF_BPF_H + +#include +#include +#include +#include + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int libbpf_set_memlock_rlim(size_t memlock_bytes); + +struct bpf_map_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + + __u32 inner_map_fd; + __u32 map_flags; + __u64 map_extra; + + __u32 numa_node; + __u32 map_ifindex; +}; +#define bpf_map_create_opts__last_field map_ifindex + +LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts); + +struct bpf_prog_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* libbpf can retry BPF_PROG_LOAD command if bpf() syscall returns + * -EAGAIN. This field determines how many attempts libbpf has to + * make. If not specified, libbpf will use default value of 5. + */ + int attempts; + + enum bpf_attach_type expected_attach_type; + __u32 prog_btf_fd; + __u32 prog_flags; + __u32 prog_ifindex; + __u32 kern_version; + + __u32 attach_btf_id; + __u32 attach_prog_fd; + __u32 attach_btf_obj_fd; + + const int *fd_array; + + /* .BTF.ext func info data */ + const void *func_info; + __u32 func_info_cnt; + __u32 func_info_rec_size; + + /* .BTF.ext line info data */ + const void *line_info; + __u32 line_info_cnt; + __u32 line_info_rec_size; + + /* verifier log options */ + __u32 log_level; + __u32 log_size; + char *log_buf; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; +}; +#define bpf_prog_load_opts__last_field log_true_size + +LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *opts); + +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + +/* Recommended log buffer size */ +#define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ + +struct bpf_btf_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* kernel log options */ + char *log_buf; + __u32 log_level; + __u32 log_size; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; +}; +#define bpf_btf_load_opts__last_field log_true_size + +LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, + struct bpf_btf_load_opts *opts); + +LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); + +LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, + __u64 flags); +LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, + void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, + void *value, __u64 flags); +LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags); +LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); +LIBBPF_API int bpf_map_freeze(int fd); + +struct bpf_map_batch_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 elem_flags; + __u64 flags; +}; +#define bpf_map_batch_opts__last_field flags + + +/** + * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple + * elements in a BPF map. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param count input and output parameter; on input **count** represents the + * number of elements in the map to delete in batch; + * on output if a non-EFAULT error is returned, **count** represents the number of deleted + * elements if the output **count** value is not equal to the input **count** value + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch deletion works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. + * + * The parameter *in_batch* is the address of the first element in the batch to read. + * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent + * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate + * that the batched lookup starts from the beginning of the map. + * + * The *keys* and *values* are output parameters which must point to memory large enough to + * hold *count* items based on the key and value size of the map *map_fd*. The *keys* + * buffer must be of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * indicate that the batched lookup starts from the beginning of the map. + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array large enough for *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read in batch; on output it's the number of elements that were + * successfully read. + * If a non-EFAULT error is returned, count will be set as the number of elements + * that were read before the error occurred. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch lookup works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion + * of BPF map elements where each element is deleted after being retrieved. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * get address of the first element in *out_batch* + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array of *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read and delete in batch; on output it represents the number of + * elements that were successfully read and deleted + * If a non-**EFAULT** error code is returned and if the output **count** value + * is not equal to the input **count** value, up to **count** elements may + * have been deleted. + * if **EFAULT** is returned up to *count* elements may have been deleted without + * being returned via the *keys* and *values* output parameters. + * @param opts options for configuring the way the batch lookup and delete works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, + void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_update_batch()** updates multiple elements in a map + * by specifying keys and their corresponding values. + * + * The *keys* and *values* parameters must point to memory large enough + * to hold *count* items based on the key and value size of the map. + * + * The *opts* parameter can be used to control how *bpf_map_update_batch()* + * should handle keys that either do or do not already exist in the map. + * In particular the *flags* parameter of *bpf_map_batch_opts* can be + * one of the following: + * + * Note that *count* is an input and output parameter, where on output it + * represents how many elements were successfully updated. Also note that if + * **EFAULT** then *count* should not be trusted to be correct. + * + * **BPF_ANY** + * Create new elements or update existing. + * + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * + * **BPF_EXIST** + * Update existing elements. + * + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param values pointer to an array of *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to update in batch; on output if a non-EFAULT error is returned, + * **count** represents the number of updated elements if the output **count** + * value is not equal to the input **count** value. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch update works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +struct bpf_obj_pin_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_pin_opts__last_field path_fd + +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_pin_opts(int fd, const char *pathname, + const struct bpf_obj_pin_opts *opts); + +struct bpf_obj_get_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_get_opts__last_field path_fd + +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_obj_get_opts(const char *pathname, + const struct bpf_obj_get_opts *opts); + +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + unsigned int flags; + int replace_prog_fd; +}; +#define bpf_prog_attach_opts__last_field replace_prog_fd + +LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, + enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); +LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, + enum bpf_attach_type type); + +union bpf_iter_link_info; /* defined in up-to-date linux/bpf.h */ +struct bpf_link_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + union bpf_iter_link_info *iter_info; + __u32 iter_info_len; + __u32 target_btf_id; + union { + struct { + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + const char **syms; + const unsigned long *addrs; + const __u64 *cookies; + } kprobe_multi; + struct { + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + size_t :0; +}; +#define bpf_link_create_opts__last_field kprobe_multi.cookies + +LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts); + +LIBBPF_API int bpf_link_detach(int link_fd); + +struct bpf_link_update_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; /* extra flags */ + __u32 old_prog_fd; /* expected old program FD */ + __u32 old_map_fd; /* expected old map FD */ +}; +#define bpf_link_update_opts__last_field old_map_fd + +LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts); + +LIBBPF_API int bpf_iter_create(int link_fd); + +struct bpf_prog_test_run_attr { + int prog_fd; + int repeat; + const void *data_in; + __u32 data_size_in; + void *data_out; /* optional */ + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out */ + __u32 retval; /* out: return code of the BPF program */ + __u32 duration; /* out: average per repetition in ns */ + const void *ctx_in; /* optional */ + __u32 ctx_size_in; + void *ctx_out; /* optional */ + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out */ +}; + +LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); + +struct bpf_get_fd_by_id_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 open_flags; /* permissions requested for the operation on fd */ + size_t :0; +}; +#define bpf_get_fd_by_id_opts__last_field open_flags + +LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +/** + * @brief **bpf_prog_get_info_by_fd()** obtains information about the BPF + * program corresponding to *prog_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param prog_fd BPF program file descriptor + * @param info pointer to **struct bpf_prog_info** that will be populated with + * BPF program information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len); + +/** + * @brief **bpf_map_get_info_by_fd()** obtains information about the BPF + * map corresponding to *map_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param map_fd BPF map file descriptor + * @param info pointer to **struct bpf_map_info** that will be populated with + * BPF map information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the + * BTF object corresponding to *btf_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param btf_fd BTF object file descriptor + * @param info pointer to **struct bpf_btf_info** that will be populated with + * BTF object information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the BPF + * link corresponding to *link_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param link_fd BPF link file descriptor + * @param info pointer to **struct bpf_link_info** that will be populated with + * BPF link information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + __u32 prog_cnt; /* input+output argument */ + __u32 *prog_attach_flags; +}; +#define bpf_prog_query_opts__last_field prog_attach_flags + +LIBBPF_API int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); +LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, + __u32 query_flags, __u32 *attach_flags, + __u32 *prog_ids, __u32 *prog_cnt); + +LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); +LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, + __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr); + +#ifdef __cplusplus +/* forward-declaring enums in C++ isn't compatible with pure C enums, so + * instead define bpf_enable_stats() as accepting int as an input + */ +LIBBPF_API int bpf_enable_stats(int type); +#else +enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ +LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +#endif + +struct bpf_prog_bind_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; +}; +#define bpf_prog_bind_opts__last_field flags + +LIBBPF_API int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts); + +struct bpf_test_run_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const void *data_in; /* optional */ + void *data_out; /* optional */ + __u32 data_size_in; + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out + */ + const void *ctx_in; /* optional */ + void *ctx_out; /* optional */ + __u32 ctx_size_in; + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out + */ + __u32 retval; /* out: return code of the BPF program */ + int repeat; + __u32 duration; /* out: average per repetition in ns */ + __u32 flags; + __u32 cpu; + __u32 batch_size; +}; +#define bpf_test_run_opts__last_field batch_size + +LIBBPF_API int bpf_prog_test_run_opts(int prog_fd, + struct bpf_test_run_opts *opts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BPF_H */ diff --git a/third_party/bpftool/libbpf/src/bpf_core_read.h b/third_party/bpftool/libbpf/src/bpf_core_read.h new file mode 100644 index 0000000..1ac57bb --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_core_read.h @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_CORE_READ_H__ +#define __BPF_CORE_READ_H__ + +/* + * enum bpf_field_info_kind is passed as a second argument into + * __builtin_preserve_field_info() built-in to get a specific aspect of + * a field, captured as a first argument. __builtin_preserve_field_info(field, + * info_kind) returns __u32 integer and produces BTF field relocation, which + * is understood and processed by libbpf during BPF object loading. See + * selftests/bpf for examples. + */ +enum bpf_field_info_kind { + BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_FIELD_BYTE_SIZE = 1, + BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_FIELD_SIGNED = 3, + BPF_FIELD_LSHIFT_U64 = 4, + BPF_FIELD_RSHIFT_U64 = 5, +}; + +/* second argument to __builtin_btf_type_id() built-in */ +enum bpf_type_id_kind { + BPF_TYPE_ID_LOCAL = 0, /* BTF type ID in local program */ + BPF_TYPE_ID_TARGET = 1, /* BTF type ID in target kernel */ +}; + +/* second argument to __builtin_preserve_type_info() built-in */ +enum bpf_type_info_kind { + BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ + BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ +}; + +/* second argument to __builtin_preserve_enum_value() built-in */ +enum bpf_enum_value_kind { + BPF_ENUMVAL_EXISTS = 0, /* enum value existence in kernel */ + BPF_ENUMVAL_VALUE = 1, /* enum value value relocation */ +}; + +#define __CORE_RELO(src, field, info) \ + __builtin_preserve_field_info((src)->field, BPF_FIELD_##info) + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst, \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#else +/* semantics of LSHIFT_64 assumes loading values into low-ordered bytes, so + * for big-endian we need to adjust destination pointer accordingly, based on + * field byte size + */ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst + (8 - __CORE_RELO(src, fld, BYTE_SIZE)), \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#endif + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * All this is done in relocatable manner, so bitfield changes such as + * signedness, bit size, offset changes, this will be handled automatically. + * This version of macro is using bpf_probe_read_kernel() to read underlying + * integer storage. Macro functions as an expression and its return type is + * bpf_probe_read_kernel()'s return value: 0, on success, <0 on error. + */ +#define BPF_CORE_READ_BITFIELD_PROBED(s, field) ({ \ + unsigned long long val = 0; \ + \ + __CORE_BITFIELD_PROBE_READ(&val, s, field); \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * This version of macro is using direct memory reads and should be used from + * BPF program types that support such functionality (e.g., typed raw + * tracepoints). + */ +#define BPF_CORE_READ_BITFIELD(s, field) ({ \ + const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ + unsigned long long val; \ + \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ + switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ + } \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +#define ___bpf_field_ref1(field) (field) +#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field) +#define ___bpf_field_ref(args...) \ + ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) + +/* + * Convenience macro to check that field actually exists in target kernel's. + * Returns: + * 1, if matching field is present in target kernel; + * 0, if no matching field found. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_exists(p->my_field); + * - field reference through type and field names: + * bpf_core_field_exists(struct my_type, my_field). + */ +#define bpf_core_field_exists(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_EXISTS) + +/* + * Convenience macro to get the byte size of a field. Works for integers, + * struct/unions, pointers, arrays, and enums. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_size(p->my_field); + * - field reference through type and field names: + * bpf_core_field_size(struct my_type, my_field). + */ +#define bpf_core_field_size(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_SIZE) + +/* + * Convenience macro to get field's byte offset. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_offset(p->my_field); + * - field reference through type and field names: + * bpf_core_field_offset(struct my_type, my_field). + */ +#define bpf_core_field_offset(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_OFFSET) + +/* + * Convenience macro to get BTF type ID of a specified type, using a local BTF + * information. Return 32-bit unsigned integer with type ID from program's own + * BTF. Always succeeds. + */ +#define bpf_core_type_id_local(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_LOCAL) + +/* + * Convenience macro to get BTF type ID of a target kernel's type that matches + * specified local type. + * Returns: + * - valid 32-bit unsigned type ID in kernel BTF; + * - 0, if no matching type was found in a target kernel BTF. + */ +#define bpf_core_type_id_kernel(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) exists in a target kernel. + * Returns: + * 1, if such type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_exists(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + +/* + * Convenience macro to get the byte size of a provided named type + * (struct/union/enum/typedef) in a target kernel. + * Returns: + * >= 0 size (in bytes), if type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_size(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_SIZE) + +/* + * Convenience macro to check that provided enumerator value is defined in + * a target kernel. + * Returns: + * 1, if specified enum type and its enumerator value are present in target + * kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) + +/* + * Convenience macro to get the integer value of an enumerator value in + * a target kernel. + * Returns: + * 64-bit value, if specified enum type and its enumerator value are + * present in target kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) + +/* + * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures + * offset relocation for source address using __builtin_preserve_access_index() + * built-in, provided by Clang. + * + * __builtin_preserve_access_index() takes as an argument an expression of + * taking an address of a field within struct/union. It makes compiler emit + * a relocation, which records BTF type ID describing root struct/union and an + * accessor string which describes exact embedded field that was used to take + * an address. See detailed description of this relocation format and + * semantics in comments to struct bpf_field_reloc in libbpf_internal.h. + * + * This relocation allows libbpf to adjust BPF instruction to use correct + * actual field offset, based on target kernel BTF type that matches original + * (local) BTF, used to record relocation. + */ +#define bpf_core_read(dst, sz, src) \ + bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user(dst, sz, src) \ + bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* + * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str() + * additionally emitting BPF CO-RE field relocation for specified source + * argument. + */ +#define bpf_core_read_str(dst, sz, src) \ + bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user_str(dst, sz, src) \ + bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +#define ___concat(a, b) a ## b +#define ___apply(fn, n) ___concat(fn, n) +#define ___nth(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, __11, N, ...) N + +/* + * return number of provided arguments; used for switch-based variadic macro + * definitions (see ___last, ___arrow, etc below) + */ +#define ___narg(...) ___nth(_, ##__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +/* + * return 0 if no arguments are passed, N - otherwise; used for + * recursively-defined macros to specify termination (0) case, and generic + * (N) case (e.g., ___read_ptrs, ___core_read) + */ +#define ___empty(...) ___nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) + +#define ___last1(x) x +#define ___last2(a, x) x +#define ___last3(a, b, x) x +#define ___last4(a, b, c, x) x +#define ___last5(a, b, c, d, x) x +#define ___last6(a, b, c, d, e, x) x +#define ___last7(a, b, c, d, e, f, x) x +#define ___last8(a, b, c, d, e, f, g, x) x +#define ___last9(a, b, c, d, e, f, g, h, x) x +#define ___last10(a, b, c, d, e, f, g, h, i, x) x +#define ___last(...) ___apply(___last, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___nolast2(a, _) a +#define ___nolast3(a, b, _) a, b +#define ___nolast4(a, b, c, _) a, b, c +#define ___nolast5(a, b, c, d, _) a, b, c, d +#define ___nolast6(a, b, c, d, e, _) a, b, c, d, e +#define ___nolast7(a, b, c, d, e, f, _) a, b, c, d, e, f +#define ___nolast8(a, b, c, d, e, f, g, _) a, b, c, d, e, f, g +#define ___nolast9(a, b, c, d, e, f, g, h, _) a, b, c, d, e, f, g, h +#define ___nolast10(a, b, c, d, e, f, g, h, i, _) a, b, c, d, e, f, g, h, i +#define ___nolast(...) ___apply(___nolast, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___arrow1(a) a +#define ___arrow2(a, b) a->b +#define ___arrow3(a, b, c) a->b->c +#define ___arrow4(a, b, c, d) a->b->c->d +#define ___arrow5(a, b, c, d, e) a->b->c->d->e +#define ___arrow6(a, b, c, d, e, f) a->b->c->d->e->f +#define ___arrow7(a, b, c, d, e, f, g) a->b->c->d->e->f->g +#define ___arrow8(a, b, c, d, e, f, g, h) a->b->c->d->e->f->g->h +#define ___arrow9(a, b, c, d, e, f, g, h, i) a->b->c->d->e->f->g->h->i +#define ___arrow10(a, b, c, d, e, f, g, h, i, j) a->b->c->d->e->f->g->h->i->j +#define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___type(...) typeof(___arrow(__VA_ARGS__)) + +#define ___read(read_fn, dst, src_type, src, accessor) \ + read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor) + +/* "recursively" read a sequence of inner pointers using local __t var */ +#define ___rd_first(fn, src, a) ___read(fn, &__t, ___type(src), src, a); +#define ___rd_last(fn, ...) \ + ___read(fn, &__t, ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); +#define ___rd_p1(fn, ...) const void *__t; ___rd_first(fn, __VA_ARGS__) +#define ___rd_p2(fn, ...) ___rd_p1(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p3(fn, ...) ___rd_p2(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p4(fn, ...) ___rd_p3(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p5(fn, ...) ___rd_p4(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p6(fn, ...) ___rd_p5(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p7(fn, ...) ___rd_p6(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p8(fn, ...) ___rd_p7(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p9(fn, ...) ___rd_p8(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___read_ptrs(fn, src, ...) \ + ___apply(___rd_p, ___narg(__VA_ARGS__))(fn, src, __VA_ARGS__) + +#define ___core_read0(fn, fn_ptr, dst, src, a) \ + ___read(fn, dst, ___type(src), src, a); +#define ___core_readN(fn, fn_ptr, dst, src, ...) \ + ___read_ptrs(fn_ptr, src, ___nolast(__VA_ARGS__)) \ + ___read(fn, dst, ___type(src, ___nolast(__VA_ARGS__)), __t, \ + ___last(__VA_ARGS__)); +#define ___core_read(fn, fn_ptr, dst, src, a, ...) \ + ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \ + src, a, ##__VA_ARGS__) + +/* + * BPF_CORE_READ_INTO() is a more performance-conscious variant of + * BPF_CORE_READ(), in which final field is read into user-provided storage. + * See BPF_CORE_READ() below for more details on general usage. + */ +#define BPF_CORE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_INTO() */ +#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as + * BPF_CORE_READ() for intermediate pointers, but then executes (and returns + * corresponding error code) bpf_core_read_str() for final string read. + */ +#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_str, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ +#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially + * when there are few pointer chasing steps. + * E.g., what in non-BPF world (or in BPF w/ BCC) would be something like: + * int x = s->a.b.c->d.e->f->g; + * can be succinctly achieved using BPF_CORE_READ as: + * int x = BPF_CORE_READ(s, a.b.c, d.e, f, g); + * + * BPF_CORE_READ will decompose above statement into 4 bpf_core_read (BPF + * CO-RE relocatable bpf_probe_read_kernel() wrapper) calls, logically + * equivalent to: + * 1. const void *__t = s->a.b.c; + * 2. __t = __t->d.e; + * 3. __t = __t->f; + * 4. return __t->g; + * + * Equivalence is logical, because there is a heavy type casting/preservation + * involved, as well as all the reads are happening through + * bpf_probe_read_kernel() calls using __builtin_preserve_access_index() to + * emit CO-RE relocations. + * + * N.B. Only up to 9 "field accessors" are supported, which should be more + * than enough for any practical purpose. + */ +#define BPF_CORE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Variant of BPF_CORE_READ() for reading from user-space memory. + * + * NOTE: all the source types involved are still *kernel types* and need to + * exist in kernel (or kernel module) BTF, otherwise CO-RE relocation will + * fail. Custom user types are not relocatable with CO-RE. + * The typical situation in which BPF_CORE_READ_USER() might be used is to + * read kernel UAPI types from the user-space memory passed in as a syscall + * input argument. + */ +#define BPF_CORE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ() */ +#define BPF_PROBE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +#endif + diff --git a/third_party/bpftool/libbpf/src/bpf_endian.h b/third_party/bpftool/libbpf/src/bpf_endian.h new file mode 100644 index 0000000..ec9db4f --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_endian.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_ENDIAN__ +#define __BPF_ENDIAN__ + +/* + * Isolate byte #n and put it into byte #m, for __u##b type. + * E.g., moving byte #6 (nnnnnnnn) into byte #1 (mmmmmmmm) for __u64: + * 1) xxxxxxxx nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx + * 2) nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 00000000 + * 3) 00000000 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn + * 4) 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 00000000 + */ +#define ___bpf_mvb(x, b, n, m) ((__u##b)(x) << (b-(n+1)*8) >> (b-8) << (m*8)) + +#define ___bpf_swab16(x) ((__u16)( \ + ___bpf_mvb(x, 16, 0, 1) | \ + ___bpf_mvb(x, 16, 1, 0))) + +#define ___bpf_swab32(x) ((__u32)( \ + ___bpf_mvb(x, 32, 0, 3) | \ + ___bpf_mvb(x, 32, 1, 2) | \ + ___bpf_mvb(x, 32, 2, 1) | \ + ___bpf_mvb(x, 32, 3, 0))) + +#define ___bpf_swab64(x) ((__u64)( \ + ___bpf_mvb(x, 64, 0, 7) | \ + ___bpf_mvb(x, 64, 1, 6) | \ + ___bpf_mvb(x, 64, 2, 5) | \ + ___bpf_mvb(x, 64, 3, 4) | \ + ___bpf_mvb(x, 64, 4, 3) | \ + ___bpf_mvb(x, 64, 5, 2) | \ + ___bpf_mvb(x, 64, 6, 1) | \ + ___bpf_mvb(x, 64, 7, 0))) + +/* LLVM's BPF target selects the endianness of the CPU + * it compiles on, or the user specifies (bpfel/bpfeb), + * respectively. The used __BYTE_ORDER__ is defined by + * the compiler, we cannot rely on __BYTE_ORDER from + * libc headers, since it doesn't reflect the actual + * requested byte order. + * + * Note, LLVM's BPF target has different __builtin_bswapX() + * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE + * in bpfel and bpfeb case, which means below, that we map + * to cpu_to_be16(). We could use it unconditionally in BPF + * case, but better not rely on it, so that this header here + * can be used from application and BPF program side, which + * use different targets. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __bpf_ntohs(x) __builtin_bswap16(x) +# define __bpf_htons(x) __builtin_bswap16(x) +# define __bpf_constant_ntohs(x) ___bpf_swab16(x) +# define __bpf_constant_htons(x) ___bpf_swab16(x) +# define __bpf_ntohl(x) __builtin_bswap32(x) +# define __bpf_htonl(x) __builtin_bswap32(x) +# define __bpf_constant_ntohl(x) ___bpf_swab32(x) +# define __bpf_constant_htonl(x) ___bpf_swab32(x) +# define __bpf_be64_to_cpu(x) __builtin_bswap64(x) +# define __bpf_cpu_to_be64(x) __builtin_bswap64(x) +# define __bpf_constant_be64_to_cpu(x) ___bpf_swab64(x) +# define __bpf_constant_cpu_to_be64(x) ___bpf_swab64(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __bpf_ntohs(x) (x) +# define __bpf_htons(x) (x) +# define __bpf_constant_ntohs(x) (x) +# define __bpf_constant_htons(x) (x) +# define __bpf_ntohl(x) (x) +# define __bpf_htonl(x) (x) +# define __bpf_constant_ntohl(x) (x) +# define __bpf_constant_htonl(x) (x) +# define __bpf_be64_to_cpu(x) (x) +# define __bpf_cpu_to_be64(x) (x) +# define __bpf_constant_be64_to_cpu(x) (x) +# define __bpf_constant_cpu_to_be64(x) (x) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#define bpf_htons(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htons(x) : __bpf_htons(x)) +#define bpf_ntohs(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohs(x) : __bpf_ntohs(x)) +#define bpf_htonl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htonl(x) : __bpf_htonl(x)) +#define bpf_ntohl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohl(x) : __bpf_ntohl(x)) +#define bpf_cpu_to_be64(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x)) +#define bpf_be64_to_cpu(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x)) + +#endif /* __BPF_ENDIAN__ */ diff --git a/third_party/bpftool/libbpf/src/bpf_gen_internal.h b/third_party/bpftool/libbpf/src/bpf_gen_internal.h new file mode 100644 index 0000000..fdf4440 --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_gen_internal.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __BPF_GEN_INTERNAL_H +#define __BPF_GEN_INTERNAL_H + +#include "bpf.h" + +struct ksym_relo_desc { + const char *name; + int kind; + int insn_idx; + bool is_weak; + bool is_typeless; + bool is_ld64; +}; + +struct ksym_desc { + const char *name; + int ref; + int kind; + union { + /* used for kfunc */ + int off; + /* used for typeless ksym */ + bool typeless; + }; + int insn; + bool is_ld64; +}; + +struct bpf_gen { + struct gen_loader_opts *opts; + void *data_start; + void *data_cur; + void *insn_start; + void *insn_cur; + ssize_t cleanup_label; + __u32 nr_progs; + __u32 nr_maps; + int log_level; + int error; + struct ksym_relo_desc *relos; + int relo_cnt; + struct bpf_core_relo *core_relos; + int core_relo_cnt; + char attach_target[128]; + int attach_kind; + struct ksym_desc *ksyms; + __u32 nr_ksyms; + int fd_array; + int nr_fd_array; +}; + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps); +void bpf_gen__free(struct bpf_gen *gen); +void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size); +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx); +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx); +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u32 value_size); +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, bool is_ld64, int kind, int insn_idx); +void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); + +#endif diff --git a/third_party/bpftool/libbpf/src/bpf_helper_defs.h b/third_party/bpftool/libbpf/src/bpf_helper_defs.h new file mode 100644 index 0000000..e338dbe --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_helper_defs.h @@ -0,0 +1,4752 @@ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */ +struct bpf_fib_lookup; +struct bpf_sk_lookup; +struct bpf_perf_event_data; +struct bpf_perf_event_value; +struct bpf_pidns_info; +struct bpf_redir_neigh; +struct bpf_sock; +struct bpf_sock_addr; +struct bpf_sock_ops; +struct bpf_sock_tuple; +struct bpf_spin_lock; +struct bpf_sysctl; +struct bpf_tcp_sock; +struct bpf_tunnel_key; +struct bpf_xfrm_state; +struct linux_binprm; +struct pt_regs; +struct sk_reuseport_md; +struct sockaddr; +struct tcphdr; +struct seq_file; +struct tcp6_sock; +struct tcp_sock; +struct tcp_timewait_sock; +struct tcp_request_sock; +struct udp6_sock; +struct unix_sock; +struct task_struct; +struct cgroup; +struct __sk_buff; +struct sk_msg_md; +struct xdp_md; +struct path; +struct btf_ptr; +struct inode; +struct socket; +struct file; +struct bpf_timer; +struct mptcp_sock; +struct bpf_dynptr; +struct iphdr; +struct ipv6hdr; + +/* + * bpf_map_lookup_elem + * + * Perform a lookup in *map* for an entry associated to *key*. + * + * Returns + * Map value associated to *key*, or **NULL** if no entry was + * found. + */ +static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; + +/* + * bpf_map_update_elem + * + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; + +/* + * bpf_map_delete_elem + * + * Delete entry with *key* from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; + +/* + * bpf_probe_read + * + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; + +/* + * bpf_ktime_get_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; + +/* + * bpf_trace_printk + * + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; + +/* + * bpf_get_prandom_u32 + * + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * + * Returns + * A random 32-bit unsigned value. + */ +static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; + +/* + * bpf_get_smp_processor_id + * + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * + * Returns + * The SMP id of the processor running the program. + */ +static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; + +/* + * bpf_skb_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; + +/* + * bpf_l3_csum_replace + * + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; + +/* + * bpf_l4_csum_replace + * + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; + +/* + * bpf_tail_call + * + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; + +/* + * bpf_clone_redirect + * + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; + +/* + * bpf_get_current_pid_tgid + * + * Get the current pid and tgid. + * + * Returns + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + */ +static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; + +/* + * bpf_get_current_uid_gid + * + * Get the current uid and gid. + * + * Returns + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + */ +static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; + +/* + * bpf_get_current_comm + * + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; + +/* + * bpf_get_cgroup_classid + * + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * + * Returns + * The classid, or 0 for the default unconfigured classid. + */ +static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; + +/* + * bpf_skb_vlan_push + * + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; + +/* + * bpf_skb_vlan_pop + * + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; + +/* + * bpf_skb_get_tunnel_key + * + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; + +/* + * bpf_skb_set_tunnel_key + * + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; + +/* + * bpf_perf_event_read + * + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * + * Returns + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + */ +static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; + +/* + * bpf_redirect + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * + * Returns + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + */ +static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; + +/* + * bpf_get_route_realm + * + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * + * Returns + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + */ +static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; + +/* + * bpf_perf_event_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; + +/* + * bpf_skb_load_bytes + * + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; + +/* + * bpf_get_stackid + * + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The positive or null stack id on success, or a negative error + * in case of failure. + */ +static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; + +/* + * bpf_csum_diff + * + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * + * Returns + * The checksum result, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; + +/* + * bpf_skb_get_tunnel_opt + * + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * + * Returns + * The size of the option data retrieved. + */ +static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; + +/* + * bpf_skb_set_tunnel_opt + * + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; + +/* + * bpf_skb_change_proto + * + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; + +/* + * bpf_skb_change_type + * + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; + +/* + * bpf_skb_under_cgroup + * + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + */ +static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; + +/* + * bpf_get_hash_recalc + * + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * + * Returns + * The 32-bit hash. + */ +static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; + +/* + * bpf_get_current_task + * + * Get the current task. + * + * Returns + * A pointer to the current task struct. + */ +static __u64 (*bpf_get_current_task)(void) = (void *) 35; + +/* + * bpf_probe_write_user + * + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; + +/* + * bpf_current_task_under_cgroup + * + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + */ +static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; + +/* + * bpf_skb_change_tail + * + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; + +/* + * bpf_skb_pull_data + * + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; + +/* + * bpf_csum_update + * + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * + * Returns + * The checksum on success, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; + +/* + * bpf_set_hash_invalid + * + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * Returns + * void. + */ +static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; + +/* + * bpf_get_numa_node_id + * + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * + * Returns + * The id of current NUMA node. + */ +static long (*bpf_get_numa_node_id)(void) = (void *) 42; + +/* + * bpf_skb_change_head + * + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; + +/* + * bpf_xdp_adjust_head + * + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; + +/* + * bpf_probe_read_str + * + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; + +/* + * bpf_get_socket_cookie + * + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * + * Returns + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + */ +static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; + +/* + * bpf_get_socket_uid + * + * Get the owner UID of the socked associated to *skb*. + * + * Returns + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + */ +static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; + +/* + * bpf_set_hash + * + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * + * Returns + * 0 + */ +static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; + +/* + * bpf_setsockopt + * + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; + +/* + * bpf_skb_adjust_room + * + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; + +/* + * bpf_redirect_map + * + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * + * Returns + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + */ +static long (*bpf_redirect_map)(void *map, __u64 key, __u64 flags) = (void *) 51; + +/* + * bpf_sk_redirect_map + * + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; + +/* + * bpf_sock_map_update + * + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; + +/* + * bpf_xdp_adjust_meta + * + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; + +/* + * bpf_perf_event_read_value + * + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; + +/* + * bpf_perf_prog_read_value + * + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; + +/* + * bpf_getsockopt + * + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; + +/* + * bpf_override_return + * + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * + * Returns + * 0 + */ +static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; + +/* + * bpf_sock_ops_cb_flags_set + * + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * + * Returns + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + */ +static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; + +/* + * bpf_msg_redirect_map + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; + +/* + * bpf_msg_apply_bytes + * + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * + * Returns + * 0 + */ +static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; + +/* + * bpf_msg_cork_bytes + * + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * + * Returns + * 0 + */ +static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; + +/* + * bpf_msg_pull_data + * + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; + +/* + * bpf_bind + * + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; + +/* + * bpf_xdp_adjust_tail + * + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; + +/* + * bpf_skb_get_xfrm_state + * + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; + +/* + * bpf_get_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; + +/* + * bpf_skb_load_bytes_relative + * + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; + +/* + * bpf_fib_lookup + * + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Returns + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + */ +static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; + +/* + * bpf_sock_hash_update + * + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; + +/* + * bpf_msg_redirect_hash + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; + +/* + * bpf_sk_redirect_hash + * + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; + +/* + * bpf_lwt_push_encap + * + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; + +/* + * bpf_lwt_seg6_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; + +/* + * bpf_lwt_seg6_adjust_srh + * + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; + +/* + * bpf_lwt_seg6_action + * + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; + +/* + * bpf_rc_repeat + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; + +/* + * bpf_rc_keydown + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; + +/* + * bpf_skb_cgroup_id + * + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; + +/* + * bpf_get_current_cgroup_id + * + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * + * Returns + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + */ +static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; + +/* + * bpf_get_local_storage + * + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * + * Returns + * A pointer to the local storage area. + */ +static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; + +/* + * bpf_sk_select_reuseport + * + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; + +/* + * bpf_skb_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; + +/* + * bpf_sk_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; + +/* + * bpf_sk_lookup_udp + * + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; + +/* + * bpf_sk_release + * + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_release)(void *sock) = (void *) 86; + +/* + * bpf_map_push_elem + * + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; + +/* + * bpf_map_pop_elem + * + * Pop an element from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; + +/* + * bpf_map_peek_elem + * + * Get an element from *map* without removing it. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; + +/* + * bpf_msg_push_data + * + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; + +/* + * bpf_msg_pop_data + * + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; + +/* + * bpf_rc_pointer_rel + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; + +/* + * bpf_spin_lock + * + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * + * Returns + * 0 + */ +static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; + +/* + * bpf_spin_unlock + * + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * + * Returns + * 0 + */ +static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; + +/* + * bpf_sk_fullsock + * + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; + +/* + * bpf_tcp_sock + * + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Returns + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; + +/* + * bpf_skb_ecn_set_ce + * + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * + * Returns + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + */ +static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; + +/* + * bpf_get_listener_sock + * + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; + +/* + * bpf_skc_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; + +/* + * bpf_tcp_check_syncookie + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + */ +static long (*bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; + +/* + * bpf_sysctl_get_name + * + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + */ +static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; + +/* + * bpf_sysctl_get_current_value + * + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + */ +static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; + +/* + * bpf_sysctl_get_new_value + * + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; + +/* + * bpf_sysctl_set_new_value + * + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * + * Returns + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; + +/* + * bpf_strtol + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; + +/* + * bpf_strtoul + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; + +/* + * bpf_sk_storage_get + * + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * + * Returns + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + */ +static void *(*bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags) = (void *) 107; + +/* + * bpf_sk_storage_delete + * + * Delete a bpf-local-storage from a *sk*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + */ +static long (*bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; + +/* + * bpf_send_signal + * + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal)(__u32 sig) = (void *) 109; + +/* + * bpf_tcp_gen_syncookie + * + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + */ +static __s64 (*bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; + +/* + * bpf_skb_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; + +/* + * bpf_probe_read_user + * + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; + +/* + * bpf_probe_read_kernel + * + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; + +/* + * bpf_probe_read_user_str + * + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * + * Returns + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; + +/* + * bpf_probe_read_kernel_str + * + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * + * Returns + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + */ +static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; + +/* + * bpf_tcp_send_ack + * + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; + +/* + * bpf_send_signal_thread + * + * Send signal *sig* to the thread corresponding to the current task. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; + +/* + * bpf_jiffies64 + * + * Obtain the 64bit jiffies + * + * Returns + * The 64 bit jiffies + */ +static __u64 (*bpf_jiffies64)(void) = (void *) 118; + +/* + * bpf_read_branch_records + * + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; + +/* + * bpf_get_ns_current_pid_tgid + * + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * Returns + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + */ +static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; + +/* + * bpf_xdp_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; + +/* + * bpf_get_netns_cookie + * + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * + * Returns + * A 8-byte long opaque number. + */ +static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; + +/* + * bpf_get_current_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; + +/* + * bpf_sk_assign + * + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + */ +static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; + +/* + * bpf_ktime_get_boot_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; + +/* + * bpf_seq_printf + * + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; + +/* + * bpf_seq_write + * + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; + +/* + * bpf_sk_cgroup_id + * + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_cgroup_id)(void *sk) = (void *) 128; + +/* + * bpf_sk_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void *) 129; + +/* + * bpf_ringbuf_output + * + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; + +/* + * bpf_ringbuf_reserve + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * + * Returns + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + */ +static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; + +/* + * bpf_ringbuf_submit + * + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; + +/* + * bpf_ringbuf_discard + * + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; + +/* + * bpf_ringbuf_query + * + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * + * Returns + * Requested value, or 0, if *flags* are not recognized. + */ +static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; + +/* + * bpf_csum_level + * + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * + * Returns + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + */ +static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; + +/* + * bpf_skc_to_tcp6_sock + * + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; + +/* + * bpf_skc_to_tcp_sock + * + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; + +/* + * bpf_skc_to_tcp_timewait_sock + * + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; + +/* + * bpf_skc_to_tcp_request_sock + * + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; + +/* + * bpf_skc_to_udp6_sock + * + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; + +/* + * bpf_get_task_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; + +/* + * bpf_load_hdr_opt + * + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * + * Returns + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, __u32 len, __u64 flags) = (void *) 142; + +/* + * bpf_store_hdr_opt + * + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, __u32 len, __u64 flags) = (void *) 143; + +/* + * bpf_reserve_hdr_opt + * + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 flags) = (void *) 144; + +/* + * bpf_inode_storage_get + * + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 flags) = (void *) 145; + +/* + * bpf_inode_storage_delete + * + * Delete a bpf_local_storage from an *inode*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static int (*bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; + +/* + * bpf_d_path + * + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147; + +/* + * bpf_copy_from_user + * + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = (void *) 148; + +/* + * bpf_snprintf_btf + * + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * + * Returns + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + */ +static long (*bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, __u32 btf_ptr_size, __u64 flags) = (void *) 149; + +/* + * bpf_seq_printf_btf + * + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * + * Returns + * 0 on success or a negative error in case of failure. + */ +static long (*bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 ptr_size, __u64 flags) = (void *) 150; + +/* + * bpf_skb_cgroup_classid + * + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; + +/* + * bpf_redirect_neigh + * + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, int plen, __u64 flags) = (void *) 152; + +/* + * bpf_per_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * + * Returns + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + */ +static void *(*bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 153; + +/* + * bpf_this_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * + * Returns + * A pointer pointing to the kernel percpu variable on this cpu. + */ +static void *(*bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; + +/* + * bpf_redirect_peer + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; + +/* + * bpf_task_storage_get + * + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags) = (void *) 156; + +/* + * bpf_task_storage_delete + * + * Delete a bpf_local_storage from a *task*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_task_storage_delete)(void *map, struct task_struct *task) = (void *) 157; + +/* + * bpf_get_current_task_btf + * + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * + * Returns + * Pointer to the current task. + */ +static struct task_struct *(*bpf_get_current_task_btf)(void) = (void *) 158; + +/* + * bpf_bprm_opts_set + * + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * + * Returns + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + */ +static long (*bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void *) 159; + +/* + * bpf_ktime_get_coarse_ns + * + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_coarse_ns)(void) = (void *) 160; + +/* + * bpf_ima_inode_hash + * + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161; + +/* + * bpf_sock_from_file + * + * If the given file represents a socket, returns the associated + * socket. + * + * Returns + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + */ +static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; + +/* + * bpf_check_mtu + * + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * + * Returns + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + */ +static long (*bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len_diff, __u64 flags) = (void *) 163; + +/* + * bpf_for_each_map_elem + * + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * + * Returns + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + */ +static long (*bpf_for_each_map_elem)(void *map, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 164; + +/* + * bpf_snprintf + * + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * + * Returns + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + */ +static long (*bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *data, __u32 data_len) = (void *) 165; + +/* + * bpf_sys_bpf + * + * Execute bpf syscall with given arguments. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 166; + +/* + * bpf_btf_find_by_name_kind + * + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * + * Returns + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + */ +static long (*bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, int flags) = (void *) 167; + +/* + * bpf_sys_close + * + * Execute close syscall for given FD. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_close)(__u32 fd) = (void *) 168; + +/* + * bpf_timer_init + * + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * + * Returns + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = (void *) 169; + +/* + * bpf_timer_set_callback + * + * Configure the timer to call *callback_fn* static function. + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn) = (void *) 170; + +/* + * bpf_timer_start + * + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + */ +static long (*bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags) = (void *) 171; + +/* + * bpf_timer_cancel + * + * Cancel the timer and wait for callback_fn to finish if it was running. + * + * Returns + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + */ +static long (*bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; + +/* + * bpf_get_func_ip + * + * Get address of the traced function (for tracing and kprobe programs). + * + * Returns + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + */ +static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; + +/* + * bpf_get_attach_cookie + * + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * + * Returns + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + */ +static __u64 (*bpf_get_attach_cookie)(void *ctx) = (void *) 174; + +/* + * bpf_task_pt_regs + * + * Get the struct pt_regs associated with **task**. + * + * Returns + * A pointer to struct pt_regs. + */ +static long (*bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; + +/* + * bpf_get_branch_snapshot + * + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = (void *) 176; + +/* + * bpf_trace_vprintk + * + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; + +/* + * bpf_skc_to_unix_sock + * + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct unix_sock *(*bpf_skc_to_unix_sock)(void *sk) = (void *) 178; + +/* + * bpf_kallsyms_lookup_name + * + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * + * Returns + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + */ +static long (*bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags, __u64 *res) = (void *) 179; + +/* + * bpf_find_vma + * + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * + * Returns + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + */ +static long (*bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 180; + +/* + * bpf_loop + * + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * + * Returns + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + */ +static long (*bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 181; + +/* + * bpf_strncmp + * + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * + * Returns + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + */ +static long (*bpf_strncmp)(const char *s1, __u32 s1_sz, const char *s2) = (void *) 182; + +/* + * bpf_get_func_arg + * + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * + * Returns + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + */ +static long (*bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183; + +/* + * bpf_get_func_ret + * + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * + * Returns + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + */ +static long (*bpf_get_func_ret)(void *ctx, __u64 *value) = (void *) 184; + +/* + * bpf_get_func_arg_cnt + * + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * + * Returns + * The number of argument registers of the traced function. + */ +static long (*bpf_get_func_arg_cnt)(void *ctx) = (void *) 185; + +/* + * bpf_get_retval + * + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Returns + * The BPF program's return value. + */ +static int (*bpf_get_retval)(void) = (void *) 186; + +/* + * bpf_set_retval + * + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static int (*bpf_set_retval)(int retval) = (void *) 187; + +/* + * bpf_xdp_get_buff_len + * + * Get the total size of a given xdp buff (linear and paged area) + * + * Returns + * The total size of a given xdp buffer. + */ +static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; + +/* + * bpf_xdp_load_bytes + * + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; + +/* + * bpf_xdp_store_bytes + * + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; + +/* + * bpf_copy_from_user_task + * + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * + * Returns + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + */ +static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr, struct task_struct *tsk, __u64 flags) = (void *) 191; + +/* + * bpf_skb_set_tstamp + * + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * + * Returns + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + */ +static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) = (void *) 192; + +/* + * bpf_ima_file_hash + * + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193; + +/* + * bpf_kptr_xchg + * + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * + * Returns + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + */ +static void *(*bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194; + +/* + * bpf_map_lookup_percpu_elem + * + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * + * Returns + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + */ +static void *(*bpf_map_lookup_percpu_elem)(void *map, const void *key, __u32 cpu) = (void *) 195; + +/* + * bpf_skc_to_mptcp_sock + * + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct mptcp_sock *(*bpf_skc_to_mptcp_sock)(void *sk) = (void *) 196; + +/* + * bpf_dynptr_from_mem + * + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + */ +static long (*bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197; + +/* + * bpf_ringbuf_reserve_dynptr + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_reserve_dynptr)(void *ringbuf, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 198; + +/* + * bpf_ringbuf_submit_dynptr + * + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 199; + +/* + * bpf_ringbuf_discard_dynptr + * + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 200; + +/* + * bpf_dynptr_read + * + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + */ +static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201; + +/* + * bpf_dynptr_write + * + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + */ +static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202; + +/* + * bpf_dynptr_data + * + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * + * Returns + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + */ +static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203; + +/* + * bpf_tcp_raw_gen_syncookie_ipv4 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 204; + +/* + * bpf_tcp_raw_gen_syncookie_ipv6 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 205; + +/* + * bpf_tcp_raw_check_syncookie_ipv4 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th) = (void *) 206; + +/* + * bpf_tcp_raw_check_syncookie_ipv6 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th) = (void *) 207; + +/* + * bpf_ktime_get_tai_ns + * + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_tai_ns)(void) = (void *) 208; + +/* + * bpf_user_ringbuf_drain + * + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * + * Returns + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + */ +static long (*bpf_user_ringbuf_drain)(void *map, void *callback_fn, void *ctx, __u64 flags) = (void *) 209; + +/* + * bpf_cgrp_storage_get + * + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_cgrp_storage_get)(void *map, struct cgroup *cgroup, void *value, __u64 flags) = (void *) 210; + +/* + * bpf_cgrp_storage_delete + * + * Delete a bpf_local_storage from a *cgroup*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_cgrp_storage_delete)(void *map, struct cgroup *cgroup) = (void *) 211; + + diff --git a/third_party/bpftool/libbpf/src/bpf_helpers.h b/third_party/bpftool/libbpf/src/bpf_helpers.h new file mode 100644 index 0000000..bbab9ad --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_helpers.h @@ -0,0 +1,402 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_HELPERS__ +#define __BPF_HELPERS__ + +/* + * Note that bpf programs need to include either + * vmlinux.h (auto-generated from BTF) or linux/types.h + * in advance since bpf_helper_defs.h uses such types + * as __u64. + */ +#include "bpf_helper_defs.h" + +#define __uint(name, val) int (*name)[val] +#define __type(name, val) typeof(val) *name +#define __array(name, val) typeof(val) *name[] + +/* + * Helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. + */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ + +#endif + +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline +#define __always_inline inline __attribute__((always_inline)) + +#ifndef __noinline +#define __noinline __attribute__((noinline)) +#endif +#ifndef __weak +#define __weak __attribute__((weak)) +#endif + +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) +#endif + +/* + * Helper macros to manipulate data structures + */ + +/* offsetof() definition that uses __builtin_offset() might not preserve field + * offset CO-RE relocation properly, so force-redefine offsetof() using + * old-school approach which works with CO-RE correctly + */ +#undef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) + +/* redefined container_of() to ensure we use the above offsetof() macro */ +#undef container_of +#define container_of(ptr, type, member) \ + ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) + +/* + * Compiler (optimization) barrier. + */ +#ifndef barrier +#define barrier() asm volatile("" ::: "memory") +#endif + +/* Variable-specific compiler (optimization) barrier. It's a no-op which makes + * compiler believe that there is some black box modification of a given + * variable and thus prevents compiler from making extra assumption about its + * value and potential simplifications and optimizations on this variable. + * + * E.g., compiler might often delay or even omit 32-bit to 64-bit casting of + * a variable, making some code patterns unverifiable. Putting barrier_var() + * in place will ensure that cast is performed before the barrier_var() + * invocation, because compiler has to pessimistically assume that embedded + * asm section might perform some extra operations on that variable. + * + * This is a variable-specific variant of more global barrier(). + */ +#ifndef barrier_var +#define barrier_var(var) asm volatile("" : "+r"(var)) +#endif + +/* + * Helper macro to throw a compilation error if __bpf_unreachable() gets + * built into the resulting code. This works given BPF back end does not + * implement __builtin_trap(). This is useful to assert that certain paths + * of the program code are never used and hence eliminated by the compiler. + * + * For example, consider a switch statement that covers known cases used by + * the program. __bpf_unreachable() can then reside in the default case. If + * the program gets extended such that a case is not covered in the switch + * statement, then it will throw a build error due to the default case not + * being compiled out. + */ +#ifndef __bpf_unreachable +# define __bpf_unreachable() __builtin_trap() +#endif + +/* + * Helper function to perform a tail call with a constant/immediate map slot. + */ +#if __clang_major__ >= 8 && defined(__bpf__) +static __always_inline void +bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) +{ + if (!__builtin_constant_p(slot)) + __bpf_unreachable(); + + /* + * Provide a hard guarantee that LLVM won't optimize setting r2 (map + * pointer) and r3 (constant map index) from _different paths_ ending + * up at the _same_ call insn as otherwise we won't be able to use the + * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel + * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key + * tracking for prog array pokes") for details on verifier tracking. + * + * Note on clobber list: we need to stay in-line with BPF calling + * convention, so even if we don't end up using r0, r4, r5, we need + * to mark them as clobber so that LLVM doesn't end up using them + * before / after the call. + */ + asm volatile("r1 = %[ctx]\n\t" + "r2 = %[map]\n\t" + "r3 = %[slot]\n\t" + "call 12" + :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) + : "r0", "r1", "r2", "r3", "r4", "r5"); +} +#endif + +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +#define __kconfig __attribute__((section(".kconfig"))) +#define __ksym __attribute__((section(".ksyms"))) +#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) +#define __kptr __attribute__((btf_type_tag("kptr"))) + +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ + !!sym; \ +}) + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) \ + ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + +/* + * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values + * in a structure. + */ +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) + +#ifdef BPF_NO_GLOBAL_DATA +#define BPF_PRINTK_FMT_MOD +#else +#define BPF_PRINTK_FMT_MOD static const +#endif + +#define __bpf_printk(fmt, ...) \ +({ \ + BPF_PRINTK_FMT_MOD char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +/* + * __bpf_vprintk wraps the bpf_trace_vprintk helper with variadic arguments + * instead of an array of u64. + */ +#define __bpf_vprintk(fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_trace_vprintk(___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args + * Otherwise use __bpf_vprintk + */ +#define ___bpf_pick_printk(...) \ + ___bpf_nth(_, ##__VA_ARGS__, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_printk /*3*/, __bpf_printk /*2*/,\ + __bpf_printk /*1*/, __bpf_printk /*0*/) + +/* Helper macro to print out debug messages */ +#define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) + +struct bpf_iter_num; + +extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak __ksym; +extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym; +extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym; + +#ifndef bpf_for_each +/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for + * using BPF open-coded iterators without having to write mundane explicit + * low-level loop logic. Instead, it provides for()-like generic construct + * that can be used pretty naturally. E.g., for some hypothetical cgroup + * iterator, you'd write: + * + * struct cgroup *cg, *parent_cg = <...>; + * + * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) { + * bpf_printk("Child cgroup id = %d", cg->cgroup_id); + * if (cg->cgroup_id == 123) + * break; + * } + * + * I.e., it looks almost like high-level for each loop in other languages, + * supports continue/break, and is verifiable by BPF verifier. + * + * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to + * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int + * *`, not just `int`. So for integers bpf_for() is more convenient. + * + * Note: this macro relies on C99 feature of allowing to declare variables + * inside for() loop, bound to for() loop lifetime. It also utilizes GCC + * extension: __attribute__((cleanup())), supported by both GCC and + * Clang. + */ +#define bpf_for_each(type, cur, args...) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */, \ + cleanup(bpf_iter_##type##_destroy))), \ + /* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_##type##_new(&___it, ##args), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_##type##_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_##type##_destroy, (void *)0); \ + /* iteration and termination check */ \ + (((cur) = bpf_iter_##type##_next(&___it))); \ +) +#endif /* bpf_for_each */ + +#ifndef bpf_for +/* bpf_for(i, start, end) implements a for()-like looping construct that sets + * provided integer variable *i* to values starting from *start* through, + * but not including, *end*. It also proves to BPF verifier that *i* belongs + * to range [start, end), so this can be used for accessing arrays without + * extra checks. + * + * Note: *start* and *end* are assumed to be expressions with no side effects + * and whose values do not change throughout bpf_for() loop execution. They do + * not have to be statically known or constant, though. + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_for(i, start, end) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, (start), (end)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + ({ \ + /* iteration step */ \ + int *___t = bpf_iter_num_next(&___it); \ + /* termination and bounds check */ \ + (___t && ((i) = *___t, (i) >= (start) && (i) < (end))); \ + }); \ +) +#endif /* bpf_for */ + +#ifndef bpf_repeat +/* bpf_repeat(N) performs N iterations without exposing iteration number + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_repeat(N) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, 0, (N)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + bpf_iter_num_next(&___it); \ + /* nothing here */ \ +) +#endif /* bpf_repeat */ + +#endif diff --git a/third_party/bpftool/libbpf/src/bpf_prog_linfo.c b/third_party/bpftool/libbpf/src/bpf_prog_linfo.c new file mode 100644 index 0000000..5c50309 --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_prog_linfo.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include "libbpf.h" +#include "libbpf_internal.h" + +struct bpf_prog_linfo { + void *raw_linfo; + void *raw_jited_linfo; + __u32 *nr_jited_linfo_per_func; + __u32 *jited_linfo_func_idx; + __u32 nr_linfo; + __u32 nr_jited_func; + __u32 rec_size; + __u32 jited_rec_size; +}; + +static int dissect_jited_func(struct bpf_prog_linfo *prog_linfo, + const __u64 *ksym_func, const __u32 *ksym_len) +{ + __u32 nr_jited_func, nr_linfo; + const void *raw_jited_linfo; + const __u64 *jited_linfo; + __u64 last_jited_linfo; + /* + * Index to raw_jited_linfo: + * i: Index for searching the next ksym_func + * prev_i: Index to the last found ksym_func + */ + __u32 i, prev_i; + __u32 f; /* Index to ksym_func */ + + raw_jited_linfo = prog_linfo->raw_jited_linfo; + jited_linfo = raw_jited_linfo; + if (ksym_func[0] != *jited_linfo) + goto errout; + + prog_linfo->jited_linfo_func_idx[0] = 0; + nr_jited_func = prog_linfo->nr_jited_func; + nr_linfo = prog_linfo->nr_linfo; + + for (prev_i = 0, i = 1, f = 1; + i < nr_linfo && f < nr_jited_func; + i++) { + raw_jited_linfo += prog_linfo->jited_rec_size; + last_jited_linfo = *jited_linfo; + jited_linfo = raw_jited_linfo; + + if (ksym_func[f] == *jited_linfo) { + prog_linfo->jited_linfo_func_idx[f] = i; + + /* Sanity check */ + if (last_jited_linfo - ksym_func[f - 1] + 1 > + ksym_len[f - 1]) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[f - 1] = + i - prev_i; + prev_i = i; + + /* + * The ksym_func[f] is found in jited_linfo. + * Look for the next one. + */ + f++; + } else if (*jited_linfo <= last_jited_linfo) { + /* Ensure the addr is increasing _within_ a func */ + goto errout; + } + } + + if (f != nr_jited_func) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[nr_jited_func - 1] = + nr_linfo - prev_i; + + return 0; + +errout: + return -EINVAL; +} + +void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo) +{ + if (!prog_linfo) + return; + + free(prog_linfo->raw_linfo); + free(prog_linfo->raw_jited_linfo); + free(prog_linfo->nr_jited_linfo_per_func); + free(prog_linfo->jited_linfo_func_idx); + free(prog_linfo); +} + +struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info) +{ + struct bpf_prog_linfo *prog_linfo; + __u32 nr_linfo, nr_jited_func; + __u64 data_sz; + + nr_linfo = info->nr_line_info; + + if (!nr_linfo) + return errno = EINVAL, NULL; + + /* + * The min size that bpf_prog_linfo has to access for + * searching purpose. + */ + if (info->line_info_rec_size < + offsetof(struct bpf_line_info, file_name_off)) + return errno = EINVAL, NULL; + + prog_linfo = calloc(1, sizeof(*prog_linfo)); + if (!prog_linfo) + return errno = ENOMEM, NULL; + + /* Copy xlated line_info */ + prog_linfo->nr_linfo = nr_linfo; + prog_linfo->rec_size = info->line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->rec_size; + prog_linfo->raw_linfo = malloc(data_sz); + if (!prog_linfo->raw_linfo) + goto err_free; + memcpy(prog_linfo->raw_linfo, (void *)(long)info->line_info, data_sz); + + nr_jited_func = info->nr_jited_ksyms; + if (!nr_jited_func || + !info->jited_line_info || + info->nr_jited_line_info != nr_linfo || + info->jited_line_info_rec_size < sizeof(__u64) || + info->nr_jited_func_lens != nr_jited_func || + !info->jited_ksyms || + !info->jited_func_lens) + /* Not enough info to provide jited_line_info */ + return prog_linfo; + + /* Copy jited_line_info */ + prog_linfo->nr_jited_func = nr_jited_func; + prog_linfo->jited_rec_size = info->jited_line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->jited_rec_size; + prog_linfo->raw_jited_linfo = malloc(data_sz); + if (!prog_linfo->raw_jited_linfo) + goto err_free; + memcpy(prog_linfo->raw_jited_linfo, + (void *)(long)info->jited_line_info, data_sz); + + /* Number of jited_line_info per jited func */ + prog_linfo->nr_jited_linfo_per_func = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->nr_jited_linfo_per_func) + goto err_free; + + /* + * For each jited func, + * the start idx to the "linfo" and "jited_linfo" array, + */ + prog_linfo->jited_linfo_func_idx = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->jited_linfo_func_idx) + goto err_free; + + if (dissect_jited_func(prog_linfo, + (__u64 *)(long)info->jited_ksyms, + (__u32 *)(long)info->jited_func_lens)) + goto err_free; + + return prog_linfo; + +err_free: + bpf_prog_linfo__free(prog_linfo); + return errno = EINVAL, NULL; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip) +{ + __u32 jited_rec_size, rec_size, nr_linfo, start, i; + const void *raw_jited_linfo, *raw_linfo; + const __u64 *jited_linfo; + + if (func_idx >= prog_linfo->nr_jited_func) + return errno = ENOENT, NULL; + + nr_linfo = prog_linfo->nr_jited_linfo_per_func[func_idx]; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + start = prog_linfo->jited_linfo_func_idx[func_idx] + nr_skip; + jited_rec_size = prog_linfo->jited_rec_size; + raw_jited_linfo = prog_linfo->raw_jited_linfo + + (start * jited_rec_size); + jited_linfo = raw_jited_linfo; + if (addr < *jited_linfo) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (start * rec_size); + for (i = 0; i < nr_linfo; i++) { + if (addr < *jited_linfo) + break; + + raw_linfo += rec_size; + raw_jited_linfo += jited_rec_size; + jited_linfo = raw_jited_linfo; + } + + return raw_linfo - rec_size; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip) +{ + const struct bpf_line_info *linfo; + __u32 rec_size, nr_linfo, i; + const void *raw_linfo; + + nr_linfo = prog_linfo->nr_linfo; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (nr_skip * rec_size); + linfo = raw_linfo; + if (insn_off < linfo->insn_off) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + for (i = 0; i < nr_linfo; i++) { + if (insn_off < linfo->insn_off) + break; + + raw_linfo += rec_size; + linfo = raw_linfo; + } + + return raw_linfo - rec_size; +} diff --git a/third_party/bpftool/libbpf/src/bpf_tracing.h b/third_party/bpftool/libbpf/src/bpf_tracing.h new file mode 100644 index 0000000..be076a4 --- /dev/null +++ b/third_party/bpftool/libbpf/src/bpf_tracing.h @@ -0,0 +1,924 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_TRACING_H__ +#define __BPF_TRACING_H__ + +#include + +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_riscv) + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arc) + #define bpf_target_arc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_loongarch) + #define bpf_target_loongarch + #define bpf_target_defined +#else + +/* Fall back to what the compiler says */ +#if defined(__x86_64__) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__s390__) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__arm__) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__aarch64__) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__mips__) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__powerpc__) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__sparc__) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__riscv) && __riscv_xlen == 64 + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__arc__) + #define bpf_target_arc + #define bpf_target_defined +#elif defined(__loongarch__) + #define bpf_target_loongarch + #define bpf_target_defined +#endif /* no compiler target */ + +#endif + +#ifndef __BPF_TARGET_MISSING +#define __BPF_TARGET_MISSING "GCC error \"Must specify a BPF target arch via __TARGET_ARCH_xxx\"" +#endif + +#if defined(bpf_target_x86) + +/* + * https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI + */ + +#if defined(__KERNEL__) || defined(__VMLINUX_H__) + +#define __PT_PARM1_REG di +#define __PT_PARM2_REG si +#define __PT_PARM3_REG dx +#define __PT_PARM4_REG cx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +/* + * Syscall uses r10 for PARM4. See arch/x86/entry/entry_64.S:entry_SYSCALL_64 + * comments in Linux sources. And refer to syscall(2) manpage. + */ +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG sp +#define __PT_FP_REG bp +#define __PT_RC_REG ax +#define __PT_SP_REG sp +#define __PT_IP_REG ip + +#else + +#ifdef __i386__ + +/* i386 kernel is built with -mregparm=3 */ +#define __PT_PARM1_REG eax +#define __PT_PARM2_REG edx +#define __PT_PARM3_REG ecx +/* i386 syscall ABI is very different, refer to syscall(2) manpage */ +#define __PT_PARM1_SYSCALL_REG ebx +#define __PT_PARM2_SYSCALL_REG ecx +#define __PT_PARM3_SYSCALL_REG edx +#define __PT_PARM4_SYSCALL_REG esi +#define __PT_PARM5_SYSCALL_REG edi +#define __PT_PARM6_SYSCALL_REG ebp + +#define __PT_RET_REG esp +#define __PT_FP_REG ebp +#define __PT_RC_REG eax +#define __PT_SP_REG esp +#define __PT_IP_REG eip + +#else /* __i386__ */ + +#define __PT_PARM1_REG rdi +#define __PT_PARM2_REG rsi +#define __PT_PARM3_REG rdx +#define __PT_PARM4_REG rcx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG rsp +#define __PT_FP_REG rbp +#define __PT_RC_REG rax +#define __PT_SP_REG rsp +#define __PT_IP_REG rip + +#endif /* __i386__ */ + +#endif /* __KERNEL__ || __VMLINUX_H__ */ + +#elif defined(bpf_target_s390) + +/* + * https://github.com/IBM/s390x-abi/releases/download/v1.6/lzsabi_s390x.pdf + */ + +struct pt_regs___s390 { + unsigned long orig_gpr2; +}; + +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const user_pt_regs *)(x)) +#define __PT_PARM1_REG gprs[2] +#define __PT_PARM2_REG gprs[3] +#define __PT_PARM3_REG gprs[4] +#define __PT_PARM4_REG gprs[5] +#define __PT_PARM5_REG gprs[6] + +#define __PT_PARM1_SYSCALL_REG orig_gpr2 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG gprs[7] +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG gprs[14] +#define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG gprs[2] +#define __PT_SP_REG gprs[15] +#define __PT_IP_REG psw.addr + +#elif defined(bpf_target_arm) + +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst#machine-registers + */ + +#define __PT_PARM1_REG uregs[0] +#define __PT_PARM2_REG uregs[1] +#define __PT_PARM3_REG uregs[2] +#define __PT_PARM4_REG uregs[3] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + +#define __PT_RET_REG uregs[14] +#define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG uregs[0] +#define __PT_SP_REG uregs[13] +#define __PT_IP_REG uregs[12] + +#elif defined(bpf_target_arm64) + +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#machine-registers + */ + +struct pt_regs___arm64 { + unsigned long orig_x0; +}; + +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[0] +#define __PT_PARM2_REG regs[1] +#define __PT_PARM3_REG regs[2] +#define __PT_PARM4_REG regs[3] +#define __PT_PARM5_REG regs[4] +#define __PT_PARM6_REG regs[5] +#define __PT_PARM7_REG regs[6] +#define __PT_PARM8_REG regs[7] + +#define __PT_PARM1_SYSCALL_REG orig_x0 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG regs[30] +#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[0] +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#elif defined(bpf_target_mips) + +/* + * N64 ABI is assumed right now. + * https://en.wikipedia.org/wiki/MIPS_architecture#Calling_conventions + */ + +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG /* only N32/N64 */ +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG /* only N32/N64 */ + +#define __PT_RET_REG regs[31] +#define __PT_FP_REG regs[30] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[2] +#define __PT_SP_REG regs[29] +#define __PT_IP_REG cp0_epc + +#elif defined(bpf_target_powerpc) + +/* + * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf (page 3-14, + * section "Function Calling Sequence") + */ + +#define __PT_PARM1_REG gpr[3] +#define __PT_PARM2_REG gpr[4] +#define __PT_PARM3_REG gpr[5] +#define __PT_PARM4_REG gpr[6] +#define __PT_PARM5_REG gpr[7] +#define __PT_PARM6_REG gpr[8] +#define __PT_PARM7_REG gpr[9] +#define __PT_PARM8_REG gpr[10] + +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG orig_gpr3 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#if !defined(__arch64__) +#define __PT_PARM7_SYSCALL_REG __PT_PARM7_REG /* only powerpc (not powerpc64) */ +#endif + +#define __PT_RET_REG regs[31] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG gpr[3] +#define __PT_SP_REG sp +#define __PT_IP_REG nip + +#elif defined(bpf_target_sparc) + +/* + * https://en.wikipedia.org/wiki/Calling_convention#SPARC + */ + +#define __PT_PARM1_REG u_regs[UREG_I0] +#define __PT_PARM2_REG u_regs[UREG_I1] +#define __PT_PARM3_REG u_regs[UREG_I2] +#define __PT_PARM4_REG u_regs[UREG_I3] +#define __PT_PARM5_REG u_regs[UREG_I4] +#define __PT_PARM6_REG u_regs[UREG_I5] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG u_regs[UREG_I7] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG u_regs[UREG_I0] +#define __PT_SP_REG u_regs[UREG_FP] +/* Should this also be a bpf_target check for the sparc case? */ +#if defined(__arch64__) +#define __PT_IP_REG tpc +#else +#define __PT_IP_REG pc +#endif + +#elif defined(bpf_target_riscv) + +/* + * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions + */ + +/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG a0 +#define __PT_PARM2_REG a1 +#define __PT_PARM3_REG a2 +#define __PT_PARM4_REG a3 +#define __PT_PARM5_REG a4 +#define __PT_PARM6_REG a5 +#define __PT_PARM7_REG a6 +#define __PT_PARM8_REG a7 + +/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG ra +#define __PT_FP_REG s0 +#define __PT_RC_REG a0 +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#elif defined(bpf_target_arc) + +/* + * Section "Function Calling Sequence" (page 24): + * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf + */ + +/* arc provides struct user_regs_struct instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG scratch.r0 +#define __PT_PARM2_REG scratch.r1 +#define __PT_PARM3_REG scratch.r2 +#define __PT_PARM4_REG scratch.r3 +#define __PT_PARM5_REG scratch.r4 +#define __PT_PARM6_REG scratch.r5 +#define __PT_PARM7_REG scratch.r6 +#define __PT_PARM8_REG scratch.r7 + +/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG scratch.blink +#define __PT_FP_REG scratch.fp +#define __PT_RC_REG scratch.r0 +#define __PT_SP_REG scratch.sp +#define __PT_IP_REG scratch.ret + +#elif defined(bpf_target_loongarch) + +/* + * https://docs.kernel.org/loongarch/introduction.html + * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html + */ + +/* loongarch provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG regs[1] +#define __PT_FP_REG regs[22] +#define __PT_RC_REG regs[4] +#define __PT_SP_REG regs[3] +#define __PT_IP_REG csr_era + +#endif + +#if defined(bpf_target_defined) + +struct pt_regs; + +/* allow some architectures to override `struct pt_regs` */ +#ifndef __PT_REGS_CAST +#define __PT_REGS_CAST(x) (x) +#endif + +/* + * Different architectures support different number of arguments passed + * through registers. i386 supports just 3, some arches support up to 8. + */ +#ifndef __PT_PARM4_REG +#define __PT_PARM4_REG __unsupported__ +#endif +#ifndef __PT_PARM5_REG +#define __PT_PARM5_REG __unsupported__ +#endif +#ifndef __PT_PARM6_REG +#define __PT_PARM6_REG __unsupported__ +#endif +#ifndef __PT_PARM7_REG +#define __PT_PARM7_REG __unsupported__ +#endif +#ifndef __PT_PARM8_REG +#define __PT_PARM8_REG __unsupported__ +#endif +/* + * Similarly, syscall-specific conventions might differ between function call + * conventions within each architecutre. All supported architectures pass + * either 6 or 7 syscall arguments in registers. + * + * See syscall(2) manpage for succinct table with information on each arch. + */ +#ifndef __PT_PARM7_SYSCALL_REG +#define __PT_PARM7_SYSCALL_REG __unsupported__ +#endif + +#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) +#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) +#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) +#define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG) +#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_PARM7(x) (__PT_REGS_CAST(x)->__PT_PARM7_REG) +#define PT_REGS_PARM8(x) (__PT_REGS_CAST(x)->__PT_PARM8_REG) +#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) +#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) +#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) +#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) +#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_REG) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_REG) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG) +#define PT_REGS_PARM6_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_REG) +#define PT_REGS_PARM7_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_REG) +#define PT_REGS_PARM8_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM8_REG) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_SP_REG) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_IP_REG) + +#if defined(bpf_target_powerpc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#elif defined(bpf_target_sparc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#else + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) + +#endif + +#ifndef PT_REGS_PARM1_SYSCALL +#define PT_REGS_PARM1_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM1_SYSCALL_REG) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM2_SYSCALL +#define PT_REGS_PARM2_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM2_SYSCALL_REG) +#define PT_REGS_PARM2_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM3_SYSCALL +#define PT_REGS_PARM3_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM3_SYSCALL_REG) +#define PT_REGS_PARM3_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM4_SYSCALL +#define PT_REGS_PARM4_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM4_SYSCALL_REG) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM5_SYSCALL +#define PT_REGS_PARM5_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM5_SYSCALL_REG) +#define PT_REGS_PARM5_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM6_SYSCALL +#define PT_REGS_PARM6_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM6_SYSCALL_REG) +#define PT_REGS_PARM6_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM7_SYSCALL +#define PT_REGS_PARM7_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM7_SYSCALL_REG) +#define PT_REGS_PARM7_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_SYSCALL_REG) +#endif + +#else /* defined(bpf_target_defined) */ + +#define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#endif /* defined(bpf_target_defined) */ + +/* + * When invoked from a syscall handler kprobe, returns a pointer to a + * struct pt_regs containing syscall arguments and suitable for passing to + * PT_REGS_PARMn_SYSCALL() and PT_REGS_PARMn_CORE_SYSCALL(). + */ +#ifndef PT_REGS_SYSCALL_REGS +/* By default, assume that the arch selects ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ((struct pt_regs *)PT_REGS_PARM1(ctx)) +#endif + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_ctx_cast0() ctx +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) + +/* + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and + * similar kinds of BPF programs, that accept input arguments as a single + * pointer to untyped u64 array, where each u64 can actually be a typed + * pointer or integer of different size. Instead of requring user to write + * manual casts and work with array elements by index, BPF_PROG macro + * allows user to declare a list of named and typed input arguments in the + * same syntax as for normal C function. All the casting is hidden and + * performed transparently, while user code can just assume working with + * function arguments of specified type and name. + * + * Original raw context argument is preserved as well as 'ctx' argument. + * This is useful when using BPF helpers that expect original context + * as one of the parameters (e.g., for bpf_perf_event_output()). + */ +#define BPF_PROG(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_ctx_cast(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args) + +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))))) + +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) + +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) + +/* + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) + +struct pt_regs; + +#define ___bpf_kprobe_args0() ctx +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for + * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific + * low-level way of getting kprobe input arguments from struct pt_regs, and + * provides a familiar typed and named function arguments syntax and + * semantics of accessing kprobe input paremeters. + * + * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + */ +#define BPF_KPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define ___bpf_kretprobe_args0() ctx +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional + * return value (in addition to `struct pt_regs *ctx`), but no input + * arguments, because they will be clobbered by the time probed function + * returns. + */ +#define BPF_KRETPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kretprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) + +/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ +#define ___bpf_syscall_args0() ctx +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) + +/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ +#define ___bpf_syswrap_args0() ctx +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) + +/* + * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for + * tracing syscall functions, like __x64_sys_close. It hides the underlying + * platform-specific low-level way of getting syscall input arguments from + * struct pt_regs, and provides a familiar typed and named function arguments + * syntax and semantics of accessing syscall input parameters. + * + * Original struct pt_regs * context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + * + * At the moment BPF_KSYSCALL does not transparently handle all the calling + * convention quirks for the following syscalls: + * + * - mmap(): __ARCH_WANT_SYS_OLD_MMAP. + * - clone(): CONFIG_CLONE_BACKWARDS, CONFIG_CLONE_BACKWARDS2 and + * CONFIG_CLONE_BACKWARDS3. + * - socket-related syscalls: __ARCH_WANT_SYS_SOCKETCALL. + * - compat syscalls. + * + * This may or may not change in the future. User needs to take extra measures + * to handle such quirks explicitly, if necessary. + * + * This macro relies on BPF CO-RE support and virtual __kconfig externs. + */ +#define BPF_KSYSCALL(name, args...) \ +name(struct pt_regs *ctx); \ +extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ + ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ + : ctx; \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + if (LINUX_HAS_SYSCALL_WRAPPER) \ + return ____##name(___bpf_syswrap_args(args)); \ + else \ + return ____##name(___bpf_syscall_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define BPF_KPROBE_SYSCALL BPF_KSYSCALL + +/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE, + * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe") + * use cases. + */ +#define BPF_UPROBE(name, args...) BPF_KPROBE(name, ##args) +#define BPF_URETPROBE(name, args...) BPF_KRETPROBE(name, ##args) + +#endif diff --git a/third_party/bpftool/libbpf/src/btf.c b/third_party/bpftool/libbpf/src/btf.c new file mode 100644 index 0000000..8484b56 --- /dev/null +++ b/third_party/bpftool/libbpf/src/btf.c @@ -0,0 +1,5025 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "strset.h" + +#define BTF_MAX_NR_TYPES 0x7fffffffU +#define BTF_MAX_STR_OFFSET 0x7fffffffU + +static struct btf_type btf_void; + +struct btf { + /* raw BTF data in native endianness */ + void *raw_data; + /* raw BTF data in non-native endianness */ + void *raw_data_swapped; + __u32 raw_size; + /* whether target endianness differs from the native one */ + bool swapped_endian; + + /* + * When BTF is loaded from an ELF or raw memory it is stored + * in a contiguous memory block. The hdr, type_data, and, strs_data + * point inside that memory region to their respective parts of BTF + * representation: + * + * +--------------------------------+ + * | Header | Types | Strings | + * +--------------------------------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data-+ | + * strs_data------------+ + * + * If BTF data is later modified, e.g., due to types added or + * removed, BTF deduplication performed, etc, this contiguous + * representation is broken up into three independently allocated + * memory regions to be able to modify them independently. + * raw_data is nulled out at that point, but can be later allocated + * and cached again if user calls btf__raw_data(), at which point + * raw_data will contain a contiguous copy of header, types, and + * strings: + * + * +----------+ +---------+ +-----------+ + * | Header | | Types | | Strings | + * +----------+ +---------+ +-----------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data----+ | + * strset__data(strs_set)-----+ + * + * +----------+---------+-----------+ + * | Header | Types | Strings | + * raw_data----->+----------+---------+-----------+ + */ + struct btf_header *hdr; + + void *types_data; + size_t types_data_cap; /* used size stored in hdr->type_len */ + + /* type ID to `struct btf_type *` lookup index + * type_offs[0] corresponds to the first non-VOID type: + * - for base BTF it's type [1]; + * - for split BTF it's the first non-base BTF type. + */ + __u32 *type_offs; + size_t type_offs_cap; + /* number of types in this BTF instance: + * - doesn't include special [0] void type; + * - for split BTF counts number of types added on top of base BTF. + */ + __u32 nr_types; + /* if not NULL, points to the base BTF on top of which the current + * split BTF is based + */ + struct btf *base_btf; + /* BTF type ID of the first type in this BTF instance: + * - for base BTF it's equal to 1; + * - for split BTF it's equal to biggest type ID of base BTF plus 1. + */ + int start_id; + /* logical string offset of this BTF instance: + * - for base BTF it's equal to 0; + * - for split BTF it's equal to total size of base BTF's string section size. + */ + int start_str_off; + + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ + void *strs_data; + /* a set of unique strings */ + struct strset *strs_set; + /* whether strings are already deduplicated */ + bool strs_deduped; + + /* BTF object FD, if loaded into kernel */ + int fd; + + /* Pointer size (in bytes) for a target architecture of this BTF */ + int ptr_sz; +}; + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +/* Ensure given dynamically allocated memory region pointed to by *data* with + * capacity of *cap_cnt* elements each taking *elem_sz* bytes has enough + * memory to accommodate *add_cnt* new elements, assuming *cur_cnt* elements + * are already used. At most *max_cnt* elements can be ever allocated. + * If necessary, memory is reallocated and all existing data is copied over, + * new pointer to the memory region is stored at *data, new memory region + * capacity (in number of elements) is stored in *cap. + * On success, memory pointer to the beginning of unused memory is returned. + * On error, NULL is returned. + */ +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) +{ + size_t new_cnt; + void *new_data; + + if (cur_cnt + add_cnt <= *cap_cnt) + return *data + cur_cnt * elem_sz; + + /* requested more than the set limit */ + if (cur_cnt + add_cnt > max_cnt) + return NULL; + + new_cnt = *cap_cnt; + new_cnt += new_cnt / 4; /* expand by 25% */ + if (new_cnt < 16) /* but at least 16 elements */ + new_cnt = 16; + if (new_cnt > max_cnt) /* but not exceeding a set limit */ + new_cnt = max_cnt; + if (new_cnt < cur_cnt + add_cnt) /* also ensure we have enough memory */ + new_cnt = cur_cnt + add_cnt; + + new_data = libbpf_reallocarray(*data, new_cnt, elem_sz); + if (!new_data) + return NULL; + + /* zero out newly allocated portion of memory */ + memset(new_data + (*cap_cnt) * elem_sz, 0, (new_cnt - *cap_cnt) * elem_sz); + + *data = new_data; + *cap_cnt = new_cnt; + return new_data + cur_cnt * elem_sz; +} + +/* Ensure given dynamically allocated memory region has enough allocated space + * to accommodate *need_cnt* elements of size *elem_sz* bytes each + */ +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +{ + void *p; + + if (need_cnt <= *cap_cnt) + return 0; + + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + if (!p) + return -ENOMEM; + + return 0; +} + +static void *btf_add_type_offs_mem(struct btf *btf, size_t add_cnt) +{ + return libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, add_cnt); +} + +static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) +{ + __u32 *p; + + p = btf_add_type_offs_mem(btf, 1); + if (!p) + return -ENOMEM; + + *p = type_off; + return 0; +} + +static void btf_bswap_hdr(struct btf_header *h) +{ + h->magic = bswap_16(h->magic); + h->hdr_len = bswap_32(h->hdr_len); + h->type_off = bswap_32(h->type_off); + h->type_len = bswap_32(h->type_len); + h->str_off = bswap_32(h->str_off); + h->str_len = bswap_32(h->str_len); +} + +static int btf_parse_hdr(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + __u32 meta_left; + + if (btf->raw_size < sizeof(struct btf_header)) { + pr_debug("BTF header not found\n"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + btf->swapped_endian = true; + if (bswap_32(hdr->hdr_len) != sizeof(struct btf_header)) { + pr_warn("Can't load BTF with non-native endianness due to unsupported header length %u\n", + bswap_32(hdr->hdr_len)); + return -ENOTSUP; + } + btf_bswap_hdr(hdr); + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF magic: %x\n", hdr->magic); + return -EINVAL; + } + + if (btf->raw_size < hdr->hdr_len) { + pr_debug("BTF header len %u larger than data size %u\n", + hdr->hdr_len, btf->raw_size); + return -EINVAL; + } + + meta_left = btf->raw_size - hdr->hdr_len; + if (meta_left < (long long)hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size: %u\n", btf->raw_size); + return -EINVAL; + } + + if ((long long)hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); + return -EINVAL; + } + + if (hdr->type_off % 4) { + pr_debug("BTF type section is not aligned to 4 bytes\n"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_str_sec(struct btf *btf) +{ + const struct btf_header *hdr = btf->hdr; + const char *start = btf->strs_data; + const char *end = start + btf->hdr->str_len; + + if (btf->base_btf && hdr->str_len == 0) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || end[-1]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + return 0; +} + +static int btf_type_size(const struct btf_type *t) +{ + const int base_size = sizeof(struct btf_type); + __u16 vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return base_size; + case BTF_KIND_INT: + return base_size + sizeof(__u32); + case BTF_KIND_ENUM: + return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); + case BTF_KIND_ARRAY: + return base_size + sizeof(struct btf_array); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return base_size + vlen * sizeof(struct btf_member); + case BTF_KIND_FUNC_PROTO: + return base_size + vlen * sizeof(struct btf_param); + case BTF_KIND_VAR: + return base_size + sizeof(struct btf_var); + case BTF_KIND_DATASEC: + return base_size + vlen * sizeof(struct btf_var_secinfo); + case BTF_KIND_DECL_TAG: + return base_size + sizeof(struct btf_decl_tag); + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static void btf_bswap_type_base(struct btf_type *t) +{ + t->name_off = bswap_32(t->name_off); + t->info = bswap_32(t->info); + t->type = bswap_32(t->type); +} + +static int btf_bswap_type_rest(struct btf_type *t) +{ + struct btf_var_secinfo *v; + struct btf_enum64 *e64; + struct btf_member *m; + struct btf_array *a; + struct btf_param *p; + struct btf_enum *e; + __u16 vlen = btf_vlen(t); + int i; + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return 0; + case BTF_KIND_INT: + *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); + return 0; + case BTF_KIND_ENUM: + for (i = 0, e = btf_enum(t); i < vlen; i++, e++) { + e->name_off = bswap_32(e->name_off); + e->val = bswap_32(e->val); + } + return 0; + case BTF_KIND_ENUM64: + for (i = 0, e64 = btf_enum64(t); i < vlen; i++, e64++) { + e64->name_off = bswap_32(e64->name_off); + e64->val_lo32 = bswap_32(e64->val_lo32); + e64->val_hi32 = bswap_32(e64->val_hi32); + } + return 0; + case BTF_KIND_ARRAY: + a = btf_array(t); + a->type = bswap_32(a->type); + a->index_type = bswap_32(a->index_type); + a->nelems = bswap_32(a->nelems); + return 0; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + for (i = 0, m = btf_members(t); i < vlen; i++, m++) { + m->name_off = bswap_32(m->name_off); + m->type = bswap_32(m->type); + m->offset = bswap_32(m->offset); + } + return 0; + case BTF_KIND_FUNC_PROTO: + for (i = 0, p = btf_params(t); i < vlen; i++, p++) { + p->name_off = bswap_32(p->name_off); + p->type = bswap_32(p->type); + } + return 0; + case BTF_KIND_VAR: + btf_var(t)->linkage = bswap_32(btf_var(t)->linkage); + return 0; + case BTF_KIND_DATASEC: + for (i = 0, v = btf_var_secinfos(t); i < vlen; i++, v++) { + v->type = bswap_32(v->type); + v->offset = bswap_32(v->offset); + v->size = bswap_32(v->size); + } + return 0; + case BTF_KIND_DECL_TAG: + btf_decl_tag(t)->component_idx = bswap_32(btf_decl_tag(t)->component_idx); + return 0; + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static int btf_parse_type_sec(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + void *next_type = btf->types_data; + void *end_type = next_type + hdr->type_len; + int err, type_size; + + while (next_type + sizeof(struct btf_type) <= end_type) { + if (btf->swapped_endian) + btf_bswap_type_base(next_type); + + type_size = btf_type_size(next_type); + if (type_size < 0) + return type_size; + if (next_type + type_size > end_type) { + pr_warn("BTF type [%d] is malformed\n", btf->start_id + btf->nr_types); + return -EINVAL; + } + + if (btf->swapped_endian && btf_bswap_type_rest(next_type)) + return -EINVAL; + + err = btf_add_type_idx_entry(btf, next_type - btf->types_data); + if (err) + return err; + + next_type += type_size; + btf->nr_types++; + } + + if (next_type != end_type) { + pr_warn("BTF types data is malformed\n"); + return -EINVAL; + } + + return 0; +} + +__u32 btf__type_cnt(const struct btf *btf) +{ + return btf->start_id + btf->nr_types; +} + +const struct btf *btf__base_btf(const struct btf *btf) +{ + return btf->base_btf; +} + +/* internal helper returning non-const pointer to a type */ +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id == 0) + return &btf_void; + if (type_id < btf->start_id) + return btf_type_by_id(btf->base_btf, type_id); + return btf->types_data + btf->type_offs[type_id - btf->start_id]; +} + +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id >= btf->start_id + btf->nr_types) + return errno = EINVAL, NULL; + return btf_type_by_id((struct btf *)btf, type_id); +} + +static int determine_ptr_size(const struct btf *btf) +{ + static const char * const long_aliases[] = { + "long", + "long int", + "int long", + "unsigned long", + "long unsigned", + "unsigned long int", + "unsigned int long", + "long unsigned int", + "long int unsigned", + "int unsigned long", + "int long unsigned", + }; + const struct btf_type *t; + const char *name; + int i, j, n; + + if (btf->base_btf && btf->base_btf->ptr_sz > 0) + return btf->base_btf->ptr_sz; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_int(t)) + continue; + + if (t->size != 4 && t->size != 8) + continue; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + continue; + + for (j = 0; j < ARRAY_SIZE(long_aliases); j++) { + if (strcmp(name, long_aliases[j]) == 0) + return t->size; + } + } + + return -1; +} + +static size_t btf_ptr_sz(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + return btf->ptr_sz < 0 ? sizeof(void *) : btf->ptr_sz; +} + +/* Return pointer size this BTF instance assumes. The size is heuristically + * determined by looking for 'long' or 'unsigned long' integer type and + * recording its size in bytes. If BTF type information doesn't have any such + * type, this function returns 0. In the latter case, native architecture's + * pointer size is assumed, so will be either 4 or 8, depending on + * architecture that libbpf was compiled for. It's possible to override + * guessed value by using btf__set_pointer_size() API. + */ +size_t btf__pointer_size(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + + if (btf->ptr_sz < 0) + /* not enough BTF type info to guess */ + return 0; + + return btf->ptr_sz; +} + +/* Override or set pointer size in bytes. Only values of 4 and 8 are + * supported. + */ +int btf__set_pointer_size(struct btf *btf, size_t ptr_sz) +{ + if (ptr_sz != 4 && ptr_sz != 8) + return libbpf_err(-EINVAL); + btf->ptr_sz = ptr_sz; + return 0; +} + +static bool is_host_big_endian(void) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return false; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return true; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif +} + +enum btf_endianness btf__endianness(const struct btf *btf) +{ + if (is_host_big_endian()) + return btf->swapped_endian ? BTF_LITTLE_ENDIAN : BTF_BIG_ENDIAN; + else + return btf->swapped_endian ? BTF_BIG_ENDIAN : BTF_LITTLE_ENDIAN; +} + +int btf__set_endianness(struct btf *btf, enum btf_endianness endian) +{ + if (endian != BTF_LITTLE_ENDIAN && endian != BTF_BIG_ENDIAN) + return libbpf_err(-EINVAL); + + btf->swapped_endian = is_host_big_endian() != (endian == BTF_BIG_ENDIAN); + if (!btf->swapped_endian) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } + return 0; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + return t == &btf_void || btf_is_fwd(t); +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +#define MAX_RESOLVE_DEPTH 32 + +__s64 btf__resolve_size(const struct btf *btf, __u32 type_id) +{ + const struct btf_array *array; + const struct btf_type *t; + __u32 nelems = 1; + __s64 size = -1; + int i; + + t = btf__type_by_id(btf, type_id); + for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t); i++) { + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: + size = t->size; + goto done; + case BTF_KIND_PTR: + size = btf_ptr_sz(btf); + goto done; + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + type_id = t->type; + break; + case BTF_KIND_ARRAY: + array = btf_array(t); + if (nelems && array->nelems > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + nelems *= array->nelems; + type_id = array->type; + break; + default: + return libbpf_err(-EINVAL); + } + + t = btf__type_by_id(btf, type_id); + } + +done: + if (size < 0) + return libbpf_err(-EINVAL); + if (nelems && size > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + + return nelems * size; +} + +int btf__align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + __u16 kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FLOAT: + return min(btf_ptr_sz(btf), (size_t)t->size); + case BTF_KIND_PTR: + return btf_ptr_sz(btf); + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf__align_of(btf, t->type); + case BTF_KIND_ARRAY: + return btf__align_of(btf, btf_array(t)->type); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, max_align = 1, align; + + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + if (align <= 0) + return libbpf_err(align); + max_align = max(max_align, align); + + /* if field offset isn't aligned according to field + * type's alignment, then struct must be packed + */ + if (btf_member_bitfield_size(t, i) == 0 && + (m->offset % (8 * align)) != 0) + return 1; + } + + /* if struct/union size isn't a multiple of its alignment, + * then struct must be packed + */ + if ((t->size % max_align) != 0) + return 1; + + return max_align; + } + default: + pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); + return errno = EINVAL, 0; + } +} + +int btf__resolve_type(const struct btf *btf, __u32 type_id) +{ + const struct btf_type *t; + int depth = 0; + + t = btf__type_by_id(btf, type_id); + while (depth < MAX_RESOLVE_DEPTH && + !btf_type_is_void_or_null(t) && + (btf_is_mod(t) || btf_is_typedef(t) || btf_is_var(t))) { + type_id = t->type; + t = btf__type_by_id(btf, type_id); + depth++; + } + + if (depth == MAX_RESOLVE_DEPTH || btf_type_is_void_or_null(t)) + return libbpf_err(-EINVAL); + + return type_id; +} + +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (!strcmp(type_name, "void")) + return 0; + + for (i = 1; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name = btf__name_by_offset(btf, t->name_off); + + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, + const char *type_name, __u32 kind) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) + return 0; + + for (i = start_id; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name; + + if (btf_kind(t) != kind) + continue; + name = btf__name_by_offset(btf, t->name_off); + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, btf->start_id, type_name, kind); +} + +__s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, 1, type_name, kind); +} + +static bool btf_is_modifiable(const struct btf *btf) +{ + return (void *)btf->hdr != btf->raw_data; +} + +void btf__free(struct btf *btf) +{ + if (IS_ERR_OR_NULL(btf)) + return; + + if (btf->fd >= 0) + close(btf->fd); + + if (btf_is_modifiable(btf)) { + /* if BTF was modified after loading, it will have a split + * in-memory representation for header, types, and strings + * sections, so we need to free all of them individually. It + * might still have a cached contiguous raw data present, + * which will be unconditionally freed below. + */ + free(btf->hdr); + free(btf->types_data); + strset__free(btf->strs_set); + } + free(btf->raw_data); + free(btf->raw_data_swapped); + free(btf->type_offs); + free(btf); +} + +static struct btf *btf_new_empty(struct btf *base_btf) +{ + struct btf *btf; + + btf = calloc(1, sizeof(*btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + btf->ptr_sz = sizeof(void *); + btf->swapped_endian = false; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + /* +1 for empty string at offset 0 */ + btf->raw_size = sizeof(struct btf_header) + (base_btf ? 0 : 1); + btf->raw_data = calloc(1, btf->raw_size); + if (!btf->raw_data) { + free(btf); + return ERR_PTR(-ENOMEM); + } + + btf->hdr = btf->raw_data; + btf->hdr->hdr_len = sizeof(struct btf_header); + btf->hdr->magic = BTF_MAGIC; + btf->hdr->version = BTF_VERSION; + + btf->types_data = btf->raw_data + btf->hdr->hdr_len; + btf->strs_data = btf->raw_data + btf->hdr->hdr_len; + btf->hdr->str_len = base_btf ? 0 : 1; /* empty string at offset 0 */ + + return btf; +} + +struct btf *btf__new_empty(void) +{ + return libbpf_ptr(btf_new_empty(NULL)); +} + +struct btf *btf__new_empty_split(struct btf *base_btf) +{ + return libbpf_ptr(btf_new_empty(base_btf)); +} + +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +{ + struct btf *btf; + int err; + + btf = calloc(1, sizeof(struct btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + btf->raw_data = malloc(size); + if (!btf->raw_data) { + err = -ENOMEM; + goto done; + } + memcpy(btf->raw_data, data, size); + btf->raw_size = size; + + btf->hdr = btf->raw_data; + err = btf_parse_hdr(btf); + if (err) + goto done; + + btf->strs_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->str_off; + btf->types_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->type_off; + + err = btf_parse_str_sec(btf); + err = err ?: btf_parse_type_sec(btf); + if (err) + goto done; + +done: + if (err) { + btf__free(btf); + return ERR_PTR(err); + } + + return btf; +} + +struct btf *btf__new(const void *data, __u32 size) +{ + return libbpf_ptr(btf_new(data, size, NULL)); +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) +{ + Elf_Data *btf_data = NULL, *btf_ext_data = NULL; + int err = 0, fd = -1, idx = 0; + struct btf *btf = NULL; + Elf_Scn *scn = NULL; + Elf *elf = NULL; + GElf_Ehdr ehdr; + size_t shstrndx; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", path); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("failed to open %s: %s\n", path, strerror(errno)); + return ERR_PTR(err); + } + + err = -LIBBPF_ERRNO__FORMAT; + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_warn("failed to open %s as ELF file\n", path); + goto done; + } + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("failed to get EHDR from %s\n", path); + goto done; + } + + if (elf_getshdrstrndx(elf, &shstrndx)) { + pr_warn("failed to get section names section index for %s\n", + path); + goto done; + } + + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) { + pr_warn("failed to get e_shstrndx from %s\n", path); + goto done; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + char *name; + + idx++; + if (gelf_getshdr(scn, &sh) != &sh) { + pr_warn("failed to get section(%d) header from %s\n", + idx, path); + goto done; + } + name = elf_strptr(elf, shstrndx, sh.sh_name); + if (!name) { + pr_warn("failed to get section(%d) name from %s\n", + idx, path); + goto done; + } + if (strcmp(name, BTF_ELF_SEC) == 0) { + btf_data = elf_getdata(scn, 0); + if (!btf_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } else if (btf_ext && strcmp(name, BTF_EXT_ELF_SEC) == 0) { + btf_ext_data = elf_getdata(scn, 0); + if (!btf_ext_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } + } + + if (!btf_data) { + pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); + err = -ENODATA; + goto done; + } + btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); + err = libbpf_get_error(btf); + if (err) + goto done; + + switch (gelf_getclass(elf)) { + case ELFCLASS32: + btf__set_pointer_size(btf, 4); + break; + case ELFCLASS64: + btf__set_pointer_size(btf, 8); + break; + default: + pr_warn("failed to get ELF class (bitness) for %s\n", path); + break; + } + + if (btf_ext && btf_ext_data) { + *btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(*btf_ext); + if (err) + goto done; + } else if (btf_ext) { + *btf_ext = NULL; + } +done: + if (elf) + elf_end(elf); + close(fd); + + if (!err) + return btf; + + if (btf_ext) + btf_ext__free(*btf_ext); + btf__free(btf); + + return ERR_PTR(err); +} + +struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse_elf(path, NULL, btf_ext)); +} + +struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_elf(path, base_btf, NULL)); +} + +static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) +{ + struct btf *btf = NULL; + void *data = NULL; + FILE *f = NULL; + __u16 magic; + int err = 0; + long sz; + + f = fopen(path, "rbe"); + if (!f) { + err = -errno; + goto err_out; + } + + /* check BTF magic */ + if (fread(&magic, 1, sizeof(magic), f) < sizeof(magic)) { + err = -EIO; + goto err_out; + } + if (magic != BTF_MAGIC && magic != bswap_16(BTF_MAGIC)) { + /* definitely not a raw BTF */ + err = -EPROTO; + goto err_out; + } + + /* get file size */ + if (fseek(f, 0, SEEK_END)) { + err = -errno; + goto err_out; + } + sz = ftell(f); + if (sz < 0) { + err = -errno; + goto err_out; + } + /* rewind to the start */ + if (fseek(f, 0, SEEK_SET)) { + err = -errno; + goto err_out; + } + + /* pre-alloc memory and read all of BTF data */ + data = malloc(sz); + if (!data) { + err = -ENOMEM; + goto err_out; + } + if (fread(data, 1, sz, f) < sz) { + err = -EIO; + goto err_out; + } + + /* finally parse BTF data */ + btf = btf_new(data, sz, base_btf); + +err_out: + free(data); + if (f) + fclose(f); + return err ? ERR_PTR(err) : btf; +} + +struct btf *btf__parse_raw(const char *path) +{ + return libbpf_ptr(btf_parse_raw(path, NULL)); +} + +struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_raw(path, base_btf)); +} + +static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) +{ + struct btf *btf; + int err; + + if (btf_ext) + *btf_ext = NULL; + + btf = btf_parse_raw(path, base_btf); + err = libbpf_get_error(btf); + if (!err) + return btf; + if (err != -EPROTO) + return ERR_PTR(err); + return btf_parse_elf(path, base_btf, btf_ext); +} + +struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse(path, NULL, btf_ext)); +} + +struct btf *btf__parse_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse(path, base_btf, NULL)); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); + +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + __u32 buf_sz = 0, raw_size; + char *buf = NULL, *tmp; + void *raw_data; + int err = 0; + + if (btf->fd >= 0) + return libbpf_err(-EEXIST); + if (log_sz && !log_buf) + return libbpf_err(-EINVAL); + + /* cache native raw data representation */ + raw_data = btf_get_raw_data(btf, &raw_size, false); + if (!raw_data) { + err = -ENOMEM; + goto done; + } + btf->raw_size = raw_size; + btf->raw_data = raw_data; + +retry_load: + /* if log_level is 0, we won't provide log_buf/log_size to the kernel, + * initially. Only if BTF loading fails, we bump log_level to 1 and + * retry, using either auto-allocated or custom log_buf. This way + * non-NULL custom log_buf provides a buffer just in case, but hopes + * for successful load and no need for log_buf. + */ + if (log_level) { + /* if caller didn't provide custom log_buf, we'll keep + * allocating our own progressively bigger buffers for BTF + * verification log + */ + if (!log_buf) { + buf_sz = max((__u32)BPF_LOG_BUF_SIZE, buf_sz * 2); + tmp = realloc(buf, buf_sz); + if (!tmp) { + err = -ENOMEM; + goto done; + } + buf = tmp; + buf[0] = '\0'; + } + + opts.log_buf = log_buf ? log_buf : buf; + opts.log_size = log_buf ? log_sz : buf_sz; + opts.log_level = log_level; + } + + btf->fd = bpf_btf_load(raw_data, raw_size, &opts); + if (btf->fd < 0) { + /* time to turn on verbose mode and try again */ + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* only retry if caller didn't provide custom log_buf, but + * make sure we can never overflow buf_sz + */ + if (!log_buf && errno == ENOSPC && buf_sz <= UINT_MAX / 2) + goto retry_load; + + err = -errno; + pr_warn("BTF loading error: %d\n", err); + /* don't print out contents of custom log_buf */ + if (!log_buf && buf[0]) + pr_warn("-- BEGIN BTF LOAD LOG ---\n%s\n-- END BTF LOAD LOG --\n", buf); + } + +done: + free(buf); + return libbpf_err(err); +} + +int btf__load_into_kernel(struct btf *btf) +{ + return btf_load_into_kernel(btf, NULL, 0, 0); +} + +int btf__fd(const struct btf *btf) +{ + return btf->fd; +} + +void btf__set_fd(struct btf *btf, int fd) +{ + btf->fd = fd; +} + +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) +{ + struct btf_header *hdr = btf->hdr; + struct btf_type *t; + void *data, *p; + __u32 data_sz; + int i; + + data = swap_endian ? btf->raw_data_swapped : btf->raw_data; + if (data) { + *size = btf->raw_size; + return data; + } + + data_sz = hdr->hdr_len + hdr->type_len + hdr->str_len; + data = calloc(1, data_sz); + if (!data) + return NULL; + p = data; + + memcpy(p, hdr, hdr->hdr_len); + if (swap_endian) + btf_bswap_hdr(p); + p += hdr->hdr_len; + + memcpy(p, btf->types_data, hdr->type_len); + if (swap_endian) { + for (i = 0; i < btf->nr_types; i++) { + t = p + btf->type_offs[i]; + /* btf_bswap_type_rest() relies on native t->info, so + * we swap base type info after we swapped all the + * additional information + */ + if (btf_bswap_type_rest(t)) + goto err_out; + btf_bswap_type_base(t); + } + } + p += hdr->type_len; + + memcpy(p, btf_strs_data(btf), hdr->str_len); + p += hdr->str_len; + + *size = data_sz; + return data; +err_out: + free(data); + return NULL; +} + +const void *btf__raw_data(const struct btf *btf_ro, __u32 *size) +{ + struct btf *btf = (struct btf *)btf_ro; + __u32 data_sz; + void *data; + + data = btf_get_raw_data(btf, &data_sz, btf->swapped_endian); + if (!data) + return errno = ENOMEM, NULL; + + btf->raw_size = data_sz; + if (btf->swapped_endian) + btf->raw_data_swapped = data; + else + btf->raw_data = data; + *size = data_sz; + return data; +} + +__attribute__((alias("btf__raw_data"))) +const void *btf__get_raw_data(const struct btf *btf, __u32 *size); + +const char *btf__str_by_offset(const struct btf *btf, __u32 offset) +{ + if (offset < btf->start_str_off) + return btf__str_by_offset(btf->base_btf, offset); + else if (offset - btf->start_str_off < btf->hdr->str_len) + return btf_strs_data(btf) + (offset - btf->start_str_off); + else + return errno = EINVAL, NULL; +} + +const char *btf__name_by_offset(const struct btf *btf, __u32 offset) +{ + return btf__str_by_offset(btf, offset); +} + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) +{ + struct bpf_btf_info btf_info; + __u32 len = sizeof(btf_info); + __u32 last_size; + struct btf *btf; + void *ptr; + int err; + + /* we won't know btf_size until we call bpf_btf_get_info_by_fd(). so + * let's start with a sane default - 4KiB here - and resize it only if + * bpf_btf_get_info_by_fd() needs a bigger buffer. + */ + last_size = 4096; + ptr = malloc(last_size); + if (!ptr) + return ERR_PTR(-ENOMEM); + + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); + + if (!err && btf_info.btf_size > last_size) { + void *temp_ptr; + + last_size = btf_info.btf_size; + temp_ptr = realloc(ptr, last_size); + if (!temp_ptr) { + btf = ERR_PTR(-ENOMEM); + goto exit_free; + } + ptr = temp_ptr; + + len = sizeof(btf_info); + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); + } + + if (err || btf_info.btf_size > last_size) { + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); + goto exit_free; + } + + btf = btf_new(ptr, btf_info.btf_size, base_btf); + +exit_free: + free(ptr); + return btf; +} + +struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf) +{ + struct btf *btf; + int btf_fd; + + btf_fd = bpf_btf_get_fd_by_id(id); + if (btf_fd < 0) + return libbpf_err_ptr(-errno); + + btf = btf_get_from_fd(btf_fd, base_btf); + close(btf_fd); + + return libbpf_ptr(btf); +} + +struct btf *btf__load_from_kernel_by_id(__u32 id) +{ + return btf__load_from_kernel_by_id_split(id, NULL); +} + +static void btf_invalidate_raw_data(struct btf *btf) +{ + if (btf->raw_data) { + free(btf->raw_data); + btf->raw_data = NULL; + } + if (btf->raw_data_swapped) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } +} + +/* Ensure BTF is ready to be modified (by splitting into a three memory + * regions for header, types, and strings). Also invalidate cached + * raw_data, if any. + */ +static int btf_ensure_modifiable(struct btf *btf) +{ + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; + + if (btf_is_modifiable(btf)) { + /* any BTF modification invalidates raw_data */ + btf_invalidate_raw_data(btf); + return 0; + } + + /* split raw data into three memory regions */ + hdr = malloc(btf->hdr->hdr_len); + types = malloc(btf->hdr->type_len); + if (!hdr || !types) + goto err_out; + + memcpy(hdr, btf->hdr, btf->hdr->hdr_len); + memcpy(types, btf->types_data, btf->hdr->type_len); + + /* build lookup index for all strings */ + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); + goto err_out; + } + + /* only when everything was successful, update internal state */ + btf->hdr = hdr; + btf->types_data = types; + btf->types_data_cap = btf->hdr->type_len; + btf->strs_data = NULL; + btf->strs_set = set; + /* if BTF was created from scratch, all strings are guaranteed to be + * unique and deduplicated + */ + if (btf->hdr->str_len == 0) + btf->strs_deduped = true; + if (!btf->base_btf && btf->hdr->str_len == 1) + btf->strs_deduped = true; + + /* invalidate raw_data representation */ + btf_invalidate_raw_data(btf); + + return 0; + +err_out: + strset__free(set); + free(hdr); + free(types); + return err; +} + +/* Find an offset in BTF string section that corresponds to a given string *s*. + * Returns: + * - >0 offset into string section, if string is found; + * - -ENOENT, if string is not in the string section; + * - <0, on any other error. + */ +int btf__find_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + /* BTF needs to be in a modifiable state to build string lookup index */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + return btf->start_str_off + off; +} + +/* Add a string s to the BTF string section. + * Returns: + * - > 0 offset into string section, on success; + * - < 0, on error. + */ +int btf__add_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + btf->hdr->str_len = strset__data_size(btf->strs_set); + + return btf->start_str_off + off; +} + +static void *btf_add_type_mem(struct btf *btf, size_t add_sz) +{ + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); +} + +static void btf_type_inc_vlen(struct btf_type *t) +{ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); +} + +static int btf_commit_type(struct btf *btf, int data_sz) +{ + int err; + + err = btf_add_type_idx_entry(btf, btf->hdr->type_len); + if (err) + return libbpf_err(err); + + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types++; + return btf->start_id + btf->nr_types - 1; +} + +struct btf_pipe { + const struct btf *src; + struct btf *dst; + struct hashmap *str_off_map; /* map string offsets from src to dst */ +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + long mapped_off; + int off, err; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + if (p->str_off_map && + hashmap__find(p->str_off_map, *str_off, &mapped_off)) { + *str_off = mapped_off; + return 0; + } + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + /* Remember string mapping from src to dst. It avoids + * performing expensive string comparisons. + */ + if (p->str_off_map) { + err = hashmap__append(p->str_off_map, *str_off, off); + if (err) + return err; + } + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return libbpf_err(sz); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return libbpf_err(err); + + return btf_commit_type(btf, sz); +} + +static int btf_rewrite_type_ids(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (!*type_id) /* nothing to do for VOID references */ + return 0; + + /* we haven't updated btf's type count yet, so + * btf->start_id + btf->nr_types - 1 is the type ID offset we should + * add to all newly added BTF types + */ + *type_id += btf->start_id + btf->nr_types - 1; + return 0; +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx); +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx); + +int btf__add_btf(struct btf *btf, const struct btf *src_btf) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + int data_sz, sz, cnt, i, err, old_strs_len; + __u32 *off; + void *t; + + /* appending split BTF isn't supported yet */ + if (src_btf->base_btf) + return libbpf_err(-ENOTSUP); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + /* remember original strings section size if we have to roll back + * partial strings section changes + */ + old_strs_len = btf->hdr->str_len; + + data_sz = src_btf->hdr->type_len; + cnt = btf__type_cnt(src_btf) - 1; + + /* pre-allocate enough memory for new types */ + t = btf_add_type_mem(btf, data_sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* pre-allocate enough memory for type offset index for new types */ + off = btf_add_type_offs_mem(btf, cnt); + if (!off) + return libbpf_err(-ENOMEM); + + /* Map the string offsets from src_btf to the offsets from btf to improve performance */ + p.str_off_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(p.str_off_map)) + return libbpf_err(-ENOMEM); + + /* bulk copy types data for all types from src_btf */ + memcpy(t, src_btf->types_data, data_sz); + + for (i = 0; i < cnt; i++) { + sz = btf_type_size(t); + if (sz < 0) { + /* unlikely, has to be corrupted src_btf */ + err = sz; + goto err_out; + } + + /* fill out type ID to type offset mapping for lookups by type ID */ + *off = t - btf->types_data; + + /* add, dedup, and remap strings referenced by this BTF type */ + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + goto err_out; + + /* remap all type IDs referenced from this BTF type */ + err = btf_type_visit_type_ids(t, btf_rewrite_type_ids, btf); + if (err) + goto err_out; + + /* go to next type data and type offset index entry */ + t += sz; + off++; + } + + /* Up until now any of the copied type data was effectively invisible, + * so if we exited early before this point due to error, BTF would be + * effectively unmodified. There would be extra internal memory + * pre-allocated, but it would not be available for querying. But now + * that we've copied and rewritten all the data successfully, we can + * update type count and various internal offsets and sizes to + * "commit" the changes and made them visible to the outside world. + */ + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types += cnt; + + hashmap__free(p.str_off_map); + + /* return type ID of the first added BTF type */ + return btf->start_id + btf->nr_types - cnt; +err_out: + /* zero out preallocated memory as if it was just allocated with + * libbpf_add_mem() + */ + memset(btf->types_data + btf->hdr->type_len, 0, data_sz); + memset(btf->strs_data + old_strs_len, 0, btf->hdr->str_len - old_strs_len); + + /* and now restore original strings section size; types data size + * wasn't modified, so doesn't need restoring, see big comment above + */ + btf->hdr->str_len = old_strs_len; + + hashmap__free(p.str_off_map); + + return libbpf_err(err); +} + +/* + * Append new BTF_KIND_INT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - power-of-2 (1, 2, 4, ..) size of the type, in bytes; + * - encoding is a combination of BTF_INT_SIGNED, BTF_INT_CHAR, BTF_INT_BOOL. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) + return libbpf_err(-EINVAL); + if (encoding & ~(BTF_INT_SIGNED | BTF_INT_CHAR | BTF_INT_BOOL)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(int); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* if something goes wrong later, we might end up with an extra string, + * but that shouldn't be a problem, because BTF can't be constructed + * completely anyway and will most probably be just discarded + */ + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_INT, 0, 0); + t->size = byte_sz; + /* set INT info, we don't allow setting legacy bit offset/size */ + *(__u32 *)(t + 1) = (encoding << 24) | (byte_sz * 8); + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* it's completely legal to append BTF types with type IDs pointing forward to + * types that haven't been appended yet, so we only make sure that id looks + * sane, we can't guarantee that ID will always be valid + */ +static int validate_type_id(int id) +{ + if (id < 0 || id > BTF_MAX_NR_TYPES) + return -EINVAL; + return 0; +} + +/* generic append function for PTR, TYPEDEF, CONST/VOLATILE/RESTRICT */ +static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref_type_id) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->type = ref_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_PTR type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_ptr(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_PTR, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_ARRAY type with: + * - *index_type_id* - type ID of the type describing array index; + * - *elem_type_id* - type ID of the type describing array element; + * - *nr_elems* - the size of the array; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems) +{ + struct btf_type *t; + struct btf_array *a; + int sz; + + if (validate_type_id(index_type_id) || validate_type_id(elem_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_array); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_ARRAY, 0, 0); + t->size = 0; + + a = btf_array(t); + a->type = elem_type_id; + a->index_type = index_type_id; + a->nelems = nr_elems; + + return btf_commit_type(btf, sz); +} + +/* generic STRUCT/UNION append function */ +static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 bytes_sz) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0 and no kflag; this will be adjusted when + * adding each member + */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->size = bytes_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_STRUCT type with: + * - *name* - name of the struct, can be NULL or empty for anonymous structs; + * - *byte_sz* - size of the struct, in bytes; + * + * Struct initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_struct() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_struct(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_STRUCT, name, byte_sz); +} + +/* + * Append new BTF_KIND_UNION type with: + * - *name* - name of the union, can be NULL or empty for anonymous union; + * - *byte_sz* - size of the union, in bytes; + * + * Union initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_union() succeeds. All fields + * should have *bit_offset* of 0. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_union(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_UNION, name, byte_sz); +} + +static struct btf_type *btf_last_type(struct btf *btf) +{ + return btf_type_by_id(btf, btf__type_cnt(btf) - 1); +} + +/* + * Append new field for the current STRUCT/UNION type with: + * - *name* - name of the field, can be NULL or empty for anonymous field; + * - *type_id* - type ID for the type describing field type; + * - *bit_offset* - bit offset of the start of the field within struct/union; + * - *bit_size* - bit size of a bitfield, 0 for non-bitfield fields; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_field(struct btf *btf, const char *name, int type_id, + __u32 bit_offset, __u32 bit_size) +{ + struct btf_type *t; + struct btf_member *m; + bool is_bitfield; + int sz, name_off = 0; + + /* last type should be union/struct */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_composite(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + /* best-effort bit field offset/size enforcement */ + is_bitfield = bit_size || (bit_offset % 8 != 0); + if (is_bitfield && (bit_size == 0 || bit_size > 255 || bit_offset > 0xffffff)) + return libbpf_err(-EINVAL); + + /* only offset 0 is allowed for unions */ + if (btf_is_union(t) && bit_offset) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_member); + m = btf_add_type_mem(btf, sz); + if (!m) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + m->name_off = name_off; + m->type = type_id; + m->offset = bit_offset | (bit_size << 24); + + /* btf_add_type_mem can invalidate t pointer */ + t = btf_last_type(btf); + /* update parent type's vlen and kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed, __u8 kind) +{ + struct btf_type *t; + int sz, name_off = 0; + + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0; it will be adjusted when adding enum values */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, is_signed); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_ENUM type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum_value() + * immediately after btf__add_enum() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +{ + /* + * set the signedness to be unsigned, it will change to signed + * if any later enumerator is negative. + */ + return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); +} + +/* + * Append new enum value for the current ENUM type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) +{ + struct btf_type *t; + struct btf_enum *v; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (value < INT_MIN || value > UINT_MAX) + return libbpf_err(-E2BIG); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val = value; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + /* if negative value, set signedness to signed */ + if (value < 0) + t->info = btf_type_info(btf_kind(t), btf_vlen(t), true); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_ENUM64 type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * - *is_signed* - whether the enum values are signed or not; + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum64_value() + * immediately after btf__add_enum64() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum64(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed) +{ + return btf_add_enum_common(btf, name, byte_sz, is_signed, + BTF_KIND_ENUM64); +} + +/* + * Append new enum value for the current ENUM64 type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) +{ + struct btf_enum64 *v; + struct btf_type *t; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM64 */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum64(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum64); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val_lo32 = (__u32)value; + v->val_hi32 = value >> 32; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_FWD type with: + * - *name*, non-empty/non-NULL name; + * - *fwd_kind*, kind of forward declaration, one of BTF_FWD_STRUCT, + * BTF_FWD_UNION, or BTF_FWD_ENUM; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + switch (fwd_kind) { + case BTF_FWD_STRUCT: + case BTF_FWD_UNION: { + struct btf_type *t; + int id; + + id = btf_add_ref_kind(btf, BTF_KIND_FWD, name, 0); + if (id <= 0) + return id; + t = btf_type_by_id(btf, id); + t->info = btf_type_info(BTF_KIND_FWD, 0, fwd_kind == BTF_FWD_UNION); + return id; + } + case BTF_FWD_ENUM: + /* enum forward in BTF currently is just an enum with no enum + * values; we also assume a standard 4-byte size for it + */ + return btf__add_enum(btf, name, sizeof(int)); + default: + return libbpf_err(-EINVAL); + } +} + +/* + * Append new BTF_KING_TYPEDEF type with: + * - *name*, non-empty/non-NULL name; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id); +} + +/* + * Append new BTF_KIND_VOLATILE type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_volatile(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_VOLATILE, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_CONST type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_const(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_CONST, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_RESTRICT type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_restrict(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_RESTRICT, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_TYPE_TAG type with: + * - *value*, non-empty/non-NULL tag value; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) +{ + if (!value || !value[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id); +} + +/* + * Append new BTF_KIND_FUNC type with: + * - *name*, non-empty/non-NULL name; + * - *proto_type_id* - FUNC_PROTO's type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id) +{ + int id; + + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && + linkage != BTF_FUNC_EXTERN) + return libbpf_err(-EINVAL); + + id = btf_add_ref_kind(btf, BTF_KIND_FUNC, name, proto_type_id); + if (id > 0) { + struct btf_type *t = btf_type_by_id(btf, id); + + t->info = btf_type_info(BTF_KIND_FUNC, linkage, 0); + } + return libbpf_err(id); +} + +/* + * Append new BTF_KIND_FUNC_PROTO with: + * - *ret_type_id* - type ID for return result of a function. + * + * Function prototype initially has no arguments, but they can be added by + * btf__add_func_param() one by one, immediately after + * btf__add_func_proto() succeeded. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func_proto(struct btf *btf, int ret_type_id) +{ + struct btf_type *t; + int sz; + + if (validate_type_id(ret_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* start out with vlen=0; this will be adjusted when adding enum + * values, if necessary + */ + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_FUNC_PROTO, 0, 0); + t->type = ret_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new function parameter for current FUNC_PROTO type with: + * - *name* - parameter name, can be NULL or empty; + * - *type_id* - type ID describing the type of the parameter. + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_func_param(struct btf *btf, const char *name, int type_id) +{ + struct btf_type *t; + struct btf_param *p; + int sz, name_off = 0; + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* last type should be BTF_KIND_FUNC_PROTO */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_func_proto(t)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_param); + p = btf_add_type_mem(btf, sz); + if (!p) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + p->name_off = name_off; + p->type = type_id; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_VAR type with: + * - *name* - non-empty/non-NULL name; + * - *linkage* - variable linkage, one of BTF_VAR_STATIC, + * BTF_VAR_GLOBAL_ALLOCATED, or BTF_VAR_GLOBAL_EXTERN; + * - *type_id* - type ID of the type describing the type of the variable. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) +{ + struct btf_type *t; + struct btf_var *v; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && + linkage != BTF_VAR_GLOBAL_EXTERN) + return libbpf_err(-EINVAL); + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_var); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_VAR, 0, 0); + t->type = type_id; + + v = btf_var(t); + v->linkage = linkage; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_DATASEC type with: + * - *name* - non-empty/non-NULL name; + * - *byte_sz* - data section size, in bytes. + * + * Data section is initially empty. Variables info can be added with + * btf__add_datasec_var_info() calls, after btf__add_datasec() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + /* start with vlen=0, which will be update as var_secinfos are added */ + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_DATASEC, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new data section variable information entry for current DATASEC type: + * - *var_type_id* - type ID, describing type of the variable; + * - *offset* - variable offset within data section, in bytes; + * - *byte_sz* - variable size, in bytes. + * + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __u32 byte_sz) +{ + struct btf_type *t; + struct btf_var_secinfo *v; + int sz; + + /* last type should be BTF_KIND_DATASEC */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_datasec(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(var_type_id)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_var_secinfo); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + v->type = var_type_id; + v->offset = offset; + v->size = byte_sz; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_DECL_TAG type with: + * - *value* - non-empty/non-NULL string; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * - *component_idx* - -1 for tagging reference type, otherwise struct/union + * member or function argument index; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx) +{ + struct btf_type *t; + int sz, value_off; + + if (!value || !value[0] || component_idx < -1) + return libbpf_err(-EINVAL); + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_decl_tag); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + value_off = btf__add_str(btf, value); + if (value_off < 0) + return value_off; + + t->name_off = value_off; + t->info = btf_type_info(BTF_KIND_DECL_TAG, 0, false); + t->type = ref_type_id; + btf_decl_tag(t)->component_idx = component_idx; + + return btf_commit_type(btf, sz); +} + +struct btf_ext_sec_setup_param { + __u32 off; + __u32 len; + __u32 min_rec_size; + struct btf_ext_info *ext_info; + const char *desc; +}; + +static int btf_ext_setup_info(struct btf_ext *btf_ext, + struct btf_ext_sec_setup_param *ext_sec) +{ + const struct btf_ext_info_sec *sinfo; + struct btf_ext_info *ext_info; + __u32 info_left, record_size; + size_t sec_cnt = 0; + /* The start of the info sec (including the __u32 record_size). */ + void *info; + + if (ext_sec->len == 0) + return 0; + + if (ext_sec->off & 0x03) { + pr_debug(".BTF.ext %s section is not aligned to 4 bytes\n", + ext_sec->desc); + return -EINVAL; + } + + info = btf_ext->data + btf_ext->hdr->hdr_len + ext_sec->off; + info_left = ext_sec->len; + + if (btf_ext->data + btf_ext->data_size < info + ext_sec->len) { + pr_debug("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", + ext_sec->desc, ext_sec->off, ext_sec->len); + return -EINVAL; + } + + /* At least a record size */ + if (info_left < sizeof(__u32)) { + pr_debug(".BTF.ext %s record size not found\n", ext_sec->desc); + return -EINVAL; + } + + /* The record size needs to meet the minimum standard */ + record_size = *(__u32 *)info; + if (record_size < ext_sec->min_rec_size || + record_size & 0x03) { + pr_debug("%s section in .BTF.ext has invalid record size %u\n", + ext_sec->desc, record_size); + return -EINVAL; + } + + sinfo = info + sizeof(__u32); + info_left -= sizeof(__u32); + + /* If no records, return failure now so .BTF.ext won't be used. */ + if (!info_left) { + pr_debug("%s section in .BTF.ext has no records", ext_sec->desc); + return -EINVAL; + } + + while (info_left) { + unsigned int sec_hdrlen = sizeof(struct btf_ext_info_sec); + __u64 total_record_size; + __u32 num_records; + + if (info_left < sec_hdrlen) { + pr_debug("%s section header is not found in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + num_records = sinfo->num_info; + if (num_records == 0) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + total_record_size = sec_hdrlen + (__u64)num_records * record_size; + if (info_left < total_record_size) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + info_left -= total_record_size; + sinfo = (void *)sinfo + total_record_size; + sec_cnt++; + } + + ext_info = ext_sec->ext_info; + ext_info->len = ext_sec->len - sizeof(__u32); + ext_info->rec_size = record_size; + ext_info->info = info + sizeof(__u32); + ext_info->sec_cnt = sec_cnt; + + return 0; +} + +static int btf_ext_setup_func_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->func_info_off, + .len = btf_ext->hdr->func_info_len, + .min_rec_size = sizeof(struct bpf_func_info_min), + .ext_info = &btf_ext->func_info, + .desc = "func_info" + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_line_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->line_info_off, + .len = btf_ext->hdr->line_info_len, + .min_rec_size = sizeof(struct bpf_line_info_min), + .ext_info = &btf_ext->line_info, + .desc = "line_info", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_core_relos(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->core_relo_off, + .len = btf_ext->hdr->core_relo_len, + .min_rec_size = sizeof(struct bpf_core_relo), + .ext_info = &btf_ext->core_relo_info, + .desc = "core_relo", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_parse_hdr(__u8 *data, __u32 data_size) +{ + const struct btf_ext_header *hdr = (struct btf_ext_header *)data; + + if (data_size < offsetofend(struct btf_ext_header, hdr_len) || + data_size < hdr->hdr_len) { + pr_debug("BTF.ext header not found"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + pr_warn("BTF.ext in non-native endianness is not supported\n"); + return -ENOTSUP; + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF.ext magic:%x\n", hdr->magic); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + pr_debug("Unsupported BTF.ext version:%u\n", hdr->version); + return -ENOTSUP; + } + + if (hdr->flags) { + pr_debug("Unsupported BTF.ext flags:%x\n", hdr->flags); + return -ENOTSUP; + } + + if (data_size == hdr->hdr_len) { + pr_debug("BTF.ext has no data\n"); + return -EINVAL; + } + + return 0; +} + +void btf_ext__free(struct btf_ext *btf_ext) +{ + if (IS_ERR_OR_NULL(btf_ext)) + return; + free(btf_ext->func_info.sec_idxs); + free(btf_ext->line_info.sec_idxs); + free(btf_ext->core_relo_info.sec_idxs); + free(btf_ext->data); + free(btf_ext); +} + +struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) +{ + struct btf_ext *btf_ext; + int err; + + btf_ext = calloc(1, sizeof(struct btf_ext)); + if (!btf_ext) + return libbpf_err_ptr(-ENOMEM); + + btf_ext->data_size = size; + btf_ext->data = malloc(size); + if (!btf_ext->data) { + err = -ENOMEM; + goto done; + } + memcpy(btf_ext->data, data, size); + + err = btf_ext_parse_hdr(btf_ext->data, size); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) { + err = -EINVAL; + goto done; + } + + err = btf_ext_setup_func_info(btf_ext); + if (err) + goto done; + + err = btf_ext_setup_line_info(btf_ext); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) + goto done; /* skip core relos parsing */ + + err = btf_ext_setup_core_relos(btf_ext); + if (err) + goto done; + +done: + if (err) { + btf_ext__free(btf_ext); + return libbpf_err_ptr(err); + } + + return btf_ext; +} + +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +{ + *size = btf_ext->data_size; + return btf_ext->data; +} + +struct btf_dedup; + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); +static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_prep(struct btf_dedup *d); +static int btf_dedup_strings(struct btf_dedup *d); +static int btf_dedup_prim_types(struct btf_dedup *d); +static int btf_dedup_struct_types(struct btf_dedup *d); +static int btf_dedup_ref_types(struct btf_dedup *d); +static int btf_dedup_resolve_fwds(struct btf_dedup *d); +static int btf_dedup_compact_types(struct btf_dedup *d); +static int btf_dedup_remap_types(struct btf_dedup *d); + +/* + * Deduplicate BTF types and strings. + * + * BTF dedup algorithm takes as an input `struct btf` representing `.BTF` ELF + * section with all BTF type descriptors and string data. It overwrites that + * memory in-place with deduplicated types and strings without any loss of + * information. If optional `struct btf_ext` representing '.BTF.ext' ELF section + * is provided, all the strings referenced from .BTF.ext section are honored + * and updated to point to the right offsets after deduplication. + * + * If function returns with error, type/string data might be garbled and should + * be discarded. + * + * More verbose and detailed description of both problem btf_dedup is solving, + * as well as solution could be found at: + * https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html + * + * Problem description and justification + * ===================================== + * + * BTF type information is typically emitted either as a result of conversion + * from DWARF to BTF or directly by compiler. In both cases, each compilation + * unit contains information about a subset of all the types that are used + * in an application. These subsets are frequently overlapping and contain a lot + * of duplicated information when later concatenated together into a single + * binary. This algorithm ensures that each unique type is represented by single + * BTF type descriptor, greatly reducing resulting size of BTF data. + * + * Compilation unit isolation and subsequent duplication of data is not the only + * problem. The same type hierarchy (e.g., struct and all the type that struct + * references) in different compilation units can be represented in BTF to + * various degrees of completeness (or, rather, incompleteness) due to + * struct/union forward declarations. + * + * Let's take a look at an example, that we'll use to better understand the + * problem (and solution). Suppose we have two compilation units, each using + * same `struct S`, but each of them having incomplete type information about + * struct's fields: + * + * // CU #1: + * struct S; + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * // CU #2: + * struct S; + * struct A; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * In case of CU #1, BTF data will know only that `struct B` exist (but no + * more), but will know the complete type information about `struct A`. While + * for CU #2, it will know full type information about `struct B`, but will + * only know about forward declaration of `struct A` (in BTF terms, it will + * have `BTF_KIND_FWD` type descriptor with name `B`). + * + * This compilation unit isolation means that it's possible that there is no + * single CU with complete type information describing structs `S`, `A`, and + * `B`. Also, we might get tons of duplicated and redundant type information. + * + * Additional complication we need to keep in mind comes from the fact that + * types, in general, can form graphs containing cycles, not just DAGs. + * + * While algorithm does deduplication, it also merges and resolves type + * information (unless disabled throught `struct btf_opts`), whenever possible. + * E.g., in the example above with two compilation units having partial type + * information for structs `A` and `B`, the output of algorithm will emit + * a single copy of each BTF type that describes structs `A`, `B`, and `S` + * (as well as type information for `int` and pointers), as if they were defined + * in a single compilation unit as: + * + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * Algorithm summary + * ================= + * + * Algorithm completes its work in 7 separate passes: + * + * 1. Strings deduplication. + * 2. Primitive types deduplication (int, enum, fwd). + * 3. Struct/union types deduplication. + * 4. Resolve unambiguous forward declarations. + * 5. Reference types deduplication (pointers, typedefs, arrays, funcs, func + * protos, and const/volatile/restrict modifiers). + * 6. Types compaction. + * 7. Types remapping. + * + * Algorithm determines canonical type descriptor, which is a single + * representative type for each truly unique type. This canonical type is the + * one that will go into final deduplicated BTF type information. For + * struct/unions, it is also the type that algorithm will merge additional type + * information into (while resolving FWDs), as it discovers it from data in + * other CUs. Each input BTF type eventually gets either mapped to itself, if + * that type is canonical, or to some other type, if that type is equivalent + * and was chosen as canonical representative. This mapping is stored in + * `btf_dedup->map` array. This map is also used to record STRUCT/UNION that + * FWD type got resolved to. + * + * To facilitate fast discovery of canonical types, we also maintain canonical + * index (`btf_dedup->dedup_table`), which maps type descriptor's signature hash + * (i.e., hashed kind, name, size, fields, etc) into a list of canonical types + * that match that signature. With sufficiently good choice of type signature + * hashing function, we can limit number of canonical types for each unique type + * signature to a very small number, allowing to find canonical type for any + * duplicated type very quickly. + * + * Struct/union deduplication is the most critical part and algorithm for + * deduplicating structs/unions is described in greater details in comments for + * `btf_dedup_is_equiv` function. + */ +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d; + int err; + + if (!OPTS_VALID(opts, btf_dedup_opts)) + return libbpf_err(-EINVAL); + + d = btf_dedup_new(btf, opts); + if (IS_ERR(d)) { + pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d)); + return libbpf_err(-EINVAL); + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + err = btf_dedup_prep(d); + if (err) { + pr_debug("btf_dedup_prep failed:%d\n", err); + goto done; + } + err = btf_dedup_strings(d); + if (err < 0) { + pr_debug("btf_dedup_strings failed:%d\n", err); + goto done; + } + err = btf_dedup_prim_types(d); + if (err < 0) { + pr_debug("btf_dedup_prim_types failed:%d\n", err); + goto done; + } + err = btf_dedup_struct_types(d); + if (err < 0) { + pr_debug("btf_dedup_struct_types failed:%d\n", err); + goto done; + } + err = btf_dedup_resolve_fwds(d); + if (err < 0) { + pr_debug("btf_dedup_resolve_fwds failed:%d\n", err); + goto done; + } + err = btf_dedup_ref_types(d); + if (err < 0) { + pr_debug("btf_dedup_ref_types failed:%d\n", err); + goto done; + } + err = btf_dedup_compact_types(d); + if (err < 0) { + pr_debug("btf_dedup_compact_types failed:%d\n", err); + goto done; + } + err = btf_dedup_remap_types(d); + if (err < 0) { + pr_debug("btf_dedup_remap_types failed:%d\n", err); + goto done; + } + +done: + btf_dedup_free(d); + return libbpf_err(err); +} + +#define BTF_UNPROCESSED_ID ((__u32)-1) +#define BTF_IN_PROGRESS_ID ((__u32)-2) + +struct btf_dedup { + /* .BTF section to be deduped in-place */ + struct btf *btf; + /* + * Optional .BTF.ext section. When provided, any strings referenced + * from it will be taken into account when deduping strings + */ + struct btf_ext *btf_ext; + /* + * This is a map from any type's signature hash to a list of possible + * canonical representative type candidates. Hash collisions are + * ignored, so even types of various kinds can share same list of + * candidates, which is fine because we rely on subsequent + * btf_xxx_equal() checks to authoritatively verify type equality. + */ + struct hashmap *dedup_table; + /* Canonical types map */ + __u32 *map; + /* Hypothetical mapping, used during type graph equivalence checks */ + __u32 *hypot_map; + __u32 *hypot_list; + size_t hypot_cnt; + size_t hypot_cap; + /* Whether hypothetical mapping, if successful, would need to adjust + * already canonicalized types (due to a new forward declaration to + * concrete type resolution). In such case, during split BTF dedup + * candidate type would still be considered as different, because base + * BTF is considered to be immutable. + */ + bool hypot_adjust_canon; + /* Various option modifying behavior of algorithm */ + struct btf_dedup_opts opts; + /* temporary strings deduplication state */ + struct strset *strs_set; +}; + +static long hash_combine(long h, long value) +{ + return h * 31 + value; +} + +#define for_each_dedup_cand(d, node, hash) \ + hashmap__for_each_key_entry(d->dedup_table, node, hash) + +static int btf_dedup_table_add(struct btf_dedup *d, long hash, __u32 type_id) +{ + return hashmap__append(d->dedup_table, hash, type_id); +} + +static int btf_dedup_hypot_map_add(struct btf_dedup *d, + __u32 from_id, __u32 to_id) +{ + if (d->hypot_cnt == d->hypot_cap) { + __u32 *new_list; + + d->hypot_cap += max((size_t)16, d->hypot_cap / 2); + new_list = libbpf_reallocarray(d->hypot_list, d->hypot_cap, sizeof(__u32)); + if (!new_list) + return -ENOMEM; + d->hypot_list = new_list; + } + d->hypot_list[d->hypot_cnt++] = from_id; + d->hypot_map[from_id] = to_id; + return 0; +} + +static void btf_dedup_clear_hypot_map(struct btf_dedup *d) +{ + int i; + + for (i = 0; i < d->hypot_cnt; i++) + d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; + d->hypot_cnt = 0; + d->hypot_adjust_canon = false; +} + +static void btf_dedup_free(struct btf_dedup *d) +{ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + + free(d->map); + d->map = NULL; + + free(d->hypot_map); + d->hypot_map = NULL; + + free(d->hypot_list); + d->hypot_list = NULL; + + free(d); +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx) +{ + return key; +} + +static size_t btf_dedup_collision_hash_fn(long key, void *ctx) +{ + return 0; +} + +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); + hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; + int i, err = 0, type_cnt; + + if (!d) + return ERR_PTR(-ENOMEM); + + if (OPTS_GET(opts, force_collisions, false)) + hash_fn = btf_dedup_collision_hash_fn; + + d->btf = btf; + d->btf_ext = OPTS_GET(opts, btf_ext, NULL); + + d->dedup_table = hashmap__new(hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(d->dedup_table)) { + err = PTR_ERR(d->dedup_table); + d->dedup_table = NULL; + goto done; + } + + type_cnt = btf__type_cnt(btf); + d->map = malloc(sizeof(__u32) * type_cnt); + if (!d->map) { + err = -ENOMEM; + goto done; + } + /* special BTF "void" type is made canonical immediately */ + d->map[0] = 0; + for (i = 1; i < type_cnt; i++) { + struct btf_type *t = btf_type_by_id(d->btf, i); + + /* VAR and DATASEC are never deduped and are self-canonical */ + if (btf_is_var(t) || btf_is_datasec(t)) + d->map[i] = i; + else + d->map[i] = BTF_UNPROCESSED_ID; + } + + d->hypot_map = malloc(sizeof(__u32) * type_cnt); + if (!d->hypot_map) { + err = -ENOMEM; + goto done; + } + for (i = 0; i < type_cnt; i++) + d->hypot_map[i] = BTF_UNPROCESSED_ID; + +done: + if (err) { + btf_dedup_free(d); + return ERR_PTR(err); + } + + return d; +} + +/* + * Iterate over all possible places in .BTF and .BTF.ext that can reference + * string and pass pointer to it to a provided callback `fn`. + */ +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; + + return 0; +} + +static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 str_off = *str_off_ptr; + const char *s; + int off, err; + + /* don't touch empty string or string in main BTF */ + if (str_off == 0 || str_off < d->btf->start_str_off) + return 0; + + s = btf__str_by_offset(d->btf, str_off); + if (d->btf->base_btf) { + err = btf__find_str(d->btf->base_btf, s); + if (err >= 0) { + *str_off_ptr = err; + return 0; + } + if (err != -ENOENT) + return err; + } + + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; + + *str_off_ptr = d->btf->start_str_off + off; + return 0; +} + +/* + * Dedup string and filter out those that are not referenced from either .BTF + * or .BTF.ext (if provided) sections. + * + * This is done by building index of all strings in BTF's string section, + * then iterating over all entities that can reference strings (e.g., type + * names, struct field names, .BTF.ext line info, etc) and marking corresponding + * strings as used. After that all used strings are deduped and compacted into + * sequential blob of memory and new offsets are calculated. Then all the string + * references are iterated again and rewritten using new offsets. + */ +static int btf_dedup_strings(struct btf_dedup *d) +{ + int err; + + if (d->btf->strs_deduped) + return 0; + + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); + goto err_out; + } + + if (!d->btf->base_btf) { + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups + */ + err = strset__add_str(d->strs_set, ""); + if (err < 0) + goto err_out; + } + + /* remap string offsets */ + err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); + if (err) + goto err_out; + + /* replace BTF string data and hash with deduped ones */ + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; + d->btf->strs_deduped = true; + return 0; + +err_out: + strset__free(d->strs_set); + d->strs_set = NULL; + + return err; +} + +static long btf_hash_common(struct btf_type *t) +{ + long h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + h = hash_combine(h, t->size); + return h; +} + +static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info && + t1->size == t2->size; +} + +/* Calculate type signature hash of INT or TAG. */ +static long btf_hash_int_decl_tag(struct btf_type *t) +{ + __u32 info = *(__u32 *)(t + 1); + long h; + + h = btf_hash_common(t); + h = hash_combine(h, info); + return h; +} + +/* Check structural equality of two INTs or TAGs. */ +static bool btf_equal_int_tag(struct btf_type *t1, struct btf_type *t2) +{ + __u32 info1, info2; + + if (!btf_equal_common(t1, t2)) + return false; + info1 = *(__u32 *)(t1 + 1); + info2 = *(__u32 *)(t2 + 1); + return info1 == info2; +} + +/* Calculate type signature hash of ENUM/ENUM64. */ +static long btf_hash_enum(struct btf_type *t) +{ + long h; + + /* don't hash vlen, enum members and size to support enum fwd resolving */ + h = hash_combine(0, t->name_off); + return h; +} + +static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum(t1); + m2 = btf_enum(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val != m2->val) + return false; + m1++; + m2++; + } + return true; +} + +static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum64 *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum64(t1); + m2 = btf_enum64(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val_lo32 != m2->val_lo32 || + m1->val_hi32 != m2->val_hi32) + return false; + m1++; + m2++; + } + return true; +} + +/* Check structural equality of two ENUMs or ENUM64s. */ +static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + /* t1 & t2 kinds are identical because of btf_equal_common */ + if (btf_kind(t1) == BTF_KIND_ENUM) + return btf_equal_enum_members(t1, t2); + else + return btf_equal_enum64_members(t1, t2); +} + +static inline bool btf_is_enum_fwd(struct btf_type *t) +{ + return btf_is_any_enum(t) && btf_vlen(t) == 0; +} + +static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2)) + return btf_equal_enum(t1, t2); + /* At this point either t1 or t2 or both are forward declarations, thus: + * - skip comparing vlen because it is zero for forward declarations; + * - skip comparing size to allow enum forward declarations + * to be compatible with enum64 full declarations; + * - skip comparing kind for the same reason. + */ + return t1->name_off == t2->name_off && + btf_is_any_enum(t1) && btf_is_any_enum(t2); +} + +/* + * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static long btf_hash_struct(struct btf_type *t) +{ + const struct btf_member *member = btf_members(t); + __u32 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->offset); + /* no hashing of referenced type ID, it can be unresolved yet */ + member++; + } + return h; +} + +/* + * Check structural compatibility of two STRUCTs/UNIONs, ignoring referenced + * type IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_member *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->offset != m2->offset) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Calculate type signature hash of ARRAY, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_array(struct btf_type *t) +{ + const struct btf_array *info = btf_array(t); + long h = btf_hash_common(t); + + h = hash_combine(h, info->type); + h = hash_combine(h, info->index_type); + h = hash_combine(h, info->nelems); + return h; +} + +/* + * Check exact equality of two ARRAYs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * ARRAY to potential canonical representative. + */ +static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_array *info1, *info2; + + if (!btf_equal_common(t1, t2)) + return false; + + info1 = btf_array(t1); + info2 = btf_array(t2); + return info1->type == info2->type && + info1->index_type == info2->index_type && + info1->nelems == info2->nelems; +} + +/* + * Check structural compatibility of two ARRAYs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + return btf_array(t1)->nelems == btf_array(t2)->nelems; +} + +/* + * Calculate type signature hash of FUNC_PROTO, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_fnproto(struct btf_type *t) +{ + const struct btf_param *member = btf_params(t); + __u16 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->type); + member++; + } + return h; +} + +/* + * Check exact equality of two FUNC_PROTOs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * FUNC_PROTO to potential canonical representative. + */ +static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->type != m2->type) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + /* skip return type ID */ + if (t1->name_off != t2->name_off || t1->info != t2->info) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off) + return false; + m1++; + m2++; + } + return true; +} + +/* Prepare split BTF for deduplication by calculating hashes of base BTF's + * types and initializing the rest of the state (canonical type mapping) for + * the fixed base BTF part. + */ +static int btf_dedup_prep(struct btf_dedup *d) +{ + struct btf_type *t; + int type_id; + long h; + + if (!d->btf->base_btf) + return 0; + + for (type_id = 1; type_id < d->btf->start_id; type_id++) { + t = btf_type_by_id(d->btf, type_id); + + /* all base BTF types are self-canonical by definition */ + d->map[type_id] = type_id; + + switch (btf_kind(t)) { + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + /* VAR and DATASEC are never hash/deduplicated */ + continue; + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_FWD: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + h = btf_hash_common(t); + break; + case BTF_KIND_INT: + case BTF_KIND_DECL_TAG: + h = btf_hash_int_decl_tag(t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + h = btf_hash_struct(t); + break; + case BTF_KIND_ARRAY: + h = btf_hash_array(t); + break; + case BTF_KIND_FUNC_PROTO: + h = btf_hash_fnproto(t); + break; + default: + pr_debug("unknown kind %d for type [%d]\n", btf_kind(t), type_id); + return -EINVAL; + } + if (btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + } + + return 0; +} + +/* + * Deduplicate primitive types, that can't reference other types, by calculating + * their type signature hash and comparing them with any possible canonical + * candidate. If no canonical candidate matches, type itself is marked as + * canonical and is added into `btf_dedup->dedup_table` as another candidate. + */ +static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + struct hashmap_entry *hash_entry; + struct btf_type *cand; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u32 cand_id; + long h; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_ARRAY: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return 0; + + case BTF_KIND_INT: + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_enum(t, cand)) { + new_id = cand_id; + break; + } + if (btf_compat_enum(t, cand)) { + if (btf_is_enum_fwd(t)) { + /* resolve fwd to full enum */ + new_id = cand_id; + break; + } + /* resolve canonical enum fwd to full enum */ + d->map[cand_id] = type_id; + } + } + break; + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_prim_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Check whether type is already mapped into canonical one (could be to itself). + */ +static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id) +{ + return d->map[type_id] <= BTF_MAX_NR_TYPES; +} + +/* + * Resolve type ID into its canonical type ID, if any; otherwise return original + * type ID. If type is FWD and is resolved into STRUCT/UNION already, follow + * STRUCT/UNION link and resolve it into canonical type ID as well. + */ +static inline __u32 resolve_type_id(struct btf_dedup *d, __u32 type_id) +{ + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + return type_id; +} + +/* + * Resolve FWD to underlying STRUCT/UNION, if any; otherwise return original + * type ID. + */ +static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id) +{ + __u32 orig_type_id = type_id; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + return orig_type_id; +} + + +static inline __u16 btf_fwd_kind(struct btf_type *t) +{ + return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT; +} + +/* Check if given two types are identical ARRAY definitions */ +static bool btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + struct btf_type *t1, *t2; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + if (!btf_is_array(t1) || !btf_is_array(t2)) + return false; + + return btf_equal_array(t1, t2); +} + +/* Check if given two types are identical STRUCT/UNION definitions */ +static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + const struct btf_member *m1, *m2; + struct btf_type *t1, *t2; + int n, i; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + + if (!btf_is_composite(t1) || btf_kind(t1) != btf_kind(t2)) + return false; + + if (!btf_shallow_equal_struct(t1, t2)) + return false; + + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { + if (m1->type != m2->type && + !btf_dedup_identical_arrays(d, m1->type, m2->type) && + !btf_dedup_identical_structs(d, m1->type, m2->type)) + return false; + } + return true; +} + +/* + * Check equivalence of BTF type graph formed by candidate struct/union (we'll + * call it "candidate graph" in this description for brevity) to a type graph + * formed by (potential) canonical struct/union ("canonical graph" for brevity + * here, though keep in mind that not all types in canonical graph are + * necessarily canonical representatives themselves, some of them might be + * duplicates or its uniqueness might not have been established yet). + * Returns: + * - >0, if type graphs are equivalent; + * - 0, if not equivalent; + * - <0, on error. + * + * Algorithm performs side-by-side DFS traversal of both type graphs and checks + * equivalence of BTF types at each step. If at any point BTF types in candidate + * and canonical graphs are not compatible structurally, whole graphs are + * incompatible. If types are structurally equivalent (i.e., all information + * except referenced type IDs is exactly the same), a mapping from `canon_id` to + * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`). + * If a type references other types, then those referenced types are checked + * for equivalence recursively. + * + * During DFS traversal, if we find that for current `canon_id` type we + * already have some mapping in hypothetical map, we check for two possible + * situations: + * - `canon_id` is mapped to exactly the same type as `cand_id`. This will + * happen when type graphs have cycles. In this case we assume those two + * types are equivalent. + * - `canon_id` is mapped to different type. This is contradiction in our + * hypothetical mapping, because same graph in canonical graph corresponds + * to two different types in candidate graph, which for equivalent type + * graphs shouldn't happen. This condition terminates equivalence check + * with negative result. + * + * If type graphs traversal exhausts types to check and find no contradiction, + * then type graphs are equivalent. + * + * When checking types for equivalence, there is one special case: FWD types. + * If FWD type resolution is allowed and one of the types (either from canonical + * or candidate graph) is FWD and other is STRUCT/UNION (depending on FWD's kind + * flag) and their names match, hypothetical mapping is updated to point from + * FWD to STRUCT/UNION. If graphs will be determined as equivalent successfully, + * this mapping will be used to record FWD -> STRUCT/UNION mapping permanently. + * + * Technically, this could lead to incorrect FWD to STRUCT/UNION resolution, + * if there are two exactly named (or anonymous) structs/unions that are + * compatible structurally, one of which has FWD field, while other is concrete + * STRUCT/UNION, but according to C sources they are different structs/unions + * that are referencing different types with the same name. This is extremely + * unlikely to happen, but btf_dedup API allows to disable FWD resolution if + * this logic is causing problems. + * + * Doing FWD resolution means that both candidate and/or canonical graphs can + * consists of portions of the graph that come from multiple compilation units. + * This is due to the fact that types within single compilation unit are always + * deduplicated and FWDs are already resolved, if referenced struct/union + * definiton is available. So, if we had unresolved FWD and found corresponding + * STRUCT/UNION, they will be from different compilation units. This + * consequently means that when we "link" FWD to corresponding STRUCT/UNION, + * type graph will likely have at least two different BTF types that describe + * same type (e.g., most probably there will be two different BTF types for the + * same 'int' primitive type) and could even have "overlapping" parts of type + * graph that describe same subset of types. + * + * This in turn means that our assumption that each type in canonical graph + * must correspond to exactly one type in candidate graph might not hold + * anymore and will make it harder to detect contradictions using hypothetical + * map. To handle this problem, we allow to follow FWD -> STRUCT/UNION + * resolution only in canonical graph. FWDs in candidate graphs are never + * resolved. To see why it's OK, let's check all possible situations w.r.t. FWDs + * that can occur: + * - Both types in canonical and candidate graphs are FWDs. If they are + * structurally equivalent, then they can either be both resolved to the + * same STRUCT/UNION or not resolved at all. In both cases they are + * equivalent and there is no need to resolve FWD on candidate side. + * - Both types in canonical and candidate graphs are concrete STRUCT/UNION, + * so nothing to resolve as well, algorithm will check equivalence anyway. + * - Type in canonical graph is FWD, while type in candidate is concrete + * STRUCT/UNION. In this case candidate graph comes from single compilation + * unit, so there is exactly one BTF type for each unique C type. After + * resolving FWD into STRUCT/UNION, there might be more than one BTF type + * in canonical graph mapping to single BTF type in candidate graph, but + * because hypothetical mapping maps from canonical to candidate types, it's + * alright, and we still maintain the property of having single `canon_id` + * mapping to single `cand_id` (there could be two different `canon_id` + * mapped to the same `cand_id`, but it's not contradictory). + * - Type in canonical graph is concrete STRUCT/UNION, while type in candidate + * graph is FWD. In this case we are just going to check compatibility of + * STRUCT/UNION and corresponding FWD, and if they are compatible, we'll + * assume that whatever STRUCT/UNION FWD resolves to must be equivalent to + * a concrete STRUCT/UNION from canonical graph. If the rest of type graphs + * turn out equivalent, we'll re-resolve FWD to concrete STRUCT/UNION from + * canonical graph. + */ +static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, + __u32 canon_id) +{ + struct btf_type *cand_type; + struct btf_type *canon_type; + __u32 hypot_type_id; + __u16 cand_kind; + __u16 canon_kind; + int i, eq; + + /* if both resolve to the same canonical, they must be equivalent */ + if (resolve_type_id(d, cand_id) == resolve_type_id(d, canon_id)) + return 1; + + canon_id = resolve_fwd_id(d, canon_id); + + hypot_type_id = d->hypot_map[canon_id]; + if (hypot_type_id <= BTF_MAX_NR_TYPES) { + if (hypot_type_id == cand_id) + return 1; + /* In some cases compiler will generate different DWARF types + * for *identical* array type definitions and use them for + * different fields within the *same* struct. This breaks type + * equivalence check, which makes an assumption that candidate + * types sub-graph has a consistent and deduped-by-compiler + * types within a single CU. So work around that by explicitly + * allowing identical array types here. + */ + if (btf_dedup_identical_arrays(d, hypot_type_id, cand_id)) + return 1; + /* It turns out that similar situation can happen with + * struct/union sometimes, sigh... Handle the case where + * structs/unions are exactly the same, down to the referenced + * type IDs. Anything more complicated (e.g., if referenced + * types are different, but equivalent) is *way more* + * complicated and requires a many-to-many equivalence mapping. + */ + if (btf_dedup_identical_structs(d, hypot_type_id, cand_id)) + return 1; + return 0; + } + + if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) + return -ENOMEM; + + cand_type = btf_type_by_id(d->btf, cand_id); + canon_type = btf_type_by_id(d->btf, canon_id); + cand_kind = btf_kind(cand_type); + canon_kind = btf_kind(canon_type); + + if (cand_type->name_off != canon_type->name_off) + return 0; + + /* FWD <--> STRUCT/UNION equivalence check, if enabled */ + if ((cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD) + && cand_kind != canon_kind) { + __u16 real_kind; + __u16 fwd_kind; + + if (cand_kind == BTF_KIND_FWD) { + real_kind = canon_kind; + fwd_kind = btf_fwd_kind(cand_type); + } else { + real_kind = cand_kind; + fwd_kind = btf_fwd_kind(canon_type); + /* we'd need to resolve base FWD to STRUCT/UNION */ + if (fwd_kind == real_kind && canon_id < d->btf->start_id) + d->hypot_adjust_canon = true; + } + return fwd_kind == real_kind; + } + + if (cand_kind != canon_kind) + return 0; + + switch (cand_kind) { + case BTF_KIND_INT: + return btf_equal_int_tag(cand_type, canon_type); + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return btf_compat_enum(cand_type, canon_type); + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + return btf_equal_common(cand_type, canon_type); + + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + if (cand_type->info != canon_type->info) + return 0; + return btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + + case BTF_KIND_ARRAY: { + const struct btf_array *cand_arr, *canon_arr; + + if (!btf_compat_array(cand_type, canon_type)) + return 0; + cand_arr = btf_array(cand_type); + canon_arr = btf_array(canon_type); + eq = btf_dedup_is_equiv(d, cand_arr->index_type, canon_arr->index_type); + if (eq <= 0) + return eq; + return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *cand_m, *canon_m; + __u16 vlen; + + if (!btf_shallow_equal_struct(cand_type, canon_type)) + return 0; + vlen = btf_vlen(cand_type); + cand_m = btf_members(cand_type); + canon_m = btf_members(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); + if (eq <= 0) + return eq; + cand_m++; + canon_m++; + } + + return 1; + } + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *cand_p, *canon_p; + __u16 vlen; + + if (!btf_compat_fnproto(cand_type, canon_type)) + return 0; + eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + if (eq <= 0) + return eq; + vlen = btf_vlen(cand_type); + cand_p = btf_params(cand_type); + canon_p = btf_params(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type); + if (eq <= 0) + return eq; + cand_p++; + canon_p++; + } + return 1; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Use hypothetical mapping, produced by successful type graph equivalence + * check, to augment existing struct/union canonical mapping, where possible. + * + * If BTF_KIND_FWD resolution is allowed, this mapping is also used to record + * FWD -> STRUCT/UNION correspondence as well. FWD resolution is bidirectional: + * it doesn't matter if FWD type was part of canonical graph or candidate one, + * we are recording the mapping anyway. As opposed to carefulness required + * for struct/union correspondence mapping (described below), for FWD resolution + * it's not important, as by the time that FWD type (reference type) will be + * deduplicated all structs/unions will be deduped already anyway. + * + * Recording STRUCT/UNION mapping is purely a performance optimization and is + * not required for correctness. It needs to be done carefully to ensure that + * struct/union from candidate's type graph is not mapped into corresponding + * struct/union from canonical type graph that itself hasn't been resolved into + * canonical representative. The only guarantee we have is that canonical + * struct/union was determined as canonical and that won't change. But any + * types referenced through that struct/union fields could have been not yet + * resolved, so in case like that it's too early to establish any kind of + * correspondence between structs/unions. + * + * No canonical correspondence is derived for primitive types (they are already + * deduplicated completely already anyway) or reference types (they rely on + * stability of struct/union canonical relationship for equivalence checks). + */ +static void btf_dedup_merge_hypot_map(struct btf_dedup *d) +{ + __u32 canon_type_id, targ_type_id; + __u16 t_kind, c_kind; + __u32 t_id, c_id; + int i; + + for (i = 0; i < d->hypot_cnt; i++) { + canon_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[canon_type_id]; + t_id = resolve_type_id(d, targ_type_id); + c_id = resolve_type_id(d, canon_type_id); + t_kind = btf_kind(btf__type_by_id(d->btf, t_id)); + c_kind = btf_kind(btf__type_by_id(d->btf, c_id)); + /* + * Resolve FWD into STRUCT/UNION. + * It's ok to resolve FWD into STRUCT/UNION that's not yet + * mapped to canonical representative (as opposed to + * STRUCT/UNION <--> STRUCT/UNION mapping logic below), because + * eventually that struct is going to be mapped and all resolved + * FWDs will automatically resolve to correct canonical + * representative. This will happen before ref type deduping, + * which critically depends on stability of these mapping. This + * stability is not a requirement for STRUCT/UNION equivalence + * checks, though. + */ + + /* if it's the split BTF case, we still need to point base FWD + * to STRUCT/UNION in a split BTF, because FWDs from split BTF + * will be resolved against base FWD. If we don't point base + * canonical FWD to the resolved STRUCT/UNION, then all the + * FWDs in split BTF won't be correctly resolved to a proper + * STRUCT/UNION. + */ + if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) + d->map[c_id] = t_id; + + /* if graph equivalence determined that we'd need to adjust + * base canonical types, then we need to only point base FWDs + * to STRUCTs/UNIONs and do no more modifications. For all + * other purposes the type graphs were not equivalent. + */ + if (d->hypot_adjust_canon) + continue; + + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + d->map[t_id] = c_id; + + if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && + c_kind != BTF_KIND_FWD && + is_type_mapped(d, c_id) && + !is_type_mapped(d, t_id)) { + /* + * as a perf optimization, we can map struct/union + * that's part of type graph we just verified for + * equivalence. We can do that for struct/union that has + * canonical representative only, though. + */ + d->map[t_id] = c_id; + } + } +} + +/* + * Deduplicate struct/union types. + * + * For each struct/union type its type signature hash is calculated, taking + * into account type's name, size, number, order and names of fields, but + * ignoring type ID's referenced from fields, because they might not be deduped + * completely until after reference types deduplication phase. This type hash + * is used to iterate over all potential canonical types, sharing same hash. + * For each canonical candidate we check whether type graphs that they form + * (through referenced types in fields and so on) are equivalent using algorithm + * implemented in `btf_dedup_is_equiv`. If such equivalence is found and + * BTF_KIND_FWD resolution is allowed, then hypothetical mapping + * (btf_dedup->hypot_map) produced by aforementioned type graph equivalence + * algorithm is used to record FWD -> STRUCT/UNION mapping. It's also used to + * potentially map other structs/unions to their canonical representatives, + * if such relationship hasn't yet been established. This speeds up algorithm + * by eliminating some of the duplicate work. + * + * If no matching canonical representative was found, struct/union is marked + * as canonical for itself and is added into btf_dedup->dedup_table hash map + * for further look ups. + */ +static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *cand_type, *t; + struct hashmap_entry *hash_entry; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u16 kind; + long h; + + /* already deduped or is in process of deduping (loop detected) */ + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return 0; + + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + return 0; + + h = btf_hash_struct(t); + for_each_dedup_cand(d, hash_entry, h) { + __u32 cand_id = hash_entry->value; + int eq; + + /* + * Even though btf_dedup_is_equiv() checks for + * btf_shallow_equal_struct() internally when checking two + * structs (unions) for equivalence, we need to guard here + * from picking matching FWD type as a dedup candidate. + * This can happen due to hash collision. In such case just + * relying on btf_dedup_is_equiv() would lead to potentially + * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because + * FWD and compatible STRUCT/UNION are considered equivalent. + */ + cand_type = btf_type_by_id(d->btf, cand_id); + if (!btf_shallow_equal_struct(t, cand_type)) + continue; + + btf_dedup_clear_hypot_map(d); + eq = btf_dedup_is_equiv(d, type_id, cand_id); + if (eq < 0) + return eq; + if (!eq) + continue; + btf_dedup_merge_hypot_map(d); + if (d->hypot_adjust_canon) /* not really equivalent */ + continue; + new_id = cand_id; + break; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_struct_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Deduplicate reference type. + * + * Once all primitive and struct/union types got deduplicated, we can easily + * deduplicate all other (reference) BTF types. This is done in two steps: + * + * 1. Resolve all referenced type IDs into their canonical type IDs. This + * resolution can be done either immediately for primitive or struct/union types + * (because they were deduped in previous two phases) or recursively for + * reference types. Recursion will always terminate at either primitive or + * struct/union type, at which point we can "unwind" chain of reference types + * one by one. There is no danger of encountering cycles because in C type + * system the only way to form type cycle is through struct/union, so any chain + * of reference types, even those taking part in a type cycle, will inevitably + * reach struct/union at some point. + * + * 2. Once all referenced type IDs are resolved into canonical ones, BTF type + * becomes "stable", in the sense that no further deduplication will cause + * any changes to it. With that, it's now possible to calculate type's signature + * hash (this time taking into account referenced type IDs) and loop over all + * potential canonical representatives. If no match was found, current type + * will become canonical representative of itself and will be added into + * btf_dedup->dedup_table as another possible canonical representative. + */ +static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) +{ + struct hashmap_entry *hash_entry; + __u32 new_id = type_id, cand_id; + struct btf_type *t, *cand; + /* if we don't find equivalent type, then we are representative type */ + int ref_type_id; + long h; + + if (d->map[type_id] == BTF_IN_PROGRESS_ID) + return -ELOOP; + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return resolve_type_id(d, type_id); + + t = btf_type_by_id(d->btf, type_id); + d->map[type_id] = BTF_IN_PROGRESS_ID; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_DECL_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ARRAY: { + struct btf_array *info = btf_array(t); + + ref_type_id = btf_dedup_ref_type(d, info->type); + if (ref_type_id < 0) + return ref_type_id; + info->type = ref_type_id; + + ref_type_id = btf_dedup_ref_type(d, info->index_type); + if (ref_type_id < 0) + return ref_type_id; + info->index_type = ref_type_id; + + h = btf_hash_array(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_array(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *param; + __u16 vlen; + int i; + + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + vlen = btf_vlen(t); + param = btf_params(t); + for (i = 0; i < vlen; i++) { + ref_type_id = btf_dedup_ref_type(d, param->type); + if (ref_type_id < 0) + return ref_type_id; + param->type = ref_type_id; + param++; + } + + h = btf_hash_fnproto(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_fnproto(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return new_id; +} + +static int btf_dedup_ref_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, d->btf->start_id + i); + if (err < 0) + return err; + } + /* we won't need d->dedup_table anymore */ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + return 0; +} + +/* + * Collect a map from type names to type ids for all canonical structs + * and unions. If the same name is shared by several canonical types + * use a special value 0 to indicate this fact. + */ +static int btf_dedup_fill_unique_names_map(struct btf_dedup *d, struct hashmap *names_map) +{ + __u32 nr_types = btf__type_cnt(d->btf); + struct btf_type *t; + __u32 type_id; + __u16 kind; + int err; + + /* + * Iterate over base and split module ids in order to get all + * available structs in the map. + */ + for (type_id = 1; type_id < nr_types; ++type_id) { + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + continue; + + /* Skip non-canonical types */ + if (type_id != d->map[type_id]) + continue; + + err = hashmap__add(names_map, t->name_off, type_id); + if (err == -EEXIST) + err = hashmap__set(names_map, t->name_off, 0, NULL, NULL); + + if (err) + return err; + } + + return 0; +} + +static int btf_dedup_resolve_fwd(struct btf_dedup *d, struct hashmap *names_map, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + enum btf_fwd_kind fwd_kind = btf_kflag(t); + __u16 cand_kind, kind = btf_kind(t); + struct btf_type *cand_t; + uintptr_t cand_id; + + if (kind != BTF_KIND_FWD) + return 0; + + /* Skip if this FWD already has a mapping */ + if (type_id != d->map[type_id]) + return 0; + + if (!hashmap__find(names_map, t->name_off, &cand_id)) + return 0; + + /* Zero is a special value indicating that name is not unique */ + if (!cand_id) + return 0; + + cand_t = btf_type_by_id(d->btf, cand_id); + cand_kind = btf_kind(cand_t); + if ((cand_kind == BTF_KIND_STRUCT && fwd_kind != BTF_FWD_STRUCT) || + (cand_kind == BTF_KIND_UNION && fwd_kind != BTF_FWD_UNION)) + return 0; + + d->map[type_id] = cand_id; + + return 0; +} + +/* + * Resolve unambiguous forward declarations. + * + * The lion's share of all FWD declarations is resolved during + * `btf_dedup_struct_types` phase when different type graphs are + * compared against each other. However, if in some compilation unit a + * FWD declaration is not a part of a type graph compared against + * another type graph that declaration's canonical type would not be + * changed. Example: + * + * CU #1: + * + * struct foo; + * struct foo *some_global; + * + * CU #2: + * + * struct foo { int u; }; + * struct foo *another_global; + * + * After `btf_dedup_struct_types` the BTF looks as follows: + * + * [1] STRUCT 'foo' size=4 vlen=1 ... + * [2] INT 'int' size=4 ... + * [3] PTR '(anon)' type_id=1 + * [4] FWD 'foo' fwd_kind=struct + * [5] PTR '(anon)' type_id=4 + * + * This pass assumes that such FWD declarations should be mapped to + * structs or unions with identical name in case if the name is not + * ambiguous. + */ +static int btf_dedup_resolve_fwds(struct btf_dedup *d) +{ + int i, err; + struct hashmap *names_map; + + names_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(names_map)) + return PTR_ERR(names_map); + + err = btf_dedup_fill_unique_names_map(d, names_map); + if (err < 0) + goto exit; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_resolve_fwd(d, names_map, d->btf->start_id + i); + if (err < 0) + break; + } + +exit: + hashmap__free(names_map); + return err; +} + +/* + * Compact types. + * + * After we established for each type its corresponding canonical representative + * type, we now can eliminate types that are not canonical and leave only + * canonical ones layed out sequentially in memory by copying them over + * duplicates. During compaction btf_dedup->hypot_map array is reused to store + * a map from original type ID to a new compacted type ID, which will be used + * during next phase to "fix up" type IDs, referenced from struct/union and + * reference types. + */ +static int btf_dedup_compact_types(struct btf_dedup *d) +{ + __u32 *new_offs; + __u32 next_type_id = d->btf->start_id; + const struct btf_type *t; + void *p; + int i, id, len; + + /* we are going to reuse hypot_map to store compaction remapping */ + d->hypot_map[0] = 0; + /* base BTF types are not renumbered */ + for (id = 1; id < d->btf->start_id; id++) + d->hypot_map[id] = id; + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) + d->hypot_map[id] = BTF_UNPROCESSED_ID; + + p = d->btf->types_data; + + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) { + if (d->map[id] != id) + continue; + + t = btf__type_by_id(d->btf, id); + len = btf_type_size(t); + if (len < 0) + return len; + + memmove(p, t, len); + d->hypot_map[id] = next_type_id; + d->btf->type_offs[next_type_id - d->btf->start_id] = p - d->btf->types_data; + p += len; + next_type_id++; + } + + /* shrink struct btf's internal types index and update btf_header */ + d->btf->nr_types = next_type_id - d->btf->start_id; + d->btf->type_offs_cap = d->btf->nr_types; + d->btf->hdr->type_len = p - d->btf->types_data; + new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, + sizeof(*new_offs)); + if (d->btf->type_offs_cap && !new_offs) + return -ENOMEM; + d->btf->type_offs = new_offs; + d->btf->hdr->str_off = d->btf->hdr->type_len; + d->btf->raw_size = d->btf->hdr->hdr_len + d->btf->hdr->type_len + d->btf->hdr->str_len; + return 0; +} + +/* + * Figure out final (deduplicated and compacted) type ID for provided original + * `type_id` by first resolving it into corresponding canonical type ID and + * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, + * which is populated during compaction phase. + */ +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 resolved_type_id, new_type_id; + + resolved_type_id = resolve_type_id(d, *type_id); + new_type_id = d->hypot_map[resolved_type_id]; + if (new_type_id > BTF_MAX_NR_TYPES) + return -EINVAL; + + *type_id = new_type_id; + return 0; +} + +/* + * Remap referenced type IDs into deduped type IDs. + * + * After BTF types are deduplicated and compacted, their final type IDs may + * differ from original ones. The map from original to a corresponding + * deduped type ID is stored in btf_dedup->hypot_map and is populated during + * compaction phase. During remapping phase we are rewriting all type IDs + * referenced from any BTF type (e.g., struct fields, func proto args, etc) to + * their final deduped type IDs. + */ +static int btf_dedup_remap_types(struct btf_dedup *d) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; + + return 0; +} + +/* + * Probe few well-known locations for vmlinux kernel image and try to load BTF + * data out of it to use for target BTF. + */ +struct btf *btf__load_vmlinux_btf(void) +{ + const char *locations[] = { + /* try canonical vmlinux BTF through sysfs first */ + "/sys/kernel/btf/vmlinux", + /* fall back to trying to find vmlinux on disk otherwise */ + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", + }; + char path[PATH_MAX + 1]; + struct utsname buf; + struct btf *btf; + int i, err; + + uname(&buf); + + for (i = 0; i < ARRAY_SIZE(locations); i++) { + snprintf(path, PATH_MAX, locations[i], buf.release); + + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) + continue; + + btf = btf__parse(path, NULL); + err = libbpf_get_error(btf); + pr_debug("loading kernel BTF '%s': %d\n", path, err); + if (err) + continue; + + return btf; + } + + pr_warn("failed to find valid kernel BTF\n"); + return libbpf_err_ptr(-ESRCH); +} + +struct btf *libbpf_find_kernel_btf(void) __attribute__((alias("btf__load_vmlinux_btf"))); + +struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf) +{ + char path[80]; + + snprintf(path, sizeof(path), "/sys/kernel/btf/%s", module_name); + return btf__parse_split(path, vmlinux_btf); +} + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM64: { + struct btf_enum64 *m = btf_enum64(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/third_party/bpftool/libbpf/src/btf.h b/third_party/bpftool/libbpf/src/btf.h new file mode 100644 index 0000000..8e6880d --- /dev/null +++ b/third_party/bpftool/libbpf/src/btf.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2018 Facebook */ +/*! \file */ + +#ifndef __LIBBPF_BTF_H +#define __LIBBPF_BTF_H + +#include +#include +#include +#include + +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BTF_ELF_SEC ".BTF" +#define BTF_EXT_ELF_SEC ".BTF.ext" +#define MAPS_ELF_SEC ".maps" + +struct btf; +struct btf_ext; +struct btf_type; + +struct bpf_object; + +enum btf_endianness { + BTF_LITTLE_ENDIAN = 0, + BTF_BIG_ENDIAN = 1, +}; + +/** + * @brief **btf__free()** frees all data of a BTF object + * @param btf BTF object to free + */ +LIBBPF_API void btf__free(struct btf *btf); + +/** + * @brief **btf__new()** creates a new instance of a BTF object from the raw + * bytes of an ELF's BTF section + * @param data raw bytes + * @param size number of bytes passed in `data` + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new(const void *data, __u32 size); + +/** + * @brief **btf__new_split()** create a new instance of a BTF object from the + * provided raw data bytes. It takes another BTF instance, **base_btf**, which + * serves as a base BTF, which is extended by types in a newly created BTF + * instance + * @param data raw bytes + * @param size length of raw bytes + * @param base_btf the base BTF object + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_split()` is equivalent to `btf__new()` and + * creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf); + +/** + * @brief **btf__new_empty()** creates an empty BTF object. Use + * `btf__add_*()` to populate such BTF object. + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty(void); + +/** + * @brief **btf__new_empty_split()** creates an unpopulated BTF object from an + * ELF BTF section except with a base BTF on top of which split BTF should be + * based + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_empty_split()` is equivalent to + * `btf__new_empty()` and creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); + +LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_raw(const char *path); +LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); + +LIBBPF_API struct btf *btf__load_vmlinux_btf(void); +LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); + +LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); +LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); + +LIBBPF_API int btf__load_into_kernel(struct btf *btf); +LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, + const char *type_name); +LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, + const char *type_name, __u32 kind); +LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); +LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); +LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, + __u32 id); +LIBBPF_API size_t btf__pointer_size(const struct btf *btf); +LIBBPF_API int btf__set_pointer_size(struct btf *btf, size_t ptr_sz); +LIBBPF_API enum btf_endianness btf__endianness(const struct btf *btf); +LIBBPF_API int btf__set_endianness(struct btf *btf, enum btf_endianness endian); +LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id); +LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API void btf__set_fd(struct btf *btf, int fd); +LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); +LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); + +LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); +LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); +LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); + +LIBBPF_API int btf__find_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); +/** + * @brief **btf__add_btf()** appends all the BTF types from *src_btf* into *btf* + * @param btf BTF object which all the BTF types and strings are added to + * @param src_btf BTF object which all BTF types and referenced strings are copied from + * @return BTF type ID of the first appended BTF type, or negative error code + * + * **btf__add_btf()** can be used to simply and efficiently append the entire + * contents of one BTF object to another one. All the BTF type data is copied + * over, all referenced type IDs are adjusted by adding a necessary ID offset. + * Only strings referenced from BTF types are copied over and deduplicated, so + * if there were some unused strings in *src_btf*, those won't be copied over, + * which is consistent with the general string deduplication semantics of BTF + * writing APIs. + * + * If any error is encountered during this process, the contents of *btf* is + * left intact, which means that **btf__add_btf()** follows the transactional + * semantics and the operation as a whole is all-or-nothing. + * + * *src_btf* has to be non-split BTF, as of now copying types from split BTF + * is not supported and will result in -ENOTSUP error code returned. + */ +LIBBPF_API int btf__add_btf(struct btf *btf, const struct btf *src_btf); + +LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); +LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_array(struct btf *btf, + int index_type_id, int elem_type_id, __u32 nr_elems); +/* struct/union construction APIs */ +LIBBPF_API int btf__add_struct(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_union(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_field(struct btf *btf, const char *name, int field_type_id, + __u32 bit_offset, __u32 bit_size); + +/* enum construction APIs */ +LIBBPF_API int btf__add_enum(struct btf *btf, const char *name, __u32 bytes_sz); +LIBBPF_API int btf__add_enum_value(struct btf *btf, const char *name, __s64 value); +LIBBPF_API int btf__add_enum64(struct btf *btf, const char *name, __u32 bytes_sz, bool is_signed); +LIBBPF_API int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value); + +enum btf_fwd_kind { + BTF_FWD_STRUCT = 0, + BTF_FWD_UNION = 1, + BTF_FWD_ENUM = 2, +}; + +LIBBPF_API int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind); +LIBBPF_API int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id); +LIBBPF_API int btf__add_volatile(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_const(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_restrict(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id); + +/* func and func_proto construction APIs */ +LIBBPF_API int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id); +LIBBPF_API int btf__add_func_proto(struct btf *btf, int ret_type_id); +LIBBPF_API int btf__add_func_param(struct btf *btf, const char *name, int type_id); + +/* var & datasec construction APIs */ +LIBBPF_API int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id); +LIBBPF_API int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz); +LIBBPF_API int btf__add_datasec_var_info(struct btf *btf, int var_type_id, + __u32 offset, __u32 byte_sz); + +/* tag construction API */ +LIBBPF_API int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx); + +struct btf_dedup_opts { + size_t sz; + /* optional .BTF.ext info to dedup along the main BTF info */ + struct btf_ext *btf_ext; + /* force hash collisions (used for testing) */ + bool force_collisions; + size_t :0; +}; +#define btf_dedup_opts__last_field force_collisions + +LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); + +struct btf_dump; + +struct btf_dump_opts { + size_t sz; +}; +#define btf_dump_opts__last_field sz + +typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); + +LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts); + +LIBBPF_API void btf_dump__free(struct btf_dump *d); + +LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); + +struct btf_dump_emit_type_decl_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* optional field name for type declaration, e.g.: + * - struct my_struct + * - void (*)(int) + * - char (*)[123] + */ + const char *field_name; + /* extra indentation level (in number of tabs) to emit for multi-line + * type declarations (e.g., anonymous struct); applies for lines + * starting from the second one (first line is assumed to have + * necessary indentation already + */ + int indent_level; + /* strip all the const/volatile/restrict mods */ + bool strip_mods; + size_t :0; +}; +#define btf_dump_emit_type_decl_opts__last_field strip_mods + +LIBBPF_API int +btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts); + + +struct btf_dump_type_data_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + const char *indent_str; + int indent_level; + /* below match "show" flags for bpf_show_snprintf() */ + bool compact; /* no newlines/indentation */ + bool skip_names; /* skip member/type names */ + bool emit_zeroes; /* show 0-valued fields */ + size_t :0; +}; +#define btf_dump_type_data_opts__last_field emit_zeroes + +LIBBPF_API int +btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts); + +/* + * A set of helpers for easier BTF types handling. + * + * The inline functions below rely on constants from the kernel headers which + * may not be available for applications including this header file. To avoid + * compilation errors, we define all the constants here that were added after + * the initial introduction of the BTF_KIND* constants. + */ +#ifndef BTF_KIND_FUNC +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#endif +#ifndef BTF_KIND_VAR +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#endif +#ifndef BTF_KIND_FLOAT +#define BTF_KIND_FLOAT 16 /* Floating point */ +#endif +/* The kernel header switched to enums, so the following were never #defined */ +#define BTF_KIND_DECL_TAG 17 /* Decl Tag */ +#define BTF_KIND_TYPE_TAG 18 /* Type Tag */ +#define BTF_KIND_ENUM64 19 /* Enum for up-to 64bit values */ + +static inline __u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline __u16 btf_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline bool btf_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static inline bool btf_is_void(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNKN; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_struct(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_STRUCT; +} + +static inline bool btf_is_union(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNION; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + +static inline bool btf_is_fwd(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FWD; +} + +static inline bool btf_is_typedef(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_is_volatile(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VOLATILE; +} + +static inline bool btf_is_const(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_CONST; +} + +static inline bool btf_is_restrict(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_RESTRICT; +} + +static inline bool btf_is_mod(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_VOLATILE || + kind == BTF_KIND_CONST || + kind == BTF_KIND_RESTRICT || + kind == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_func(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC; +} + +static inline bool btf_is_func_proto(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC_PROTO; +} + +static inline bool btf_is_var(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VAR; +} + +static inline bool btf_is_datasec(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DATASEC; +} + +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + +static inline bool btf_is_decl_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DECL_TAG; +} + +static inline bool btf_is_type_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return btf_is_enum(t) || btf_is_enum64(t); +} + +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return btf_kind(t1) == btf_kind(t2) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + +static inline __u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_bits(const struct btf_type *t) +{ + return BTF_INT_BITS(*(__u32 *)(t + 1)); +} + +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + +struct btf_enum64; + +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + +static inline __u64 btf_enum64_value(const struct btf_enum64 *e) +{ + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; +} + +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +/* Get bit offset of a member with specified index. */ +static inline __u32 btf_member_bit_offset(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BIT_OFFSET(m->offset) : m->offset; +} +/* + * Get bitfield size of a member, assuming t is BTF_KIND_STRUCT or + * BTF_KIND_UNION. If member is not a bitfield, zero is returned. + */ +static inline __u32 btf_member_bitfield_size(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0; +} + +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + +static inline struct btf_var *btf_var(const struct btf_type *t) +{ + return (struct btf_var *)(t + 1); +} + +static inline struct btf_var_secinfo * +btf_var_secinfos(const struct btf_type *t) +{ + return (struct btf_var_secinfo *)(t + 1); +} + +struct btf_decl_tag; +static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t) +{ + return (struct btf_decl_tag *)(t + 1); +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BTF_H */ diff --git a/third_party/bpftool/libbpf/src/btf_dump.c b/third_party/bpftool/libbpf/src/btf_dump.c new file mode 100644 index 0000000..4d9f30b --- /dev/null +++ b/third_party/bpftool/libbpf/src/btf_dump.c @@ -0,0 +1,2542 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * BTF-to-C type converter. + * + * Copyright (c) 2019 Facebook + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "hashmap.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; +static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; + +static const char *pfx(int lvl) +{ + return lvl >= PREFIX_CNT ? PREFIXES : &PREFIXES[PREFIX_CNT - lvl]; +} + +enum btf_dump_type_order_state { + NOT_ORDERED, + ORDERING, + ORDERED, +}; + +enum btf_dump_type_emit_state { + NOT_EMITTED, + EMITTING, + EMITTED, +}; + +/* per-type auxiliary state */ +struct btf_dump_type_aux_state { + /* topological sorting state */ + enum btf_dump_type_order_state order_state: 2; + /* emitting state used to determine the need for forward declaration */ + enum btf_dump_type_emit_state emit_state: 2; + /* whether forward declaration was already emitted */ + __u8 fwd_emitted: 1; + /* whether unique non-duplicate name was already assigned */ + __u8 name_resolved: 1; + /* whether type is referenced from any other type */ + __u8 referenced: 1; +}; + +/* indent string length; one indent string is added for each indent level */ +#define BTF_DATA_INDENT_STR_LEN 32 + +/* + * Common internal data for BTF type data dump operations. + */ +struct btf_dump_data { + const void *data_end; /* end of valid data to show */ + bool compact; + bool skip_names; + bool emit_zeroes; + __u8 indent_lvl; /* base indent level */ + char indent_str[BTF_DATA_INDENT_STR_LEN]; + /* below are used during iteration */ + int depth; + bool is_array_member; + bool is_array_terminated; + bool is_array_char; +}; + +struct btf_dump { + const struct btf *btf; + btf_dump_printf_fn_t printf_fn; + void *cb_ctx; + int ptr_sz; + bool strip_mods; + bool skip_anon_defs; + int last_id; + + /* per-type auxiliary state */ + struct btf_dump_type_aux_state *type_states; + size_t type_states_cap; + /* per-type optional cached unique name, must be freed, if present */ + const char **cached_names; + size_t cached_names_cap; + + /* topo-sorted list of dependent type definitions */ + __u32 *emit_queue; + int emit_queue_cap; + int emit_queue_cnt; + + /* + * stack of type declarations (e.g., chain of modifiers, arrays, + * funcs, etc) + */ + __u32 *decl_stack; + int decl_stack_cap; + int decl_stack_cnt; + + /* maps struct/union/enum name to a number of name occurrences */ + struct hashmap *type_names; + /* + * maps typedef identifiers and enum value names to a number of such + * name occurrences + */ + struct hashmap *ident_names; + /* + * data for typed display; allocated if needed. + */ + struct btf_dump_data *typed_dump; +}; + +static size_t str_hash_fn(long key, void *ctx) +{ + return str_hash((void *)key); +} + +static bool str_equal_fn(long a, long b, void *ctx) +{ + return strcmp((void *)a, (void *)b) == 0; +} + +static const char *btf_name_of(const struct btf_dump *d, __u32 name_off) +{ + return btf__name_by_offset(d->btf, name_off); +} + +static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + d->printf_fn(d->cb_ctx, fmt, args); + va_end(args); +} + +static int btf_dump_mark_referenced(struct btf_dump *d); +static int btf_dump_resize(struct btf_dump *d); + +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) +{ + struct btf_dump *d; + int err; + + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + + if (!printf_fn) + return libbpf_err_ptr(-EINVAL); + + d = calloc(1, sizeof(struct btf_dump)); + if (!d) + return libbpf_err_ptr(-ENOMEM); + + d->btf = btf; + d->printf_fn = printf_fn; + d->cb_ctx = ctx; + d->ptr_sz = btf__pointer_size(btf) ? : sizeof(void *); + + d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->type_names)) { + err = PTR_ERR(d->type_names); + d->type_names = NULL; + goto err; + } + d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->ident_names)) { + err = PTR_ERR(d->ident_names); + d->ident_names = NULL; + goto err; + } + + err = btf_dump_resize(d); + if (err) + goto err; + + return d; +err: + btf_dump__free(d); + return libbpf_err_ptr(err); +} + +static int btf_dump_resize(struct btf_dump *d) +{ + int err, last_id = btf__type_cnt(d->btf) - 1; + + if (last_id <= d->last_id) + return 0; + + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) + return -ENOMEM; + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) + return -ENOMEM; + + if (d->last_id == 0) { + /* VOID is special */ + d->type_states[0].order_state = ORDERED; + d->type_states[0].emit_state = EMITTED; + } + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + return err; + + d->last_id = last_id; + return 0; +} + +static void btf_dump_free_names(struct hashmap *map) +{ + size_t bkt; + struct hashmap_entry *cur; + + hashmap__for_each_entry(map, cur, bkt) + free((void *)cur->pkey); + + hashmap__free(map); +} + +void btf_dump__free(struct btf_dump *d) +{ + int i; + + if (IS_ERR_OR_NULL(d)) + return; + + free(d->type_states); + if (d->cached_names) { + /* any set cached name is owned by us and should be freed */ + for (i = 0; i <= d->last_id; i++) { + if (d->cached_names[i]) + free((void *)d->cached_names[i]); + } + } + free(d->cached_names); + free(d->emit_queue); + free(d->decl_stack); + btf_dump_free_names(d->type_names); + btf_dump_free_names(d->ident_names); + + free(d); +} + +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); + +/* + * Dump BTF type in a compilable C syntax, including all the necessary + * dependent types, necessary for compilation. If some of the dependent types + * were already emitted as part of previous btf_dump__dump_type() invocation + * for another type, they won't be emitted again. This API allows callers to + * filter out BTF types according to user-defined criterias and emitted only + * minimal subset of types, necessary to compile everything. Full struct/union + * definitions will still be emitted, even if the only usage is through + * pointer and could be satisfied with just a forward declaration. + * + * Dumping is done in two high-level passes: + * 1. Topologically sort type definitions to satisfy C rules of compilation. + * 2. Emit type definitions in C syntax. + * + * Returns 0 on success; <0, otherwise. + */ +int btf_dump__dump_type(struct btf_dump *d, __u32 id) +{ + int err, i; + + if (id >= btf__type_cnt(d->btf)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + d->emit_queue_cnt = 0; + err = btf_dump_order_type(d, id, false); + if (err < 0) + return libbpf_err(err); + + for (i = 0; i < d->emit_queue_cnt; i++) + btf_dump_emit_type(d, d->emit_queue[i], 0 /*top-level*/); + + return 0; +} + +/* + * Mark all types that are referenced from any other type. This is used to + * determine top-level anonymous enums that need to be emitted as an + * independent type declarations. + * Anonymous enums come in two flavors: either embedded in a struct's field + * definition, in which case they have to be declared inline as part of field + * type declaration; or as a top-level anonymous enum, typically used for + * declaring global constants. It's impossible to distinguish between two + * without knowning whether given enum type was referenced from other type: + * top-level anonymous enum won't be referenced by anything, while embedded + * one will. + */ +static int btf_dump_mark_referenced(struct btf_dump *d) +{ + int i, j, n = btf__type_cnt(d->btf); + const struct btf_type *t; + __u16 vlen; + + for (i = d->last_id + 1; i < n; i++) { + t = btf__type_by_id(d->btf, i); + vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + break; + + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + d->type_states[t->type].referenced = 1; + break; + + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + d->type_states[a->index_type].referenced = 1; + d->type_states[a->type].referenced = 1; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + for (j = 0; j < vlen; j++, m++) + d->type_states[m->type].referenced = 1; + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + for (j = 0; j < vlen; j++, p++) + d->type_states[p->type].referenced = 1; + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + for (j = 0; j < vlen; j++, v++) + d->type_states[v->type].referenced = 1; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + +static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_queue; + size_t new_cap; + + if (d->emit_queue_cnt >= d->emit_queue_cap) { + new_cap = max(16, d->emit_queue_cap * 3 / 2); + new_queue = libbpf_reallocarray(d->emit_queue, new_cap, sizeof(new_queue[0])); + if (!new_queue) + return -ENOMEM; + d->emit_queue = new_queue; + d->emit_queue_cap = new_cap; + } + + d->emit_queue[d->emit_queue_cnt++] = id; + return 0; +} + +/* + * Determine order of emitting dependent types and specified type to satisfy + * C compilation rules. This is done through topological sorting with an + * additional complication which comes from C rules. The main idea for C is + * that if some type is "embedded" into a struct/union, it's size needs to be + * known at the time of definition of containing type. E.g., for: + * + * struct A {}; + * struct B { struct A x; } + * + * struct A *HAS* to be defined before struct B, because it's "embedded", + * i.e., it is part of struct B layout. But in the following case: + * + * struct A; + * struct B { struct A *x; } + * struct A {}; + * + * it's enough to just have a forward declaration of struct A at the time of + * struct B definition, as struct B has a pointer to struct A, so the size of + * field x is known without knowing struct A size: it's sizeof(void *). + * + * Unfortunately, there are some trickier cases we need to handle, e.g.: + * + * struct A {}; // if this was forward-declaration: compilation error + * struct B { + * struct { // anonymous struct + * struct A y; + * } *x; + * }; + * + * In this case, struct B's field x is a pointer, so it's size is known + * regardless of the size of (anonymous) struct it points to. But because this + * struct is anonymous and thus defined inline inside struct B, *and* it + * embeds struct A, compiler requires full definition of struct A to be known + * before struct B can be defined. This creates a transitive dependency + * between struct A and struct B. If struct A was forward-declared before + * struct B definition and fully defined after struct B definition, that would + * trigger compilation error. + * + * All this means that while we are doing topological sorting on BTF type + * graph, we need to determine relationships between different types (graph + * nodes): + * - weak link (relationship) between X and Y, if Y *CAN* be + * forward-declared at the point of X definition; + * - strong link, if Y *HAS* to be fully-defined before X can be defined. + * + * The rule is as follows. Given a chain of BTF types from X to Y, if there is + * BTF_KIND_PTR type in the chain and at least one non-anonymous type + * Z (excluding X, including Y), then link is weak. Otherwise, it's strong. + * Weak/strong relationship is determined recursively during DFS traversal and + * is returned as a result from btf_dump_order_type(). + * + * btf_dump_order_type() is trying to avoid unnecessary forward declarations, + * but it is not guaranteeing that no extraneous forward declarations will be + * emitted. + * + * To avoid extra work, algorithm marks some of BTF types as ORDERED, when + * it's done with them, but not for all (e.g., VOLATILE, CONST, RESTRICT, + * ARRAY, FUNC_PROTO), as weak/strong semantics for those depends on the + * entire graph path, so depending where from one came to that BTF type, it + * might cause weak or strong ordering. For types like STRUCT/UNION/INT/ENUM, + * once they are processed, there is no need to do it again, so they are + * marked as ORDERED. We can mark PTR as ORDERED as well, as it semi-forces + * weak link, unless subsequent referenced STRUCT/UNION/ENUM is anonymous. But + * in any case, once those are processed, no need to do it again, as the + * result won't change. + * + * Returns: + * - 1, if type is part of strong link (so there is strong topological + * ordering requirements); + * - 0, if type is part of weak link (so can be satisfied through forward + * declaration); + * - <0, on error (e.g., unsatisfiable type loop detected). + */ +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) +{ + /* + * Order state is used to detect strong link cycles, but only for BTF + * kinds that are or could be an independent definition (i.e., + * stand-alone fwd decl, enum, typedef, struct, union). Ptrs, arrays, + * func_protos, modifiers are just means to get to these definitions. + * Int/void don't need definitions, they are assumed to be always + * properly defined. We also ignore datasec, var, and funcs for now. + * So for all non-defining kinds, we never even set ordering state, + * for defining kinds we set ORDERING and subsequently ORDERED if it + * forms a strong link. + */ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + const struct btf_type *t; + __u16 vlen; + int err, i; + + /* return true, letting typedefs know that it's ok to be emitted */ + if (tstate->order_state == ORDERED) + return 1; + + t = btf__type_by_id(d->btf, id); + + if (tstate->order_state == ORDERING) { + /* type loop, but resolvable through fwd declaration */ + if (btf_is_composite(t) && through_ptr && t->name_off != 0) + return 0; + pr_warn("unsatisfiable type cycle, id:[%u]\n", id); + return -ELOOP; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + tstate->order_state = ORDERED; + return 0; + + case BTF_KIND_PTR: + err = btf_dump_order_type(d, t->type, true); + tstate->order_state = ORDERED; + return err; + + case BTF_KIND_ARRAY: + return btf_dump_order_type(d, btf_array(t)->type, false); + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + /* + * struct/union is part of strong link, only if it's embedded + * (so no ptr in a path) or it's anonymous (so has to be + * defined inline, even if declared through ptr) + */ + if (through_ptr && t->name_off != 0) + return 0; + + tstate->order_state = ORDERING; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, m++) { + err = btf_dump_order_type(d, m->type, false); + if (err < 0) + return err; + } + + if (t->name_off != 0) { + err = btf_dump_add_emit_queue_id(d, id); + if (err < 0) + return err; + } + + tstate->order_state = ORDERED; + return 1; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + /* + * non-anonymous or non-referenced enums are top-level + * declarations and should be emitted. Same logic can be + * applied to FWDs, it won't hurt anyways. + */ + if (t->name_off != 0 || !tstate->referenced) { + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + } + tstate->order_state = ORDERED; + return 1; + + case BTF_KIND_TYPEDEF: { + int is_strong; + + is_strong = btf_dump_order_type(d, t->type, through_ptr); + if (is_strong < 0) + return is_strong; + + /* typedef is similar to struct/union w.r.t. fwd-decls */ + if (through_ptr && !is_strong) + return 0; + + /* typedef is always a named definition */ + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + + d->type_states[id].order_state = ORDERED; + return 1; + } + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf_dump_order_type(d, t->type, through_ptr); + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + bool is_strong; + + err = btf_dump_order_type(d, t->type, through_ptr); + if (err < 0) + return err; + is_strong = err > 0; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, p++) { + err = btf_dump_order_type(d, p->type, through_ptr); + if (err < 0) + return err; + if (err > 0) + is_strong = true; + } + return is_strong; + } + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + d->type_states[id].order_state = ORDERED; + return 0; + + default: + return -EINVAL; + } +} + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_struct_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +/* a local view into a shared stack */ +struct id_stack { + const __u32 *ids; + int cnt; +}; + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl); +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decl_stack, + const char *fname, int lvl); + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id); +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id); +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name); + +static bool btf_dump_is_blacklisted(struct btf_dump *d, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(d->btf, id); + + /* __builtin_va_list is a compiler built-in, which causes compilation + * errors, when compiling w/ different compiler, then used to compile + * original code (e.g., GCC to compile kernel, Clang to use generated + * C header from BTF). As it is built-in, it should be already defined + * properly internally in compiler. + */ + if (t->name_off == 0) + return false; + return strcmp(btf_name_of(d, t->name_off), "__builtin_va_list") == 0; +} + +/* + * Emit C-syntax definitions of types from chains of BTF types. + * + * High-level handling of determining necessary forward declarations are handled + * by btf_dump_emit_type() itself, but all nitty-gritty details of emitting type + * declarations/definitions in C syntax are handled by a combo of + * btf_dump_emit_type_decl()/btf_dump_emit_type_chain() w/ delegation to + * corresponding btf_dump_emit_*_{def,fwd}() functions. + * + * We also keep track of "containing struct/union type ID" to determine when + * we reference it from inside and thus can avoid emitting unnecessary forward + * declaration. + * + * This algorithm is designed in such a way, that even if some error occurs + * (either technical, e.g., out of memory, or logical, i.e., malformed BTF + * that doesn't comply to C rules completely), algorithm will try to proceed + * and produce as much meaningful output as possible. + */ +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) +{ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + bool top_level_def = cont_id == 0; + const struct btf_type *t; + __u16 kind; + + if (tstate->emit_state == EMITTED) + return; + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + if (tstate->emit_state == EMITTING) { + if (tstate->fwd_emitted) + return; + + switch (kind) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* + * if we are referencing a struct/union that we are + * part of - then no need for fwd declaration + */ + if (id == cont_id) + return; + if (t->name_off == 0) { + pr_warn("anonymous struct/union loop, id:[%u]\n", + id); + return; + } + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + break; + case BTF_KIND_TYPEDEF: + /* + * for typedef fwd_emitted means typedef definition + * was emitted, but it can be used only for "weak" + * references through pointer only, not for embedding + */ + if (!btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->fwd_emitted = 1; + break; + default: + break; + } + + return; + } + + switch (kind) { + case BTF_KIND_INT: + /* Emit type alias definitions if necessary */ + btf_dump_emit_missing_aliases(d, id, t); + + tstate->emit_state = EMITTED; + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (top_level_def) { + btf_dump_emit_enum_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + btf_dump_emit_type(d, t->type, cont_id); + break; + case BTF_KIND_ARRAY: + btf_dump_emit_type(d, btf_array(t)->type, cont_id); + break; + case BTF_KIND_FWD: + btf_dump_emit_fwd_def(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + break; + case BTF_KIND_TYPEDEF: + tstate->emit_state = EMITTING; + btf_dump_emit_type(d, t->type, id); + /* + * typedef can server as both definition and forward + * declaration; at this stage someone depends on + * typedef as a forward declaration (refers to it + * through pointer), so unless we already did it, + * emit typedef as a forward declaration + */ + if (!tstate->fwd_emitted && !btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + tstate->emit_state = EMITTING; + /* if it's a top-level struct/union definition or struct/union + * is anonymous, then in C we'll be emitting all fields and + * their types (as opposed to just `struct X`), so we need to + * make sure that all types, referenced from struct/union + * members have necessary forward-declarations, where + * applicable + */ + if (top_level_def || t->name_off == 0) { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, new_cont_id; + + new_cont_id = t->name_off == 0 ? cont_id : id; + for (i = 0; i < vlen; i++, m++) + btf_dump_emit_type(d, m->type, new_cont_id); + } else if (!tstate->fwd_emitted && id != cont_id) { + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + } + + if (top_level_def) { + btf_dump_emit_struct_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + } else { + tstate->emit_state = NOT_EMITTED; + } + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 n = btf_vlen(t); + int i; + + btf_dump_emit_type(d, t->type, cont_id); + for (i = 0; i < n; i++, p++) + btf_dump_emit_type(d, p->type, cont_id); + + break; + } + default: + break; + } +} + +static bool btf_is_struct_packed(const struct btf *btf, __u32 id, + const struct btf_type *t) +{ + const struct btf_member *m; + int max_align = 1, align, i, bit_sz; + __u16 vlen; + + m = btf_members(t); + vlen = btf_vlen(t); + /* all non-bitfield fields have to be naturally aligned */ + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + bit_sz = btf_member_bitfield_size(t, i); + if (align && bit_sz == 0 && m->offset % (8 * align) != 0) + return true; + max_align = max(align, max_align); + } + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; + /* + * if original struct was marked as packed, but its layout is + * naturally aligned, we'll detect that it's not packed + */ + return false; +} + +static void btf_dump_emit_bit_padding(const struct btf_dump *d, + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) +{ + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; + + if (cur_off >= next_off) + return; /* no gap */ + + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; + + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } + + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than 's + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `: sizeof();` markers, except for the last one, which + * might need smaller than sizeof() padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; + } + } +} + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "%s%s%s", + btf_is_struct(t) ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_struct_def(struct btf_dump *d, + __u32 id, + const struct btf_type *t, + int lvl) +{ + const struct btf_member *m = btf_members(t); + bool is_struct = btf_is_struct(t); + bool packed, prev_bitfield = false; + int align, i, off = 0; + __u16 vlen = btf_vlen(t); + + align = btf__align_of(d->btf, id); + packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; + + btf_dump_printf(d, "%s%s%s {", + is_struct ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + for (i = 0; i < vlen; i++, m++) { + const char *fname; + int m_off, m_sz, m_align; + bool in_bitfield; + + fname = btf_name_of(d, m->name_off); + m_sz = btf_member_bitfield_size(t, i); + m_off = btf_member_bit_offset(t, i); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); + + in_bitfield = prev_bitfield && m_sz != 0; + + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); + btf_dump_printf(d, "\n%s", pfx(lvl + 1)); + btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); + + if (m_sz) { + btf_dump_printf(d, ": %d", m_sz); + off = m_off + m_sz; + prev_bitfield = true; + } else { + m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); + off = m_off + m_sz * 8; + prev_bitfield = false; + } + + btf_dump_printf(d, ";"); + } + + /* pad at the end, if necessary */ + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); + + /* + * Keep `struct empty {}` on a single line, + * only print newline when there are regular or padding fields. + */ + if (vlen || t->size) { + btf_dump_printf(d, "\n"); + btf_dump_printf(d, "%s}", pfx(lvl)); + } else { + btf_dump_printf(d, "}"); + } + if (packed) + btf_dump_printf(d, " __attribute__((packed))"); +} + +static const char *missing_base_types[][2] = { + /* + * GCC emits typedefs to its internal __PolyX_t types when compiling Arm + * SIMD intrinsics. Alias them to standard base types. + */ + { "__Poly8_t", "unsigned char" }, + { "__Poly16_t", "unsigned short" }, + { "__Poly64_t", "unsigned long long" }, + { "__Poly128_t", "unsigned __int128" }, +}; + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + int i; + + for (i = 0; i < ARRAY_SIZE(missing_base_types); i++) { + if (strcmp(name, missing_base_types[i][0]) == 0) { + btf_dump_printf(d, "typedef %s %s;\n\n", + missing_base_types[i][1], name); + break; + } + } +} + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_enum32_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum *v = btf_enum(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + /* enumerators share namespace with typedef idents */ + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %d," : "\n%s%s___%zd = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, dup_cnt, v->val); + } else { + fmt_str = is_signed ? "\n%s%s = %d," : "\n%s%s = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, v->val); + } + } +} + +static void btf_dump_emit_enum64_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum64 *v = btf_enum64(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + __u64 val; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + val = btf_enum64_value(v); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %lldLL," + : "\n%s%s___%zd = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, dup_cnt, + (unsigned long long)val); + } else { + fmt_str = is_signed ? "\n%s%s = %lldLL," + : "\n%s%s = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, + (unsigned long long)val); + } + } +} +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, + int lvl) +{ + __u16 vlen = btf_vlen(t); + + btf_dump_printf(d, "enum%s%s", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + if (!vlen) + return; + + btf_dump_printf(d, " {"); + if (btf_is_enum(t)) + btf_dump_emit_enum32_val(d, t, lvl, vlen); + else + btf_dump_emit_enum64_val(d, t, lvl, vlen); + btf_dump_printf(d, "\n%s}", pfx(lvl)); + + /* special case enums with special sizes */ + if (t->size == 1) { + /* one-byte enums can be forced with mode(byte) attribute */ + btf_dump_printf(d, " __attribute__((mode(byte)))"); + } else if (t->size == 8 && d->ptr_sz == 8) { + /* enum can be 8-byte sized if one of the enumerator values + * doesn't fit in 32-bit integer, or by adding mode(word) + * attribute (but probably only on 64-bit architectures); do + * our best here to try to satisfy the contract without adding + * unnecessary attributes + */ + bool needs_word_mode; + + if (btf_is_enum(t)) { + /* enum can't represent 64-bit values, so we need word mode */ + needs_word_mode = true; + } else { + /* enum64 needs mode(word) if none of its values has + * non-zero upper 32-bits (which means that all values + * fit in 32-bit integers and won't cause compiler to + * bump enum to be 64-bit naturally + */ + int i; + + needs_word_mode = true; + for (i = 0; i < vlen; i++) { + if (btf_enum64(t)[i].val_hi32 != 0) { + needs_word_mode = false; + break; + } + } + } + if (needs_word_mode) + btf_dump_printf(d, " __attribute__((mode(word)))"); + } + +} + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + + if (btf_kflag(t)) + btf_dump_printf(d, "union %s", name); + else + btf_dump_printf(d, "struct %s", name); +} + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl) +{ + const char *name = btf_dump_ident_name(d, id); + + /* + * Old GCC versions are emitting invalid typedef for __gnuc_va_list + * pointing to VOID. This generates warnings from btf_dump() and + * results in uncompilable header file, so we are fixing it up here + * with valid typedef into __builtin_va_list. + */ + if (t->type == 0 && strcmp(name, "__gnuc_va_list") == 0) { + btf_dump_printf(d, "typedef __builtin_va_list __gnuc_va_list"); + return; + } + + btf_dump_printf(d, "typedef "); + btf_dump_emit_type_decl(d, t->type, name, lvl); +} + +static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_stack; + size_t new_cap; + + if (d->decl_stack_cnt >= d->decl_stack_cap) { + new_cap = max(16, d->decl_stack_cap * 3 / 2); + new_stack = libbpf_reallocarray(d->decl_stack, new_cap, sizeof(new_stack[0])); + if (!new_stack) + return -ENOMEM; + d->decl_stack = new_stack; + d->decl_stack_cap = new_cap; + } + + d->decl_stack[d->decl_stack_cnt++] = id; + + return 0; +} + +/* + * Emit type declaration (e.g., field type declaration in a struct or argument + * declaration in function prototype) in correct C syntax. + * + * For most types it's trivial, but there are few quirky type declaration + * cases worth mentioning: + * - function prototypes (especially nesting of function prototypes); + * - arrays; + * - const/volatile/restrict for pointers vs other types. + * + * For a good discussion of *PARSING* C syntax (as a human), see + * Peter van der Linden's "Expert C Programming: Deep C Secrets", + * Ch.3 "Unscrambling Declarations in C". + * + * It won't help with BTF to C conversion much, though, as it's an opposite + * problem. So we came up with this algorithm in reverse to van der Linden's + * parsing algorithm. It goes from structured BTF representation of type + * declaration to a valid compilable C syntax. + * + * For instance, consider this C typedef: + * typedef const int * const * arr[10] arr_t; + * It will be represented in BTF with this chain of BTF types: + * [typedef] -> [array] -> [ptr] -> [const] -> [ptr] -> [const] -> [int] + * + * Notice how [const] modifier always goes before type it modifies in BTF type + * graph, but in C syntax, const/volatile/restrict modifiers are written to + * the right of pointers, but to the left of other types. There are also other + * quirks, like function pointers, arrays of them, functions returning other + * functions, etc. + * + * We handle that by pushing all the types to a stack, until we hit "terminal" + * type (int/enum/struct/union/fwd). Then depending on the kind of a type on + * top of a stack, modifiers are handled differently. Array/function pointers + * have also wildly different syntax and how nesting of them are done. See + * code for authoritative definition. + * + * To avoid allocating new stack for each independent chain of BTF types, we + * share one bigger stack, with each chain working only on its own local view + * of a stack frame. Some care is required to "pop" stack frames after + * processing type declaration chain. + */ +int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts) +{ + const char *fname; + int lvl, err; + + if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + fname = OPTS_GET(opts, field_name, ""); + lvl = OPTS_GET(opts, indent_level, 0); + d->strip_mods = OPTS_GET(opts, strip_mods, false); + btf_dump_emit_type_decl(d, id, fname, lvl); + d->strip_mods = false; + return 0; +} + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl) +{ + struct id_stack decl_stack; + const struct btf_type *t; + int err, stack_start; + + stack_start = d->decl_stack_cnt; + for (;;) { + t = btf__type_by_id(d->btf, id); + if (d->strip_mods && btf_is_mod(t)) + goto skip_mod; + + err = btf_dump_push_decl_stack_id(d, id); + if (err < 0) { + /* + * if we don't have enough memory for entire type decl + * chain, restore stack, emit warning, and try to + * proceed nevertheless + */ + pr_warn("not enough memory for decl stack:%d", err); + d->decl_stack_cnt = stack_start; + return; + } +skip_mod: + /* VOID */ + if (id == 0) + break; + + switch (btf_kind(t)) { + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_TYPE_TAG: + id = t->type; + break; + case BTF_KIND_ARRAY: + id = btf_array(t)->type; + break; + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: + goto done; + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + btf_kind(t), id); + goto done; + } + } +done: + /* + * We might be inside a chain of declarations (e.g., array of function + * pointers returning anonymous (so inlined) structs, having another + * array field). Each of those needs its own "stack frame" to handle + * emitting of declarations. Those stack frames are non-overlapping + * portions of shared btf_dump->decl_stack. To make it a bit nicer to + * handle this set of nested stacks, we create a view corresponding to + * our own "stack frame" and work with it as an independent stack. + * We'll need to clean up after emit_type_chain() returns, though. + */ + decl_stack.ids = d->decl_stack + stack_start; + decl_stack.cnt = d->decl_stack_cnt - stack_start; + btf_dump_emit_type_chain(d, &decl_stack, fname, lvl); + /* + * emit_type_chain() guarantees that it will pop its entire decl_stack + * frame before returning. But it works with a read-only view into + * decl_stack, so it doesn't actually pop anything from the + * perspective of shared btf_dump->decl_stack, per se. We need to + * reset decl_stack state to how it was before us to avoid it growing + * all the time. + */ + d->decl_stack_cnt = stack_start; +} + +static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + + switch (btf_kind(t)) { + case BTF_KIND_VOLATILE: + btf_dump_printf(d, "volatile "); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, "const "); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, "restrict "); + break; + default: + return; + } + decl_stack->cnt--; + } +} + +static void btf_dump_drop_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + if (!btf_is_mod(t)) + return; + decl_stack->cnt--; + } +} + +static void btf_dump_emit_name(const struct btf_dump *d, + const char *name, bool last_was_ptr) +{ + bool separate = name[0] && !last_was_ptr; + + btf_dump_printf(d, "%s%s", separate ? " " : "", name); +} + +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decls, + const char *fname, int lvl) +{ + /* + * last_was_ptr is used to determine if we need to separate pointer + * asterisk (*) from previous part of type signature with space, so + * that we get `int ***`, instead of `int * * *`. We default to true + * for cases where we have single pointer in a chain. E.g., in ptr -> + * func_proto case. func_proto will start a new emit_type_chain call + * with just ptr, which should be emitted as (*) or (*), so we + * don't want to prepend space for that last pointer. + */ + bool last_was_ptr = true; + const struct btf_type *t; + const char *name; + __u16 kind; + __u32 id; + + while (decls->cnt) { + id = decls->ids[--decls->cnt]; + if (id == 0) { + /* VOID is a special snowflake */ + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "void"); + last_was_ptr = false; + continue; + } + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, "%s", name); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + btf_dump_emit_mods(d, decls); + /* inline anonymous struct/union */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_struct_def(d, id, t, lvl); + else + btf_dump_emit_struct_fwd(d, id, t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + btf_dump_emit_mods(d, decls); + /* inline anonymous enum */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_enum_def(d, id, t, lvl); + else + btf_dump_emit_enum_fwd(d, id, t); + break; + case BTF_KIND_FWD: + btf_dump_emit_mods(d, decls); + btf_dump_emit_fwd_def(d, id, t); + break; + case BTF_KIND_TYPEDEF: + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "%s", btf_dump_ident_name(d, id)); + break; + case BTF_KIND_PTR: + btf_dump_printf(d, "%s", last_was_ptr ? "*" : " *"); + break; + case BTF_KIND_VOLATILE: + btf_dump_printf(d, " volatile"); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, " const"); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, " restrict"); + break; + case BTF_KIND_TYPE_TAG: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + const struct btf_type *next_t; + __u32 next_id; + bool multidim; + /* + * GCC has a bug + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=8354) + * which causes it to emit extra const/volatile + * modifiers for an array, if array's element type has + * const/volatile modifiers. Clang doesn't do that. + * In general, it doesn't seem very meaningful to have + * a const/volatile modifier for array, so we are + * going to silently skip them here. + */ + btf_dump_drop_mods(d, decls); + + if (decls->cnt == 0) { + btf_dump_emit_name(d, fname, last_was_ptr); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + + next_id = decls->ids[decls->cnt - 1]; + next_t = btf__type_by_id(d->btf, next_id); + multidim = btf_is_array(next_t); + /* we need space if we have named non-pointer */ + if (fname[0] && !last_was_ptr) + btf_dump_printf(d, " "); + /* no parentheses for multi-dimensional array */ + if (!multidim) + btf_dump_printf(d, "("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + if (!multidim) + btf_dump_printf(d, ")"); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 vlen = btf_vlen(t); + int i; + + /* + * GCC emits extra volatile qualifier for + * __attribute__((noreturn)) function pointers. Clang + * doesn't do it. It's a GCC quirk for backwards + * compatibility with code written for GCC <2.5. So, + * similarly to extra qualifiers for array, just drop + * them, instead of handling them. + */ + btf_dump_drop_mods(d, decls); + if (decls->cnt) { + btf_dump_printf(d, " ("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + btf_dump_printf(d, ")"); + } else { + btf_dump_emit_name(d, fname, last_was_ptr); + } + btf_dump_printf(d, "("); + /* + * Clang for BPF target generates func_proto with no + * args as a func_proto with a single void arg (e.g., + * `int (*f)(void)` vs just `int (*f)()`). We are + * going to pretend there are no args for such case. + */ + if (vlen == 1 && p->type == 0) { + btf_dump_printf(d, ")"); + return; + } + + for (i = 0; i < vlen; i++, p++) { + if (i > 0) + btf_dump_printf(d, ", "); + + /* last arg of type void is vararg */ + if (i == vlen - 1 && p->type == 0) { + btf_dump_printf(d, "..."); + break; + } + + name = btf_name_of(d, p->name_off); + btf_dump_emit_type_decl(d, p->type, name, lvl); + } + + btf_dump_printf(d, ")"); + return; + } + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + kind, id); + return; + } + + last_was_ptr = kind == BTF_KIND_PTR; + } + + btf_dump_emit_name(d, fname, last_was_ptr); +} + +/* show type name as (type_name) */ +static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id, + bool top_level) +{ + const struct btf_type *t; + + /* for array members, we don't bother emitting type name for each + * member to avoid the redundancy of + * .name = (char[4])[(char)'f',(char)'o',(char)'o',] + */ + if (d->typed_dump->is_array_member) + return; + + /* avoid type name specification for variable/section; it will be done + * for the associated variable value(s). + */ + t = btf__type_by_id(d->btf, id); + if (btf_is_var(t) || btf_is_datasec(t)) + return; + + if (top_level) + btf_dump_printf(d, "("); + + d->skip_anon_defs = true; + d->strip_mods = true; + btf_dump_emit_type_decl(d, id, "", 0); + d->strip_mods = false; + d->skip_anon_defs = false; + + if (top_level) + btf_dump_printf(d, ")"); +} + +/* return number of duplicates (occurrences) of a given name */ +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name) +{ + char *old_name, *new_name; + size_t dup_cnt = 0; + int err; + + new_name = strdup(orig_name); + if (!new_name) + return 1; + + (void)hashmap__find(name_map, orig_name, &dup_cnt); + dup_cnt++; + + err = hashmap__set(name_map, new_name, dup_cnt, &old_name, NULL); + if (err) + free(new_name); + + free(old_name); + + return dup_cnt; +} + +static const char *btf_dump_resolve_name(struct btf_dump *d, __u32 id, + struct hashmap *name_map) +{ + struct btf_dump_type_aux_state *s = &d->type_states[id]; + const struct btf_type *t = btf__type_by_id(d->btf, id); + const char *orig_name = btf_name_of(d, t->name_off); + const char **cached_name = &d->cached_names[id]; + size_t dup_cnt; + + if (t->name_off == 0) + return ""; + + if (s->name_resolved) + return *cached_name ? *cached_name : orig_name; + + if (btf_is_fwd(t) || (btf_is_enum(t) && btf_vlen(t) == 0)) { + s->name_resolved = 1; + return orig_name; + } + + dup_cnt = btf_dump_name_dups(d, name_map, orig_name); + if (dup_cnt > 1) { + const size_t max_len = 256; + char new_name[max_len]; + + snprintf(new_name, max_len, "%s___%zu", orig_name, dup_cnt); + *cached_name = strdup(new_name); + } + + s->name_resolved = 1; + return *cached_name ? *cached_name : orig_name; +} + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->type_names); +} + +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->ident_names); +} + +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz); + +static const char *btf_dump_data_newline(struct btf_dump *d) +{ + return d->typed_dump->compact || d->typed_dump->depth == 0 ? "" : "\n"; +} + +static const char *btf_dump_data_delim(struct btf_dump *d) +{ + return d->typed_dump->depth == 0 ? "" : ","; +} + +static void btf_dump_data_pfx(struct btf_dump *d) +{ + int i, lvl = d->typed_dump->indent_lvl + d->typed_dump->depth; + + if (d->typed_dump->compact) + return; + + for (i = 0; i < lvl; i++) + btf_dump_printf(d, "%s", d->typed_dump->indent_str); +} + +/* A macro is used here as btf_type_value[s]() appends format specifiers + * to the format specifier passed in; these do the work of appending + * delimiters etc while the caller simply has to specify the type values + * in the format specifier + value(s). + */ +#define btf_dump_type_values(d, fmt, ...) \ + btf_dump_printf(d, fmt "%s%s", \ + ##__VA_ARGS__, \ + btf_dump_data_delim(d), \ + btf_dump_data_newline(d)) + +static int btf_dump_unsupported_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id) +{ + btf_dump_printf(d, "", btf_kind(t)); + return -ENOTSUP; +} + +static int btf_dump_get_bitfield_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz, + __u64 *value) +{ + __u16 left_shift_bits, right_shift_bits; + const __u8 *bytes = data; + __u8 nr_copy_bits; + __u64 num = 0; + int i; + + /* Maximum supported bitfield size is 64 bits */ + if (t->size > 8) { + pr_warn("unexpected bitfield size %d\n", t->size); + return -EINVAL; + } + + /* Bitfield value retrieval is done in two steps; first relevant bytes are + * stored in num, then we left/right shift num to eliminate irrelevant bits. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + for (i = t->size - 1; i >= 0; i--) + num = num * 256 + bytes[i]; + nr_copy_bits = bit_sz + bits_offset; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + for (i = 0; i < t->size; i++) + num = num * 256 + bytes[i]; + nr_copy_bits = t->size * 8 - bits_offset; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + left_shift_bits = 64 - nr_copy_bits; + right_shift_bits = 64 - bit_sz; + + *value = (num << left_shift_bits) >> right_shift_bits; + + return 0; +} + +static int btf_dump_bitfield_check_zero(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 check_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &check_num); + if (err) + return err; + if (check_num == 0) + return -ENODATA; + return 0; +} + +static int btf_dump_bitfield_data(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 print_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &print_num); + if (err) + return err; + + btf_dump_type_values(d, "0x%llx", (unsigned long long)print_num); + + return 0; +} + +/* ints, floats and ptrs */ +static int btf_dump_base_type_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + static __u8 bytecmp[16] = {}; + int nr_bytes; + + /* For pointer types, pointer size is not defined on a per-type basis. + * On dump creation however, we store the pointer size. + */ + if (btf_kind(t) == BTF_KIND_PTR) + nr_bytes = d->ptr_sz; + else + nr_bytes = t->size; + + if (nr_bytes < 1 || nr_bytes > 16) { + pr_warn("unexpected size %d for id [%u]\n", nr_bytes, id); + return -EINVAL; + } + + if (memcmp(data, bytecmp, nr_bytes) == 0) + return -ENODATA; + return 0; +} + +static bool ptr_is_aligned(const struct btf *btf, __u32 type_id, + const void *data) +{ + int alignment = btf__align_of(btf, type_id); + + if (alignment == 0) + return false; + + return ((uintptr_t)data) % alignment == 0; +} + +static int btf_dump_int_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data, + __u8 bits_offset) +{ + __u8 encoding = btf_int_encoding(t); + bool sign = encoding & BTF_INT_SIGNED; + char buf[16] __attribute__((aligned(16))); + int sz = t->size; + + if (sz == 0 || sz > sizeof(buf)) { + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + + /* handle packed int data - accesses of integers not aligned on + * int boundaries can cause problems on some platforms. + */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(buf, data, sz); + data = buf; + } + + switch (sz) { + case 16: { + const __u64 *ints = data; + __u64 lsi, msi; + + /* avoid use of __int128 as some 32-bit platforms do not + * support it. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + lsi = ints[0]; + msi = ints[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + lsi = ints[1]; + msi = ints[0]; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (msi == 0) + btf_dump_type_values(d, "0x%llx", (unsigned long long)lsi); + else + btf_dump_type_values(d, "0x%llx%016llx", (unsigned long long)msi, + (unsigned long long)lsi); + break; + } + case 8: + if (sign) + btf_dump_type_values(d, "%lld", *(long long *)data); + else + btf_dump_type_values(d, "%llu", *(unsigned long long *)data); + break; + case 4: + if (sign) + btf_dump_type_values(d, "%d", *(__s32 *)data); + else + btf_dump_type_values(d, "%u", *(__u32 *)data); + break; + case 2: + if (sign) + btf_dump_type_values(d, "%d", *(__s16 *)data); + else + btf_dump_type_values(d, "%u", *(__u16 *)data); + break; + case 1: + if (d->typed_dump->is_array_char) { + /* check for null terminator */ + if (d->typed_dump->is_array_terminated) + break; + if (*(char *)data == '\0') { + d->typed_dump->is_array_terminated = true; + break; + } + if (isprint(*(char *)data)) { + btf_dump_type_values(d, "'%c'", *(char *)data); + break; + } + } + if (sign) + btf_dump_type_values(d, "%d", *(__s8 *)data); + else + btf_dump_type_values(d, "%u", *(__u8 *)data); + break; + default: + pr_warn("unexpected sz %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +union float_data { + long double ld; + double d; + float f; +}; + +static int btf_dump_float_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data) +{ + const union float_data *flp = data; + union float_data fl; + int sz = t->size; + + /* handle unaligned data; copy to local union */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(&fl, data, sz); + flp = &fl; + } + + switch (sz) { + case 16: + btf_dump_type_values(d, "%Lf", flp->ld); + break; + case 8: + btf_dump_type_values(d, "%lf", flp->d); + break; + case 4: + btf_dump_type_values(d, "%f", flp->f); + break; + default: + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +static int btf_dump_var_data(struct btf_dump *d, + const struct btf_type *v, + __u32 id, + const void *data) +{ + enum btf_func_linkage linkage = btf_var(v)->linkage; + const struct btf_type *t; + const char *l; + __u32 type_id; + + switch (linkage) { + case BTF_FUNC_STATIC: + l = "static "; + break; + case BTF_FUNC_EXTERN: + l = "extern "; + break; + case BTF_FUNC_GLOBAL: + default: + l = ""; + break; + } + + /* format of output here is [linkage] [type] [varname] = (type)value, + * for example "static int cpu_profile_flip = (int)1" + */ + btf_dump_printf(d, "%s", l); + type_id = v->type; + t = btf__type_by_id(d->btf, type_id); + btf_dump_emit_type_cast(d, type_id, false); + btf_dump_printf(d, " %s = ", btf_name_of(d, v->name_off)); + return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0); +} + +static int btf_dump_array_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 i, elem_type_id; + __s64 elem_size; + bool is_array_member; + + elem_type_id = array->type; + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + elem_size = btf__resolve_size(d->btf, elem_type_id); + if (elem_size <= 0) { + pr_warn("unexpected elem size %zd for array type [%u]\n", + (ssize_t)elem_size, id); + return -EINVAL; + } + + if (btf_is_int(elem_type)) { + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + d->typed_dump->is_array_char = true; + } + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "[%s", btf_dump_data_newline(d)); + + /* may be a multidimensional array, so store current "is array member" + * status so we can restore it correctly later. + */ + is_array_member = d->typed_dump->is_array_member; + d->typed_dump->is_array_member = true; + for (i = 0; i < array->nelems; i++, data += elem_size) { + if (d->typed_dump->is_array_terminated) + break; + btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); + } + d->typed_dump->is_array_member = is_array_member; + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "]"); + + return 0; +} + +static int btf_dump_struct_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + int i, err = 0; + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "{%s", btf_dump_data_newline(d)); + + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + const char *mname; + __u32 moffset; + __u8 bit_sz; + + mtype = btf__type_by_id(d->btf, m->type); + mname = btf_name_of(d, m->name_off); + moffset = btf_member_bit_offset(t, i); + + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_dump_type_data(d, mname, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err < 0) + return err; + } + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "}"); + return err; +} + +union ptr_data { + unsigned int p; + unsigned long long lp; +}; + +static int btf_dump_ptr_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + if (ptr_is_aligned(d->btf, id, data) && d->ptr_sz == sizeof(void *)) { + btf_dump_type_values(d, "%p", *(void **)data); + } else { + union ptr_data pt; + + memcpy(&pt, data, d->ptr_sz); + if (d->ptr_sz == 4) + btf_dump_type_values(d, "0x%x", pt.p); + else + btf_dump_type_values(d, "0x%llx", pt.lp); + } + return 0; +} + +static int btf_dump_get_enum_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u32 id, + __s64 *value) +{ + bool is_signed = btf_kflag(t); + + if (!ptr_is_aligned(d->btf, id, data)) { + __u64 val; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, 0, 0, &val); + if (err) + return err; + *value = (__s64)val; + return 0; + } + + switch (t->size) { + case 8: + *value = *(__s64 *)data; + return 0; + case 4: + *value = is_signed ? (__s64)*(__s32 *)data : *(__u32 *)data; + return 0; + case 2: + *value = is_signed ? *(__s16 *)data : *(__u16 *)data; + return 0; + case 1: + *value = is_signed ? *(__s8 *)data : *(__u8 *)data; + return 0; + default: + pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); + return -EINVAL; + } +} + +static int btf_dump_enum_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + bool is_signed; + __s64 value; + int i, err; + + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + + is_signed = btf_kflag(t); + if (btf_is_enum(t)) { + const struct btf_enum *e; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%d" : "%u", value); + } else { + const struct btf_enum64 *e; + + for (i = 0, e = btf_enum64(t); i < btf_vlen(t); i++, e++) { + if (value != btf_enum64_value(e)) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%lldLL" : "%lluULL", + (unsigned long long)value); + } + return 0; +} + +static int btf_dump_datasec_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + __u32 i; + int err; + + btf_dump_type_values(d, "SEC(\"%s\") ", btf_name_of(d, t->name_off)); + + for (i = 0, vsi = btf_var_secinfos(t); i < btf_vlen(t); i++, vsi++) { + var = btf__type_by_id(d->btf, vsi->type); + err = btf_dump_dump_type_data(d, NULL, var, vsi->type, data + vsi->offset, 0, 0); + if (err < 0) + return err; + btf_dump_printf(d, ";"); + } + return 0; +} + +/* return size of type, or if base type overflows, return -E2BIG. */ +static int btf_dump_type_data_check_overflow(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 size; + + if (bit_sz) { + /* bits_offset is at most 7. bit_sz is at most 128. */ + __u8 nr_bytes = (bits_offset + bit_sz + 7) / 8; + + /* When bit_sz is non zero, it is called from + * btf_dump_struct_data() where it only cares about + * negative error value. + * Return nr_bytes in success case to make it + * consistent as the regular integer case below. + */ + return data + nr_bytes > d->typed_dump->data_end ? -E2BIG : nr_bytes; + } + + size = btf__resolve_size(d->btf, id); + + if (size < 0 || size >= INT_MAX) { + pr_warn("unexpected size [%zu] for id [%u]\n", + (size_t)size, id); + return -EINVAL; + } + + /* Only do overflow checking for base types; we do not want to + * avoid showing part of a struct, union or array, even if we + * do not have enough data to show the full object. By + * restricting overflow checking to base types we can ensure + * that partial display succeeds, while avoiding overflowing + * and using bogus data for display. + */ + t = skip_mods_and_typedefs(d->btf, id, NULL); + if (!t) { + pr_warn("unexpected error skipping mods/typedefs for id [%u]\n", + id); + return -EINVAL; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (data + bits_offset / 8 + size > d->typed_dump->data_end) + return -E2BIG; + break; + default: + break; + } + return (int)size; +} + +static int btf_dump_type_data_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 value; + int i, err; + + /* toplevel exceptions; we show zero values if + * - we ask for them (emit_zeros) + * - if we are at top-level so we see "struct empty { }" + * - or if we are an array member and the array is non-empty and + * not a char array; we don't want to be in a situation where we + * have an integer array 0, 1, 0, 1 and only show non-zero values. + * If the array contains zeroes only, or is a char array starting + * with a '\0', the array-level check_zero() will prevent showing it; + * we are concerned with determining zero value at the array member + * level here. + */ + if (d->typed_dump->emit_zeroes || d->typed_dump->depth == 0 || + (d->typed_dump->is_array_member && + !d->typed_dump->is_array_char)) + return 0; + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + if (bit_sz) + return btf_dump_bitfield_check_zero(d, t, data, bits_offset, bit_sz); + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_ARRAY: { + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 elem_type_id, elem_size; + bool ischar; + + elem_type_id = array->type; + elem_size = btf__resolve_size(d->btf, elem_type_id); + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + + ischar = btf_is_int(elem_type) && elem_size == 1; + + /* check all elements; if _any_ element is nonzero, all + * of array is displayed. We make an exception however + * for char arrays where the first element is 0; these + * are considered zeroed also, even if later elements are + * non-zero because the string is terminated. + */ + for (i = 0; i < array->nelems; i++) { + if (i == 0 && ischar && *(char *)data == 0) + return -ENODATA; + err = btf_dump_type_data_check_zero(d, elem_type, + elem_type_id, + data + + (i * elem_size), + bits_offset, 0); + if (err != -ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + + /* if any struct/union member is non-zero, the struct/union + * is considered non-zero and dumped. + */ + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + __u32 moffset; + + mtype = btf__type_by_id(d->btf, m->type); + moffset = btf_member_bit_offset(t, i); + + /* btf_int_bits() does not store member bitfield size; + * bitfield size needs to be stored here so int display + * of member can retrieve it. + */ + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_type_data_check_zero(d, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err != ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + if (value == 0) + return -ENODATA; + return 0; + default: + return 0; + } +} + +/* returns size of data dumped, or error. */ +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + int size, err = 0; + + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset, bit_sz); + if (size < 0) + return size; + err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); + if (err) { + /* zeroed data is expected and not an error, so simply skip + * dumping such data. Record other errors however. + */ + if (err == -ENODATA) + return size; + return err; + } + btf_dump_data_pfx(d); + + if (!d->typed_dump->skip_names) { + if (fname && strlen(fname) > 0) + btf_dump_printf(d, ".%s = ", fname); + btf_dump_emit_type_cast(d, id, true); + } + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_DECL_TAG: + err = btf_dump_unsupported_data(d, t, id); + break; + case BTF_KIND_INT: + if (bit_sz) + err = btf_dump_bitfield_data(d, t, data, bits_offset, bit_sz); + else + err = btf_dump_int_data(d, t, id, data, bits_offset); + break; + case BTF_KIND_FLOAT: + err = btf_dump_float_data(d, t, id, data); + break; + case BTF_KIND_PTR: + err = btf_dump_ptr_data(d, t, id, data); + break; + case BTF_KIND_ARRAY: + err = btf_dump_array_data(d, t, id, data); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + err = btf_dump_struct_data(d, t, id, data); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* handle bitfield and int enum values */ + if (bit_sz) { + __u64 print_num; + __s64 enum_val; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, + &print_num); + if (err) + break; + enum_val = (__s64)print_num; + err = btf_dump_enum_data(d, t, id, &enum_val); + } else + err = btf_dump_enum_data(d, t, id, data); + break; + case BTF_KIND_VAR: + err = btf_dump_var_data(d, t, id, data); + break; + case BTF_KIND_DATASEC: + err = btf_dump_datasec_data(d, t, id, data); + break; + default: + pr_warn("unexpected kind [%u] for id [%u]\n", + BTF_INFO_KIND(t->info), id); + return -EINVAL; + } + if (err < 0) + return err; + return size; +} + +int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts) +{ + struct btf_dump_data typed_dump = {}; + const struct btf_type *t; + int ret; + + if (!OPTS_VALID(opts, btf_dump_type_data_opts)) + return libbpf_err(-EINVAL); + + t = btf__type_by_id(d->btf, id); + if (!t) + return libbpf_err(-ENOENT); + + d->typed_dump = &typed_dump; + d->typed_dump->data_end = data + data_sz; + d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); + + /* default indent string is a tab */ + if (!OPTS_GET(opts, indent_str, NULL)) + d->typed_dump->indent_str[0] = '\t'; + else + libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, + sizeof(d->typed_dump->indent_str)); + + d->typed_dump->compact = OPTS_GET(opts, compact, false); + d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false); + d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false); + + ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0); + + d->typed_dump = NULL; + + return libbpf_err(ret); +} diff --git a/third_party/bpftool/libbpf/src/gen_loader.c b/third_party/bpftool/libbpf/src/gen_loader.c new file mode 100644 index 0000000..cf3323f --- /dev/null +++ b/third_party/bpftool/libbpf/src/gen_loader.c @@ -0,0 +1,1123 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" +#include "skel_internal.h" +#include + +#define MAX_USED_MAPS 64 +#define MAX_USED_PROGS 32 +#define MAX_KFUNC_DESCS 256 +#define MAX_FD_ARRAY_SZ (MAX_USED_MAPS + MAX_KFUNC_DESCS) + +/* The following structure describes the stack layout of the loader program. + * In addition R6 contains the pointer to context. + * R7 contains the result of the last sys_bpf command (typically error or FD). + * R9 contains the result of the last sys_close command. + * + * Naming convention: + * ctx - bpf program context + * stack - bpf program stack + * blob - bpf_attr-s, strings, insns, map data. + * All the bytes that loader prog will use for read/write. + */ +struct loader_stack { + __u32 btf_fd; + __u32 inner_map_fd; + __u32 prog_fd[MAX_USED_PROGS]; +}; + +#define stack_off(field) \ + (__s16)(-sizeof(struct loader_stack) + offsetof(struct loader_stack, field)) + +#define attr_field(attr, field) (attr + offsetof(union bpf_attr, field)) + +static int blob_fd_array_off(struct bpf_gen *gen, int index) +{ + return gen->fd_array + index * sizeof(int); +} + +static int realloc_insn_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->insn_cur - gen->insn_start; + void *insn_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + insn_start = realloc(gen->insn_start, off + size); + if (!insn_start) { + gen->error = -ENOMEM; + free(gen->insn_start); + gen->insn_start = NULL; + return -ENOMEM; + } + gen->insn_start = insn_start; + gen->insn_cur = insn_start + off; + return 0; +} + +static int realloc_data_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->data_cur - gen->data_start; + void *data_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + data_start = realloc(gen->data_start, off + size); + if (!data_start) { + gen->error = -ENOMEM; + free(gen->data_start); + gen->data_start = NULL; + return -ENOMEM; + } + gen->data_start = data_start; + gen->data_cur = data_start + off; + return 0; +} + +static void emit(struct bpf_gen *gen, struct bpf_insn insn) +{ + if (realloc_insn_buf(gen, sizeof(insn))) + return; + memcpy(gen->insn_cur, &insn, sizeof(insn)); + gen->insn_cur += sizeof(insn); +} + +static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn insn2) +{ + emit(gen, insn1); + emit(gen, insn2); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size); +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) +{ + size_t stack_sz = sizeof(struct loader_stack), nr_progs_sz; + int i; + + gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); + gen->log_level = log_level; + /* save ctx pointer into R6 */ + emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + + /* bzero stack */ + emit(gen, BPF_MOV64_REG(BPF_REG_1, BPF_REG_10)); + emit(gen, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + /* amount of stack actually used, only used to calculate iterations, not stack offset */ + nr_progs_sz = offsetof(struct loader_stack, prog_fd[nr_progs]); + /* jump over cleanup code */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, + /* size of cleanup code below (including map fd cleanup) */ + (nr_progs_sz / 4) * 3 + 2 + + /* 6 insns for emit_sys_close_blob, + * 6 insns for debug_regs in emit_sys_close_blob + */ + nr_maps * (6 + (gen->log_level ? 6 : 0)))); + + /* remember the label where all error branches will jump to */ + gen->cleanup_label = gen->insn_cur - gen->insn_start; + /* emit cleanup code: close all temp FDs */ + for (i = 0; i < nr_progs_sz; i += 4) { + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -stack_sz + i)); + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + } + for (i = 0; i < nr_maps; i++) + emit_sys_close_blob(gen, blob_fd_array_off(gen, i)); + /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ + emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); + emit(gen, BPF_EXIT_INSN()); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size) +{ + __u32 size8 = roundup(size, 8); + __u64 zero = 0; + void *prev; + + if (realloc_data_buf(gen, size8)) + return 0; + prev = gen->data_cur; + if (data) { + memcpy(gen->data_cur, data, size); + memcpy(gen->data_cur + size, &zero, size8 - size); + } else { + memset(gen->data_cur, 0, size8); + } + gen->data_cur += size8; + return prev - gen->data_start; +} + +/* Get index for map_fd/btf_fd slot in reserved fd_array, or in data relative + * to start of fd_array. Caller can decide if it is usable or not. + */ +static int add_map_fd(struct bpf_gen *gen) +{ + if (gen->nr_maps == MAX_USED_MAPS) { + pr_warn("Total maps exceeds %d\n", MAX_USED_MAPS); + gen->error = -E2BIG; + return 0; + } + return gen->nr_maps++; +} + +static int add_kfunc_btf_fd(struct bpf_gen *gen) +{ + int cur; + + if (gen->nr_fd_array == MAX_KFUNC_DESCS) { + cur = add_data(gen, NULL, sizeof(int)); + return (cur - gen->fd_array) / sizeof(int); + } + return MAX_USED_MAPS + gen->nr_fd_array++; +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* *(u64 *)(blob + off) = (u64)(void *)(blob + data) */ +static void emit_rel_store(struct bpf_gen *gen, int off, int data) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, data)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2blob(struct bpf_gen *gen, int off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_2, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2ctx(struct bpf_gen *gen, int ctx_off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_1, 0)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void move_ctx2blob(struct bpf_gen *gen, int off, int size, int ctx_off, + bool check_non_zero) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_6, ctx_off)); + if (check_non_zero) + /* If value in ctx is zero don't update the blob. + * For example: when ctx->map.max_entries == 0, keep default max_entries from bpf.c + */ + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2blob(struct bpf_gen *gen, int off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2ctx(struct bpf_gen *gen, int ctx_off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void emit_sys_bpf(struct bpf_gen *gen, int cmd, int attr, int attr_size) +{ + emit(gen, BPF_MOV64_IMM(BPF_REG_1, cmd)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, attr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, attr_size)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_bpf)); + /* remember the result in R7 */ + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); +} + +static bool is_simm16(__s64 value) +{ + return value == (__s64)(__s16)value; +} + +static void emit_check_err(struct bpf_gen *gen) +{ + __s64 off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + + /* R7 contains result of last sys_bpf command. + * if (R7 < 0) goto cleanup; + */ + if (is_simm16(off)) { + emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } +} + +/* reg1 and reg2 should not be R1 - R5. They can be R0, R6 - R10 */ +static void emit_debug(struct bpf_gen *gen, int reg1, int reg2, + const char *fmt, va_list args) +{ + char buf[1024]; + int addr, len, ret; + + if (!gen->log_level) + return; + ret = vsnprintf(buf, sizeof(buf), fmt, args); + if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0) + /* The special case to accommodate common debug_ret(): + * to avoid specifying BPF_REG_7 and adding " r=%%d" to + * prints explicitly. + */ + strcat(buf, " r=%d"); + len = strlen(buf) + 1; + addr = add_data(gen, buf, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, addr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + if (reg1 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1)); + if (reg2 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk)); +} + +static void debug_regs(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, reg1, reg2, fmt, args); + va_end(args); +} + +static void debug_ret(struct bpf_gen *gen, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, BPF_REG_7, -1, fmt, args); + va_end(args); +} + +static void __emit_sys_close(struct bpf_gen *gen) +{ + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, + /* 2 is the number of the following insns + * * 6 is additional insns in debug_regs + */ + 2 + (gen->log_level ? 6 : 0))); + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d"); +} + +static void emit_sys_close_stack(struct bpf_gen *gen, int stack_off) +{ + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, stack_off)); + __emit_sys_close(gen); +} + +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0)); + __emit_sys_close(gen); +} + +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) +{ + int i; + + if (nr_progs < gen->nr_progs || nr_maps != gen->nr_maps) { + pr_warn("nr_progs %d/%d nr_maps %d/%d mismatch\n", + nr_progs, gen->nr_progs, nr_maps, gen->nr_maps); + gen->error = -EFAULT; + return gen->error; + } + emit_sys_close_stack(gen, stack_off(btf_fd)); + for (i = 0; i < gen->nr_progs; i++) + move_stack2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * gen->nr_maps + + sizeof(struct bpf_prog_desc) * i + + offsetof(struct bpf_prog_desc, prog_fd), 4, + stack_off(prog_fd[i])); + for (i = 0; i < gen->nr_maps; i++) + move_blob2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * i + + offsetof(struct bpf_map_desc, map_fd), 4, + blob_fd_array_off(gen, i)); + emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); + emit(gen, BPF_EXIT_INSN()); + pr_debug("gen: finish %d\n", gen->error); + if (!gen->error) { + struct gen_loader_opts *opts = gen->opts; + + opts->insns = gen->insn_start; + opts->insns_sz = gen->insn_cur - gen->insn_start; + opts->data = gen->data_start; + opts->data_sz = gen->data_cur - gen->data_start; + } + return gen->error; +} + +void bpf_gen__free(struct bpf_gen *gen) +{ + if (!gen) + return; + free(gen->data_start); + free(gen->insn_start); + free(gen); +} + +void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, + __u32 btf_raw_size) +{ + int attr_size = offsetofend(union bpf_attr, btf_log_level); + int btf_data, btf_load_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: load_btf: size %d\n", btf_raw_size); + btf_data = add_data(gen, btf_raw_data, btf_raw_size); + + attr.btf_size = btf_raw_size; + btf_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with a pointer to the BTF data */ + emit_rel_store(gen, attr_field(btf_load_attr, btf), btf_data); + /* emit BTF_LOAD command */ + emit_sys_bpf(gen, BPF_BTF_LOAD, btf_load_attr, attr_size); + debug_ret(gen, "btf_load size %d", btf_raw_size); + emit_check_err(gen); + /* remember btf_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(btf_fd))); +} + +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_extra); + bool close_inner_map_fd = false; + int map_create_attr, idx; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.map_flags = map_attr->map_flags; + attr.map_extra = map_attr->map_extra; + if (map_name) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.numa_node = map_attr->numa_node; + attr.map_ifindex = map_attr->map_ifindex; + attr.max_entries = max_entries; + attr.btf_key_type_id = map_attr->btf_key_type_id; + attr.btf_value_type_id = map_attr->btf_value_type_id; + + pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n", + attr.map_name, map_idx, map_type, attr.btf_value_type_id); + + map_create_attr = add_data(gen, &attr, attr_size); + if (attr.btf_value_type_id) + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(map_create_attr, btf_fd), 4, + stack_off(btf_fd)); + switch (attr.map_type) { + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + move_stack2blob(gen, attr_field(map_create_attr, inner_map_fd), 4, + stack_off(inner_map_fd)); + close_inner_map_fd = true; + break; + default: + break; + } + /* conditionally update max_entries */ + if (map_idx >= 0) + move_ctx2blob(gen, attr_field(map_create_attr, max_entries), 4, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, max_entries), + true /* check that max_entries != 0 */); + /* emit MAP_CREATE command */ + emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size); + debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d", + attr.map_name, map_idx, map_type, value_size, + attr.btf_value_type_id); + emit_check_err(gen); + /* remember map_fd in the stack, if successful */ + if (map_idx < 0) { + /* This bpf_gen__map_create() function is called with map_idx >= 0 + * for all maps that libbpf loading logic tracks. + * It's called with -1 to create an inner map. + */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(inner_map_fd))); + } else if (map_idx != gen->nr_maps) { + gen->error = -EDOM; /* internal bug */ + return; + } else { + /* add_map_fd does gen->nr_maps++ */ + idx = add_map_fd(gen); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, idx))); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_7, 0)); + } + if (close_inner_map_fd) + emit_sys_close_stack(gen, stack_off(inner_map_fd)); +} + +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, + enum bpf_attach_type type) +{ + const char *prefix; + int kind, ret; + + btf_get_kernel_prefix_kind(type, &prefix, &kind); + gen->attach_kind = kind; + ret = snprintf(gen->attach_target, sizeof(gen->attach_target), "%s%s", + prefix, attach_name); + if (ret >= sizeof(gen->attach_target)) + gen->error = -ENOSPC; +} + +static void emit_find_attach_target(struct bpf_gen *gen) +{ + int name, len = strlen(gen->attach_target) + 1; + + pr_debug("gen: find_attach_tgt %s %d\n", gen->attach_target, gen->attach_kind); + name = add_data(gen, gen->attach_target, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, gen->attach_kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", + gen->attach_target, gen->attach_kind); + emit_check_err(gen); + /* if successful, btf_id is in lower 32-bit of R7 and + * btf_obj_fd is in upper 32-bit + */ +} + +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, bool is_ld64, int kind, int insn_idx) +{ + struct ksym_relo_desc *relo; + + relo = libbpf_reallocarray(gen->relos, gen->relo_cnt + 1, sizeof(*relo)); + if (!relo) { + gen->error = -ENOMEM; + return; + } + gen->relos = relo; + relo += gen->relo_cnt; + relo->name = name; + relo->is_weak = is_weak; + relo->is_typeless = is_typeless; + relo->is_ld64 = is_ld64; + relo->kind = kind; + relo->insn_idx = insn_idx; + gen->relo_cnt++; +} + +/* returns existing ksym_desc with ref incremented, or inserts a new one */ +static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + struct ksym_desc *kdesc; + int i; + + for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; + if (kdesc->kind == relo->kind && kdesc->is_ld64 == relo->is_ld64 && + !strcmp(kdesc->name, relo->name)) { + kdesc->ref++; + return kdesc; + } + } + kdesc = libbpf_reallocarray(gen->ksyms, gen->nr_ksyms + 1, sizeof(*kdesc)); + if (!kdesc) { + gen->error = -ENOMEM; + return NULL; + } + gen->ksyms = kdesc; + kdesc = &gen->ksyms[gen->nr_ksyms++]; + kdesc->name = relo->name; + kdesc->kind = relo->kind; + kdesc->ref = 1; + kdesc->off = 0; + kdesc->insn = 0; + kdesc->is_ld64 = relo->is_ld64; + return kdesc; +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + */ +static void emit_bpf_find_by_name_kind(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1; + + name_off = add_data(gen, relo->name, len); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, relo->kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", relo->name, relo->kind); +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + * Returns u64 symbol addr in BPF_REG_9 + */ +static void emit_bpf_kallsyms_lookup_name(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1, res_off; + + name_off = add_data(gen, relo->name, len); + res_off = add_data(gen, NULL, 8); /* res is u64 */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_4, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, res_off)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_4)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_kallsyms_lookup_name)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "kallsyms_lookup_name(%s,%d)", relo->name, relo->kind); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + * + * We need to reuse BTF fd for same symbol otherwise each relocation takes a new + * index, while kernel limits total kfunc BTFs to 256. For duplicate symbols, + * this would mean a new BTF fd index for each entry. By pairing symbol name + * with index, we get the insn->imm, insn->off pairing that kernel uses for + * kfunc_tab, which becomes the effective limit even though all of them may + * share same index in fd_array (such that kfunc_btf_tab has 1 element). + */ +static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + int btf_fd_idx; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing bpf_insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + offsetof(struct bpf_insn, off), 2, + kdesc->insn + offsetof(struct bpf_insn, off)); + goto log; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* get index in fd_array to store BTF FD at */ + btf_fd_idx = add_kfunc_btf_fd(gen); + if (btf_fd_idx > INT16_MAX) { + pr_warn("BTF fd off %d for kfunc %s exceeds INT16_MAX, cannot process relocation\n", + btf_fd_idx, relo->name); + gen->error = -E2BIG; + return; + } + kdesc->off = btf_fd_idx; + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set value for imm, off as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 10)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* obtain fd in BPF_REG_9 */ + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + /* load fd_array slot pointer */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); + /* store BTF fd in slot, 0 for vmlinux */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + /* jump to insn[insn_idx].off store if fd denotes module BTF */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); + /* set the default value for off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip BTF fd store for vmlinux BTF */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + /* store index into insn[insn_idx].off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); +log: + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, + offsetof(struct bpf_insn, off))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " func (%s:count=%d): imm: %%d, off: %%d", + relo->name, kdesc->ref); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, kdesc->off))); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_0, 0)); + debug_regs(gen, BPF_REG_9, -1, " func (%s:count=%d): btf_fd", + relo->name, kdesc->ref); +} + +static void emit_ksym_relo_log(struct bpf_gen *gen, struct ksym_relo_desc *relo, + int ref) +{ + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, sizeof(struct bpf_insn) + + offsetof(struct bpf_insn, imm))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " var t=%d w=%d (%s:count=%d): imm[0]: %%d, imm[1]: %%d", + relo->is_typeless, relo->is_weak, relo->name, ref); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + debug_regs(gen, BPF_REG_9, -1, " var t=%d w=%d (%s:count=%d): insn.reg", + relo->is_typeless, relo->is_weak, relo->name, ref); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_typeless(struct bpf_gen *gen, + struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + goto log; + } + /* remember insn offset, so we can copy ksym addr later */ + kdesc->insn = insn; + /* skip typeless ksym_desc in fd closing loop in cleanup_relos */ + kdesc->typeless = true; + emit_bpf_kallsyms_lookup_name(gen, relo); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_7, -ENOENT, 1)); + emit_check_err(gen); + /* store lower half of addr into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, offsetof(struct bpf_insn, imm))); + /* store upper half of addr into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); +log: + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +static __u32 src_reg_mask(void) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + return 0x0f; /* src_reg,dst_reg,... */ +#elif defined(__BIG_ENDIAN_BITFIELD) + return 0xf0; /* dst_reg,src_reg,... */ +#else +#error "Unsupported bit endianness, cannot proceed" +#endif +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + __u32 reg_mask; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + /* jump over src_reg adjustment if imm (btf_id) is not 0, reuse BPF_REG_0 from move_blob2blob + * If btf_id is zero, clear BPF_PSEUDO_BTF_ID flag in src_reg of ld_imm64 insn + */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); + goto clear_src_reg; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set values for insn[insn_idx].imm, insn[insn_idx + 1].imm as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* store btf_obj_fd into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); + /* skip src_reg adjustment */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3)); +clear_src_reg: + /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ + reg_mask = src_reg_mask(); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + emit(gen, BPF_ALU32_IMM(BPF_AND, BPF_REG_9, reg_mask)); + emit(gen, BPF_STX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, offsetofend(struct bpf_insn, code))); + + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +void bpf_gen__record_relo_core(struct bpf_gen *gen, + const struct bpf_core_relo *core_relo) +{ + struct bpf_core_relo *relos; + + relos = libbpf_reallocarray(gen->core_relos, gen->core_relo_cnt + 1, sizeof(*relos)); + if (!relos) { + gen->error = -ENOMEM; + return; + } + gen->core_relos = relos; + relos += gen->core_relo_cnt; + memcpy(relos, core_relo, sizeof(*relos)); + gen->core_relo_cnt++; +} + +static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insns) +{ + int insn; + + pr_debug("gen: emit_relo (%d): %s at %d %s\n", + relo->kind, relo->name, relo->insn_idx, relo->is_ld64 ? "ld64" : "call"); + insn = insns + sizeof(struct bpf_insn) * relo->insn_idx; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_8, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn)); + if (relo->is_ld64) { + if (relo->is_typeless) + emit_relo_ksym_typeless(gen, relo, insn); + else + emit_relo_ksym_btf(gen, relo, insn); + } else { + emit_relo_kfunc_btf(gen, relo, insn); + } +} + +static void emit_relos(struct bpf_gen *gen, int insns) +{ + int i; + + for (i = 0; i < gen->relo_cnt; i++) + emit_relo(gen, gen->relos + i, insns); +} + +static void cleanup_core_relo(struct bpf_gen *gen) +{ + if (!gen->core_relo_cnt) + return; + free(gen->core_relos); + gen->core_relo_cnt = 0; + gen->core_relos = NULL; +} + +static void cleanup_relos(struct bpf_gen *gen, int insns) +{ + struct ksym_desc *kdesc; + int i, insn; + + for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; + /* only close fds for typed ksyms and kfuncs */ + if (kdesc->is_ld64 && !kdesc->typeless) { + /* close fd recorded in insn[insn_idx + 1].imm */ + insn = kdesc->insn; + insn += sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm); + emit_sys_close_blob(gen, insn); + } else if (!kdesc->is_ld64) { + emit_sys_close_blob(gen, blob_fd_array_off(gen, kdesc->off)); + if (kdesc->off < MAX_FD_ARRAY_SZ) + gen->nr_fd_array--; + } + } + if (gen->nr_ksyms) { + free(gen->ksyms); + gen->nr_ksyms = 0; + gen->ksyms = NULL; + } + if (gen->relo_cnt) { + free(gen->relos); + gen->relo_cnt = 0; + gen->relos = NULL; + } + cleanup_core_relo(gen); +} + +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx) +{ + int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos; + int attr_size = offsetofend(union bpf_attr, core_relo_rec_size); + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n", + prog_type, insn_cnt, prog_idx); + /* add license string to blob of bytes */ + license_off = add_data(gen, license, strlen(license) + 1); + /* add insns to blob of bytes */ + insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn)); + + attr.prog_type = prog_type; + attr.expected_attach_type = load_attr->expected_attach_type; + attr.attach_btf_id = load_attr->attach_btf_id; + attr.prog_ifindex = load_attr->prog_ifindex; + attr.kern_version = 0; + attr.insn_cnt = (__u32)insn_cnt; + attr.prog_flags = load_attr->prog_flags; + + attr.func_info_rec_size = load_attr->func_info_rec_size; + attr.func_info_cnt = load_attr->func_info_cnt; + func_info = add_data(gen, load_attr->func_info, + attr.func_info_cnt * attr.func_info_rec_size); + + attr.line_info_rec_size = load_attr->line_info_rec_size; + attr.line_info_cnt = load_attr->line_info_cnt; + line_info = add_data(gen, load_attr->line_info, + attr.line_info_cnt * attr.line_info_rec_size); + + attr.core_relo_rec_size = sizeof(struct bpf_core_relo); + attr.core_relo_cnt = gen->core_relo_cnt; + core_relos = add_data(gen, gen->core_relos, + attr.core_relo_cnt * attr.core_relo_rec_size); + + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + prog_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with a pointer to license */ + emit_rel_store(gen, attr_field(prog_load_attr, license), license_off); + + /* populate union bpf_attr with a pointer to instructions */ + emit_rel_store(gen, attr_field(prog_load_attr, insns), insns_off); + + /* populate union bpf_attr with a pointer to func_info */ + emit_rel_store(gen, attr_field(prog_load_attr, func_info), func_info); + + /* populate union bpf_attr with a pointer to line_info */ + emit_rel_store(gen, attr_field(prog_load_attr, line_info), line_info); + + /* populate union bpf_attr with a pointer to core_relos */ + emit_rel_store(gen, attr_field(prog_load_attr, core_relos), core_relos); + + /* populate union bpf_attr fd_array with a pointer to data where map_fds are saved */ + emit_rel_store(gen, attr_field(prog_load_attr, fd_array), gen->fd_array); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(prog_load_attr, log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(prog_load_attr, prog_btf_fd), 4, + stack_off(btf_fd)); + if (gen->attach_kind) { + emit_find_attach_target(gen); + /* populate union bpf_attr with btf_id and btf_obj_fd found by helper */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, prog_load_attr)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_id))); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_obj_fd))); + } + emit_relos(gen, insns_off); + /* emit PROG_LOAD command */ + emit_sys_bpf(gen, BPF_PROG_LOAD, prog_load_attr, attr_size); + debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt); + /* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */ + cleanup_relos(gen, insns_off); + if (gen->attach_kind) { + emit_sys_close_blob(gen, + attr_field(prog_load_attr, attach_btf_obj_fd)); + gen->attach_kind = 0; + } + emit_check_err(gen); + /* remember prog_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(prog_fd[gen->nr_progs]))); + gen->nr_progs++; +} + +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue, + __u32 value_size) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, value, key; + union bpf_attr attr; + int zero = 0; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_update_elem: idx %d\n", map_idx); + + value = add_data(gen, pvalue, value_size); + key = add_data(gen, &zero, sizeof(zero)); + + /* if (map_desc[map_idx].initial_value) { + * if (ctx->flags & BPF_SKEL_KERNEL) + * bpf_probe_read_kernel(value, value_size, initial_value); + * else + * bpf_copy_from_user(value, value_size, initial_value); + * } + */ + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, initial_value))); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, value)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct bpf_loader_ctx, flags))); + emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), value); + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "update_elem idx %d value_size %d", map_idx, value_size); + emit_check_err(gen); +} + +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slot, + int inner_map_idx) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, key; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n", + outer_map_idx, slot, inner_map_idx); + + key = add_data(gen, &slot, sizeof(slot)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, outer_map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), + blob_fd_array_off(gen, inner_map_idx)); + + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "populate_outer_map outer %d key %d inner %d", + outer_map_idx, slot, inner_map_idx); + emit_check_err(gen); +} + +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_fd); + int map_freeze_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_freeze: idx %d\n", map_idx); + map_freeze_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_freeze_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + /* emit MAP_FREEZE command */ + emit_sys_bpf(gen, BPF_MAP_FREEZE, map_freeze_attr, attr_size); + debug_ret(gen, "map_freeze"); + emit_check_err(gen); +} diff --git a/third_party/bpftool/libbpf/src/hashmap.c b/third_party/bpftool/libbpf/src/hashmap.c new file mode 100644 index 0000000..140ee40 --- /dev/null +++ b/third_party/bpftool/libbpf/src/hashmap.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#include +#include +#include +#include +#include +#include "hashmap.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +/* start with 4 buckets */ +#define HASHMAP_MIN_CAP_BITS 2 + +static void hashmap_add_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + entry->next = *pprev; + *pprev = entry; +} + +static void hashmap_del_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + *pprev = entry->next; + entry->next = NULL; +} + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx) +{ + map->hash_fn = hash_fn; + map->equal_fn = equal_fn; + map->ctx = ctx; + + map->buckets = NULL; + map->cap = 0; + map->cap_bits = 0; + map->sz = 0; +} + +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx) +{ + struct hashmap *map = malloc(sizeof(struct hashmap)); + + if (!map) + return ERR_PTR(-ENOMEM); + hashmap__init(map, hash_fn, equal_fn, ctx); + return map; +} + +void hashmap__clear(struct hashmap *map) +{ + struct hashmap_entry *cur, *tmp; + size_t bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } + free(map->buckets); + map->buckets = NULL; + map->cap = map->cap_bits = map->sz = 0; +} + +void hashmap__free(struct hashmap *map) +{ + if (IS_ERR_OR_NULL(map)) + return; + + hashmap__clear(map); + free(map); +} + +size_t hashmap__size(const struct hashmap *map) +{ + return map->sz; +} + +size_t hashmap__capacity(const struct hashmap *map) +{ + return map->cap; +} + +static bool hashmap_needs_to_grow(struct hashmap *map) +{ + /* grow if empty or more than 75% filled */ + return (map->cap == 0) || ((map->sz + 1) * 4 / 3 > map->cap); +} + +static int hashmap_grow(struct hashmap *map) +{ + struct hashmap_entry **new_buckets; + struct hashmap_entry *cur, *tmp; + size_t new_cap_bits, new_cap; + size_t h, bkt; + + new_cap_bits = map->cap_bits + 1; + if (new_cap_bits < HASHMAP_MIN_CAP_BITS) + new_cap_bits = HASHMAP_MIN_CAP_BITS; + + new_cap = 1UL << new_cap_bits; + new_buckets = calloc(new_cap, sizeof(new_buckets[0])); + if (!new_buckets) + return -ENOMEM; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + h = hash_bits(map->hash_fn(cur->key, map->ctx), new_cap_bits); + hashmap_add_entry(&new_buckets[h], cur); + } + + map->cap = new_cap; + map->cap_bits = new_cap_bits; + free(map->buckets); + map->buckets = new_buckets; + + return 0; +} + +static bool hashmap_find_entry(const struct hashmap *map, + const long key, size_t hash, + struct hashmap_entry ***pprev, + struct hashmap_entry **entry) +{ + struct hashmap_entry *cur, **prev_ptr; + + if (!map->buckets) + return false; + + for (prev_ptr = &map->buckets[hash], cur = *prev_ptr; + cur; + prev_ptr = &cur->next, cur = cur->next) { + if (map->equal_fn(cur->key, key, map->ctx)) { + if (pprev) + *pprev = prev_ptr; + *entry = cur; + return true; + } + } + + return false; +} + +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value) +{ + struct hashmap_entry *entry; + size_t h; + int err; + + if (old_key) + *old_key = 0; + if (old_value) + *old_value = 0; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (strategy != HASHMAP_APPEND && + hashmap_find_entry(map, key, h, NULL, &entry)) { + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + if (strategy == HASHMAP_SET || strategy == HASHMAP_UPDATE) { + entry->key = key; + entry->value = value; + return 0; + } else if (strategy == HASHMAP_ADD) { + return -EEXIST; + } + } + + if (strategy == HASHMAP_UPDATE) + return -ENOENT; + + if (hashmap_needs_to_grow(map)) { + err = hashmap_grow(map); + if (err) + return err; + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + } + + entry = malloc(sizeof(struct hashmap_entry)); + if (!entry) + return -ENOMEM; + + entry->key = key; + entry->value = value; + hashmap_add_entry(&map->buckets[h], entry); + map->sz++; + + return 0; +} + +bool hashmap_find(const struct hashmap *map, long key, long *value) +{ + struct hashmap_entry *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, NULL, &entry)) + return false; + + if (value) + *value = entry->value; + return true; +} + +bool hashmap_delete(struct hashmap *map, long key, + long *old_key, long *old_value) +{ + struct hashmap_entry **pprev, *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, &pprev, &entry)) + return false; + + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + hashmap_del_entry(pprev, entry); + free(entry); + map->sz--; + + return true; +} diff --git a/third_party/bpftool/libbpf/src/hashmap.h b/third_party/bpftool/libbpf/src/hashmap.h new file mode 100644 index 0000000..c12f832 --- /dev/null +++ b/third_party/bpftool/libbpf/src/hashmap.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#ifndef __LIBBPF_HASHMAP_H +#define __LIBBPF_HASHMAP_H + +#include +#include +#include + +static inline size_t hash_bits(size_t h, int bits) +{ + /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + +#if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) + /* LP64 case */ + return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); +#elif (__SIZEOF_SIZE_T__ <= __SIZEOF_LONG__) + return (h * 2654435769lu) >> (__SIZEOF_LONG__ * 8 - bits); +#else +# error "Unsupported size_t size" +#endif +} + +/* generic C-string hashing function */ +static inline size_t str_hash(const char *s) +{ + size_t h = 0; + + while (*s) { + h = h * 31 + *s; + s++; + } + return h; +} + +typedef size_t (*hashmap_hash_fn)(long key, void *ctx); +typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx); + +/* + * Hashmap interface is polymorphic, keys and values could be either + * long-sized integers or pointers, this is achieved as follows: + * - interface functions that operate on keys and values are hidden + * behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert; + * - these auxiliary macros cast the key and value parameters as + * long or long *, so the user does not have to specify the casts explicitly; + * - for pointer parameters (e.g. old_key) the size of the pointed + * type is verified by hashmap_cast_ptr using _Static_assert; + * - when iterating using hashmap__for_each_* forms + * hasmap_entry->key should be used for integer keys and + * hasmap_entry->pkey should be used for pointer keys, + * same goes for values. + */ +struct hashmap_entry { + union { + long key; + const void *pkey; + }; + union { + long value; + void *pvalue; + }; + struct hashmap_entry *next; +}; + +struct hashmap { + hashmap_hash_fn hash_fn; + hashmap_equal_fn equal_fn; + void *ctx; + + struct hashmap_entry **buckets; + size_t cap; + size_t cap_bits; + size_t sz; +}; + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx); +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx); +void hashmap__clear(struct hashmap *map); +void hashmap__free(struct hashmap *map); + +size_t hashmap__size(const struct hashmap *map); +size_t hashmap__capacity(const struct hashmap *map); + +/* + * Hashmap insertion strategy: + * - HASHMAP_ADD - only add key/value if key doesn't exist yet; + * - HASHMAP_SET - add key/value pair if key doesn't exist yet; otherwise, + * update value; + * - HASHMAP_UPDATE - update value, if key already exists; otherwise, do + * nothing and return -ENOENT; + * - HASHMAP_APPEND - always add key/value pair, even if key already exists. + * This turns hashmap into a multimap by allowing multiple values to be + * associated with the same key. Most useful read API for such hashmap is + * hashmap__for_each_key_entry() iteration. If hashmap__find() is still + * used, it will return last inserted key/value entry (first in a bucket + * chain). + */ +enum hashmap_insert_strategy { + HASHMAP_ADD, + HASHMAP_SET, + HASHMAP_UPDATE, + HASHMAP_APPEND, +}; + +#define hashmap_cast_ptr(p) ({ \ + _Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) || \ + sizeof(*(p)) == sizeof(long), \ + #p " pointee should be a long-sized integer or a pointer"); \ + (long *)(p); \ +}) + +/* + * hashmap__insert() adds key/value entry w/ various semantics, depending on + * provided strategy value. If a given key/value pair replaced already + * existing key/value pair, both old key and old value will be returned + * through old_key and old_value to allow calling code do proper memory + * management. + */ +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value); + +#define hashmap__insert(map, key, value, strategy, old_key, old_value) \ + hashmap_insert((map), (long)(key), (long)(value), (strategy), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +#define hashmap__add(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL) + +#define hashmap__set(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value)) + +#define hashmap__update(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value)) + +#define hashmap__append(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL) + +bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value); + +#define hashmap__delete(map, key, old_key, old_value) \ + hashmap_delete((map), (long)(key), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +bool hashmap_find(const struct hashmap *map, long key, long *value); + +#define hashmap__find(map, key, value) \ + hashmap_find((map), (long)(key), hashmap_cast_ptr(value)) + +/* + * hashmap__for_each_entry - iterate over all entries in hashmap + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry(map, cur, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; cur; cur = cur->next) + +/* + * hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe + * against removals + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @tmp: struct hashmap_entry * used as a temporary next cursor storage + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; \ + cur && ({tmp = cur->next; true; }); \ + cur = tmp) + +/* + * hashmap__for_each_key_entry - iterate over entries associated with given key + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @key: key to iterate entries for + */ +#define hashmap__for_each_key_entry(map, cur, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur; \ + cur = cur->next) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur && ({ tmp = cur->next; true; }); \ + cur = tmp) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#endif /* __LIBBPF_HASHMAP_H */ diff --git a/third_party/bpftool/libbpf/src/libbpf.c b/third_party/bpftool/libbpf/src/libbpf.c new file mode 100644 index 0000000..63311a7 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf.c @@ -0,0 +1,13129 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + * Copyright (C) 2019 Isovalent, Inc. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" +#include "zip.h" + +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + +#define BPF_INSN_SZ (sizeof(struct bpf_insn)) + +/* vsprintf() in __base_pr() uses nonliteral format string. It may break + * compilation if user enables corresponding warning. Disable it explicitly. + */ +#pragma GCC diagnostic ignored "-Wformat-nonliteral" + +#define __printf(a, b) __attribute__((format(printf, a, b))) + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); + +static const char * const attach_type_name[] = { + [BPF_CGROUP_INET_INGRESS] = "cgroup_inet_ingress", + [BPF_CGROUP_INET_EGRESS] = "cgroup_inet_egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "cgroup_inet_sock_create", + [BPF_CGROUP_INET_SOCK_RELEASE] = "cgroup_inet_sock_release", + [BPF_CGROUP_SOCK_OPS] = "cgroup_sock_ops", + [BPF_CGROUP_DEVICE] = "cgroup_device", + [BPF_CGROUP_INET4_BIND] = "cgroup_inet4_bind", + [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", + [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", + [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", + [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", + [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", + [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", + [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", + [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", + [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "trace_raw_tp", + [BPF_TRACE_FENTRY] = "trace_fentry", + [BPF_TRACE_FEXIT] = "trace_fexit", + [BPF_MODIFY_RETURN] = "modify_return", + [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", + [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", + [BPF_PERF_EVENT] = "perf_event", + [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", + [BPF_STRUCT_OPS] = "struct_ops", + [BPF_NETFILTER] = "netfilter", +}; + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_LINK_TYPE_NETFILTER] = "netfilter", +}; + +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", + [BPF_MAP_TYPE_QUEUE] = "queue", + [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", + [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", + [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", +}; + +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", + [BPF_PROG_TYPE_SK_MSG] = "sk_msg", + [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", + [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", + [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", + [BPF_PROG_TYPE_NETFILTER] = "netfilter", +}; + +static int __base_pr(enum libbpf_print_level level, const char *format, + va_list args) +{ + if (level == LIBBPF_DEBUG) + return 0; + + return vfprintf(stderr, format, args); +} + +static libbpf_print_fn_t __libbpf_pr = __base_pr; + +libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn) +{ + libbpf_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__libbpf_pr, fn, __ATOMIC_RELAXED); + + return old_print_fn; +} + +__printf(2, 3) +void libbpf_print(enum libbpf_print_level level, const char *format, ...) +{ + va_list args; + int old_errno; + libbpf_print_fn_t print_fn; + + print_fn = __atomic_load_n(&__libbpf_pr, __ATOMIC_RELAXED); + if (!print_fn) + return; + + old_errno = errno; + + va_start(args, format); + __libbpf_pr(level, format, args); + va_end(args); + + errno = old_errno; +} + +static void pr_perm_msg(int err) +{ + struct rlimit limit; + char buf[100]; + + if (err != -EPERM || geteuid() != 0) + return; + + err = getrlimit(RLIMIT_MEMLOCK, &limit); + if (err) + return; + + if (limit.rlim_cur == RLIM_INFINITY) + return; + + if (limit.rlim_cur < 1024) + snprintf(buf, sizeof(buf), "%zu bytes", (size_t)limit.rlim_cur); + else if (limit.rlim_cur < 1024*1024) + snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024); + else + snprintf(buf, sizeof(buf), "%.1f MiB", (double)limit.rlim_cur / (1024*1024)); + + pr_warn("permission error while running as root; try raising 'ulimit -l'? current value: %s\n", + buf); +} + +#define STRERR_BUFSIZE 128 + +/* Copied from tools/perf/util/util.h */ +#ifndef zfree +# define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) +#endif + +#ifndef zclose +# define zclose(fd) ({ \ + int ___err = 0; \ + if ((fd) >= 0) \ + ___err = close((fd)); \ + fd = -1; \ + ___err; }) +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int libbpf_set_strict_mode(enum libbpf_strict_mode mode) +{ + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ + return 0; +} + +__u32 libbpf_major_version(void) +{ + return LIBBPF_MAJOR_VERSION; +} + +__u32 libbpf_minor_version(void) +{ + return LIBBPF_MINOR_VERSION; +} + +const char *libbpf_version_string(void) +{ +#define __S(X) #X +#define _S(X) __S(X) + return "v" _S(LIBBPF_MAJOR_VERSION) "." _S(LIBBPF_MINOR_VERSION); +#undef _S +#undef __S +} + +enum reloc_type { + RELO_LD64, + RELO_CALL, + RELO_DATA, + RELO_EXTERN_LD64, + RELO_EXTERN_CALL, + RELO_SUBPROG_ADDR, + RELO_CORE, +}; + +struct reloc_desc { + enum reloc_type type; + int insn_idx; + union { + const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ + struct { + int map_idx; + int sym_off; + int ext_idx; + }; + }; +}; + +/* stored as sec_def->cookie for all libbpf-supported SEC()s */ +enum sec_def_flags { + SEC_NONE = 0, + /* expected_attach_type is optional, if kernel doesn't support that */ + SEC_EXP_ATTACH_OPT = 1, + /* legacy, only used by libbpf_get_type_names() and + * libbpf_attach_type_by_name(), not used by libbpf itself at all. + * This used to be associated with cgroup (and few other) BPF programs + * that were attachable through BPF_PROG_ATTACH command. Pretty + * meaningless nowadays, though. + */ + SEC_ATTACHABLE = 2, + SEC_ATTACHABLE_OPT = SEC_ATTACHABLE | SEC_EXP_ATTACH_OPT, + /* attachment target is specified through BTF ID in either kernel or + * other BPF program's BTF object + */ + SEC_ATTACH_BTF = 4, + /* BPF program type allows sleeping/blocking in kernel */ + SEC_SLEEPABLE = 8, + /* BPF program support non-linear XDP buffer */ + SEC_XDP_FRAGS = 16, +}; + +struct bpf_sec_def { + char *sec; + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + long cookie; + int handler_id; + + libbpf_prog_setup_fn_t prog_setup_fn; + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + libbpf_prog_attach_fn_t prog_attach_fn; +}; + +/* + * bpf_prog should be a better name but it has been used in + * linux/filter.h. + */ +struct bpf_program { + char *name; + char *sec_name; + size_t sec_idx; + const struct bpf_sec_def *sec_def; + /* this program's instruction offset (in number of instructions) + * within its containing ELF section + */ + size_t sec_insn_off; + /* number of original instructions in ELF section belonging to this + * program, not taking into account subprogram instructions possible + * appended later during relocation + */ + size_t sec_insn_cnt; + /* Offset (in number of instructions) of the start of instruction + * belonging to this BPF program within its containing main BPF + * program. For the entry-point (main) BPF program, this is always + * zero. For a sub-program, this gets reset before each of main BPF + * programs are processed and relocated and is used to determined + * whether sub-program was already appended to the main program, and + * if yes, at which instruction offset. + */ + size_t sub_insn_off; + + /* instructions that belong to BPF program; insns[0] is located at + * sec_insn_off instruction within its ELF section in ELF file, so + * when mapping ELF file instruction index to the local instruction, + * one needs to subtract sec_insn_off; and vice versa. + */ + struct bpf_insn *insns; + /* actual number of instruction in this BPF program's image; for + * entry-point BPF programs this includes the size of main program + * itself plus all the used sub-programs, appended at the end + */ + size_t insns_cnt; + + struct reloc_desc *reloc_desc; + int nr_reloc; + + /* BPF verifier log settings */ + char *log_buf; + size_t log_size; + __u32 log_level; + + struct bpf_object *obj; + + int fd; + bool autoload; + bool autoattach; + bool mark_btf_static; + enum bpf_prog_type type; + enum bpf_attach_type expected_attach_type; + + int prog_ifindex; + __u32 attach_btf_obj_fd; + __u32 attach_btf_id; + __u32 attach_prog_fd; + + void *func_info; + __u32 func_info_rec_size; + __u32 func_info_cnt; + + void *line_info; + __u32 line_info_rec_size; + __u32 line_info_cnt; + __u32 prog_flags; +}; + +struct bpf_struct_ops { + const char *tname; + const struct btf_type *type; + struct bpf_program **progs; + __u32 *kern_func_off; + /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ + void *data; + /* e.g. struct bpf_struct_ops_tcp_congestion_ops in + * btf_vmlinux's format. + * struct bpf_struct_ops_tcp_congestion_ops { + * [... some other kernel fields ...] + * struct tcp_congestion_ops data; + * } + * kern_vdata-size == sizeof(struct bpf_struct_ops_tcp_congestion_ops) + * bpf_map__init_kern_struct_ops() will populate the "kern_vdata" + * from "data". + */ + void *kern_vdata; + __u32 type_id; +}; + +#define DATA_SEC ".data" +#define BSS_SEC ".bss" +#define RODATA_SEC ".rodata" +#define KCONFIG_SEC ".kconfig" +#define KSYMS_SEC ".ksyms" +#define STRUCT_OPS_SEC ".struct_ops" +#define STRUCT_OPS_LINK_SEC ".struct_ops.link" + +enum libbpf_map_type { + LIBBPF_MAP_UNSPEC, + LIBBPF_MAP_DATA, + LIBBPF_MAP_BSS, + LIBBPF_MAP_RODATA, + LIBBPF_MAP_KCONFIG, +}; + +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + +struct bpf_map { + struct bpf_object *obj; + char *name; + /* real_name is defined for special internal maps (.rodata*, + * .data*, .bss, .kconfig) and preserves their original ELF section + * name. This is important to be able to find corresponding BTF + * DATASEC information. + */ + char *real_name; + int fd; + int sec_idx; + size_t sec_offset; + int map_ifindex; + int inner_map_fd; + struct bpf_map_def def; + __u32 numa_node; + __u32 btf_var_idx; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + enum libbpf_map_type libbpf_type; + void *mmaped; + struct bpf_struct_ops *st_ops; + struct bpf_map *inner_map; + void **init_slots; + int init_slots_sz; + char *pin_path; + bool pinned; + bool reused; + bool autocreate; + __u64 map_extra; +}; + +enum extern_type { + EXT_UNKNOWN, + EXT_KCFG, + EXT_KSYM, +}; + +enum kcfg_type { + KCFG_UNKNOWN, + KCFG_CHAR, + KCFG_BOOL, + KCFG_INT, + KCFG_TRISTATE, + KCFG_CHAR_ARR, +}; + +struct extern_desc { + enum extern_type type; + int sym_idx; + int btf_id; + int sec_btf_id; + const char *name; + bool is_set; + bool is_weak; + union { + struct { + enum kcfg_type type; + int sz; + int align; + int data_off; + bool is_signed; + } kcfg; + struct { + unsigned long long addr; + + /* target btf_id of the corresponding kernel var. */ + int kernel_btf_obj_fd; + int kernel_btf_id; + + /* local btf_id of the ksym extern's type. */ + __u32 type_id; + /* BTF fd index to be patched in for insn->off, this is + * 0 for vmlinux BTF, index in obj->fd_array for module + * BTF + */ + __s16 btf_fd_idx; + } ksym; + }; +}; + +struct module_btf { + struct btf *btf; + char *name; + __u32 id; + int fd; + int fd_array_idx; +}; + +enum sec_type { + SEC_UNUSED = 0, + SEC_RELO, + SEC_BSS, + SEC_DATA, + SEC_RODATA, +}; + +struct elf_sec_desc { + enum sec_type sec_type; + Elf64_Shdr *shdr; + Elf_Data *data; +}; + +struct elf_state { + int fd; + const void *obj_buf; + size_t obj_buf_sz; + Elf *elf; + Elf64_Ehdr *ehdr; + Elf_Data *symbols; + Elf_Data *st_ops_data; + Elf_Data *st_ops_link_data; + size_t shstrndx; /* section index for section name strings */ + size_t strtabidx; + struct elf_sec_desc *secs; + size_t sec_cnt; + int btf_maps_shndx; + __u32 btf_maps_sec_btf_id; + int text_shndx; + int symbols_shndx; + int st_ops_shndx; + int st_ops_link_shndx; +}; + +struct usdt_manager; + +struct bpf_object { + char name[BPF_OBJ_NAME_LEN]; + char license[64]; + __u32 kern_version; + + struct bpf_program *programs; + size_t nr_programs; + struct bpf_map *maps; + size_t nr_maps; + size_t maps_cap; + + char *kconfig; + struct extern_desc *externs; + int nr_extern; + int kconfig_map_idx; + + bool loaded; + bool has_subcalls; + bool has_rodata; + + struct bpf_gen *gen_loader; + + /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ + struct elf_state efile; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* Parse and load BTF vmlinux if any of the programs in the object need + * it at load time. + */ + struct btf *btf_vmlinux; + /* Path to the custom BTF to be used for BPF CO-RE relocations as an + * override for vmlinux BTF. + */ + char *btf_custom_path; + /* vmlinux BTF override for CO-RE relocations */ + struct btf *btf_vmlinux_override; + /* Lazily initialized kernel module BTFs */ + struct module_btf *btf_modules; + bool btf_modules_loaded; + size_t btf_module_cnt; + size_t btf_module_cap; + + /* optional log settings passed to BPF_BTF_LOAD and BPF_PROG_LOAD commands */ + char *log_buf; + size_t log_size; + __u32 log_level; + + int *fd_array; + size_t fd_array_cap; + size_t fd_array_cnt; + + struct usdt_manager *usdt_man; + + char path[]; +}; + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off); +static const char *elf_sec_str(const struct bpf_object *obj, size_t off); +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx); +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn); +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx); +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); + +void bpf_program__unload(struct bpf_program *prog) +{ + if (!prog) + return; + + zclose(prog->fd); + + zfree(&prog->func_info); + zfree(&prog->line_info); +} + +static void bpf_program__exit(struct bpf_program *prog) +{ + if (!prog) + return; + + bpf_program__unload(prog); + zfree(&prog->name); + zfree(&prog->sec_name); + zfree(&prog->insns); + zfree(&prog->reloc_desc); + + prog->nr_reloc = 0; + prog->insns_cnt = 0; + prog->sec_idx = -1; +} + +static bool insn_is_subprog_call(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == BPF_PSEUDO_CALL && + insn->dst_reg == 0 && + insn->off == 0; +} + +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + +static int +bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, + const char *name, size_t sec_idx, const char *sec_name, + size_t sec_off, void *insn_data, size_t insn_data_sz) +{ + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { + pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", + sec_name, name, sec_off, insn_data_sz); + return -EINVAL; + } + + memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + + prog->sec_idx = sec_idx; + prog->sec_insn_off = sec_off / BPF_INSN_SZ; + prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; + /* insns_cnt can later be increased by appending used subprograms */ + prog->insns_cnt = prog->sec_insn_cnt; + + prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; + + /* libbpf's convention for SEC("?abc...") is that it's just like + * SEC("abc...") but the corresponding bpf_program starts out with + * autoload set to false. + */ + if (sec_name[0] == '?') { + prog->autoload = false; + /* from now on forget there was ? in section name */ + sec_name++; + } else { + prog->autoload = true; + } + + prog->autoattach = true; + + /* inherit object's log_level */ + prog->log_level = obj->log_level; + + prog->sec_name = strdup(sec_name); + if (!prog->sec_name) + goto errout; + + prog->name = strdup(name); + if (!prog->name) + goto errout; + + prog->insns = malloc(insn_data_sz); + if (!prog->insns) + goto errout; + memcpy(prog->insns, insn_data, insn_data_sz); + + return 0; +errout: + pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); + bpf_program__exit(prog); + return -ENOMEM; +} + +static int +bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, + const char *sec_name, int sec_idx) +{ + Elf_Data *symbols = obj->efile.symbols; + struct bpf_program *prog, *progs; + void *data = sec_data->d_buf; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; + const char *name; + Elf64_Sym *sym; + + progs = obj->programs; + nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(Elf64_Sym); + + for (i = 0; i < nr_syms; i++) { + sym = elf_sym_by_idx(obj, i); + + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) + continue; + + prog_sz = sym->st_size; + sec_off = sym->st_value; + + name = elf_sym_str(obj, sym->st_name); + if (!name) { + pr_warn("sec '%s': failed to get symbol name for offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_off + prog_sz > sec_sz) { + pr_warn("sec '%s': program at offset %zu crosses section boundary\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_idx != obj->efile.text_shndx && ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { + pr_warn("sec '%s': program '%s' is static and not supported\n", sec_name, name); + return -ENOTSUP; + } + + pr_debug("sec '%s': found program '%s' at insn offset %zu (%zu bytes), code size %zu insns (%zu bytes)\n", + sec_name, name, sec_off / BPF_INSN_SZ, sec_off, prog_sz / BPF_INSN_SZ, prog_sz); + + progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(*progs)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warn("sec '%s': failed to alloc memory for new program '%s'\n", + sec_name, name); + return -ENOMEM; + } + obj->programs = progs; + + prog = &progs[nr_progs]; + + err = bpf_object__init_prog(obj, prog, name, sec_idx, sec_name, + sec_off, data + sec_off, prog_sz); + if (err) + return err; + + /* if function is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF FUNC + * as static to enable more permissive BPF verification mode + * with more outside context available to BPF verifier + */ + if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL + && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) + prog->mark_btf_static = true; + + nr_progs++; + obj->nr_programs = nr_progs; + } + + return 0; +} + +static const struct btf_member * +find_member_by_offset(const struct btf_type *t, __u32 bit_offset) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (btf_member_bit_offset(t, i) == bit_offset) + return m; + } + + return NULL; +} + +static const struct btf_member * +find_member_by_name(const struct btf *btf, const struct btf_type *t, + const char *name) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (!strcmp(btf__name_by_offset(btf, m->name_off), name)) + return m; + } + + return NULL; +} + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind); + +static int +find_struct_ops_kern_types(const struct btf *btf, const char *tname, + const struct btf_type **type, __u32 *type_id, + const struct btf_type **vtype, __u32 *vtype_id, + const struct btf_member **data_member) +{ + const struct btf_type *kern_type, *kern_vtype; + const struct btf_member *kern_data_member; + __s32 kern_vtype_id, kern_type_id; + __u32 i; + + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", + tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + + /* Find the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, + * find "struct bpf_struct_ops_tcp_congestion_ops" from the + * btf_vmlinux. + */ + kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, + tname, BTF_KIND_STRUCT); + if (kern_vtype_id < 0) { + pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", + STRUCT_OPS_VALUE_PREFIX, tname); + return kern_vtype_id; + } + kern_vtype = btf__type_by_id(btf, kern_vtype_id); + + /* Find "struct tcp_congestion_ops" from + * struct bpf_struct_ops_tcp_congestion_ops { + * [ ... ] + * struct tcp_congestion_ops data; + * } + */ + kern_data_member = btf_members(kern_vtype); + for (i = 0; i < btf_vlen(kern_vtype); i++, kern_data_member++) { + if (kern_data_member->type == kern_type_id) + break; + } + if (i == btf_vlen(kern_vtype)) { + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", + tname, STRUCT_OPS_VALUE_PREFIX, tname); + return -EINVAL; + } + + *type = kern_type; + *type_id = kern_type_id; + *vtype = kern_vtype; + *vtype_id = kern_vtype_id; + *data_member = kern_data_member; + + return 0; +} + +static bool bpf_map__is_struct_ops(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; +} + +/* Init the map's fields that depend on kern_btf */ +static int bpf_map__init_kern_struct_ops(struct bpf_map *map, + const struct btf *btf, + const struct btf *kern_btf) +{ + const struct btf_member *member, *kern_member, *kern_data_member; + const struct btf_type *type, *kern_type, *kern_vtype; + __u32 i, kern_type_id, kern_vtype_id, kern_data_off; + struct bpf_struct_ops *st_ops; + void *data, *kern_data; + const char *tname; + int err; + + st_ops = map->st_ops; + type = st_ops->type; + tname = st_ops->tname; + err = find_struct_ops_kern_types(kern_btf, tname, + &kern_type, &kern_type_id, + &kern_vtype, &kern_vtype_id, + &kern_data_member); + if (err) + return err; + + pr_debug("struct_ops init_kern %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n", + map->name, st_ops->type_id, kern_type_id, kern_vtype_id); + + map->def.value_size = kern_vtype->size; + map->btf_vmlinux_value_type_id = kern_vtype_id; + + st_ops->kern_vdata = calloc(1, kern_vtype->size); + if (!st_ops->kern_vdata) + return -ENOMEM; + + data = st_ops->data; + kern_data_off = kern_data_member->offset / 8; + kern_data = st_ops->kern_vdata + kern_data_off; + + member = btf_members(type); + for (i = 0; i < btf_vlen(type); i++, member++) { + const struct btf_type *mtype, *kern_mtype; + __u32 mtype_id, kern_mtype_id; + void *mdata, *kern_mdata; + __s64 msize, kern_msize; + __u32 moff, kern_moff; + __u32 kern_member_idx; + const char *mname; + + mname = btf__name_by_offset(btf, member->name_off); + kern_member = find_member_by_name(kern_btf, kern_type, mname); + if (!kern_member) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + kern_member_idx = kern_member - btf_members(kern_type); + if (btf_member_bitfield_size(type, i) || + btf_member_bitfield_size(kern_type, kern_member_idx)) { + pr_warn("struct_ops init_kern %s: bitfield %s is not supported\n", + map->name, mname); + return -ENOTSUP; + } + + moff = member->offset / 8; + kern_moff = kern_member->offset / 8; + + mdata = data + moff; + kern_mdata = kern_data + kern_moff; + + mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_member->type, + &kern_mtype_id); + if (BTF_INFO_KIND(mtype->info) != + BTF_INFO_KIND(kern_mtype->info)) { + pr_warn("struct_ops init_kern %s: Unmatched member type %s %u != %u(kernel)\n", + map->name, mname, BTF_INFO_KIND(mtype->info), + BTF_INFO_KIND(kern_mtype->info)); + return -ENOTSUP; + } + + if (btf_is_ptr(mtype)) { + struct bpf_program *prog; + + prog = st_ops->progs[i]; + if (!prog) + continue; + + kern_mtype = skip_mods_and_typedefs(kern_btf, + kern_mtype->type, + &kern_mtype_id); + + /* mtype->type must be a func_proto which was + * guaranteed in bpf_object__collect_st_ops_relos(), + * so only check kern_mtype for func_proto here. + */ + if (!btf_is_func_proto(kern_mtype)) { + pr_warn("struct_ops init_kern %s: kernel member %s is not a func ptr\n", + map->name, mname); + return -ENOTSUP; + } + + prog->attach_btf_id = kern_type_id; + prog->expected_attach_type = kern_member_idx; + + st_ops->kern_func_off[i] = kern_data_off + kern_moff; + + pr_debug("struct_ops init_kern %s: func ptr %s is set to prog %s from data(+%u) to kern_data(+%u)\n", + map->name, mname, prog->name, moff, + kern_moff); + + continue; + } + + msize = btf__resolve_size(btf, mtype_id); + kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); + if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", + map->name, mname, (ssize_t)msize, + (ssize_t)kern_msize); + return -ENOTSUP; + } + + pr_debug("struct_ops init_kern %s: copy %s %u bytes from data(+%u) to kern_data(+%u)\n", + map->name, mname, (unsigned int)msize, + moff, kern_moff); + memcpy(kern_mdata, mdata, msize); + } + + return 0; +} + +static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + size_t i; + int err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!bpf_map__is_struct_ops(map)) + continue; + + err = bpf_map__init_kern_struct_ops(map, obj->btf, + obj->btf_vmlinux); + if (err) + return err; + } + + return 0; +} + +static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, + int shndx, Elf_Data *data, __u32 map_flags) +{ + const struct btf_type *type, *datasec; + const struct btf_var_secinfo *vsi; + struct bpf_struct_ops *st_ops; + const char *tname, *var_name; + __s32 type_id, datasec_id; + const struct btf *btf; + struct bpf_map *map; + __u32 i; + + if (shndx == -1) + return 0; + + btf = obj->btf; + datasec_id = btf__find_by_name_kind(btf, sec_name, + BTF_KIND_DATASEC); + if (datasec_id < 0) { + pr_warn("struct_ops init: DATASEC %s not found\n", + sec_name); + return -EINVAL; + } + + datasec = btf__type_by_id(btf, datasec_id); + vsi = btf_var_secinfos(datasec); + for (i = 0; i < btf_vlen(datasec); i++, vsi++) { + type = btf__type_by_id(obj->btf, vsi->type); + var_name = btf__name_by_offset(obj->btf, type->name_off); + + type_id = btf__resolve_type(obj->btf, vsi->type); + if (type_id < 0) { + pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", + vsi->type, sec_name); + return -EINVAL; + } + + type = btf__type_by_id(obj->btf, type_id); + tname = btf__name_by_offset(obj->btf, type->name_off); + if (!tname[0]) { + pr_warn("struct_ops init: anonymous type is not supported\n"); + return -ENOTSUP; + } + if (!btf_is_struct(type)) { + pr_warn("struct_ops init: %s is not a struct\n", tname); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->sec_idx = shndx; + map->sec_offset = vsi->offset; + map->name = strdup(var_name); + if (!map->name) + return -ENOMEM; + + map->def.type = BPF_MAP_TYPE_STRUCT_OPS; + map->def.key_size = sizeof(int); + map->def.value_size = type->size; + map->def.max_entries = 1; + map->def.map_flags = map_flags; + + map->st_ops = calloc(1, sizeof(*map->st_ops)); + if (!map->st_ops) + return -ENOMEM; + st_ops = map->st_ops; + st_ops->data = malloc(type->size); + st_ops->progs = calloc(btf_vlen(type), sizeof(*st_ops->progs)); + st_ops->kern_func_off = malloc(btf_vlen(type) * + sizeof(*st_ops->kern_func_off)); + if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) + return -ENOMEM; + + if (vsi->offset + type->size > data->d_size) { + pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", + var_name, sec_name); + return -EINVAL; + } + + memcpy(st_ops->data, + data->d_buf + vsi->offset, + type->size); + st_ops->tname = tname; + st_ops->type = type; + st_ops->type_id = type_id; + + pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", + tname, type_id, var_name, vsi->offset); + } + + return 0; +} + +static int bpf_object_init_struct_ops(struct bpf_object *obj) +{ + int err; + + err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx, + obj->efile.st_ops_data, 0); + err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC, + obj->efile.st_ops_link_shndx, + obj->efile.st_ops_link_data, + BPF_F_LINK); + return err; +} + +static struct bpf_object *bpf_object__new(const char *path, + const void *obj_buf, + size_t obj_buf_sz, + const char *obj_name) +{ + struct bpf_object *obj; + char *end; + + obj = calloc(1, sizeof(struct bpf_object) + strlen(path) + 1); + if (!obj) { + pr_warn("alloc memory failed for %s\n", path); + return ERR_PTR(-ENOMEM); + } + + strcpy(obj->path, path); + if (obj_name) { + libbpf_strlcpy(obj->name, obj_name, sizeof(obj->name)); + } else { + /* Using basename() GNU version which doesn't modify arg. */ + libbpf_strlcpy(obj->name, basename((void *)path), sizeof(obj->name)); + end = strchr(obj->name, '.'); + if (end) + *end = 0; + } + + obj->efile.fd = -1; + /* + * Caller of this function should also call + * bpf_object__elf_finish() after data collection to return + * obj_buf to user. If not, we should duplicate the buffer to + * avoid user freeing them before elf finish. + */ + obj->efile.obj_buf = obj_buf; + obj->efile.obj_buf_sz = obj_buf_sz; + obj->efile.btf_maps_shndx = -1; + obj->efile.st_ops_shndx = -1; + obj->efile.st_ops_link_shndx = -1; + obj->kconfig_map_idx = -1; + + obj->kern_version = get_kernel_version(); + obj->loaded = false; + + return obj; +} + +static void bpf_object__elf_finish(struct bpf_object *obj) +{ + if (!obj->efile.elf) + return; + + elf_end(obj->efile.elf); + obj->efile.elf = NULL; + obj->efile.symbols = NULL; + obj->efile.st_ops_data = NULL; + obj->efile.st_ops_link_data = NULL; + + zfree(&obj->efile.secs); + obj->efile.sec_cnt = 0; + zclose(obj->efile.fd); + obj->efile.obj_buf = NULL; + obj->efile.obj_buf_sz = 0; +} + +static int bpf_object__elf_init(struct bpf_object *obj) +{ + Elf64_Ehdr *ehdr; + int err = 0; + Elf *elf; + + if (obj->efile.elf) { + pr_warn("elf: init internal error\n"); + return -LIBBPF_ERRNO__LIBELF; + } + + if (obj->efile.obj_buf_sz > 0) { + /* obj_buf should have been validated by bpf_object__open_mem(). */ + elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); + } else { + obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); + if (obj->efile.fd < 0) { + char errmsg[STRERR_BUFSIZE], *cp; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("elf: failed to open %s: %s\n", obj->path, cp); + return err; + } + + elf = elf_begin(obj->efile.fd, ELF_C_READ_MMAP, NULL); + } + + if (!elf) { + pr_warn("elf: failed to open %s as ELF file: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__LIBELF; + goto errout; + } + + obj->efile.elf = elf; + + if (elf_kind(elf) != ELF_K_ELF) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a proper ELF object\n", obj->path); + goto errout; + } + + if (gelf_getclass(elf) != ELFCLASS64) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a 64-bit ELF object\n", obj->path); + goto errout; + } + + obj->efile.ehdr = ehdr = elf64_getehdr(elf); + if (!obj->efile.ehdr) { + pr_warn("elf: failed to get ELF header from %s: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + if (elf_getshdrstrndx(elf, &obj->efile.shstrndx)) { + pr_warn("elf: failed to get section names section index for %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* ELF is corrupted/truncated, avoid calling elf_strptr. */ + if (!elf_rawdata(elf_getscn(elf, obj->efile.shstrndx), NULL)) { + pr_warn("elf: failed to get section names strings from %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* Old LLVM set e_machine to EM_NONE */ + if (ehdr->e_type != ET_REL || (ehdr->e_machine && ehdr->e_machine != EM_BPF)) { + pr_warn("elf: %s is not a valid eBPF object file\n", obj->path); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + return 0; +errout: + bpf_object__elf_finish(obj); + return err; +} + +static int bpf_object__check_endianness(struct bpf_object *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + return 0; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + return 0; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + pr_warn("elf: endianness mismatch in %s.\n", obj->path); + return -LIBBPF_ERRNO__ENDIAN; +} + +static int +bpf_object__init_license(struct bpf_object *obj, void *data, size_t size) +{ + if (!data) { + pr_warn("invalid license section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + /* libbpf_strlcpy() only copies first N - 1 bytes, so size + 1 won't + * go over allowed ELF data section buffer + */ + libbpf_strlcpy(obj->license, data, min(size + 1, sizeof(obj->license))); + pr_debug("license of %s is %s\n", obj->path, obj->license); + return 0; +} + +static int +bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) +{ + __u32 kver; + + if (!data || size != sizeof(kver)) { + pr_warn("invalid kver section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + memcpy(&kver, data, sizeof(kver)); + obj->kern_version = kver; + pr_debug("kernel version of %s is %x\n", obj->path, obj->kern_version); + return 0; +} + +static bool bpf_map_type__is_map_in_map(enum bpf_map_type type) +{ + if (type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + type == BPF_MAP_TYPE_HASH_OF_MAPS) + return true; + return false; +} + +static int find_elf_sec_sz(const struct bpf_object *obj, const char *name, __u32 *size) +{ + Elf_Data *data; + Elf_Scn *scn; + + if (!name) + return -EINVAL; + + scn = elf_sec_by_name(obj, name); + data = elf_sec_data(obj, scn); + if (data) { + *size = data->d_size; + return 0; /* found it */ + } + + return -ENOENT; +} + +static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *name) +{ + Elf_Data *symbols = obj->efile.symbols; + const char *sname; + size_t si; + + for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, si); + + if (ELF64_ST_TYPE(sym->st_info) != STT_OBJECT) + continue; + + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && + ELF64_ST_BIND(sym->st_info) != STB_WEAK) + continue; + + sname = elf_sym_str(obj, sym->st_name); + if (!sname) { + pr_warn("failed to get sym name string for var %s\n", name); + return ERR_PTR(-EIO); + } + if (strcmp(name, sname) == 0) + return sym; + } + + return ERR_PTR(-ENOENT); +} + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int err; + + err = libbpf_ensure_mem((void **)&obj->maps, &obj->maps_cap, + sizeof(*obj->maps), obj->nr_maps + 1); + if (err) + return ERR_PTR(err); + + map = &obj->maps[obj->nr_maps++]; + map->obj = obj; + map->fd = -1; + map->inner_map_fd = -1; + map->autocreate = true; + + return map; +} + +static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + size_t map_sz; + + map_sz = (size_t)roundup(value_sz, 8) * max_entries; + map_sz = roundup(map_sz, page_sz); + return map_sz; +} + +static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) +{ + void *mmaped; + + if (!map->mmaped) + return -EINVAL; + + if (old_sz == new_sz) + return 0; + + mmaped = mmap(NULL, new_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmaped == MAP_FAILED) + return -errno; + + memcpy(mmaped, map->mmaped, min(old_sz, new_sz)); + munmap(map->mmaped, old_sz); + map->mmaped = mmaped; + return 0; +} + +static char *internal_map_name(struct bpf_object *obj, const char *real_name) +{ + char map_name[BPF_OBJ_NAME_LEN], *p; + int pfx_len, sfx_len = max((size_t)7, strlen(real_name)); + + /* This is one of the more confusing parts of libbpf for various + * reasons, some of which are historical. The original idea for naming + * internal names was to include as much of BPF object name prefix as + * possible, so that it can be distinguished from similar internal + * maps of a different BPF object. + * As an example, let's say we have bpf_object named 'my_object_name' + * and internal map corresponding to '.rodata' ELF section. The final + * map name advertised to user and to the kernel will be + * 'my_objec.rodata', taking first 8 characters of object name and + * entire 7 characters of '.rodata'. + * Somewhat confusingly, if internal map ELF section name is shorter + * than 7 characters, e.g., '.bss', we still reserve 7 characters + * for the suffix, even though we only have 4 actual characters, and + * resulting map will be called 'my_objec.bss', not even using all 15 + * characters allowed by the kernel. Oh well, at least the truncated + * object name is somewhat consistent in this case. But if the map + * name is '.kconfig', we'll still have entirety of '.kconfig' added + * (8 chars) and thus will be left with only first 7 characters of the + * object name ('my_obje'). Happy guessing, user, that the final map + * name will be "my_obje.kconfig". + * Now, with libbpf starting to support arbitrarily named .rodata.* + * and .data.* data sections, it's possible that ELF section name is + * longer than allowed 15 chars, so we now need to be careful to take + * only up to 15 first characters of ELF name, taking no BPF object + * name characters at all. So '.rodata.abracadabra' will result in + * '.rodata.abracad' kernel and user-visible name. + * We need to keep this convoluted logic intact for .data, .bss and + * .rodata maps, but for new custom .data.custom and .rodata.custom + * maps we use their ELF names as is, not prepending bpf_object name + * in front. We still need to truncate them to 15 characters for the + * kernel. Full name can be recovered for such maps by using DATASEC + * BTF type associated with such map's value type, though. + */ + if (sfx_len >= BPF_OBJ_NAME_LEN) + sfx_len = BPF_OBJ_NAME_LEN - 1; + + /* if there are two or more dots in map name, it's a custom dot map */ + if (strchr(real_name + 1, '.') != NULL) + pfx_len = 0; + else + pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, strlen(obj->name)); + + snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, + sfx_len, real_name); + + /* sanitise map name to characters allowed by kernel */ + for (p = map_name; *p && p < map_name + sizeof(map_name); p++) + if (!isalnum(*p) && *p != '_' && *p != '.') + *p = '_'; + + return strdup(map_name); +} + +static int +map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map); + +/* Internal BPF map is mmap()'able only if at least one of corresponding + * DATASEC's VARs are to be exposed through BPF skeleton. I.e., it's a GLOBAL + * variable and it's not marked as __hidden (which turns it into, effectively, + * a STATIC variable). + */ +static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) +{ + const struct btf_type *t, *vt; + struct btf_var_secinfo *vsi; + int i, n; + + if (!map->btf_value_type_id) + return false; + + t = btf__type_by_id(obj->btf, map->btf_value_type_id); + if (!btf_is_datasec(t)) + return false; + + vsi = btf_var_secinfos(t); + for (i = 0, n = btf_vlen(t); i < n; i++, vsi++) { + vt = btf__type_by_id(obj->btf, vsi->type); + if (!btf_is_var(vt)) + continue; + + if (btf_var(vt)->linkage != BTF_VAR_STATIC) + return true; + } + + return false; +} + +static int +bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, + const char *real_name, int sec_idx, void *data, size_t data_sz) +{ + struct bpf_map_def *def; + struct bpf_map *map; + size_t mmap_sz; + int err; + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->libbpf_type = type; + map->sec_idx = sec_idx; + map->sec_offset = 0; + map->real_name = strdup(real_name); + map->name = internal_map_name(obj, real_name); + if (!map->real_name || !map->name) { + zfree(&map->real_name); + zfree(&map->name); + return -ENOMEM; + } + + def = &map->def; + def->type = BPF_MAP_TYPE_ARRAY; + def->key_size = sizeof(int); + def->value_size = data_sz; + def->max_entries = 1; + def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG + ? BPF_F_RDONLY_PROG : 0; + + /* failures are fine because of maps like .rodata.str1.1 */ + (void) map_fill_btf_type_info(obj, map); + + if (map_is_mmapable(obj, map)) + def->map_flags |= BPF_F_MMAPABLE; + + pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", + map->name, map->sec_idx, map->sec_offset, def->map_flags); + + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("failed to alloc map '%s' content buffer: %d\n", + map->name, err); + zfree(&map->real_name); + zfree(&map->name); + return err; + } + + if (data) + memcpy(map->mmaped, data, data_sz); + + pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); + return 0; +} + +static int bpf_object__init_global_data_maps(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + const char *sec_name; + int err = 0, sec_idx; + + /* + * Populate obj->maps with libbpf internal maps. + */ + for (sec_idx = 1; sec_idx < obj->efile.sec_cnt; sec_idx++) { + sec_desc = &obj->efile.secs[sec_idx]; + + /* Skip recognized sections with size 0. */ + if (!sec_desc->data || sec_desc->data->d_size == 0) + continue; + + switch (sec_desc->sec_type) { + case SEC_DATA: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_RODATA: + obj->has_rodata = true; + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_BSS: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, + sec_name, sec_idx, + NULL, + sec_desc->data->d_size); + break; + default: + /* skip */ + break; + } + if (err) + return err; + } + return 0; +} + + +static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, + const void *name) +{ + int i; + + for (i = 0; i < obj->nr_extern; i++) { + if (strcmp(obj->externs[i].name, name) == 0) + return &obj->externs[i]; + } + return NULL; +} + +static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, + char value) +{ + switch (ext->kcfg.type) { + case KCFG_BOOL: + if (value == 'm') { + pr_warn("extern (kcfg) '%s': value '%c' implies tristate or char type\n", + ext->name, value); + return -EINVAL; + } + *(bool *)ext_val = value == 'y' ? true : false; + break; + case KCFG_TRISTATE: + if (value == 'y') + *(enum libbpf_tristate *)ext_val = TRI_YES; + else if (value == 'm') + *(enum libbpf_tristate *)ext_val = TRI_MODULE; + else /* value == 'n' */ + *(enum libbpf_tristate *)ext_val = TRI_NO; + break; + case KCFG_CHAR: + *(char *)ext_val = value; + break; + case KCFG_UNKNOWN: + case KCFG_INT: + case KCFG_CHAR_ARR: + default: + pr_warn("extern (kcfg) '%s': value '%c' implies bool, tristate, or char type\n", + ext->name, value); + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, + const char *value) +{ + size_t len; + + if (ext->kcfg.type != KCFG_CHAR_ARR) { + pr_warn("extern (kcfg) '%s': value '%s' implies char array type\n", + ext->name, value); + return -EINVAL; + } + + len = strlen(value); + if (value[len - 1] != '"') { + pr_warn("extern (kcfg) '%s': invalid string config '%s'\n", + ext->name, value); + return -EINVAL; + } + + /* strip quotes */ + len -= 2; + if (len >= ext->kcfg.sz) { + pr_warn("extern (kcfg) '%s': long string '%s' of (%zu bytes) truncated to %d bytes\n", + ext->name, value, len, ext->kcfg.sz - 1); + len = ext->kcfg.sz - 1; + } + memcpy(ext_val, value + 1, len); + ext_val[len] = '\0'; + ext->is_set = true; + return 0; +} + +static int parse_u64(const char *value, __u64 *res) +{ + char *value_end; + int err; + + errno = 0; + *res = strtoull(value, &value_end, 0); + if (errno) { + err = -errno; + pr_warn("failed to parse '%s' as integer: %d\n", value, err); + return err; + } + if (*value_end) { + pr_warn("failed to parse '%s' as integer completely\n", value); + return -EINVAL; + } + return 0; +} + +static bool is_kcfg_value_in_range(const struct extern_desc *ext, __u64 v) +{ + int bit_sz = ext->kcfg.sz * 8; + + if (ext->kcfg.sz == 8) + return true; + + /* Validate that value stored in u64 fits in integer of `ext->sz` + * bytes size without any loss of information. If the target integer + * is signed, we rely on the following limits of integer type of + * Y bits and subsequent transformation: + * + * -2^(Y-1) <= X <= 2^(Y-1) - 1 + * 0 <= X + 2^(Y-1) <= 2^Y - 1 + * 0 <= X + 2^(Y-1) < 2^Y + * + * For unsigned target integer, check that all the (64 - Y) bits are + * zero. + */ + if (ext->kcfg.is_signed) + return v + (1ULL << (bit_sz - 1)) < (1ULL << bit_sz); + else + return (v >> bit_sz) == 0; +} + +static int set_kcfg_value_num(struct extern_desc *ext, void *ext_val, + __u64 value) +{ + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR && + ext->kcfg.type != KCFG_BOOL) { + pr_warn("extern (kcfg) '%s': value '%llu' implies integer, char, or boolean type\n", + ext->name, (unsigned long long)value); + return -EINVAL; + } + if (ext->kcfg.type == KCFG_BOOL && value > 1) { + pr_warn("extern (kcfg) '%s': value '%llu' isn't boolean compatible\n", + ext->name, (unsigned long long)value); + return -EINVAL; + + } + if (!is_kcfg_value_in_range(ext, value)) { + pr_warn("extern (kcfg) '%s': value '%llu' doesn't fit in %d bytes\n", + ext->name, (unsigned long long)value, ext->kcfg.sz); + return -ERANGE; + } + switch (ext->kcfg.sz) { + case 1: + *(__u8 *)ext_val = value; + break; + case 2: + *(__u16 *)ext_val = value; + break; + case 4: + *(__u32 *)ext_val = value; + break; + case 8: + *(__u64 *)ext_val = value; + break; + default: + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int bpf_object__process_kconfig_line(struct bpf_object *obj, + char *buf, void *data) +{ + struct extern_desc *ext; + char *sep, *value; + int len, err = 0; + void *ext_val; + __u64 num; + + if (!str_has_pfx(buf, "CONFIG_")) + return 0; + + sep = strchr(buf, '='); + if (!sep) { + pr_warn("failed to parse '%s': no separator\n", buf); + return -EINVAL; + } + + /* Trim ending '\n' */ + len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) { + *sep = '='; + pr_warn("failed to parse '%s': no value\n", buf); + return -EINVAL; + } + + ext = find_extern_by_name(obj, buf); + if (!ext || ext->is_set) + return 0; + + ext_val = data + ext->kcfg.data_off; + value = sep + 1; + + switch (*value) { + case 'y': case 'n': case 'm': + err = set_kcfg_value_tri(ext, ext_val, *value); + break; + case '"': + err = set_kcfg_value_str(ext, ext_val, value); + break; + default: + /* assume integer */ + err = parse_u64(value, &num); + if (err) { + pr_warn("extern (kcfg) '%s': value '%s' isn't a valid integer\n", ext->name, value); + return err; + } + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { + pr_warn("extern (kcfg) '%s': value '%s' implies integer type\n", ext->name, value); + return -EINVAL; + } + err = set_kcfg_value_num(ext, ext_val, num); + break; + } + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to %s\n", ext->name, value); + return 0; +} + +static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) +{ + char buf[PATH_MAX]; + struct utsname uts; + int len, err = 0; + gzFile file; + + uname(&uts); + len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); + if (len < 0) + return -EINVAL; + else if (len >= PATH_MAX) + return -ENAMETOOLONG; + + /* gzopen also accepts uncompressed files. */ + file = gzopen(buf, "r"); + if (!file) + file = gzopen("/proc/config.gz", "r"); + + if (!file) { + pr_warn("failed to open system Kconfig\n"); + return -ENOENT; + } + + while (gzgets(file, buf, sizeof(buf))) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing system Kconfig line '%s': %d\n", + buf, err); + goto out; + } + } + +out: + gzclose(file); + return err; +} + +static int bpf_object__read_kconfig_mem(struct bpf_object *obj, + const char *config, void *data) +{ + char buf[PATH_MAX]; + int err = 0; + FILE *file; + + file = fmemopen((void *)config, strlen(config), "r"); + if (!file) { + err = -errno; + pr_warn("failed to open in-memory Kconfig: %d\n", err); + return err; + } + + while (fgets(buf, sizeof(buf), file)) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing in-memory Kconfig line '%s': %d\n", + buf, err); + break; + } + } + + fclose(file); + return err; +} + +static int bpf_object__init_kconfig_map(struct bpf_object *obj) +{ + struct extern_desc *last_ext = NULL, *ext; + size_t map_sz; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG) + last_ext = ext; + } + + if (!last_ext) + return 0; + + map_sz = last_ext->kcfg.data_off + last_ext->kcfg.sz; + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_KCONFIG, + ".kconfig", obj->efile.symbols_shndx, + NULL, map_sz); + if (err) + return err; + + obj->kconfig_map_idx = obj->nr_maps - 1; + + return 0; +} + +const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + + if (res_id) + *res_id = id; + + while (btf_is_mod(t) || btf_is_typedef(t)) { + if (res_id) + *res_id = t->type; + t = btf__type_by_id(btf, t->type); + } + + return t; +} + +static const struct btf_type * +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t; + + t = skip_mods_and_typedefs(btf, id, NULL); + if (!btf_is_ptr(t)) + return NULL; + + t = skip_mods_and_typedefs(btf, t->type, res_id); + + return btf_is_func_proto(t) ? t : NULL; +} + +static const char *__btf_kind_str(__u16 kind) +{ + switch (kind) { + case BTF_KIND_UNKN: return "void"; + case BTF_KIND_INT: return "int"; + case BTF_KIND_PTR: return "ptr"; + case BTF_KIND_ARRAY: return "array"; + case BTF_KIND_STRUCT: return "struct"; + case BTF_KIND_UNION: return "union"; + case BTF_KIND_ENUM: return "enum"; + case BTF_KIND_FWD: return "fwd"; + case BTF_KIND_TYPEDEF: return "typedef"; + case BTF_KIND_VOLATILE: return "volatile"; + case BTF_KIND_CONST: return "const"; + case BTF_KIND_RESTRICT: return "restrict"; + case BTF_KIND_FUNC: return "func"; + case BTF_KIND_FUNC_PROTO: return "func_proto"; + case BTF_KIND_VAR: return "var"; + case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; + case BTF_KIND_DECL_TAG: return "decl_tag"; + case BTF_KIND_TYPE_TAG: return "type_tag"; + case BTF_KIND_ENUM64: return "enum64"; + default: return "unknown"; + } +} + +const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + +/* + * Fetch integer attribute of BTF map definition. Such attributes are + * represented using a pointer to an array, in which dimensionality of array + * encodes specified integer value. E.g., int (*type)[BPF_MAP_TYPE_ARRAY]; + * encodes `type => BPF_MAP_TYPE_ARRAY` key/value pair completely using BTF + * type definition, while using only sizeof(void *) space in ELF data section. + */ +static bool get_map_field_int(const char *map_name, const struct btf *btf, + const struct btf_member *m, __u32 *res) +{ + const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); + const char *name = btf__name_by_offset(btf, m->name_off); + const struct btf_array *arr_info; + const struct btf_type *arr_t; + + if (!btf_is_ptr(t)) { + pr_warn("map '%s': attr '%s': expected PTR, got %s.\n", + map_name, name, btf_kind_str(t)); + return false; + } + + arr_t = btf__type_by_id(btf, t->type); + if (!arr_t) { + pr_warn("map '%s': attr '%s': type [%u] not found.\n", + map_name, name, t->type); + return false; + } + if (!btf_is_array(arr_t)) { + pr_warn("map '%s': attr '%s': expected ARRAY, got %s.\n", + map_name, name, btf_kind_str(arr_t)); + return false; + } + arr_info = btf_array(arr_t); + *res = arr_info->nelems; + return true; +} + +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} + +static int build_map_pin_path(struct bpf_map *map, const char *path) +{ + char buf[PATH_MAX]; + int err; + + if (!path) + path = "/sys/fs/bpf"; + + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; + + return bpf_map__set_pin_path(map, buf); +} + +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) +{ + const struct btf_type *t; + const struct btf_member *m; + bool is_inner = inner_def == NULL; + int vlen, i; + + vlen = btf_vlen(def_t); + m = btf_members(def_t); + for (i = 0; i < vlen; i++, m++) { + const char *name = btf__name_by_offset(btf, m->name_off); + + if (!name) { + pr_warn("map '%s': invalid field #%d.\n", map_name, i); + return -EINVAL; + } + if (strcmp(name, "type") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_TYPE; + } else if (strcmp(name, "max_entries") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAX_ENTRIES; + } else if (strcmp(name, "map_flags") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_FLAGS; + } else if (strcmp(name, "numa_node") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) + return -EINVAL; + map_def->parts |= MAP_DEF_NUMA_NODE; + } else if (strcmp(name, "key_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %u.\n", + map_name, map_def->key_size, sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; + } else if (strcmp(name, "key") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': key type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': key spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %zd.\n", + map_name, map_def->key_size, (ssize_t)sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; + } else if (strcmp(name, "value_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %u.\n", + map_name, map_def->value_size, sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; + } else if (strcmp(name, "value") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': value type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': value spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %zd.\n", + map_name, map_def->value_size, (ssize_t)sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; + } + else if (strcmp(name, "values") == 0) { + bool is_map_in_map = bpf_map_type__is_map_in_map(map_def->map_type); + bool is_prog_array = map_def->map_type == BPF_MAP_TYPE_PROG_ARRAY; + const char *desc = is_map_in_map ? "map-in-map inner" : "prog-array value"; + char inner_map_name[128]; + int err; + + if (is_inner) { + pr_warn("map '%s': multi-level inner maps not supported.\n", + map_name); + return -ENOTSUP; + } + if (i != vlen - 1) { + pr_warn("map '%s': '%s' member should be last.\n", + map_name, name); + return -EINVAL; + } + if (!is_map_in_map && !is_prog_array) { + pr_warn("map '%s': should be map-in-map or prog-array.\n", + map_name); + return -ENOTSUP; + } + if (map_def->value_size && map_def->value_size != 4) { + pr_warn("map '%s': conflicting value size %u != 4.\n", + map_name, map_def->value_size); + return -EINVAL; + } + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': %s type [%d] not found.\n", + map_name, desc, m->type); + return -EINVAL; + } + if (!btf_is_array(t) || btf_array(t)->nelems) { + pr_warn("map '%s': %s spec is not a zero-sized array.\n", + map_name, desc); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); + if (!btf_is_ptr(t)) { + pr_warn("map '%s': %s def is of unexpected kind %s.\n", + map_name, desc, btf_kind_str(t)); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, t->type, NULL); + if (is_prog_array) { + if (!btf_is_func_proto(t)) { + pr_warn("map '%s': prog-array value def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + continue; + } + if (!btf_is_struct(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); + if (err) + return err; + + map_def->parts |= MAP_DEF_INNER_MAP; + } else if (strcmp(name, "pinning") == 0) { + __u32 val; + + if (is_inner) { + pr_warn("map '%s': inner def can't be pinned.\n", map_name); + return -EINVAL; + } + if (!get_map_field_int(map_name, btf, m, &val)) + return -EINVAL; + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { + pr_warn("map '%s': invalid pinning value %u.\n", + map_name, val); + return -EINVAL; + } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; + } else if (strcmp(name, "map_extra") == 0) { + __u32 map_extra; + + if (!get_map_field_int(map_name, btf, m, &map_extra)) + return -EINVAL; + map_def->map_extra = map_extra; + map_def->parts |= MAP_DEF_MAP_EXTRA; + } else { + if (strict) { + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); + return -ENOTSUP; + } + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); + } + } + + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); + return -EINVAL; + } + + return 0; +} + +static size_t adjust_ringbuf_sz(size_t sz) +{ + __u32 page_sz = sysconf(_SC_PAGE_SIZE); + __u32 mul; + + /* if user forgot to set any size, make sure they see error */ + if (sz == 0) + return 0; + /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be + * a power-of-2 multiple of kernel's page size. If user diligently + * satisified these conditions, pass the size through. + */ + if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) + return sz; + + /* Otherwise find closest (page_sz * power_of_2) product bigger than + * user-set size to satisfy both user size request and kernel + * requirements and substitute correct max_entries for map creation. + */ + for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { + if (mul * page_sz > sz) + return mul * page_sz; + } + + /* if it's impossible to satisfy the conditions (i.e., user size is + * very close to UINT_MAX but is not a power-of-2 multiple of + * page_size) then just return original size and let kernel reject it + */ + return sz; +} + +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + map->map_extra = def->map_extra; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = 0x%x.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_MAP_EXTRA) + pr_debug("map '%s': found map_extra = 0x%llx.\n", map->name, + (unsigned long long)def->map_extra); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: return "global"; + case BTF_VAR_GLOBAL_EXTERN: return "extern"; + default: return "unknown"; + } +} + +static int bpf_object__init_user_btf_map(struct bpf_object *obj, + const struct btf_type *sec, + int var_idx, int sec_idx, + const Elf_Data *data, bool strict, + const char *pin_root_path) +{ + struct btf_map_def map_def = {}, inner_def = {}; + const struct btf_type *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const char *map_name; + struct bpf_map *map; + int err; + + vi = btf_var_secinfos(sec) + var_idx; + var = btf__type_by_id(obj->btf, vi->type); + var_extra = btf_var(var); + map_name = btf__name_by_offset(obj->btf, var->name_off); + + if (map_name == NULL || map_name[0] == '\0') { + pr_warn("map #%d: empty name.\n", var_idx); + return -EINVAL; + } + if ((__u64)vi->offset + vi->size > data->d_size) { + pr_warn("map '%s' BTF data is corrupted.\n", map_name); + return -EINVAL; + } + if (!btf_is_var(var)) { + pr_warn("map '%s': unexpected var kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + pr_warn("map '%s': unsupported map linkage %s.\n", + map_name, btf_var_linkage_str(var_extra->linkage)); + return -EOPNOTSUPP; + } + + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (!btf_is_struct(def)) { + pr_warn("map '%s': unexpected def kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (def->size > vi->size) { + pr_warn("map '%s': invalid def size.\n", map_name); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + map->name = strdup(map_name); + if (!map->name) { + pr_warn("map '%s': failed to alloc map name.\n", map_name); + return -ENOMEM; + } + map->libbpf_type = LIBBPF_MAP_UNSPEC; + map->def.type = BPF_MAP_TYPE_UNSPEC; + map->sec_idx = sec_idx; + map->sec_offset = vi->offset; + map->btf_var_idx = var_idx; + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", + map_name, map->sec_idx, map->sec_offset); + + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + err = map_fill_btf_type_info(obj, map); + if (err) + return err; + + return 0; +} + +static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, + const char *pin_root_path) +{ + const struct btf_type *sec = NULL; + int nr_types, i, vlen, err; + const struct btf_type *t; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + + if (obj->efile.btf_maps_shndx < 0) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); + data = elf_sec_data(obj, scn); + if (!scn || !data) { + pr_warn("elf: failed to get %s map definitions for %s\n", + MAPS_ELF_SEC, obj->path); + return -EINVAL; + } + + nr_types = btf__type_cnt(obj->btf); + for (i = 1; i < nr_types; i++) { + t = btf__type_by_id(obj->btf, i); + if (!btf_is_datasec(t)) + continue; + name = btf__name_by_offset(obj->btf, t->name_off); + if (strcmp(name, MAPS_ELF_SEC) == 0) { + sec = t; + obj->efile.btf_maps_sec_btf_id = i; + break; + } + } + + if (!sec) { + pr_warn("DATASEC '%s' not found.\n", MAPS_ELF_SEC); + return -ENOENT; + } + + vlen = btf_vlen(sec); + for (i = 0; i < vlen; i++) { + err = bpf_object__init_user_btf_map(obj, sec, i, + obj->efile.btf_maps_shndx, + data, strict, + pin_root_path); + if (err) + return err; + } + + return 0; +} + +static int bpf_object__init_maps(struct bpf_object *obj, + const struct bpf_object_open_opts *opts) +{ + const char *pin_root_path; + bool strict; + int err = 0; + + strict = !OPTS_GET(opts, relaxed_maps, false); + pin_root_path = OPTS_GET(opts, pin_root_path, NULL); + + err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = err ?: bpf_object__init_global_data_maps(obj); + err = err ?: bpf_object__init_kconfig_map(obj); + err = err ?: bpf_object_init_struct_ops(obj); + + return err; +} + +static bool section_have_execinstr(struct bpf_object *obj, int idx) +{ + Elf64_Shdr *sh; + + sh = elf_sec_hdr(obj, elf_sec_by_idx(obj, idx)); + if (!sh) + return false; + + return sh->sh_flags & SHF_EXECINSTR; +} + +static bool btf_needs_sanitization(struct bpf_object *obj) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + + return !has_func || !has_datasec || !has_func_global || !has_float || + !has_decl_tag || !has_type_tag || !has_enum64; +} + +static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + int enum64_placeholder_id = 0; + struct btf_type *t; + int i, j, vlen; + + for (i = 1; i < btf__type_cnt(btf); i++) { + t = (struct btf_type *)btf__type_by_id(btf, i); + + if ((!has_datasec && btf_is_var(t)) || (!has_decl_tag && btf_is_decl_tag(t))) { + /* replace VAR/DECL_TAG with INT */ + t->info = BTF_INFO_ENC(BTF_KIND_INT, 0, 0); + /* + * using size = 1 is the safest choice, 4 will be too + * big and cause kernel BTF validation failure if + * original variable took less than 4 bytes + */ + t->size = 1; + *(int *)(t + 1) = BTF_INT_ENC(0, 0, 8); + } else if (!has_datasec && btf_is_datasec(t)) { + /* replace DATASEC with STRUCT */ + const struct btf_var_secinfo *v = btf_var_secinfos(t); + struct btf_member *m = btf_members(t); + struct btf_type *vt; + char *name; + + name = (char *)btf__name_by_offset(btf, t->name_off); + while (*name) { + if (*name == '.') + *name = '_'; + name++; + } + + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, vlen); + for (j = 0; j < vlen; j++, v++, m++) { + /* order of field assignments is important */ + m->offset = v->offset * 8; + m->type = v->type; + /* preserve variable name as member name */ + vt = (void *)btf__type_by_id(btf, v->type); + m->name_off = vt->name_off; + } + } else if (!has_func && btf_is_func_proto(t)) { + /* replace FUNC_PROTO with ENUM */ + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_ENUM, 0, vlen); + t->size = sizeof(__u32); /* kernel enforced */ + } else if (!has_func && btf_is_func(t)) { + /* replace FUNC with TYPEDEF */ + t->info = BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0); + } else if (!has_func_global && btf_is_func(t)) { + /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ + t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); + } else if (!has_type_tag && btf_is_type_tag(t)) { + /* replace TYPE_TAG with a CONST */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0); + } else if (!has_enum64 && btf_is_enum(t)) { + /* clear the kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t), false); + } else if (!has_enum64 && btf_is_enum64(t)) { + /* replace ENUM64 with a union */ + struct btf_member *m; + + if (enum64_placeholder_id == 0) { + enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); + if (enum64_placeholder_id < 0) + return enum64_placeholder_id; + + t = (struct btf_type *)btf__type_by_id(btf, i); + } + + m = btf_members(t); + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_UNION, 0, vlen); + for (j = 0; j < vlen; j++, m++) { + m->type = enum64_placeholder_id; + m->offset = 0; + } + } + } + + return 0; +} + +static bool libbpf_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.btf_maps_shndx >= 0 || + obj->efile.st_ops_shndx >= 0 || + obj->efile.st_ops_link_shndx >= 0 || + obj->nr_extern > 0; +} + +static bool kernel_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0; +} + +static int bpf_object__init_btf(struct bpf_object *obj, + Elf_Data *btf_data, + Elf_Data *btf_ext_data) +{ + int err = -ENOENT; + + if (btf_data) { + obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); + err = libbpf_get_error(obj->btf); + if (err) { + obj->btf = NULL; + pr_warn("Error loading ELF section %s: %d.\n", BTF_ELF_SEC, err); + goto out; + } + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + } + if (btf_ext_data) { + struct btf_ext_info *ext_segs[3]; + int seg_num, sec_num; + + if (!obj->btf) { + pr_debug("Ignore ELF section %s because its depending ELF section %s is not found.\n", + BTF_EXT_ELF_SEC, BTF_ELF_SEC); + goto out; + } + obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("Error loading ELF section %s: %d. Ignored and continue.\n", + BTF_EXT_ELF_SEC, err); + obj->btf_ext = NULL; + goto out; + } + + /* setup .BTF.ext to ELF section mapping */ + ext_segs[0] = &obj->btf_ext->func_info; + ext_segs[1] = &obj->btf_ext->line_info; + ext_segs[2] = &obj->btf_ext->core_relo_info; + for (seg_num = 0; seg_num < ARRAY_SIZE(ext_segs); seg_num++) { + struct btf_ext_info *seg = ext_segs[seg_num]; + const struct btf_ext_info_sec *sec; + const char *sec_name; + Elf_Scn *scn; + + if (seg->sec_cnt == 0) + continue; + + seg->sec_idxs = calloc(seg->sec_cnt, sizeof(*seg->sec_idxs)); + if (!seg->sec_idxs) { + err = -ENOMEM; + goto out; + } + + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + /* preventively increment index to avoid doing + * this before every continue below + */ + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) + continue; + scn = elf_sec_by_name(obj, sec_name); + if (!scn) + continue; + + seg->sec_idxs[sec_num - 1] = elf_ndxscn(scn); + } + } + } +out: + if (err && libbpf_needs_btf(obj)) { + pr_warn("BTF is required, but is missing or corrupted.\n"); + return err; + } + return 0; +} + +static int compare_vsi_off(const void *_a, const void *_b) +{ + const struct btf_var_secinfo *a = _a; + const struct btf_var_secinfo *b = _b; + + return a->offset - b->offset; +} + +static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, + struct btf_type *t) +{ + __u32 size = 0, i, vars = btf_vlen(t); + const char *sec_name = btf__name_by_offset(btf, t->name_off); + struct btf_var_secinfo *vsi; + bool fixup_offsets = false; + int err; + + if (!sec_name) { + pr_debug("No name found in string section for DATASEC kind.\n"); + return -ENOENT; + } + + /* Extern-backing datasecs (.ksyms, .kconfig) have their size and + * variable offsets set at the previous step. Further, not every + * extern BTF VAR has corresponding ELF symbol preserved, so we skip + * all fixups altogether for such sections and go straight to sorting + * VARs within their DATASEC. + */ + if (strcmp(sec_name, KCONFIG_SEC) == 0 || strcmp(sec_name, KSYMS_SEC) == 0) + goto sort_vars; + + /* Clang leaves DATASEC size and VAR offsets as zeroes, so we need to + * fix this up. But BPF static linker already fixes this up and fills + * all the sizes and offsets during static linking. So this step has + * to be optional. But the STV_HIDDEN handling is non-optional for any + * non-extern DATASEC, so the variable fixup loop below handles both + * functions at the same time, paying the cost of BTF VAR <-> ELF + * symbol matching just once. + */ + if (t->size == 0) { + err = find_elf_sec_sz(obj, sec_name, &size); + if (err || !size) { + pr_debug("sec '%s': failed to determine size from ELF: size %u, err %d\n", + sec_name, size, err); + return -ENOENT; + } + + t->size = size; + fixup_offsets = true; + } + + for (i = 0, vsi = btf_var_secinfos(t); i < vars; i++, vsi++) { + const struct btf_type *t_var; + struct btf_var *var; + const char *var_name; + Elf64_Sym *sym; + + t_var = btf__type_by_id(btf, vsi->type); + if (!t_var || !btf_is_var(t_var)) { + pr_debug("sec '%s': unexpected non-VAR type found\n", sec_name); + return -EINVAL; + } + + var = btf_var(t_var); + if (var->linkage == BTF_VAR_STATIC || var->linkage == BTF_VAR_GLOBAL_EXTERN) + continue; + + var_name = btf__name_by_offset(btf, t_var->name_off); + if (!var_name) { + pr_debug("sec '%s': failed to find name of DATASEC's member #%d\n", + sec_name, i); + return -ENOENT; + } + + sym = find_elf_var_sym(obj, var_name); + if (IS_ERR(sym)) { + pr_debug("sec '%s': failed to find ELF symbol for VAR '%s'\n", + sec_name, var_name); + return -ENOENT; + } + + if (fixup_offsets) + vsi->offset = sym->st_value; + + /* if variable is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF VAR + * as static. This follows similar logic for functions (BPF + * subprogs) and influences libbpf's further decisions about + * whether to make global data BPF array maps as + * BPF_F_MMAPABLE. + */ + if (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL) + var->linkage = BTF_VAR_STATIC; + } + +sort_vars: + qsort(btf_var_secinfos(t), vars, sizeof(*vsi), compare_vsi_off); + return 0; +} + +static int bpf_object_fixup_btf(struct bpf_object *obj) +{ + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_type *t = btf_type_by_id(obj->btf, i); + + /* Loader needs to fix up some of the things compiler + * couldn't get its hands on while emitting BTF. This + * is section size and global variable offset. We use + * the info from the ELF itself for this purpose. + */ + if (btf_is_datasec(t)) { + err = btf_fixup_datasec(obj, obj->btf, t); + if (err) + return err; + } + } + + return 0; +} + +static bool prog_needs_vmlinux_btf(struct bpf_program *prog) +{ + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_LSM) + return true; + + /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs + * also need vmlinux BTF + */ + if (prog->type == BPF_PROG_TYPE_TRACING && !prog->attach_prog_fd) + return true; + + return false; +} + +static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* CO-RE relocations need kernel BTF, only when btf_custom_path + * is not specified + */ + if (obj->btf_ext && obj->btf_ext->core_relo_info.len && !obj->btf_custom_path) + return true; + + /* Support for typed ksyms needs kernel BTF */ + for (i = 0; i < obj->nr_extern; i++) { + const struct extern_desc *ext; + + ext = &obj->externs[i]; + if (ext->type == EXT_KSYM && ext->ksym.type_id) + return true; + } + + bpf_object__for_each_program(prog, obj) { + if (!prog->autoload) + continue; + if (prog_needs_vmlinux_btf(prog)) + return true; + } + + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) +{ + int err; + + /* btf_vmlinux could be loaded earlier */ + if (obj->btf_vmlinux || obj->gen_loader) + return 0; + + if (!force && !obj_needs_vmlinux_btf(obj)) + return 0; + + obj->btf_vmlinux = btf__load_vmlinux_btf(); + err = libbpf_get_error(obj->btf_vmlinux); + if (err) { + pr_warn("Error loading vmlinux BTF: %d\n", err); + obj->btf_vmlinux = NULL; + return err; + } + return 0; +} + +static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) +{ + struct btf *kern_btf = obj->btf; + bool btf_mandatory, sanitize; + int i, err = 0; + + if (!obj->btf) + return 0; + + if (!kernel_supports(obj, FEAT_BTF)) { + if (kernel_needs_btf(obj)) { + err = -EOPNOTSUPP; + goto report; + } + pr_debug("Kernel doesn't support BTF, skipping uploading it.\n"); + return 0; + } + + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__type_cnt(obj->btf); + for (j = 1; j < n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + + sanitize = btf_needs_sanitization(obj); + if (sanitize) { + const void *raw_data; + __u32 sz; + + /* clone BTF to sanitize a copy and leave the original intact */ + raw_data = btf__raw_data(obj->btf, &sz); + kern_btf = btf__new(raw_data, sz); + err = libbpf_get_error(kern_btf); + if (err) + return err; + + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + err = bpf_object__sanitize_btf(obj, kern_btf); + if (err) + return err; + } + + if (obj->gen_loader) { + __u32 raw_size = 0; + const void *raw_data = btf__raw_data(kern_btf, &raw_size); + + if (!raw_data) + return -ENOMEM; + bpf_gen__load_btf(obj->gen_loader, raw_data, raw_size); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + btf__set_fd(kern_btf, 0); + } else { + /* currently BPF_BTF_LOAD only supports log_level 1 */ + err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, + obj->log_level ? 1 : 0); + } + if (sanitize) { + if (!err) { + /* move fd to libbpf's BTF */ + btf__set_fd(obj->btf, btf__fd(kern_btf)); + btf__set_fd(kern_btf, -1); + } + btf__free(kern_btf); + } +report: + if (err) { + btf_mandatory = kernel_needs_btf(obj); + pr_warn("Error loading .BTF into kernel: %d. %s\n", err, + btf_mandatory ? "BTF is mandatory, can't proceed." + : "BTF is optional, ignoring."); + if (!btf_mandatory) + err = 0; + } + return err; +} + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static const char *elf_sec_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.shstrndx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx) +{ + Elf_Scn *scn; + + scn = elf_getscn(obj->efile.elf, idx); + if (!scn) { + pr_warn("elf: failed to get section(%zu) from %s: %s\n", + idx, obj->path, elf_errmsg(-1)); + return NULL; + } + return scn; +} + +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name) +{ + Elf_Scn *scn = NULL; + Elf *elf = obj->efile.elf; + const char *sec_name; + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sec_name = elf_sec_name(obj, scn); + if (!sec_name) + return NULL; + + if (strcmp(sec_name, name) != 0) + continue; + + return scn; + } + return NULL; +} + +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf64_Shdr *shdr; + + if (!scn) + return NULL; + + shdr = elf64_getshdr(scn); + if (!shdr) { + pr_warn("elf: failed to get section(%zu) header from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return shdr; +} + +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn) +{ + const char *name; + Elf64_Shdr *sh; + + if (!scn) + return NULL; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return NULL; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) { + pr_warn("elf: failed to get section(%zu) name from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf_Data *data; + + if (!scn) + return NULL; + + data = elf_getdata(scn, 0); + if (!data) { + pr_warn("elf: failed to get section(%zu) %s data from %s: %s\n", + elf_ndxscn(scn), elf_sec_name(obj, scn) ?: "", + obj->path, elf_errmsg(-1)); + return NULL; + } + + return data; +} + +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx) +{ + if (idx >= obj->efile.symbols->d_size / sizeof(Elf64_Sym)) + return NULL; + + return (Elf64_Sym *)obj->efile.symbols->d_buf + idx; +} + +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx) +{ + if (idx >= data->d_size / sizeof(Elf64_Rel)) + return NULL; + + return (Elf64_Rel *)data->d_buf + idx; +} + +static bool is_sec_name_dwarf(const char *name) +{ + /* approximation, but the actual list is too long */ + return str_has_pfx(name, ".debug_"); +} + +static bool ignore_elf_section(Elf64_Shdr *hdr, const char *name) +{ + /* no special handling of .strtab */ + if (hdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (hdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (hdr->sh_type == SHT_PROGBITS && hdr->sh_size == 0 && + strcmp(name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_sec_name_dwarf(name)) + return true; + + if (str_has_pfx(name, ".rel")) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_sec_name_dwarf(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static int cmp_progs(const void *_a, const void *_b) +{ + const struct bpf_program *a = _a; + const struct bpf_program *b = _b; + + if (a->sec_idx != b->sec_idx) + return a->sec_idx < b->sec_idx ? -1 : 1; + + /* sec_insn_off can't be the same within the section */ + return a->sec_insn_off < b->sec_insn_off ? -1 : 1; +} + +static int bpf_object__elf_collect(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + Elf *elf = obj->efile.elf; + Elf_Data *btf_ext_data = NULL; + Elf_Data *btf_data = NULL; + int idx = 0, err = 0; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + Elf64_Shdr *sh; + + /* ELF section indices are 0-based, but sec #0 is special "invalid" + * section. Since section count retrieved by elf_getshdrnum() does + * include sec #0, it is already the necessary size of an array to keep + * all the sections. + */ + if (elf_getshdrnum(obj->efile.elf, &obj->efile.sec_cnt)) { + pr_warn("elf: failed to get the number of sections for %s: %s\n", + obj->path, elf_errmsg(-1)); + return -LIBBPF_ERRNO__FORMAT; + } + obj->efile.secs = calloc(obj->efile.sec_cnt, sizeof(*obj->efile.secs)); + if (!obj->efile.secs) + return -ENOMEM; + + /* a bunch of ELF parsing functionality depends on processing symbols, + * so do the first pass and find the symbol table + */ + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + if (sh->sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warn("elf: multiple symbol tables in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + idx = elf_ndxscn(scn); + + obj->efile.symbols = data; + obj->efile.symbols_shndx = idx; + obj->efile.strtabidx = sh->sh_link; + } + } + + if (!obj->efile.symbols) { + pr_warn("elf: couldn't find symbol table in %s, stripped object file?\n", + obj->path); + return -ENOENT; + } + + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + idx = elf_ndxscn(scn); + sec_desc = &obj->efile.secs[idx]; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) + return -LIBBPF_ERRNO__FORMAT; + + if (ignore_elf_section(sh, name)) + continue; + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + pr_debug("elf: section(%d) %s, size %ld, link %d, flags %lx, type=%d\n", + idx, name, (unsigned long)data->d_size, + (int)sh->sh_link, (unsigned long)sh->sh_flags, + (int)sh->sh_type); + + if (strcmp(name, "license") == 0) { + err = bpf_object__init_license(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "version") == 0) { + err = bpf_object__init_kversion(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "maps") == 0) { + pr_warn("elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+\n"); + return -ENOTSUP; + } else if (strcmp(name, MAPS_ELF_SEC) == 0) { + obj->efile.btf_maps_shndx = idx; + } else if (strcmp(name, BTF_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_data = data; + } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_ext_data = data; + } else if (sh->sh_type == SHT_SYMTAB) { + /* already processed during the first pass above */ + } else if (sh->sh_type == SHT_PROGBITS && data->d_size > 0) { + if (sh->sh_flags & SHF_EXECINSTR) { + if (strcmp(name, ".text") == 0) + obj->efile.text_shndx = idx; + err = bpf_object__add_programs(obj, data, name, idx); + if (err) + return err; + } else if (strcmp(name, DATA_SEC) == 0 || + str_has_pfx(name, DATA_SEC ".")) { + sec_desc->sec_type = SEC_DATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, RODATA_SEC) == 0 || + str_has_pfx(name, RODATA_SEC ".")) { + sec_desc->sec_type = SEC_RODATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, STRUCT_OPS_SEC) == 0) { + obj->efile.st_ops_data = data; + obj->efile.st_ops_shndx = idx; + } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) { + obj->efile.st_ops_link_data = data; + obj->efile.st_ops_link_shndx = idx; + } else { + pr_info("elf: skipping unrecognized data section(%d) %s\n", + idx, name); + } + } else if (sh->sh_type == SHT_REL) { + int targ_sec_idx = sh->sh_info; /* points to other section */ + + if (sh->sh_entsize != sizeof(Elf64_Rel) || + targ_sec_idx >= obj->efile.sec_cnt) + return -LIBBPF_ERRNO__FORMAT; + + /* Only do relo for section with exec instructions */ + if (!section_have_execinstr(obj, targ_sec_idx) && + strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) && + strcmp(name, ".rel" MAPS_ELF_SEC)) { + pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n", + idx, name, targ_sec_idx, + elf_sec_name(obj, elf_sec_by_idx(obj, targ_sec_idx)) ?: ""); + continue; + } + + sec_desc->sec_type = SEC_RELO; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (sh->sh_type == SHT_NOBITS && (strcmp(name, BSS_SEC) == 0 || + str_has_pfx(name, BSS_SEC "."))) { + sec_desc->sec_type = SEC_BSS; + sec_desc->shdr = sh; + sec_desc->data = data; + } else { + pr_info("elf: skipping section(%d) %s (size %zu)\n", idx, name, + (size_t)sh->sh_size); + } + } + + if (!obj->efile.strtabidx || obj->efile.strtabidx > idx) { + pr_warn("elf: symbol strings section missing or invalid in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + /* sort BPF programs by section name and in-section instruction offset + * for faster search + */ + if (obj->nr_programs) + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + + return bpf_object__init_btf(obj, btf_data, btf_ext_data); +} + +static bool sym_is_extern(const Elf64_Sym *sym) +{ + int bind = ELF64_ST_BIND(sym->st_info); + /* externs are symbols w/ type=NOTYPE, bind=GLOBAL|WEAK, section=UND */ + return sym->st_shndx == SHN_UNDEF && + (bind == STB_GLOBAL || bind == STB_WEAK) && + ELF64_ST_TYPE(sym->st_info) == STT_NOTYPE; +} + +static bool sym_is_subprog(const Elf64_Sym *sym, int text_shndx) +{ + int bind = ELF64_ST_BIND(sym->st_info); + int type = ELF64_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + +static int find_extern_btf_id(const struct btf *btf, const char *ext_name) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_var(t) && !btf_is_func(t)) + continue; + + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) + continue; + + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) + return -EINVAL; + + return i; + } + + return -ENOENT; +} + +static int find_extern_sec_btf_id(struct btf *btf, int ext_btf_id) { + const struct btf_var_secinfo *vs; + const struct btf_type *t; + int i, j, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_datasec(t)) + continue; + + vs = btf_var_secinfos(t); + for (j = 0; j < btf_vlen(t); j++, vs++) { + if (vs->type == ext_btf_id) + return i; + } + } + + return -ENOENT; +} + +static enum kcfg_type find_kcfg_type(const struct btf *btf, int id, + bool *is_signed) +{ + const struct btf_type *t; + const char *name; + + t = skip_mods_and_typedefs(btf, id, NULL); + name = btf__name_by_offset(btf, t->name_off); + + if (is_signed) + *is_signed = false; + switch (btf_kind(t)) { + case BTF_KIND_INT: { + int enc = btf_int_encoding(t); + + if (enc & BTF_INT_BOOL) + return t->size == 1 ? KCFG_BOOL : KCFG_UNKNOWN; + if (is_signed) + *is_signed = enc & BTF_INT_SIGNED; + if (t->size == 1) + return KCFG_CHAR; + if (t->size < 1 || t->size > 8 || (t->size & (t->size - 1))) + return KCFG_UNKNOWN; + return KCFG_INT; + } + case BTF_KIND_ENUM: + if (t->size != 4) + return KCFG_UNKNOWN; + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ENUM64: + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ARRAY: + if (btf_array(t)->nelems == 0) + return KCFG_UNKNOWN; + if (find_kcfg_type(btf, btf_array(t)->type, NULL) != KCFG_CHAR) + return KCFG_UNKNOWN; + return KCFG_CHAR_ARR; + default: + return KCFG_UNKNOWN; + } +} + +static int cmp_externs(const void *_a, const void *_b) +{ + const struct extern_desc *a = _a; + const struct extern_desc *b = _b; + + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + if (a->type == EXT_KCFG) { + /* descending order by alignment requirements */ + if (a->kcfg.align != b->kcfg.align) + return a->kcfg.align > b->kcfg.align ? -1 : 1; + /* ascending order by size, within same alignment class */ + if (a->kcfg.sz != b->kcfg.sz) + return a->kcfg.sz < b->kcfg.sz ? -1 : 1; + } + + /* resolve ties by name */ + return strcmp(a->name, b->name); +} + +static int find_int_btf_id(const struct btf *btf) +{ + const struct btf_type *t; + int i, n; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (btf_is_int(t) && btf_int_bits(t) == 32) + return i; + } + + return 0; +} + +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + if (!btf) + return 0; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + +static int bpf_object__collect_externs(struct bpf_object *obj) +{ + struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; + const struct btf_type *t; + struct extern_desc *ext; + int i, n, off, dummy_var_btf_id; + const char *ext_name, *sec_name; + Elf_Scn *scn; + Elf64_Shdr *sh; + + if (!obj->efile.symbols) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.symbols_shndx); + sh = elf_sec_hdr(obj, scn); + if (!sh || sh->sh_entsize != sizeof(Elf64_Sym)) + return -LIBBPF_ERRNO__FORMAT; + + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + + n = sh->sh_size / sh->sh_entsize; + pr_debug("looking for externs among %d symbols...\n", n); + + for (i = 0; i < n; i++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, i); + + if (!sym) + return -LIBBPF_ERRNO__FORMAT; + if (!sym_is_extern(sym)) + continue; + ext_name = elf_sym_str(obj, sym->st_name); + if (!ext_name || !ext_name[0]) + continue; + + ext = obj->externs; + ext = libbpf_reallocarray(ext, obj->nr_extern + 1, sizeof(*ext)); + if (!ext) + return -ENOMEM; + obj->externs = ext; + ext = &ext[obj->nr_extern]; + memset(ext, 0, sizeof(*ext)); + obj->nr_extern++; + + ext->btf_id = find_extern_btf_id(obj->btf, ext_name); + if (ext->btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s': %d\n", + ext_name, ext->btf_id); + return ext->btf_id; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->sym_idx = i; + ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; + + ext->sec_btf_id = find_extern_sec_btf_id(obj->btf, ext->btf_id); + if (ext->sec_btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s' [%d] section: %d\n", + ext_name, ext->btf_id, ext->sec_btf_id); + return ext->sec_btf_id; + } + sec = (void *)btf__type_by_id(obj->btf, ext->sec_btf_id); + sec_name = btf__name_by_offset(obj->btf, sec->name_off); + + if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } + kcfg_sec = sec; + ext->type = EXT_KCFG; + ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); + if (ext->kcfg.sz <= 0) { + pr_warn("failed to resolve size of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.sz); + return ext->kcfg.sz; + } + ext->kcfg.align = btf__align_of(obj->btf, t->type); + if (ext->kcfg.align <= 0) { + pr_warn("failed to determine alignment of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.align); + return -EINVAL; + } + ext->kcfg.type = find_kcfg_type(obj->btf, t->type, + &ext->kcfg.is_signed); + if (ext->kcfg.type == KCFG_UNKNOWN) { + pr_warn("extern (kcfg) '%s': type is unsupported\n", ext_name); + return -ENOTSUP; + } + } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + ksym_sec = sec; + ext->type = EXT_KSYM; + skip_mods_and_typedefs(obj->btf, t->type, + &ext->ksym.type_id); + } else { + pr_warn("unrecognized extern section '%s'\n", sec_name); + return -ENOTSUP; + } + } + pr_debug("collected %d externs total\n", obj->nr_extern); + + if (!obj->nr_extern) + return 0; + + /* sort externs by type, for kcfg ones also by (align, size, name) */ + qsort(obj->externs, obj->nr_extern, sizeof(*ext), cmp_externs); + + /* for .ksyms section, we need to turn all externs into allocated + * variables in BTF to pass kernel verification; we do this by + * pretending that each extern is a 8-byte variable + */ + if (ksym_sec) { + /* find existing 4-byte integer type in BTF to use for fake + * extern variables in DATASEC + */ + int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM) + continue; + pr_debug("extern (ksym) #%d: symbol %d, name %s\n", + i, ext->sym_idx, ext->name); + } + + sec = ksym_sec; + n = btf_vlen(sec); + for (i = 0, off = 0; i < n; i++, off += sizeof(int)) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + struct btf_type *vt; + + vt = (void *)btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, vt->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); + return -ESRCH; + } + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } + vs->offset = off; + vs->size = sizeof(int); + } + sec->size = off; + } + + if (kcfg_sec) { + sec = kcfg_sec; + /* for kcfg externs calculate their offsets within a .kconfig map */ + off = 0; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KCFG) + continue; + + ext->kcfg.data_off = roundup(off, ext->kcfg.align); + off = ext->kcfg.data_off + ext->kcfg.sz; + pr_debug("extern (kcfg) #%d: symbol %d, off %u, name %s\n", + i, ext->sym_idx, ext->kcfg.data_off, ext->name); + } + sec->size = off; + n = btf_vlen(sec); + for (i = 0; i < n; i++) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + + t = btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, t->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF var '%s'\n", + ext_name); + return -ESRCH; + } + btf_var(t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vs->offset = ext->kcfg.data_off; + } + } + return 0; +} + +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) +{ + return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; +} + +struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + if (prog_is_subprog(obj, prog)) + continue; + if (!strcmp(prog->name, name)) + return prog; + } + return errno = ENOENT, NULL; +} + +static bool bpf_object__shndx_is_data(const struct bpf_object *obj, + int shndx) +{ + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + case SEC_DATA: + case SEC_RODATA: + return true; + default: + return false; + } +} + +static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, + int shndx) +{ + return shndx == obj->efile.btf_maps_shndx; +} + +static enum libbpf_map_type +bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) +{ + if (shndx == obj->efile.symbols_shndx) + return LIBBPF_MAP_KCONFIG; + + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + return LIBBPF_MAP_BSS; + case SEC_DATA: + return LIBBPF_MAP_DATA; + case SEC_RODATA: + return LIBBPF_MAP_RODATA; + default: + return LIBBPF_MAP_UNSPEC; + } +} + +static int bpf_program__record_reloc(struct bpf_program *prog, + struct reloc_desc *reloc_desc, + __u32 insn_idx, const char *sym_name, + const Elf64_Sym *sym, const Elf64_Rel *rel) +{ + struct bpf_insn *insn = &prog->insns[insn_idx]; + size_t map_idx, nr_maps = prog->obj->nr_maps; + struct bpf_object *obj = prog->obj; + __u32 shdr_idx = sym->st_shndx; + enum libbpf_map_type type; + const char *sym_sec_name; + struct bpf_map *map; + + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { + pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", + prog->name, sym_name, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + + if (sym_is_extern(sym)) { + int sym_idx = ELF64_R_SYM(rel->r_info); + int i, n = obj->nr_extern; + struct extern_desc *ext; + + for (i = 0; i < n; i++) { + ext = &obj->externs[i]; + if (ext->sym_idx == sym_idx) + break; + } + if (i >= n) { + pr_warn("prog '%s': extern relo failed to find extern for '%s' (%d)\n", + prog->name, sym_name, sym_idx); + return -LIBBPF_ERRNO__RELOC; + } + pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", + prog->name, i, ext->name, ext->sym_idx, insn_idx); + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_CALL; + else + reloc_desc->type = RELO_EXTERN_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->ext_idx = i; + return 0; + } + + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { + pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", + prog->name, sym_name, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + + /* generic map reference relocation */ + if (type == LIBBPF_MAP_UNSPEC) { + if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { + pr_warn("prog '%s': bad map relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || + map->sec_idx != sym->st_shndx || + map->sec_offset != sym->st_value) + continue; + pr_debug("prog '%s': found map %zd (%s, sec %d, off %zu) for insn #%u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': map relo failed to find map for section '%s', off %zu\n", + prog->name, sym_sec_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = 0; /* sym->st_value determines map_idx */ + return 0; + } + + /* global data map relocation */ + if (!bpf_object__shndx_is_data(obj, shdr_idx)) { + pr_warn("prog '%s': bad data relo against section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || map->sec_idx != sym->st_shndx) + continue; + pr_debug("prog '%s': found data map %zd (%s, sec %d, off %zu) for insn %u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': data relo failed to find map for section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = sym->st_value; + return 0; +} + +static bool prog_contains_insn(const struct bpf_program *prog, size_t insn_idx) +{ + return insn_idx >= prog->sec_insn_off && + insn_idx < prog->sec_insn_off + prog->sec_insn_cnt; +} + +static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, + size_t sec_idx, size_t insn_idx) +{ + int l = 0, r = obj->nr_programs - 1, m; + struct bpf_program *prog; + + if (!obj->nr_programs) + return NULL; + + while (l < r) { + m = l + (r - l + 1) / 2; + prog = &obj->programs[m]; + + if (prog->sec_idx < sec_idx || + (prog->sec_idx == sec_idx && prog->sec_insn_off <= insn_idx)) + l = m; + else + r = m - 1; + } + /* matching program could be at index l, but it still might be the + * wrong one, so we need to double check conditions for the last time + */ + prog = &obj->programs[l]; + if (prog->sec_idx == sec_idx && prog_contains_insn(prog, insn_idx)) + return prog; + return NULL; +} + +static int +bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) +{ + const char *relo_sec_name, *sec_name; + size_t sec_idx = shdr->sh_info, sym_idx; + struct bpf_program *prog; + struct reloc_desc *relos; + int err, i, nrels; + const char *sym_name; + __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; + Elf64_Sym *sym; + Elf64_Rel *rel; + + if (sec_idx >= obj->efile.sec_cnt) + return -EINVAL; + + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + + relo_sec_name = elf_sec_str(obj, shdr->sh_name); + sec_name = elf_sec_name(obj, scn); + if (!relo_sec_name || !sec_name) + return -EINVAL; + + pr_debug("sec '%s': collecting relocation for section(%zu) '%s'\n", + relo_sec_name, sec_idx, sec_name); + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("sec '%s': failed to get relo #%d\n", relo_sec_name, i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym_idx = ELF64_R_SYM(rel->r_info); + sym = elf_sym_by_idx(obj, sym_idx); + if (!sym) { + pr_warn("sec '%s': symbol #%zu not found for relo #%d\n", + relo_sec_name, sym_idx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sym->st_shndx >= obj->efile.sec_cnt) { + pr_warn("sec '%s': corrupted symbol #%zu pointing to invalid section #%zu for relo #%d\n", + relo_sec_name, sym_idx, (size_t)sym->st_shndx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (rel->r_offset % BPF_INSN_SZ || rel->r_offset >= scn_data->d_size) { + pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", + relo_sec_name, (size_t)rel->r_offset, i); + return -LIBBPF_ERRNO__FORMAT; + } + + insn_idx = rel->r_offset / BPF_INSN_SZ; + /* relocations against static functions are recorded as + * relocations against the section that contains a function; + * in such case, symbol will be STT_SECTION and sym.st_name + * will point to empty string (0), so fetch section name + * instead + */ + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && sym->st_name == 0) + sym_name = elf_sec_name(obj, elf_sec_by_idx(obj, sym->st_shndx)); + else + sym_name = elf_sym_str(obj, sym->st_name); + sym_name = sym_name ?: "reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + prog->reloc_desc = relos; + + /* adjust insn_idx to local BPF program frame of reference */ + insn_idx -= prog->sec_insn_off; + err = bpf_program__record_reloc(prog, &relos[prog->nr_reloc], + insn_idx, sym_name, sym, rel); + if (err) + return err; + + prog->nr_reloc++; + } + return 0; +} + +static int map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map) +{ + int id; + + if (!obj->btf) + return -ENOENT; + + /* if it's BTF-defined map, we don't need to search for type IDs. + * For struct_ops map, it does not need btf_key_type_id and + * btf_value_type_id. + */ + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) + return 0; + + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; + + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; + + map->btf_key_type_id = 0; + map->btf_value_type_id = id; + return 0; +} + +static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) +{ + char file[PATH_MAX], buff[4096]; + FILE *fp; + __u32 val; + int err; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + memset(info, 0, sizeof(*info)); + + fp = fopen(file, "re"); + if (!fp) { + err = -errno; + pr_warn("failed to open %s: %d. No procfs support?\n", file, + err); + return err; + } + + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + info->type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + info->key_size = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + info->value_size = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + info->max_entries = val; + else if (sscanf(buff, "map_flags:\t%i", &val) == 1) + info->map_flags = val; + } + + fclose(fp); + + return 0; +} + +bool bpf_map__autocreate(const struct bpf_map *map) +{ + return map->autocreate; +} + +int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->autocreate = autocreate; + return 0; +} + +int bpf_map__reuse_fd(struct bpf_map *map, int fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info), name_len; + int new_fd, err; + char *new_name; + + memset(&info, 0, len); + err = bpf_map_get_info_by_fd(fd, &info, &len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(fd, &info); + if (err) + return libbpf_err(err); + + name_len = strlen(info.name); + if (name_len == BPF_OBJ_NAME_LEN - 1 && strncmp(map->name, info.name, name_len) == 0) + new_name = strdup(map->name); + else + new_name = strdup(info.name); + + if (!new_name) + return libbpf_err(-errno); + + /* + * Like dup(), but make sure new FD is >= 3 and has O_CLOEXEC set. + * This is similar to what we do in ensure_good_fd(), but without + * closing original FD. + */ + new_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (new_fd < 0) { + err = -errno; + goto err_free_new_name; + } + + err = zclose(map->fd); + if (err) { + err = -errno; + goto err_close_new_fd; + } + free(map->name); + + map->fd = new_fd; + map->name = new_name; + map->def.type = info.type; + map->def.key_size = info.key_size; + map->def.value_size = info.value_size; + map->def.max_entries = info.max_entries; + map->def.map_flags = info.map_flags; + map->btf_key_type_id = info.btf_key_type_id; + map->btf_value_type_id = info.btf_value_type_id; + map->reused = true; + map->map_extra = info.map_extra; + + return 0; + +err_close_new_fd: + close(new_fd); +err_free_new_name: + free(new_name); + return libbpf_err(err); +} + +__u32 bpf_map__max_entries(const struct bpf_map *map) +{ + return map->def.max_entries; +} + +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return errno = EINVAL, NULL; + + return map->inner_map; +} + +int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->def.max_entries = max_entries; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + return 0; +} + +static int +bpf_object__probe_loading(struct bpf_object *obj) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + if (obj->gen_loader) + return 0; + + ret = bump_rlimit_memlock(); + if (ret) + pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret); + + /* make sure basic loading works */ + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) { + ret = errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF " + "program. Make sure your kernel supports BPF " + "(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is " + "set to big enough value.\n", __func__, cp, ret); + return -ret; + } + close(ret); + + return 0; +} + +static int probe_fd(int fd) +{ + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static int probe_kern_prog_name(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); + + /* make sure loading with name works */ + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); + return probe_fd(ret); +} + +static int probe_kern_global_data(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + insns[0].imm = map; + + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + close(map); + return probe_fd(ret); +} + +static int probe_kern_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func_global(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_datasec(void) +{ + static const char strs[] = "\0x\0.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC val */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_decl_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* attr */ + BTF_TYPE_DECL_TAG_ENC(1, 2, -1), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_type_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* attr */ + BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ + /* ptr */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_array_mmap(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); +} + +static int probe_kern_exp_attach_type(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + /* use any valid combination of program type and (optional) + * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) + * to see if kernel supports expected_attach_type field for + * BPF_PROG_LOAD command + */ + fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_kern_probe_read_kernel(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ + BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ + BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(fd); +} + +static int probe_prog_bind_map(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_btf_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + +static int probe_perf_link(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", + insns, ARRAY_SIZE(insns), NULL); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_kern_bpf_cookie(void) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(ret); +} + +static int probe_kern_btf_enum64(void) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_syscall_wrapper(void); + +enum kern_feature_result { + FEAT_UNKNOWN = 0, + FEAT_SUPPORTED = 1, + FEAT_MISSING = 2, +}; + +typedef int (*feature_probe_fn)(void); + +static struct kern_feature_desc { + const char *desc; + feature_probe_fn probe; + enum kern_feature_result res; +} feature_probes[__FEAT_CNT] = { + [FEAT_PROG_NAME] = { + "BPF program name", probe_kern_prog_name, + }, + [FEAT_GLOBAL_DATA] = { + "global variables", probe_kern_global_data, + }, + [FEAT_BTF] = { + "minimal BTF", probe_kern_btf, + }, + [FEAT_BTF_FUNC] = { + "BTF functions", probe_kern_btf_func, + }, + [FEAT_BTF_GLOBAL_FUNC] = { + "BTF global function", probe_kern_btf_func_global, + }, + [FEAT_BTF_DATASEC] = { + "BTF data section and variable", probe_kern_btf_datasec, + }, + [FEAT_ARRAY_MMAP] = { + "ARRAY map mmap()", probe_kern_array_mmap, + }, + [FEAT_EXP_ATTACH_TYPE] = { + "BPF_PROG_LOAD expected_attach_type attribute", + probe_kern_exp_attach_type, + }, + [FEAT_PROBE_READ_KERN] = { + "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, + [FEAT_BTF_DECL_TAG] = { + "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, + }, + [FEAT_BTF_TYPE_TAG] = { + "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, + }, + [FEAT_MEMCG_ACCOUNT] = { + "memcg-based memory accounting", probe_memcg_account, + }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, +}; + +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) +{ + struct kern_feature_desc *feat = &feature_probes[feat_id]; + int ret; + + if (obj && obj->gen_loader) + /* To generate loader program assume the latest kernel + * to avoid doing extra prog_load, map_create syscalls. + */ + return true; + + if (READ_ONCE(feat->res) == FEAT_UNKNOWN) { + ret = feat->probe(); + if (ret > 0) { + WRITE_ONCE(feat->res, FEAT_SUPPORTED); + } else if (ret == 0) { + WRITE_ONCE(feat->res, FEAT_MISSING); + } else { + pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); + WRITE_ONCE(feat->res, FEAT_MISSING); + } + } + + return READ_ONCE(feat->res) == FEAT_SUPPORTED; +} + +static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) +{ + struct bpf_map_info map_info; + char msg[STRERR_BUFSIZE]; + __u32 map_info_len = sizeof(map_info); + int err; + + memset(&map_info, 0, map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); + if (err) { + pr_warn("failed to get map info for map FD %d: %s\n", map_fd, + libbpf_strerror_r(errno, msg, sizeof(msg))); + return false; + } + + return (map_info.type == map->def.type && + map_info.key_size == map->def.key_size && + map_info.value_size == map->def.value_size && + map_info.max_entries == map->def.max_entries && + map_info.map_flags == map->def.map_flags && + map_info.map_extra == map->map_extra); +} + +static int +bpf_object__reuse_map(struct bpf_map *map) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err, pin_fd; + + pin_fd = bpf_obj_get(map->pin_path); + if (pin_fd < 0) { + err = -errno; + if (err == -ENOENT) { + pr_debug("found no pinned map to reuse at '%s'\n", + map->pin_path); + return 0; + } + + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("couldn't retrieve pinned map '%s': %s\n", + map->pin_path, cp); + return err; + } + + if (!map_is_reuse_compat(map, pin_fd)) { + pr_warn("couldn't reuse pinned map at '%s': parameter mismatch\n", + map->pin_path); + close(pin_fd); + return -EINVAL; + } + + err = bpf_map__reuse_fd(map, pin_fd); + close(pin_fd); + if (err) + return err; + + map->pinned = true; + pr_debug("reused pinned map at '%s'\n", map->pin_path); + + return 0; +} + +static int +bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) +{ + enum libbpf_map_type map_type = map->libbpf_type; + char *cp, errmsg[STRERR_BUFSIZE]; + int err, zero = 0; + + if (obj->gen_loader) { + bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps, + map->mmaped, map->def.value_size); + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) + bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); + return 0; + } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error setting initial map(%s) contents: %s\n", + map->name, cp); + return err; + } + + /* Freeze .rodata and .kconfig map as read-only from syscall side. */ + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) { + err = bpf_map_freeze(map->fd); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error freezing map(%s) as read-only: %s\n", + map->name, cp); + return err; + } + } + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map); + +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) +{ + LIBBPF_OPTS(bpf_map_create_opts, create_attr); + struct bpf_map_def *def = &map->def; + const char *map_name = NULL; + int err = 0; + + if (kernel_supports(obj, FEAT_PROG_NAME)) + map_name = map->name; + create_attr.map_ifindex = map->map_ifindex; + create_attr.map_flags = def->map_flags; + create_attr.numa_node = map->numa_node; + create_attr.map_extra = map->map_extra; + + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; + + if (obj->btf && btf__fd(obj->btf) >= 0) { + create_attr.btf_fd = btf__fd(obj->btf); + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; + } + + if (bpf_map_type__is_map_in_map(def->type)) { + if (map->inner_map) { + err = bpf_object__create_map(obj, map->inner_map, true); + if (err) { + pr_warn("map '%s': failed to create inner map: %d\n", + map->name, err); + return err; + } + map->inner_map_fd = bpf_map__fd(map->inner_map); + } + if (map->inner_map_fd >= 0) + create_attr.inner_map_fd = map->inner_map_fd; + } + + switch (def->type) { + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + default: + break; + } + + if (obj->gen_loader) { + bpf_gen__map_create(obj->gen_loader, def->type, map_name, + def->key_size, def->value_size, def->max_entries, + &create_attr, is_inner ? -1 : map - obj->maps); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + map->fd = 0; + } else { + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + if (map->fd < 0 && (create_attr.btf_key_type_id || + create_attr.btf_value_type_id)) { + char *cp, errmsg[STRERR_BUFSIZE]; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", + map->name, cp, err); + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + + err = map->fd < 0 ? -errno : 0; + + if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { + if (obj->gen_loader) + map->inner_map->fd = -1; + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + return err; +} + +static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_map *targ_map; + unsigned int i; + int fd, err = 0; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_map = map->init_slots[i]; + fd = bpf_map__fd(targ_map); + + if (obj->gen_loader) { + bpf_gen__populate_outer_map(obj->gen_loader, + map - obj->maps, i, + targ_map - obj->maps); + } else { + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + } + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, i, targ_map->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, i, targ_map->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int init_prog_array_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_program *targ_prog; + unsigned int i; + int fd, err; + + if (obj->gen_loader) + return -ENOTSUP; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_prog = map->init_slots[i]; + fd = bpf_program__fd(targ_prog); + + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to prog '%s' fd=%d: %d\n", + map->name, i, targ_prog->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to prog '%s' fd=%d\n", + map->name, i, targ_prog->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int bpf_object_init_prog_arrays(struct bpf_object *obj) +{ + struct bpf_map *map; + int i, err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!map->init_slots_sz || map->def.type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + err = init_prog_array_slots(obj, map); + if (err < 0) { + zclose(map->fd); + return err; + } + } + return 0; +} + +static int map_set_def_max_entries(struct bpf_map *map) +{ + if (map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !map->def.max_entries) { + int nr_cpus; + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warn("map '%s': failed to determine number of system CPUs: %d\n", + map->name, nr_cpus); + return nr_cpus; + } + pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); + map->def.max_entries = nr_cpus; + } + + return 0; +} + +static int +bpf_object__create_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + char *cp, errmsg[STRERR_BUFSIZE]; + unsigned int i, j; + int err; + bool retried; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + /* To support old kernels, we skip creating global data maps + * (.rodata, .data, .kconfig, etc); later on, during program + * loading, if we detect that at least one of the to-be-loaded + * programs is referencing any global data map, we'll error + * out with program name and relocation index logged. + * This approach allows to accommodate Clang emitting + * unnecessary .rodata.str1.1 sections for string literals, + * but also it allows to have CO-RE applications that use + * global variables in some of BPF programs, but not others. + * If those global variable-using programs are not loaded at + * runtime due to bpf_program__set_autoload(prog, false), + * bpf_object loading will succeed just fine even on old + * kernels. + */ + if (bpf_map__is_internal(map) && !kernel_supports(obj, FEAT_GLOBAL_DATA)) + map->autocreate = false; + + if (!map->autocreate) { + pr_debug("map '%s': skipped auto-creating...\n", map->name); + continue; + } + + err = map_set_def_max_entries(map); + if (err) + goto err_out; + + retried = false; +retry: + if (map->pin_path) { + err = bpf_object__reuse_map(map); + if (err) { + pr_warn("map '%s': error reusing pinned map\n", + map->name); + goto err_out; + } + if (retried && map->fd < 0) { + pr_warn("map '%s': cannot find pinned map\n", + map->name); + err = -ENOENT; + goto err_out; + } + } + + if (map->fd >= 0) { + pr_debug("map '%s': skipping creation (preset fd=%d)\n", + map->name, map->fd); + } else { + err = bpf_object__create_map(obj, map, false); + if (err) + goto err_out; + + pr_debug("map '%s': created successfully, fd=%d\n", + map->name, map->fd); + + if (bpf_map__is_internal(map)) { + err = bpf_object__populate_internal_map(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + + if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { + err = init_map_in_map_slots(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + } + + if (map->pin_path && !map->pinned) { + err = bpf_map__pin(map, NULL); + if (err) { + zclose(map->fd); + if (!retried && err == -EEXIST) { + retried = true; + goto retry; + } + pr_warn("map '%s': failed to auto-pin at '%s': %d\n", + map->name, map->pin_path, err); + goto err_out; + } + } + } + + return 0; + +err_out: + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); + for (j = 0; j < i; j++) + zclose(obj->maps[j].fd); + return err; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +/* Given 'some_struct_name___with_flavor' return the length of a name prefix + * before last triple underscore. Struct name part after last triple + * underscore is ignored by BPF CO-RE relocation during relocation matching. + */ +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} + +void bpf_core_free_cands(struct bpf_core_cand_list *cands) +{ + if (!cands) + return; + + free(cands->cands); + free(cands); +} + +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands) +{ + struct bpf_core_cand *new_cands, *cand; + const struct btf_type *t, *local_t; + const char *targ_name, *local_name; + size_t targ_essent_len; + int n, i; + + local_t = btf__type_by_id(local_cand->btf, local_cand->id); + local_name = btf__str_by_offset(local_cand->btf, local_t->name_off); + + n = btf__type_cnt(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf__type_by_id(targ_btf, i); + if (!btf_kind_core_compat(t, local_t)) + continue; + + targ_name = btf__name_by_offset(targ_btf, t->name_off); + if (str_is_empty(targ_name)) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + + if (strncmp(local_name, targ_name, local_essent_len) != 0) + continue; + + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", + local_cand->id, btf_kind_str(local_t), + local_name, i, btf_kind_str(t), targ_name, + targ_btf_name); + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, + sizeof(*cands->cands)); + if (!new_cands) + return -ENOMEM; + + cand = &new_cands[cands->len]; + cand->btf = targ_btf; + cand->id = i; + + cands->cands = new_cands; + cands->len++; + } + return 0; +} + +static int load_module_btfs(struct bpf_object *obj) +{ + struct bpf_btf_info info; + struct module_btf *mod_btf; + struct btf *btf; + char name[64]; + __u32 id = 0, len; + int err, fd; + + if (obj->btf_modules_loaded) + return 0; + + if (obj->gen_loader) + return 0; + + /* don't do this again, even if we find no module BTFs */ + obj->btf_modules_loaded = true; + + /* kernel too old to support module BTFs */ + if (!kernel_supports(obj, FEAT_MODULE_BTF)) + return 0; + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err && errno == ENOENT) + return 0; + if (err && errno == EPERM) { + pr_debug("skipping module BTFs loading, missing privileges\n"); + return 0; + } + if (err) { + err = -errno; + pr_warn("failed to iterate BTF objects: %d\n", err); + return err; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; /* expected race: BTF was unloaded */ + err = -errno; + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); + return err; + } + + len = sizeof(info); + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + err = bpf_btf_get_info_by_fd(fd, &info, &len); + if (err) { + err = -errno; + pr_warn("failed to get BTF object #%d info: %d\n", id, err); + goto err_out; + } + + /* ignore non-module BTFs */ + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { + close(fd); + continue; + } + + btf = btf_get_from_fd(fd, obj->btf_vmlinux); + err = libbpf_get_error(btf); + if (err) { + pr_warn("failed to load module [%s]'s BTF object #%d: %d\n", + name, id, err); + goto err_out; + } + + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + if (err) + goto err_out; + + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + + mod_btf->btf = btf; + mod_btf->id = id; + mod_btf->fd = fd; + mod_btf->name = strdup(name); + if (!mod_btf->name) { + err = -ENOMEM; + goto err_out; + } + continue; + +err_out: + close(fd); + return err; + } + + return 0; +} + +static struct bpf_core_cand_list * +bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) +{ + struct bpf_core_cand local_cand = {}; + struct bpf_core_cand_list *cands; + const struct btf *main_btf; + const struct btf_type *local_t; + const char *local_name; + size_t local_essent_len; + int err, i; + + local_cand.btf = local_btf; + local_cand.id = local_type_id; + local_t = btf__type_by_id(local_btf, local_type_id); + if (!local_t) + return ERR_PTR(-EINVAL); + + local_name = btf__name_by_offset(local_btf, local_t->name_off); + if (str_is_empty(local_name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(local_name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return ERR_PTR(-ENOMEM); + + /* Attempt to find target candidates in vmlinux BTF first */ + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + /* if vmlinux BTF has any candidate, don't got for module BTFs */ + if (cands->len) + return cands; + + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ + if (obj->btf_vmlinux_override) + return cands; + + /* now look through module BTFs, trying to still find candidates */ + err = load_module_btfs(obj); + if (err) + goto err_out; + + for (i = 0; i < obj->btf_module_cnt; i++) { + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_modules[i].btf, + obj->btf_modules[i].name, + btf__type_cnt(obj->btf_vmlinux), + cands); + if (err) + goto err_out; + } + + return cands; +err_out: + bpf_core_free_cands(cands); + return ERR_PTR(err); +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); +} + +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} + +static size_t bpf_core_hash_fn(const long key, void *ctx) +{ + return key; +} + +static bool bpf_core_equal_fn(const long k1, const long k2, void *ctx) +{ + return k1 == k2; +} + +static int record_relo_core(struct bpf_program *prog, + const struct bpf_core_relo *core_relo, int insn_idx) +{ + struct reloc_desc *relos, *relo; + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + relo = &relos[prog->nr_reloc]; + relo->type = RELO_CORE; + relo->insn_idx = insn_idx; + relo->core_relo = core_relo; + prog->reloc_desc = relos; + prog->nr_reloc++; + return 0; +} + +static const struct bpf_core_relo *find_relo_core(struct bpf_program *prog, int insn_idx) +{ + struct reloc_desc *relo; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + relo = &prog->reloc_desc[i]; + if (relo->type != RELO_CORE || relo->insn_idx != insn_idx) + continue; + + return relo->core_relo; + } + + return NULL; +} + +static int bpf_core_resolve_relo(struct bpf_program *prog, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct hashmap *cand_cache, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec specs_scratch[3] = {}; + struct bpf_core_cand_list *cands = NULL; + const char *prog_name = prog->name; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id = relo->type_id; + int err; + + local_type = btf__type_by_id(local_btf, local_id); + if (!local_type) + return -EINVAL; + + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && + !hashmap__find(cand_cache, local_id, &cands)) { + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); + if (IS_ERR(cands)) { + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + local_name, PTR_ERR(cands)); + return PTR_ERR(cands); + } + err = hashmap__set(cand_cache, local_id, cands, NULL, NULL); + if (err) { + bpf_core_free_cands(cands); + return err; + } + } + + return bpf_core_calc_relo_insn(prog_name, relo, relo_idx, local_btf, cands, specs_scratch, + targ_res); +} + +static int +bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) +{ + const struct btf_ext_info_sec *sec; + struct bpf_core_relo_res targ_res; + const struct bpf_core_relo *rec; + const struct btf_ext_info *seg; + struct hashmap_entry *entry; + struct hashmap *cand_cache = NULL; + struct bpf_program *prog; + struct bpf_insn *insn; + const char *sec_name; + int i, err = 0, insn_idx, sec_idx, sec_num; + + if (obj->btf_ext->core_relo_info.len == 0) + return 0; + + if (targ_btf_path) { + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); + err = libbpf_get_error(obj->btf_vmlinux_override); + if (err) { + pr_warn("failed to parse target BTF: %d\n", err); + return err; + } + } + + cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); + if (IS_ERR(cand_cache)) { + err = PTR_ERR(cand_cache); + goto out; + } + + seg = &obj->btf_ext->core_relo_info; + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + sec_idx = seg->sec_idxs[sec_num]; + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) { + err = -EINVAL; + goto out; + } + + pr_debug("sec '%s': found %d CO-RE relocations\n", sec_name, sec->num_info); + + for_each_btf_ext_rec(seg, sec, i, rec) { + if (rec->insn_off % BPF_INSN_SZ) + return -EINVAL; + insn_idx = rec->insn_off / BPF_INSN_SZ; + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + /* When __weak subprog is "overridden" by another instance + * of the subprog from a different object file, linker still + * appends all the .BTF.ext info that used to belong to that + * eliminated subprogram. + * This is similar to what x86-64 linker does for relocations. + * So just ignore such relocations just like we ignore + * subprog instructions when discovering subprograms. + */ + pr_debug("sec '%s': skipping CO-RE relocation #%d for insn #%d belonging to eliminated weak subprogram\n", + sec_name, i, insn_idx); + continue; + } + /* no need to apply CO-RE relocation if the program is + * not going to be loaded + */ + if (!prog->autoload) + continue; + + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; + if (insn_idx >= prog->insns_cnt) + return -EINVAL; + insn = &prog->insns[insn_idx]; + + err = record_relo_core(prog, rec, insn_idx); + if (err) { + pr_warn("prog '%s': relo #%d: failed to record relocation: %d\n", + prog->name, i, err); + goto out; + } + + if (prog->obj->gen_loader) + continue; + + err = bpf_core_resolve_relo(prog, rec, i, obj->btf, cand_cache, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", + prog->name, i, err); + goto out; + } + + err = bpf_core_patch_insn(prog->name, insn, insn_idx, rec, i, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", + prog->name, i, insn_idx, err); + goto out; + } + } + } + +out: + /* obj->btf_vmlinux and module BTFs are freed after object load */ + btf__free(obj->btf_vmlinux_override); + obj->btf_vmlinux_override = NULL; + + if (!IS_ERR_OR_NULL(cand_cache)) { + hashmap__for_each_entry(cand_cache, entry, i) { + bpf_core_free_cands(entry->pvalue); + } + hashmap__free(cand_cache); + } + return err; +} + +/* base map load ldimm64 special constant, used also for log fixup logic */ +#define POISON_LDIMM64_MAP_BASE 2001000000 +#define POISON_LDIMM64_MAP_PFX "200100" + +static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int map_idx, const struct bpf_map *map) +{ + int i; + + pr_debug("prog '%s': relo #%d: poisoning insn #%d that loads map #%d '%s'\n", + prog->name, relo_idx, insn_idx, map_idx, map->name); + + /* we turn single ldimm64 into two identical invalid calls */ + for (i = 0; i < 2; i++) { + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is map index into obj->maps[] array + */ + insn->imm = POISON_LDIMM64_MAP_BASE + map_idx; + + insn++; + } +} + +/* unresolved kfunc call special constant, used also for log fixup logic */ +#define POISON_CALL_KFUNC_BASE 2002000000 +#define POISON_CALL_KFUNC_PFX "2002" + +static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int ext_idx, const struct extern_desc *ext) +{ + pr_debug("prog '%s': relo #%d: poisoning insn #%d that calls kfunc '%s'\n", + prog->name, relo_idx, insn_idx, ext->name); + + /* we turn kfunc call into invalid helper call with identifiable constant */ + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is extern index into obj->externs[] array + */ + insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; +} + +/* Relocate data references within program code: + * - map references; + * - global variable references; + * - extern references. + */ +static int +bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) +{ + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + const struct bpf_map *map; + struct extern_desc *ext; + + switch (relo->type) { + case RELO_LD64: + map = &obj->maps[relo->map_idx]; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_FD; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_DATA: + map = &obj->maps[relo->map_idx]; + insn[1].imm = insn[0].imm + relo->sym_off; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_EXTERN_LD64: + ext = &obj->externs[relo->ext_idx]; + if (ext->type == EXT_KCFG) { + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = obj->kconfig_map_idx; + } else { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; + } + insn[1].imm = ext->kcfg.data_off; + } else /* EXT_KSYM */ { + if (ext->ksym.type_id && ext->is_set) { /* typed ksyms */ + insn[0].src_reg = BPF_PSEUDO_BTF_ID; + insn[0].imm = ext->ksym.kernel_btf_id; + insn[1].imm = ext->ksym.kernel_btf_obj_fd; + } else { /* typeless ksyms or unresolved typed ksyms */ + insn[0].imm = (__u32)ext->ksym.addr; + insn[1].imm = ext->ksym.addr >> 32; + } + } + break; + case RELO_EXTERN_CALL: + ext = &obj->externs[relo->ext_idx]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + if (ext->is_set) { + insn[0].imm = ext->ksym.kernel_btf_id; + insn[0].off = ext->ksym.btf_fd_idx; + } else { /* unresolved weak kfunc call */ + poison_kfunc_call(prog, i, relo->insn_idx, insn, + relo->ext_idx, ext); + } + break; + case RELO_SUBPROG_ADDR: + if (insn[0].src_reg != BPF_PSEUDO_FUNC) { + pr_warn("prog '%s': relo #%d: bad insn\n", + prog->name, i); + return -EINVAL; + } + /* handled already */ + break; + case RELO_CALL: + /* handled already */ + break; + case RELO_CORE: + /* will be handled by bpf_program_record_relos() */ + break; + default: + pr_warn("prog '%s': relo #%d: bad relo type %d\n", + prog->name, i, relo->type); + return -EINVAL; + } + } + + return 0; +} + +static int adjust_prog_btf_ext_info(const struct bpf_object *obj, + const struct bpf_program *prog, + const struct btf_ext_info *ext_info, + void **prog_info, __u32 *prog_rec_cnt, + __u32 *prog_rec_sz) +{ + void *copy_start = NULL, *copy_end = NULL; + void *rec, *rec_end, *new_prog_info; + const struct btf_ext_info_sec *sec; + size_t old_sz, new_sz; + int i, sec_num, sec_idx, off_adj; + + sec_num = 0; + for_each_btf_ext_sec(ext_info, sec) { + sec_idx = ext_info->sec_idxs[sec_num]; + sec_num++; + if (prog->sec_idx != sec_idx) + continue; + + for_each_btf_ext_rec(ext_info, sec, i, rec) { + __u32 insn_off = *(__u32 *)rec / BPF_INSN_SZ; + + if (insn_off < prog->sec_insn_off) + continue; + if (insn_off >= prog->sec_insn_off + prog->sec_insn_cnt) + break; + + if (!copy_start) + copy_start = rec; + copy_end = rec + ext_info->rec_size; + } + + if (!copy_start) + return -ENOENT; + + /* append func/line info of a given (sub-)program to the main + * program func/line info + */ + old_sz = (size_t)(*prog_rec_cnt) * ext_info->rec_size; + new_sz = old_sz + (copy_end - copy_start); + new_prog_info = realloc(*prog_info, new_sz); + if (!new_prog_info) + return -ENOMEM; + *prog_info = new_prog_info; + *prog_rec_cnt = new_sz / ext_info->rec_size; + memcpy(new_prog_info + old_sz, copy_start, copy_end - copy_start); + + /* Kernel instruction offsets are in units of 8-byte + * instructions, while .BTF.ext instruction offsets generated + * by Clang are in units of bytes. So convert Clang offsets + * into kernel offsets and adjust offset according to program + * relocated position. + */ + off_adj = prog->sub_insn_off - prog->sec_insn_off; + rec = new_prog_info + old_sz; + rec_end = new_prog_info + new_sz; + for (; rec < rec_end; rec += ext_info->rec_size) { + __u32 *insn_off = rec; + + *insn_off = *insn_off / BPF_INSN_SZ + off_adj; + } + *prog_rec_sz = ext_info->rec_size; + return 0; + } + + return -ENOENT; +} + +static int +reloc_prog_func_and_line_info(const struct bpf_object *obj, + struct bpf_program *main_prog, + const struct bpf_program *prog) +{ + int err; + + /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't + * supprot func/line info + */ + if (!obj->btf_ext || !kernel_supports(obj, FEAT_BTF_FUNC)) + return 0; + + /* only attempt func info relocation if main program's func_info + * relocation was successful + */ + if (main_prog != prog && !main_prog->func_info) + goto line_info; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->func_info, + &main_prog->func_info, + &main_prog->func_info_cnt, + &main_prog->func_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext function info: %d\n", + prog->name, err); + return err; + } + if (main_prog->func_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext function info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext function info for the main program, skipping all of .BTF.ext func info.\n", + prog->name); + } + +line_info: + /* don't relocate line info if main program's relocation failed */ + if (main_prog != prog && !main_prog->line_info) + return 0; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->line_info, + &main_prog->line_info, + &main_prog->line_info_cnt, + &main_prog->line_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext line info: %d\n", + prog->name, err); + return err; + } + if (main_prog->line_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext line info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext line info for the main program, skipping all of .BTF.ext line info.\n", + prog->name); + } + return 0; +} + +static int cmp_relo_by_insn_idx(const void *key, const void *elem) +{ + size_t insn_idx = *(const size_t *)key; + const struct reloc_desc *relo = elem; + + if (insn_idx == relo->insn_idx) + return 0; + return insn_idx < relo->insn_idx ? -1 : 1; +} + +static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) +{ + if (!prog->nr_reloc) + return NULL; + return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, + sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); +} + +static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_program *subprog) +{ + int new_cnt = main_prog->nr_reloc + subprog->nr_reloc; + struct reloc_desc *relos; + int i; + + if (main_prog == subprog) + return 0; + relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (!relos && new_cnt) + return -ENOMEM; + if (subprog->nr_reloc) + memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, + sizeof(*relos) * subprog->nr_reloc); + + for (i = main_prog->nr_reloc; i < new_cnt; i++) + relos[i].insn_idx += subprog->sub_insn_off; + /* After insn_idx adjustment the 'relos' array is still sorted + * by insn_idx and doesn't break bsearch. + */ + main_prog->reloc_desc = relos; + main_prog->nr_reloc = new_cnt; + return 0; +} + +static int +bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *prog) +{ + size_t sub_insn_idx, insn_idx, new_cnt; + struct bpf_program *subprog; + struct bpf_insn *insns, *insn; + struct reloc_desc *relo; + int err; + + err = reloc_prog_func_and_line_info(obj, main_prog, prog); + if (err) + return err; + + for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) + continue; + + relo = find_prog_insn_relo(prog, insn_idx); + if (relo && relo->type == RELO_EXTERN_CALL) + /* kfunc relocations will be handled later + * in bpf_object__relocate_data() + */ + continue; + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { + pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", + prog->name, insn_idx, relo->type); + return -LIBBPF_ERRNO__RELOC; + } + if (relo) { + /* sub-program instruction index is a combination of + * an offset of a symbol pointed to by relocation and + * call instruction's imm field; for global functions, + * call always has imm = -1, but for static functions + * relocation is against STT_SECTION and insn->imm + * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. + */ + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. + */ + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } else { + /* if subprogram call is to a static function within + * the same ELF section, there won't be any relocation + * emitted, but it also means there is no additional + * offset necessary, insns->imm is relative to + * instruction's original position within the section + */ + sub_insn_idx = prog->sec_insn_off + insn_idx + insn->imm + 1; + } + + /* we enforce that sub-programs should be in .text section */ + subprog = find_prog_by_sec_insn(obj, obj->efile.text_shndx, sub_insn_idx); + if (!subprog) { + pr_warn("prog '%s': no .text section found yet sub-program call exists\n", + prog->name); + return -LIBBPF_ERRNO__RELOC; + } + + /* if it's the first call instruction calling into this + * subprogram (meaning this subprog hasn't been processed + * yet) within the context of current main program: + * - append it at the end of main program's instructions blog; + * - process is recursively, while current program is put on hold; + * - if that subprogram calls some other not yet processes + * subprogram, same thing will happen recursively until + * there are no more unprocesses subprograms left to append + * and relocate. + */ + if (subprog->sub_insn_off == 0) { + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + err = bpf_object__reloc_code(obj, main_prog, subprog); + if (err) + return err; + } + + /* main_prog->insns memory could have been re-allocated, so + * calculate pointer again + */ + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + /* calculate correct instruction position within current main + * prog; each main prog can have a different set of + * subprograms appended (potentially in different order as + * well), so position of any subprog can be different for + * different main programs + */ + insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; + + pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", + prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); + } + + return 0; +} + +/* + * Relocate sub-program calls. + * + * Algorithm operates as follows. Each entry-point BPF program (referred to as + * main prog) is processed separately. For each subprog (non-entry functions, + * that can be called from either entry progs or other subprogs) gets their + * sub_insn_off reset to zero. This serves as indicator that this subprogram + * hasn't been yet appended and relocated within current main prog. Once its + * relocated, sub_insn_off will point at the position within current main prog + * where given subprog was appended. This will further be used to relocate all + * the call instructions jumping into this subprog. + * + * We start with main program and process all call instructions. If the call + * is into a subprog that hasn't been processed (i.e., subprog->sub_insn_off + * is zero), subprog instructions are appended at the end of main program's + * instruction array. Then main program is "put on hold" while we recursively + * process newly appended subprogram. If that subprogram calls into another + * subprogram that hasn't been appended, new subprogram is appended again to + * the *main* prog's instructions (subprog's instructions are always left + * untouched, as they need to be in unmodified state for subsequent main progs + * and subprog instructions are always sent only as part of a main prog) and + * the process continues recursively. Once all the subprogs called from a main + * prog or any of its subprogs are appended (and relocated), all their + * positions within finalized instructions array are known, so it's easy to + * rewrite call instructions with correct relative offsets, corresponding to + * desired target subprog. + * + * Its important to realize that some subprogs might not be called from some + * main prog and any of its called/used subprogs. Those will keep their + * subprog->sub_insn_off as zero at all times and won't be appended to current + * main prog and won't be relocated within the context of current main prog. + * They might still be used from other main progs later. + * + * Visually this process can be shown as below. Suppose we have two main + * programs mainA and mainB and BPF object contains three subprogs: subA, + * subB, and subC. mainA calls only subA, mainB calls only subC, but subA and + * subC both call subB: + * + * +--------+ +-------+ + * | v v | + * +--+---+ +--+-+-+ +---+--+ + * | subA | | subB | | subC | + * +--+---+ +------+ +---+--+ + * ^ ^ + * | | + * +---+-------+ +------+----+ + * | mainA | | mainB | + * +-----------+ +-----------+ + * + * We'll start relocating mainA, will find subA, append it and start + * processing sub A recursively: + * + * +-----------+------+ + * | mainA | subA | + * +-----------+------+ + * + * At this point we notice that subB is used from subA, so we append it and + * relocate (there are no further subcalls from subB): + * + * +-----------+------+------+ + * | mainA | subA | subB | + * +-----------+------+------+ + * + * At this point, we relocate subA calls, then go one level up and finish with + * relocatin mainA calls. mainA is done. + * + * For mainB process is similar but results in different order. We start with + * mainB and skip subA and subB, as mainB never calls them (at least + * directly), but we see subC is needed, so we append and start processing it: + * + * +-----------+------+ + * | mainB | subC | + * +-----------+------+ + * Now we see subC needs subB, so we go back to it, append and relocate it: + * + * +-----------+------+------+ + * | mainB | subC | subB | + * +-----------+------+------+ + * + * At this point we unwind recursion, relocate calls in subC, then in mainB. + */ +static int +bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_program *subprog; + int i, err; + + /* mark all subprogs as not relocated (yet) within the context of + * current main program + */ + for (i = 0; i < obj->nr_programs; i++) { + subprog = &obj->programs[i]; + if (!prog_is_subprog(obj, subprog)) + continue; + + subprog->sub_insn_off = 0; + } + + err = bpf_object__reloc_code(obj, prog, prog); + if (err) + return err; + + return 0; +} + +static void +bpf_object__free_relocs(struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* free up relocation descriptors */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + } +} + +static int cmp_relocs(const void *_a, const void *_b) +{ + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; + + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static void bpf_object__sort_relos(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } +} + +static int +bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) +{ + struct bpf_program *prog; + size_t i, j; + int err; + + if (obj->btf_ext) { + err = bpf_object__relocate_core(obj, targ_btf_path); + if (err) { + pr_warn("failed to perform CO-RE relocations: %d\n", + err); + return err; + } + bpf_object__sort_relos(obj); + } + + /* Before relocating calls pre-process relocations and mark + * few ld_imm64 instructions that points to subprogs. + * Otherwise bpf_object__reloc_code() later would have to consider + * all ld_imm64 insns as relocation candidates. That would + * reduce relocation speed, since amount of find_prog_insn_relo() + * would increase and most of them will fail to find a relo. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + for (j = 0; j < prog->nr_reloc; j++) { + struct reloc_desc *relo = &prog->reloc_desc[j]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + + /* mark the insn, so it's recognized by insn_is_pseudo_func() */ + if (relo->type == RELO_SUBPROG_ADDR) + insn[0].src_reg = BPF_PSEUDO_FUNC; + } + } + + /* relocate subprogram calls and append used subprograms to main + * programs; each copy of subprogram code needs to be relocated + * differently for each main program, because its code location might + * have changed. + * Append subprog relos to main programs to allow data relos to be + * processed after text is completely relocated. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + /* sub-program's sub-calls are relocated within the context of + * its main program only + */ + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + + err = bpf_object__relocate_calls(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate calls: %d\n", + prog->name, err); + return err; + } + } + /* Process data relos for main programs */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + err = bpf_object__relocate_data(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate data references: %d\n", + prog->name, err); + return err; + } + } + + return 0; +} + +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data); + +static int bpf_object__collect_map_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const int bpf_ptr_sz = 8, host_ptr_sz = sizeof(void *); + int i, j, nrels, new_sz; + const struct btf_var_secinfo *vi = NULL; + const struct btf_type *sec, *var, *def; + struct bpf_map *map = NULL, *targ_map = NULL; + struct bpf_program *targ_prog = NULL; + bool is_prog_array, is_map_in_map; + const struct btf_member *member; + const char *name, *mname, *type; + unsigned int moff; + Elf64_Sym *sym; + Elf64_Rel *rel; + void *tmp; + + if (!obj->efile.btf_maps_sec_btf_id || !obj->btf) + return -EINVAL; + sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id); + if (!sec) + return -EINVAL; + + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn(".maps relo #%d: failed to get ELF relo\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn(".maps relo #%d: symbol %zx not found\n", + i, (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + name = elf_sym_str(obj, sym->st_name) ?: ""; + + pr_debug(".maps relo #%d: for %zd value %zd rel->r_offset %zu name %d ('%s')\n", + i, (ssize_t)(rel->r_info >> 32), (size_t)sym->st_value, + (size_t)rel->r_offset, sym->st_name, name); + + for (j = 0; j < obj->nr_maps; j++) { + map = &obj->maps[j]; + if (map->sec_idx != obj->efile.btf_maps_shndx) + continue; + + vi = btf_var_secinfos(sec) + map->btf_var_idx; + if (vi->offset <= rel->r_offset && + rel->r_offset + bpf_ptr_sz <= vi->offset + vi->size) + break; + } + if (j == obj->nr_maps) { + pr_warn(".maps relo #%d: cannot find map '%s' at rel->r_offset %zu\n", + i, name, (size_t)rel->r_offset); + return -EINVAL; + } + + is_map_in_map = bpf_map_type__is_map_in_map(map->def.type); + is_prog_array = map->def.type == BPF_MAP_TYPE_PROG_ARRAY; + type = is_map_in_map ? "map" : "prog"; + if (is_map_in_map) { + if (sym->st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) { + pr_warn(".maps relo #%d: '%s' isn't a valid map reference\n", + i, name); + return -ESRCH; + } + } else if (is_prog_array) { + targ_prog = bpf_object__find_program_by_name(obj, name); + if (!targ_prog) { + pr_warn(".maps relo #%d: '%s' isn't a valid program reference\n", + i, name); + return -ESRCH; + } + if (targ_prog->sec_idx != sym->st_shndx || + targ_prog->sec_insn_off * 8 != sym->st_value || + prog_is_subprog(obj, targ_prog)) { + pr_warn(".maps relo #%d: '%s' isn't an entry-point program\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + } else { + return -EINVAL; + } + + var = btf__type_by_id(obj->btf, vi->type); + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (btf_vlen(def) == 0) + return -EINVAL; + member = btf_members(def) + btf_vlen(def) - 1; + mname = btf__name_by_offset(obj->btf, member->name_off); + if (strcmp(mname, "values")) + return -EINVAL; + + moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8; + if (rel->r_offset - vi->offset < moff) + return -EINVAL; + + moff = rel->r_offset - vi->offset - moff; + /* here we use BPF pointer size, which is always 64 bit, as we + * are parsing ELF that was built for BPF target + */ + if (moff % bpf_ptr_sz) + return -EINVAL; + moff /= bpf_ptr_sz; + if (moff >= map->init_slots_sz) { + new_sz = moff + 1; + tmp = libbpf_reallocarray(map->init_slots, new_sz, host_ptr_sz); + if (!tmp) + return -ENOMEM; + map->init_slots = tmp; + memset(map->init_slots + map->init_slots_sz, 0, + (new_sz - map->init_slots_sz) * host_ptr_sz); + map->init_slots_sz = new_sz; + } + map->init_slots[moff] = is_map_in_map ? (void *)targ_map : (void *)targ_prog; + + pr_debug(".maps relo #%d: map '%s' slot [%d] points to %s '%s'\n", + i, map->name, moff, type, name); + } + + return 0; +} + +static int bpf_object__collect_relos(struct bpf_object *obj) +{ + int i, err; + + for (i = 0; i < obj->efile.sec_cnt; i++) { + struct elf_sec_desc *sec_desc = &obj->efile.secs[i]; + Elf64_Shdr *shdr; + Elf_Data *data; + int idx; + + if (sec_desc->sec_type != SEC_RELO) + continue; + + shdr = sec_desc->shdr; + data = sec_desc->data; + idx = shdr->sh_info; + + if (shdr->sh_type != SHT_REL) { + pr_warn("internal error at %d\n", __LINE__); + return -LIBBPF_ERRNO__INTERNAL; + } + + if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx) + err = bpf_object__collect_st_ops_relos(obj, shdr, data); + else if (idx == obj->efile.btf_maps_shndx) + err = bpf_object__collect_map_relos(obj, shdr, data); + else + err = bpf_object__collect_prog_relos(obj, shdr, data); + if (err) + return err; + } + + bpf_object__sort_relos(obj); + return 0; +} + +static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id) +{ + if (BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == 0 && + insn->dst_reg == 0) { + *func_id = insn->imm; + return true; + } + return false; +} + +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_insn *insn = prog->insns; + enum bpf_func_id func_id; + int i; + + if (obj->gen_loader) + return 0; + + for (i = 0; i < prog->insns_cnt; i++, insn++) { + if (!insn_is_helper_call(insn, &func_id)) + continue; + + /* on kernels that don't yet support + * bpf_probe_read_{kernel,user}[_str] helpers, fall back + * to bpf_probe_read() which works well for old kernels + */ + switch (func_id) { + case BPF_FUNC_probe_read_kernel: + case BPF_FUNC_probe_read_user: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read; + break; + case BPF_FUNC_probe_read_kernel_str: + case BPF_FUNC_probe_read_user_str: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read_str; + break; + default: + break; + } + } + return 0; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id); + +/* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */ +static int libbpf_prepare_prog_load(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie) +{ + enum sec_def_flags def = cookie; + + /* old kernels might not support specifying expected_attach_type */ + if ((def & SEC_EXP_ATTACH_OPT) && !kernel_supports(prog->obj, FEAT_EXP_ATTACH_TYPE)) + opts->expected_attach_type = 0; + + if (def & SEC_SLEEPABLE) + opts->prog_flags |= BPF_F_SLEEPABLE; + + if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) + opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; + + if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { + int btf_obj_fd = 0, btf_type_id = 0, err; + const char *attach_name; + + attach_name = strchr(prog->sec_name, '/'); + if (!attach_name) { + /* if BPF program is annotated with just SEC("fentry") + * (or similar) without declaratively specifying + * target, then it is expected that target will be + * specified with bpf_program__set_attach_target() at + * runtime before BPF object load step. If not, then + * there is nothing to load into the kernel as BPF + * verifier won't be able to validate BPF program + * correctness anyways. + */ + pr_warn("prog '%s': no BTF-based attach target is specified, use bpf_program__set_attach_target()\n", + prog->name); + return -EINVAL; + } + attach_name++; /* skip over / */ + + err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id); + if (err) + return err; + + /* cache resolved BTF FD and BTF type ID in the prog */ + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_btf_id = btf_type_id; + + /* but by now libbpf common logic is not utilizing + * prog->atach_btf_obj_fd/prog->attach_btf_id anymore because + * this callback is called after opts were populated by + * libbpf, so this callback has to update opts explicitly here + */ + opts->attach_btf_obj_fd = btf_obj_fd; + opts->attach_btf_id = btf_type_id; + } + return 0; +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); + +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_attr); + const char *prog_name = NULL; + char *cp, errmsg[STRERR_BUFSIZE]; + size_t log_buf_size = 0; + char *log_buf = NULL, *tmp; + int btf_fd, ret, err; + bool own_log_buf = true; + __u32 log_level = prog->log_level; + + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* + * The program type must be set. Most likely we couldn't find a proper + * section definition at load time, and thus we didn't infer the type. + */ + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + if (!insns || !insns_cnt) + return -EINVAL; + + load_attr.expected_attach_type = prog->expected_attach_type; + if (kernel_supports(obj, FEAT_PROG_NAME)) + prog_name = prog->name; + load_attr.attach_prog_fd = prog->attach_prog_fd; + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.kern_version = kern_version; + load_attr.prog_ifindex = prog->prog_ifindex; + + /* specify func_info/line_info only if kernel supports them */ + btf_fd = bpf_object__btf_fd(obj); + if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf_fd; + load_attr.func_info = prog->func_info; + load_attr.func_info_rec_size = prog->func_info_rec_size; + load_attr.func_info_cnt = prog->func_info_cnt; + load_attr.line_info = prog->line_info; + load_attr.line_info_rec_size = prog->line_info_rec_size; + load_attr.line_info_cnt = prog->line_info_cnt; + } + load_attr.log_level = log_level; + load_attr.prog_flags = prog->prog_flags; + load_attr.fd_array = obj->fd_array; + + /* adjust load_attr if sec_def provides custom preload callback */ + if (prog->sec_def && prog->sec_def->prog_prepare_load_fn) { + err = prog->sec_def->prog_prepare_load_fn(prog, &load_attr, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to prepare load attributes: %d\n", + prog->name, err); + return err; + } + insns = prog->insns; + insns_cnt = prog->insns_cnt; + } + + if (obj->gen_loader) { + bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name, + license, insns, insns_cnt, &load_attr, + prog - obj->programs); + *prog_fd = -1; + return 0; + } + +retry_load: + /* if log_level is zero, we don't request logs initially even if + * custom log_buf is specified; if the program load fails, then we'll + * bump log_level to 1 and use either custom log_buf or we'll allocate + * our own and retry the load to get details on what failed + */ + if (log_level) { + if (prog->log_buf) { + log_buf = prog->log_buf; + log_buf_size = prog->log_size; + own_log_buf = false; + } else if (obj->log_buf) { + log_buf = obj->log_buf; + log_buf_size = obj->log_size; + own_log_buf = false; + } else { + log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size * 2); + tmp = realloc(log_buf, log_buf_size); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + log_buf = tmp; + log_buf[0] = '\0'; + own_log_buf = true; + } + } + + load_attr.log_buf = log_buf; + load_attr.log_size = log_buf_size; + load_attr.log_level = log_level; + + ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); + if (ret >= 0) { + if (log_level && own_log_buf) { + pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + + if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) { + struct bpf_map *map; + int i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &prog->obj->maps[i]; + if (map->libbpf_type != LIBBPF_MAP_RODATA) + continue; + + if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to bind map '%s': %s\n", + prog->name, map->real_name, cp); + /* Don't fail hard if can't bind rodata. */ + } + } + } + + *prog_fd = ret; + ret = 0; + goto out; + } + + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* On ENOSPC, increase log buffer size and retry, unless custom + * log_buf is specified. + * Be careful to not overflow u32, though. Kernel's log buf size limit + * isn't part of UAPI so it can always be bumped to full 4GB. So don't + * multiply by 2 unless we are sure we'll fit within 32 bits. + * Currently, we'll get -EINVAL when we reach (UINT_MAX >> 2). + */ + if (own_log_buf && errno == ENOSPC && log_buf_size <= UINT_MAX / 2) + goto retry_load; + + ret = -errno; + + /* post-process verifier log to improve error descriptions */ + fixup_verifier_log(prog, log_buf, log_buf_size); + + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); + pr_perm_msg(ret); + + if (own_log_buf && log_buf && log_buf[0] != '\0') { + pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + +out: + if (own_log_buf) + free(log_buf); + return ret; +} + +static char *find_prev_line(char *buf, char *cur) +{ + char *p; + + if (cur == buf) /* end of a log buf */ + return NULL; + + p = cur - 1; + while (p - 1 >= buf && *(p - 1) != '\n') + p--; + + return p; +} + +static void patch_log(char *buf, size_t buf_sz, size_t log_sz, + char *orig, size_t orig_sz, const char *patch) +{ + /* size of the remaining log content to the right from the to-be-replaced part */ + size_t rem_sz = (buf + log_sz) - (orig + orig_sz); + size_t patch_sz = strlen(patch); + + if (patch_sz != orig_sz) { + /* If patch line(s) are longer than original piece of verifier log, + * shift log contents by (patch_sz - orig_sz) bytes to the right + * starting from after to-be-replaced part of the log. + * + * If patch line(s) are shorter than original piece of verifier log, + * shift log contents by (orig_sz - patch_sz) bytes to the left + * starting from after to-be-replaced part of the log + * + * We need to be careful about not overflowing available + * buf_sz capacity. If that's the case, we'll truncate the end + * of the original log, as necessary. + */ + if (patch_sz > orig_sz) { + if (orig + patch_sz >= buf + buf_sz) { + /* patch is big enough to cover remaining space completely */ + patch_sz -= (orig + patch_sz) - (buf + buf_sz) + 1; + rem_sz = 0; + } else if (patch_sz - orig_sz > buf_sz - log_sz) { + /* patch causes part of remaining log to be truncated */ + rem_sz -= (patch_sz - orig_sz) - (buf_sz - log_sz); + } + } + /* shift remaining log to the right by calculated amount */ + memmove(orig + patch_sz, orig + orig_sz, rem_sz); + } + + memcpy(orig, patch, patch_sz); +} + +static void fixup_log_failed_core_relo(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded CO-RE relocation: + * line1 -> 123: (85) call unknown#195896080 + * line2 -> invalid func unknown#195896080 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. We extract + * instruction index to find corresponding CO-RE relocation and + * replace this part of the log with more relevant information about + * failed CO-RE relocation. + */ + const struct bpf_core_relo *relo; + struct bpf_core_spec spec; + char patch[512], spec_buf[256]; + int insn_idx, err, spec_len; + + if (sscanf(line1, "%d: (%*d) call unknown#195896080\n", &insn_idx) != 1) + return; + + relo = find_relo_core(prog, insn_idx); + if (!relo) + return; + + err = bpf_core_parse_spec(prog->name, prog->obj->btf, relo, &spec); + if (err) + return; + + spec_len = bpf_core_format_spec(spec_buf, sizeof(spec_buf), &spec); + snprintf(patch, sizeof(patch), + "%d: \n" + "failed to resolve CO-RE relocation %s%s\n", + insn_idx, spec_buf, spec_len >= sizeof(spec_buf) ? "..." : ""); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_map_load(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded map reference: + * line1 -> 123: (85) call unknown#2001000345 + * line2 -> invalid func unknown#2001000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2001000345" is a map index in obj->maps to fetch map name. + */ + struct bpf_object *obj = prog->obj; + const struct bpf_map *map; + int insn_idx, map_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &map_idx) != 2) + return; + + map_idx -= POISON_LDIMM64_MAP_BASE; + if (map_idx < 0 || map_idx >= obj->nr_maps) + return; + map = &obj->maps[map_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "BPF map '%s' is referenced but wasn't created\n", + insn_idx, map->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_kfunc_call(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded kfunc call: + * line1 -> 123: (85) call unknown#2002000345 + * line2 -> invalid func unknown#2002000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2002000345" is an extern index in obj->externs to fetch kfunc name. + */ + struct bpf_object *obj = prog->obj; + const struct extern_desc *ext; + int insn_idx, ext_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &ext_idx) != 2) + return; + + ext_idx -= POISON_CALL_KFUNC_BASE; + if (ext_idx < 0 || ext_idx >= obj->nr_extern) + return; + ext = &obj->externs[ext_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "kfunc '%s' is referenced but wasn't resolved\n", + insn_idx, ext->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) +{ + /* look for familiar error patterns in last N lines of the log */ + const size_t max_last_line_cnt = 10; + char *prev_line, *cur_line, *next_line; + size_t log_sz; + int i; + + if (!buf) + return; + + log_sz = strlen(buf) + 1; + next_line = buf + log_sz - 1; + + for (i = 0; i < max_last_line_cnt; i++, next_line = cur_line) { + cur_line = find_prev_line(buf, next_line); + if (!cur_line) + return; + + if (str_has_pfx(cur_line, "invalid func unknown#195896080\n")) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* failed CO-RE relocation case */ + fixup_log_failed_core_relo(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_LDIMM64_MAP_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to uncreated BPF map */ + fixup_log_missing_map_load(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_CALL_KFUNC_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to unresolved kfunc */ + fixup_log_missing_kfunc_call(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } + } +} + +static int bpf_program_record_relos(struct bpf_program *prog) +{ + struct bpf_object *obj = prog->obj; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct extern_desc *ext = &obj->externs[relo->ext_idx]; + int kind; + + switch (relo->type) { + case RELO_EXTERN_LD64: + if (ext->type != EXT_KSYM) + continue; + kind = btf_is_var(btf__type_by_id(obj->btf, ext->btf_id)) ? + BTF_KIND_VAR : BTF_KIND_FUNC; + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, !ext->ksym.type_id, + true, kind, relo->insn_idx); + break; + case RELO_EXTERN_CALL: + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, false, false, BTF_KIND_FUNC, + relo->insn_idx); + break; + case RELO_CORE: { + struct bpf_core_relo cr = { + .insn_off = relo->insn_idx * 8, + .type_id = relo->core_relo->type_id, + .access_str_off = relo->core_relo->access_str_off, + .kind = relo->core_relo->kind, + }; + + bpf_gen__record_relo_core(obj->gen_loader, &cr); + break; + } + default: + continue; + } + } + return 0; +} + +static int +bpf_object__load_progs(struct bpf_object *obj, int log_level) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + err = bpf_object__sanitize_prog(obj, prog); + if (err) + return err; + } + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) { + pr_debug("prog '%s': skipped loading\n", prog->name); + continue; + } + prog->log_level |= log_level; + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); + return err; + } + } + + bpf_object__free_relocs(obj); + return 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name); + +static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts) +{ + struct bpf_program *prog; + int err; + + bpf_object__for_each_program(prog, obj) { + prog->sec_def = find_sec_def(prog->sec_name); + if (!prog->sec_def) { + /* couldn't guess, but user might manually specify */ + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", + prog->name, prog->sec_name); + continue; + } + + prog->type = prog->sec_def->prog_type; + prog->expected_attach_type = prog->sec_def->expected_attach_type; + + /* sec_def can have custom callback which should be called + * after bpf_program is initialized to adjust its properties + */ + if (prog->sec_def->prog_setup_fn) { + err = prog->sec_def->prog_setup_fn(prog, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to initialize: %d\n", + prog->name, err); + return err; + } + } + } + + return 0; +} + +static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + const char *obj_name, *kconfig, *btf_tmp_path; + struct bpf_object *obj; + char tmp_name[64]; + int err; + char *log_buf; + size_t log_size; + __u32 log_level; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", + path ? : "(mem buf)"); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + if (!OPTS_VALID(opts, bpf_object_open_opts)) + return ERR_PTR(-EINVAL); + + obj_name = OPTS_GET(opts, object_name, NULL); + if (obj_buf) { + if (!obj_name) { + snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", + (unsigned long)obj_buf, + (unsigned long)obj_buf_sz); + obj_name = tmp_name; + } + path = obj_name; + pr_debug("loading object '%s' from buffer\n", obj_name); + } + + log_buf = OPTS_GET(opts, kernel_log_buf, NULL); + log_size = OPTS_GET(opts, kernel_log_size, 0); + log_level = OPTS_GET(opts, kernel_log_level, 0); + if (log_size > UINT_MAX) + return ERR_PTR(-EINVAL); + if (log_size && !log_buf) + return ERR_PTR(-EINVAL); + + obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); + if (IS_ERR(obj)) + return obj; + + obj->log_buf = log_buf; + obj->log_size = log_size; + obj->log_level = log_level; + + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); + if (btf_tmp_path) { + if (strlen(btf_tmp_path) >= PATH_MAX) { + err = -ENAMETOOLONG; + goto out; + } + obj->btf_custom_path = strdup(btf_tmp_path); + if (!obj->btf_custom_path) { + err = -ENOMEM; + goto out; + } + } + + kconfig = OPTS_GET(opts, kconfig, NULL); + if (kconfig) { + obj->kconfig = strdup(kconfig); + if (!obj->kconfig) { + err = -ENOMEM; + goto out; + } + } + + err = bpf_object__elf_init(obj); + err = err ? : bpf_object__check_endianness(obj); + err = err ? : bpf_object__elf_collect(obj); + err = err ? : bpf_object__collect_externs(obj); + err = err ? : bpf_object_fixup_btf(obj); + err = err ? : bpf_object__init_maps(obj, opts); + err = err ? : bpf_object_init_progs(obj, opts); + err = err ? : bpf_object__collect_relos(obj); + if (err) + goto out; + + bpf_object__elf_finish(obj); + + return obj; +out: + bpf_object__close(obj); + return ERR_PTR(err); +} + +struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) +{ + if (!path) + return libbpf_err_ptr(-EINVAL); + + pr_debug("loading %s\n", path); + + return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); +} + +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + +struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + if (!obj_buf || obj_buf_sz == 0) + return libbpf_err_ptr(-EINVAL); + + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); +} + +static int bpf_object_unload(struct bpf_object *obj) +{ + size_t i; + + if (!obj) + return libbpf_err(-EINVAL); + + for (i = 0; i < obj->nr_maps; i++) { + zclose(obj->maps[i].fd); + if (obj->maps[i].st_ops) + zfree(&obj->maps[i].st_ops->kern_vdata); + } + + for (i = 0; i < obj->nr_programs; i++) + bpf_program__unload(&obj->programs[i]); + + return 0; +} + +static int bpf_object__sanitize_maps(struct bpf_object *obj) +{ + struct bpf_map *m; + + bpf_object__for_each_map(m, obj) { + if (!bpf_map__is_internal(m)) + continue; + if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) + m->def.map_flags &= ~BPF_F_MMAPABLE; + } + + return 0; +} + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +{ + char sym_type, sym_name[500]; + unsigned long long sym_addr; + int ret, err = 0; + FILE *f; + + f = fopen("/proc/kallsyms", "re"); + if (!f) { + err = -errno; + pr_warn("failed to open /proc/kallsyms: %d\n", err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %c %499s%*[^\n]\n", + &sym_addr, &sym_type, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 3) { + pr_warn("failed to read kallsyms entry: %d\n", ret); + err = -EINVAL; + break; + } + + err = cb(sym_addr, sym_type, sym_name, ctx); + if (err) + break; + } + + fclose(f); + return err; +} + +static int kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct bpf_object *obj = ctx; + const struct btf_type *t; + struct extern_desc *ext; + + ext = find_extern_by_name(obj, sym_name); + if (!ext || ext->type != EXT_KSYM) + return 0; + + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + return 0; + + if (ext->is_set && ext->ksym.addr != sym_addr) { + pr_warn("extern (ksym) '%s': resolution is ambiguous: 0x%llx or 0x%llx\n", + sym_name, ext->ksym.addr, sym_addr); + return -EINVAL; + } + if (!ext->is_set) { + ext->is_set = true; + ext->ksym.addr = sym_addr; + pr_debug("extern (ksym) '%s': set to 0x%llx\n", sym_name, sym_addr); + } + return 0; +} + +static int bpf_object__read_kallsyms_file(struct bpf_object *obj) +{ + return libbpf_kallsyms_parse(kallsyms_cb, obj); +} + +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + struct module_btf **res_mod_btf) +{ + struct module_btf *mod_btf; + struct btf *btf; + int i, id, err; + + btf = obj->btf_vmlinux; + mod_btf = NULL; + id = btf__find_by_name_kind(btf, ksym_name, kind); + + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; + + for (i = 0; i < obj->btf_module_cnt; i++) { + /* we assume module_btf's BTF FD is always >0 */ + mod_btf = &obj->btf_modules[i]; + btf = mod_btf->btf; + id = btf__find_by_name_kind_own(btf, ksym_name, kind); + if (id != -ENOENT) + break; + } + } + if (id <= 0) + return -ESRCH; + + *res_btf = btf; + *res_mod_btf = mod_btf; + return id; +} + +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + struct module_btf *mod_btf = NULL; + const char *targ_var_name; + struct btf *btf = NULL; + int id, err; + + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &mod_btf); + if (id < 0) { + if (id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (var ksym) '%s': not found in kernel BTF\n", + ext->name); + return id; + } + + /* find local type_id */ + local_type_id = ext->ksym.type_id; + + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); + + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); + + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + + return 0; +} + +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + struct module_btf *mod_btf = NULL; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, &kern_btf, &mod_btf); + if (kfunc_id < 0) { + if (kfunc_id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (func ksym) '%s': not found in kernel or module BTFs\n", + ext->name); + return kfunc_id; + } + + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with %s [%d]\n", + ext->name, local_func_proto_id, + mod_btf ? mod_btf->name : "vmlinux", kfunc_proto_id); + return -EINVAL; + } + + /* set index for module BTF fd in fd_array, if unset */ + if (mod_btf && !mod_btf->fd_array_idx) { + /* insn->off is s16 */ + if (obj->fd_array_cnt == INT16_MAX) { + pr_warn("extern (func ksym) '%s': module BTF fd index %d too big to fit in bpf_insn offset\n", + ext->name, mod_btf->fd_array_idx); + return -E2BIG; + } + /* Cannot use index 0 for module BTF fd */ + if (!obj->fd_array_cnt) + obj->fd_array_cnt = 1; + + ret = libbpf_ensure_mem((void **)&obj->fd_array, &obj->fd_array_cap, sizeof(int), + obj->fd_array_cnt + 1); + if (ret) + return ret; + mod_btf->fd_array_idx = obj->fd_array_cnt; + /* we assume module BTF FD is always >0 */ + obj->fd_array[obj->fd_array_cnt++] = mod_btf->fd; + } + + ext->is_set = true; + ext->ksym.kernel_btf_id = kfunc_id; + ext->ksym.btf_fd_idx = mod_btf ? mod_btf->fd_array_idx : 0; + /* Also set kernel_btf_obj_fd to make sure that bpf_object__relocate_data() + * populates FD into ld_imm64 insn when it's used to point to kfunc. + * {kernel_btf_id, btf_fd_idx} -> fixup bpf_call. + * {kernel_btf_id, kernel_btf_obj_fd} -> fixup ld_imm64. + */ + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + pr_debug("extern (func ksym) '%s': resolved to %s [%d]\n", + ext->name, mod_btf ? mod_btf->name : "vmlinux", kfunc_id); + + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + if (obj->gen_loader) { + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = 0; + ext->ksym.kernel_btf_id = 0; + continue; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); + if (err) + return err; + } + return 0; +} + +static int bpf_object__resolve_externs(struct bpf_object *obj, + const char *extra_kconfig) +{ + bool need_config = false, need_kallsyms = false; + bool need_vmlinux_btf = false; + struct extern_desc *ext; + void *kcfg_data = NULL; + int err, i; + + if (obj->nr_extern == 0) + return 0; + + if (obj->kconfig_map_idx >= 0) + kcfg_data = obj->maps[obj->kconfig_map_idx].mmaped; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (ext->type == EXT_KSYM) { + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; + continue; + } else if (ext->type == EXT_KCFG) { + void *ext_ptr = kcfg_data + ext->kcfg.data_off; + __u64 value = 0; + + /* Kconfig externs need actual /proc/config.gz */ + if (str_has_pfx(ext->name, "CONFIG_")) { + need_config = true; + continue; + } + + /* Virtual kcfg externs are customly handled by libbpf */ + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + value = get_kernel_version(); + if (!value) { + pr_warn("extern (kcfg) '%s': failed to get kernel version\n", ext->name); + return -EINVAL; + } + } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { + value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (strcmp(ext->name, "LINUX_HAS_SYSCALL_WRAPPER") == 0) { + value = kernel_supports(obj, FEAT_SYSCALL_WRAPPER); + } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { + /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed + * __kconfig externs, where LINUX_ ones are virtual and filled out + * customly by libbpf (their values don't come from Kconfig). + * If LINUX_xxx variable is not recognized by libbpf, but is marked + * __weak, it defaults to zero value, just like for CONFIG_xxx + * externs. + */ + pr_warn("extern (kcfg) '%s': unrecognized virtual extern\n", ext->name); + return -EINVAL; + } + + err = set_kcfg_value_num(ext, ext_ptr, value); + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to 0x%llx\n", + ext->name, (long long)value); + } else { + pr_warn("extern '%s': unrecognized extern kind\n", ext->name); + return -EINVAL; + } + } + if (need_config && extra_kconfig) { + err = bpf_object__read_kconfig_mem(obj, extra_kconfig, kcfg_data); + if (err) + return -EINVAL; + need_config = false; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG && !ext->is_set) { + need_config = true; + break; + } + } + } + if (need_config) { + err = bpf_object__read_kconfig_file(obj, kcfg_data); + if (err) + return -EINVAL; + } + if (need_kallsyms) { + err = bpf_object__read_kallsyms_file(obj); + if (err) + return -EINVAL; + } + if (need_vmlinux_btf) { + err = bpf_object__resolve_ksyms_btf_id(obj); + if (err) + return -EINVAL; + } + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (!ext->is_set && !ext->is_weak) { + pr_warn("extern '%s' (strong): not resolved\n", ext->name); + return -ESRCH; + } else if (!ext->is_set) { + pr_debug("extern '%s' (weak): not resolved, defaulting to zero\n", + ext->name); + } + } + + return 0; +} + +static void bpf_map_prepare_vdata(const struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + __u32 i; + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } +} + +static int bpf_object_prepare_struct_ops(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_maps; i++) + if (bpf_map__is_struct_ops(&obj->maps[i])) + bpf_map_prepare_vdata(&obj->maps[i]); + + return 0; +} + +static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) +{ + int err, i; + + if (!obj) + return libbpf_err(-EINVAL); + + if (obj->loaded) { + pr_warn("object '%s': load can't be attempted twice\n", obj->name); + return libbpf_err(-EINVAL); + } + + if (obj->gen_loader) + bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps); + + err = bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__load_progs(obj, extra_log_level); + err = err ? : bpf_object_init_prog_arrays(obj); + err = err ? : bpf_object_prepare_struct_ops(obj); + + if (obj->gen_loader) { + /* reset FDs */ + if (obj->btf) + btf__set_fd(obj->btf, -1); + for (i = 0; i < obj->nr_maps; i++) + obj->maps[i].fd = -1; + if (!err) + err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); + } + + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + free(obj->btf_modules); + + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; + + obj->loaded = true; /* doesn't matter if successfully or not */ + + if (err) + goto out; + + return 0; +out: + /* unpin any maps that were auto-pinned during load */ + for (i = 0; i < obj->nr_maps; i++) + if (obj->maps[i].pinned && !obj->maps[i].reused) + bpf_map__unpin(&obj->maps[i], NULL); + + bpf_object_unload(obj); + pr_warn("failed to load object '%s'\n", obj->path); + return libbpf_err(err); +} + +int bpf_object__load(struct bpf_object *obj) +{ + return bpf_object_load(obj, 0, NULL); +} + +static int make_parent_dir(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + char *dname, *dir; + int err = 0; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (mkdir(dir, 0700) && errno != EEXIST) + err = -errno; + + free(dname); + if (err) { + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to mkdir %s: %s\n", path, cp); + } + return err; +} + +static int check_path(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct statfs st_fs; + char *dname, *dir; + int err = 0; + + if (path == NULL) + return -EINVAL; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (statfs(dir, &st_fs)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("failed to statfs %s: %s\n", dir, cp); + err = -errno; + } + free(dname); + + if (!err && st_fs.f_type != BPF_FS_MAGIC) { + pr_warn("specified path %s is not on BPF FS\n", path); + err = -EINVAL; + } + + return err; +} + +int bpf_program__pin(struct bpf_program *prog, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + + err = check_path(path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(prog->fd, path)) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); + return libbpf_err(err); + } + + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); + return 0; +} + +int bpf_program__unpin(struct bpf_program *prog, const char *path) +{ + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err) + return libbpf_err(-errno); + + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); + return 0; +} + +int bpf_map__pin(struct bpf_map *map, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_debug("map '%s' already pinned at '%s'; not re-pinning\n", + bpf_map__name(map), map->pin_path); + return 0; + } + } else { + if (!path) { + pr_warn("missing a path to pin map '%s' at\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_warn("map '%s' already pinned\n", bpf_map__name(map)); + return libbpf_err(-EEXIST); + } + + map->pin_path = strdup(path); + if (!map->pin_path) { + err = -errno; + goto out_err; + } + } + + err = make_parent_dir(map->pin_path); + if (err) + return libbpf_err(err); + + err = check_path(map->pin_path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(map->fd, map->pin_path)) { + err = -errno; + goto out_err; + } + + map->pinned = true; + pr_debug("pinned map '%s'\n", map->pin_path); + + return 0; + +out_err: + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to pin map: %s\n", cp); + return libbpf_err(err); +} + +int bpf_map__unpin(struct bpf_map *map, const char *path) +{ + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } + path = map->pin_path; + } else if (!path) { + pr_warn("no path to unpin map '%s' from\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err != 0) + return libbpf_err(-errno); + + map->pinned = false; + pr_debug("unpinned map '%s' from '%s'\n", bpf_map__name(map), path); + + return 0; +} + +int bpf_map__set_pin_path(struct bpf_map *map, const char *path) +{ + char *new = NULL; + + if (path) { + new = strdup(path); + if (!new) + return libbpf_err(-errno); + } + + free(map->pin_path); + map->pin_path = new; + return 0; +} + +__alias(bpf_map__pin_path) +const char *bpf_map__get_pin_path(const struct bpf_map *map); + +const char *bpf_map__pin_path(const struct bpf_map *map) +{ + return map->pin_path; +} + +bool bpf_map__is_pinned(const struct bpf_map *map) +{ + return map->pinned; +} + +static void sanitize_pin_path(char *s) +{ + /* bpffs disallows periods in path names */ + while (*s) { + if (*s == '.') + *s = '_'; + s++; + } +} + +int bpf_object__pin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (!map->autocreate) + continue; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + goto err_unpin_maps; + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__pin(map, pin_path); + if (err) + goto err_unpin_maps; + } + + return 0; + +err_unpin_maps: + while ((map = bpf_object__prev_map(obj, map))) { + if (!map->pin_path) + continue; + + bpf_map__unpin(map, NULL); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__unpin(map, pin_path); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + char buf[PATH_MAX]; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_program(prog, obj) { + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + goto err_unpin_programs; + + err = bpf_program__pin(prog, buf); + if (err) + goto err_unpin_programs; + } + + return 0; + +err_unpin_programs: + while ((prog = bpf_object__prev_program(obj, prog))) { + if (pathname_concat(buf, sizeof(buf), path, prog->name)) + continue; + + bpf_program__unpin(prog, buf); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_program(prog, obj) { + char buf[PATH_MAX]; + + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); + + err = bpf_program__unpin(prog, buf); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin(struct bpf_object *obj, const char *path) +{ + int err; + + err = bpf_object__pin_maps(obj, path); + if (err) + return libbpf_err(err); + + err = bpf_object__pin_programs(obj, path); + if (err) { + bpf_object__unpin_maps(obj, path); + return libbpf_err(err); + } + + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map) +{ + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + if (map->mmaped) { + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + munmap(map->mmaped, mmap_sz); + map->mmaped = NULL; + } + + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + + zfree(&map->name); + zfree(&map->real_name); + zfree(&map->pin_path); + + if (map->fd >= 0) + zclose(map->fd); +} + +void bpf_object__close(struct bpf_object *obj) +{ + size_t i; + + if (IS_ERR_OR_NULL(obj)) + return; + + usdt_manager_free(obj->usdt_man); + obj->usdt_man = NULL; + + bpf_gen__free(obj->gen_loader); + bpf_object__elf_finish(obj); + bpf_object_unload(obj); + btf__free(obj->btf); + btf_ext__free(obj->btf_ext); + + for (i = 0; i < obj->nr_maps; i++) + bpf_map__destroy(&obj->maps[i]); + + zfree(&obj->btf_custom_path); + zfree(&obj->kconfig); + zfree(&obj->externs); + obj->nr_extern = 0; + + zfree(&obj->maps); + obj->nr_maps = 0; + + if (obj->programs && obj->nr_programs) { + for (i = 0; i < obj->nr_programs; i++) + bpf_program__exit(&obj->programs[i]); + } + zfree(&obj->programs); + + free(obj); +} + +const char *bpf_object__name(const struct bpf_object *obj) +{ + return obj ? obj->name : libbpf_err_ptr(-EINVAL); +} + +unsigned int bpf_object__kversion(const struct bpf_object *obj) +{ + return obj ? obj->kern_version : 0; +} + +struct btf *bpf_object__btf(const struct bpf_object *obj) +{ + return obj ? obj->btf : NULL; +} + +int bpf_object__btf_fd(const struct bpf_object *obj) +{ + return obj->btf ? btf__fd(obj->btf) : -1; +} + +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return libbpf_err(-EINVAL); + + obj->kern_version = kern_version; + + return 0; +} + +int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) +{ + struct bpf_gen *gen; + + if (!opts) + return -EFAULT; + if (!OPTS_VALID(opts, gen_loader_opts)) + return -EINVAL; + gen = calloc(sizeof(*gen), 1); + if (!gen) + return -ENOMEM; + gen->opts = opts; + obj->gen_loader = gen; + return 0; +} + +static struct bpf_program * +__bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, + bool forward) +{ + size_t nr_programs = obj->nr_programs; + ssize_t idx; + + if (!nr_programs) + return NULL; + + if (!p) + /* Iter from the beginning */ + return forward ? &obj->programs[0] : + &obj->programs[nr_programs - 1]; + + if (p->obj != obj) { + pr_warn("error: program handler doesn't match object\n"); + return errno = EINVAL, NULL; + } + + idx = (p - obj->programs) + (forward ? 1 : -1); + if (idx >= obj->nr_programs || idx < 0) + return NULL; + return &obj->programs[idx]; +} + +struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) +{ + struct bpf_program *prog = prev; + + do { + prog = __bpf_program__iter(prog, obj, true); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) +{ + struct bpf_program *prog = next; + + do { + prog = __bpf_program__iter(prog, obj, false); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) +{ + prog->prog_ifindex = ifindex; +} + +const char *bpf_program__name(const struct bpf_program *prog) +{ + return prog->name; +} + +const char *bpf_program__section_name(const struct bpf_program *prog) +{ + return prog->sec_name; +} + +bool bpf_program__autoload(const struct bpf_program *prog) +{ + return prog->autoload; +} + +int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) +{ + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + prog->autoload = autoload; + return 0; +} + +bool bpf_program__autoattach(const struct bpf_program *prog) +{ + return prog->autoattach; +} + +void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach) +{ + prog->autoattach = autoattach; +} + +const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) +{ + return prog->insns; +} + +size_t bpf_program__insn_cnt(const struct bpf_program *prog) +{ + return prog->insns_cnt; +} + +int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt) +{ + struct bpf_insn *insns; + + if (prog->obj->loaded) + return -EBUSY; + + insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); + /* NULL is a valid return from reallocarray if the new count is zero */ + if (!insns && new_insn_cnt) { + pr_warn("prog '%s': failed to realloc prog code\n", prog->name); + return -ENOMEM; + } + memcpy(insns, new_insns, new_insn_cnt * sizeof(*insns)); + + prog->insns = insns; + prog->insns_cnt = new_insn_cnt; + return 0; +} + +int bpf_program__fd(const struct bpf_program *prog) +{ + if (!prog) + return libbpf_err(-EINVAL); + + if (prog->fd < 0) + return libbpf_err(-ENOENT); + + return prog->fd; +} + +__alias(bpf_program__type) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); + +enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) +{ + return prog->type; +} + +static size_t custom_sec_def_cnt; +static struct bpf_sec_def *custom_sec_defs; +static struct bpf_sec_def custom_fallback_def; +static bool has_custom_fallback_def; +static int last_custom_sec_def_handler_id; + +int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + /* if type is not changed, do nothing */ + if (prog->type == type) + return 0; + + prog->type = type; + + /* If a program type was changed, we need to reset associated SEC() + * handler, as it will be invalid now. The only exception is a generic + * fallback handler, which by definition is program type-agnostic and + * is a catch-all custom handler, optionally set by the application, + * so should be able to handle any type of BPF program. + */ + if (prog->sec_def != &custom_fallback_def) + prog->sec_def = NULL; + return 0; +} + +__alias(bpf_program__expected_attach_type) +enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); + +enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program *prog) +{ + return prog->expected_attach_type; +} + +int bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->expected_attach_type = type; + return 0; +} + +__u32 bpf_program__flags(const struct bpf_program *prog) +{ + return prog->prog_flags; +} + +int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->prog_flags = flags; + return 0; +} + +__u32 bpf_program__log_level(const struct bpf_program *prog) +{ + return prog->log_level; +} + +int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->log_level = log_level; + return 0; +} + +const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size) +{ + *log_size = prog->log_size; + return prog->log_buf; +} + +int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size) +{ + if (log_size && !log_buf) + return -EINVAL; + if (prog->log_size > UINT_MAX) + return -EINVAL; + if (prog->obj->loaded) + return -EBUSY; + + prog->log_buf = log_buf; + prog->log_size = log_size; + return 0; +} + +#define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ + .sec = (char *)sec_pfx, \ + .prog_type = BPF_PROG_TYPE_##ptype, \ + .expected_attach_type = atype, \ + .cookie = (long)(flags), \ + .prog_prepare_load_fn = libbpf_prepare_prog_load, \ + __VA_ARGS__ \ +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); + +static const struct bpf_sec_def section_defs[] = { + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), + SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), + SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), + SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), + SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), + SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), + SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), + SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), + SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), + SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), + SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), + SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), +}; + +int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts) +{ + struct bpf_sec_def *sec_def; + + if (!OPTS_VALID(opts, libbpf_prog_handler_opts)) + return libbpf_err(-EINVAL); + + if (last_custom_sec_def_handler_id == INT_MAX) /* prevent overflow */ + return libbpf_err(-E2BIG); + + if (sec) { + sec_def = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt + 1, + sizeof(*sec_def)); + if (!sec_def) + return libbpf_err(-ENOMEM); + + custom_sec_defs = sec_def; + sec_def = &custom_sec_defs[custom_sec_def_cnt]; + } else { + if (has_custom_fallback_def) + return libbpf_err(-EBUSY); + + sec_def = &custom_fallback_def; + } + + sec_def->sec = sec ? strdup(sec) : NULL; + if (sec && !sec_def->sec) + return libbpf_err(-ENOMEM); + + sec_def->prog_type = prog_type; + sec_def->expected_attach_type = exp_attach_type; + sec_def->cookie = OPTS_GET(opts, cookie, 0); + + sec_def->prog_setup_fn = OPTS_GET(opts, prog_setup_fn, NULL); + sec_def->prog_prepare_load_fn = OPTS_GET(opts, prog_prepare_load_fn, NULL); + sec_def->prog_attach_fn = OPTS_GET(opts, prog_attach_fn, NULL); + + sec_def->handler_id = ++last_custom_sec_def_handler_id; + + if (sec) + custom_sec_def_cnt++; + else + has_custom_fallback_def = true; + + return sec_def->handler_id; +} + +int libbpf_unregister_prog_handler(int handler_id) +{ + struct bpf_sec_def *sec_defs; + int i; + + if (handler_id <= 0) + return libbpf_err(-EINVAL); + + if (has_custom_fallback_def && custom_fallback_def.handler_id == handler_id) { + memset(&custom_fallback_def, 0, sizeof(custom_fallback_def)); + has_custom_fallback_def = false; + return 0; + } + + for (i = 0; i < custom_sec_def_cnt; i++) { + if (custom_sec_defs[i].handler_id == handler_id) + break; + } + + if (i == custom_sec_def_cnt) + return libbpf_err(-ENOENT); + + free(custom_sec_defs[i].sec); + for (i = i + 1; i < custom_sec_def_cnt; i++) + custom_sec_defs[i - 1] = custom_sec_defs[i]; + custom_sec_def_cnt--; + + /* try to shrink the array, but it's ok if we couldn't */ + sec_defs = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt, sizeof(*sec_defs)); + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (sec_defs || custom_sec_def_cnt == 0) + custom_sec_defs = sec_defs; + + return 0; +} + +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) +{ + size_t len = strlen(sec_def->sec); + + /* "type/" always has to have proper SEC("type/extras") form */ + if (sec_def->sec[len - 1] == '/') { + if (str_has_pfx(sec_name, sec_def->sec)) + return true; + return false; + } + + /* "type+" means it can be either exact SEC("type") or + * well-formed SEC("type/extras") with proper '/' separator + */ + if (sec_def->sec[len - 1] == '+') { + len--; + /* not even a prefix */ + if (strncmp(sec_name, sec_def->sec, len) != 0) + return false; + /* exact match or has '/' separator */ + if (sec_name[len] == '\0' || sec_name[len] == '/') + return true; + return false; + } + + return strcmp(sec_name, sec_def->sec) == 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name) +{ + const struct bpf_sec_def *sec_def; + int i, n; + + n = custom_sec_def_cnt; + for (i = 0; i < n; i++) { + sec_def = &custom_sec_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + n = ARRAY_SIZE(section_defs); + for (i = 0; i < n; i++) { + sec_def = §ion_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + if (has_custom_fallback_def) + return &custom_fallback_def; + + return NULL; +} + +#define MAX_TYPE_NAME_SIZE 32 + +static char *libbpf_get_type_names(bool attach_type) +{ + int i, len = ARRAY_SIZE(section_defs) * MAX_TYPE_NAME_SIZE; + char *buf; + + buf = malloc(len); + if (!buf) + return NULL; + + buf[0] = '\0'; + /* Forge string buf with all available names */ + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + const struct bpf_sec_def *sec_def = §ion_defs[i]; + + if (attach_type) { + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + continue; + + if (!(sec_def->cookie & SEC_ATTACHABLE)) + continue; + } + + if (strlen(buf) + strlen(section_defs[i].sec) + 2 > len) { + free(buf); + return NULL; + } + strcat(buf, " "); + strcat(buf, section_defs[i].sec); + } + + return buf; +} + +int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type) +{ + const struct bpf_sec_def *sec_def; + char *type_names; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (sec_def) { + *prog_type = sec_def->prog_type; + *expected_attach_type = sec_def->expected_attach_type; + return 0; + } + + pr_debug("failed to guess program type from ELF section '%s'\n", name); + type_names = libbpf_get_type_names(false); + if (type_names != NULL) { + pr_debug("supported section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-ESRCH); +} + +const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(attach_type_name)) + return NULL; + + return attach_type_name[t]; +} + +const char *libbpf_bpf_link_type_str(enum bpf_link_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(link_type_name)) + return NULL; + + return link_type_name[t]; +} + +const char *libbpf_bpf_map_type_str(enum bpf_map_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(map_type_name)) + return NULL; + + return map_type_name[t]; +} + +const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) + return NULL; + + return prog_type_name[t]; +} + +static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + int sec_idx, + size_t offset) +{ + struct bpf_map *map; + size_t i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + if (!bpf_map__is_struct_ops(map)) + continue; + if (map->sec_idx == sec_idx && + map->sec_offset <= offset && + offset - map->sec_offset < map->def.value_size) + return map; + } + + return NULL; +} + +/* Collect the reloc from ELF and populate the st_ops->progs[] */ +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_program *prog; + unsigned int shdr_idx; + const struct btf *btf; + struct bpf_map *map; + unsigned int moff, insn_idx; + const char *name; + __u32 member_idx; + Elf64_Sym *sym; + Elf64_Rel *rel; + int i, nrels; + + btf = obj->btf; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("struct_ops reloc: failed to get %d reloc\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn("struct_ops reloc: symbol %zx not found\n", + (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + + name = elf_sym_str(obj, sym->st_name) ?: ""; + map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset); + if (!map) { + pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n", + (size_t)rel->r_offset); + return -EINVAL; + } + + moff = rel->r_offset - map->sec_offset; + shdr_idx = sym->st_shndx; + st_ops = map->st_ops; + pr_debug("struct_ops reloc %s: for %lld value %lld shdr_idx %u rel->r_offset %zu map->sec_offset %zu name %d (\'%s\')\n", + map->name, + (long long)(rel->r_info >> 32), + (long long)sym->st_value, + shdr_idx, (size_t)rel->r_offset, + map->sec_offset, sym->st_name, name); + + if (shdr_idx >= SHN_LORESERVE) { + pr_warn("struct_ops reloc %s: rel->r_offset %zu shdr_idx %u unsupported non-static function\n", + map->name, (size_t)rel->r_offset, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", + map->name, (unsigned long long)sym->st_value); + return -LIBBPF_ERRNO__FORMAT; + } + insn_idx = sym->st_value / BPF_INSN_SZ; + + member = find_member_by_offset(st_ops->type, moff * 8); + if (!member) { + pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", + map->name, moff); + return -EINVAL; + } + member_idx = member - btf_members(st_ops->type); + name = btf__name_by_offset(btf, member->name_off); + + if (!resolve_func_ptr(btf, member->type, NULL)) { + pr_warn("struct_ops reloc %s: cannot relocate non func ptr %s\n", + map->name, name); + return -EINVAL; + } + + prog = find_prog_by_sec_insn(obj, shdr_idx, insn_idx); + if (!prog) { + pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", + map->name, shdr_idx, name); + return -EINVAL; + } + + /* prevent the use of BPF prog with invalid type */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("struct_ops reloc %s: prog %s is not struct_ops BPF program\n", + map->name, prog->name); + return -EINVAL; + } + + /* if we haven't yet processed this BPF program, record proper + * attach_btf_id and member_idx + */ + if (!prog->attach_btf_id) { + prog->attach_btf_id = st_ops->type_id; + prog->expected_attach_type = member_idx; + } + + /* struct_ops BPF prog can be re-used between multiple + * .struct_ops & .struct_ops.link as long as it's the + * same struct_ops struct definition and the same + * function pointer field + */ + if (prog->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != member_idx) { + pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", + map->name, prog->name, prog->sec_name, prog->type, + prog->attach_btf_id, prog->expected_attach_type, name); + return -EINVAL; + } + + st_ops->progs[member_idx] = prog; + } + + return 0; +} + +#define BTF_TRACE_PREFIX "btf_trace_" +#define BTF_LSM_PREFIX "bpf_lsm_" +#define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_MAX_NAME_SIZE 128 + +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind) +{ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + *prefix = BTF_TRACE_PREFIX; + *kind = BTF_KIND_TYPEDEF; + break; + case BPF_LSM_MAC: + case BPF_LSM_CGROUP: + *prefix = BTF_LSM_PREFIX; + *kind = BTF_KIND_FUNC; + break; + case BPF_TRACE_ITER: + *prefix = BTF_ITER_PREFIX; + *kind = BTF_KIND_FUNC; + break; + default: + *prefix = ""; + *kind = BTF_KIND_FUNC; + } +} + +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind) +{ + char btf_type_name[BTF_MAX_NAME_SIZE]; + int ret; + + ret = snprintf(btf_type_name, sizeof(btf_type_name), + "%s%s", prefix, name); + /* snprintf returns the number of characters written excluding the + * terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it + * indicates truncation. + */ + if (ret < 0 || ret >= sizeof(btf_type_name)) + return -ENAMETOOLONG; + return btf__find_by_name_kind(btf, btf_type_name, kind); +} + +static inline int find_attach_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) +{ + const char *prefix; + int kind; + + btf_get_kernel_prefix_kind(attach_type, &prefix, &kind); + return find_btf_by_prefix_kind(btf, prefix, name, kind); +} + +int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type) +{ + struct btf *btf; + int err; + + btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(btf); + if (err) { + pr_warn("vmlinux BTF is not found\n"); + return libbpf_err(err); + } + + err = find_attach_btf_id(btf, name, attach_type); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + + btf__free(btf); + return libbpf_err(err); +} + +static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + struct btf *btf; + int err; + + memset(&info, 0, info_len); + err = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len); + if (err) { + pr_warn("failed bpf_prog_get_info_by_fd for FD %d: %d\n", + attach_prog_fd, err); + return err; + } + + err = -EINVAL; + if (!info.btf_id) { + pr_warn("The target program doesn't have BTF\n"); + goto out; + } + btf = btf__load_from_kernel_by_id(info.btf_id); + err = libbpf_get_error(btf); + if (err) { + pr_warn("Failed to get BTF %d of the program: %d\n", info.btf_id, err); + goto out; + } + err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + btf__free(btf); + if (err <= 0) { + pr_warn("%s is not found in prog's BTF\n", name); + goto out; + } +out: + return err; +} + +static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, + enum bpf_attach_type attach_type, + int *btf_obj_fd, int *btf_type_id) +{ + int ret, i; + + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; + + ret = load_module_btfs(obj); + if (ret) + return ret; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = mod->fd; + *btf_type_id = ret; + return 0; + } + if (ret == -ENOENT) + continue; + + return ret; + } + + return -ESRCH; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id) +{ + enum bpf_attach_type attach_type = prog->expected_attach_type; + __u32 attach_prog_fd = prog->attach_prog_fd; + int err = 0; + + /* BPF program's BTF ID */ + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); + if (err < 0) { + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); + return err; + } + *btf_obj_fd = 0; + *btf_type_id = err; + return 0; + } + + /* kernel/module BTF ID */ + if (prog->obj->gen_loader) { + bpf_gen__record_attach_target(prog->obj->gen_loader, attach_name, attach_type); + *btf_obj_fd = 0; + *btf_type_id = 1; + } else { + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + } + if (err) { + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); + return err; + } + return 0; +} + +int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type) +{ + char *type_names; + const struct bpf_sec_def *sec_def; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (!sec_def) { + pr_debug("failed to guess attach type based on ELF section name '%s'\n", name); + type_names = libbpf_get_type_names(true); + if (type_names != NULL) { + pr_debug("attachable section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-EINVAL); + } + + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + return libbpf_err(-EINVAL); + if (!(sec_def->cookie & SEC_ATTACHABLE)) + return libbpf_err(-EINVAL); + + *attach_type = sec_def->expected_attach_type; + return 0; +} + +int bpf_map__fd(const struct bpf_map *map) +{ + return map ? map->fd : libbpf_err(-EINVAL); +} + +static bool map_uses_real_name(const struct bpf_map *map) +{ + /* Since libbpf started to support custom .data.* and .rodata.* maps, + * their user-visible name differs from kernel-visible name. Users see + * such map's corresponding ELF section name as a map name. + * This check distinguishes .data/.rodata from .data.* and .rodata.* + * maps to know which name has to be returned to the user. + */ + if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0) + return true; + if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0) + return true; + return false; +} + +const char *bpf_map__name(const struct bpf_map *map) +{ + if (!map) + return NULL; + + if (map_uses_real_name(map)) + return map->real_name; + + return map->name; +} + +enum bpf_map_type bpf_map__type(const struct bpf_map *map) +{ + return map->def.type; +} + +int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.type = type; + return 0; +} + +__u32 bpf_map__map_flags(const struct bpf_map *map) +{ + return map->def.map_flags; +} + +int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.map_flags = flags; + return 0; +} + +__u64 bpf_map__map_extra(const struct bpf_map *map) +{ + return map->map_extra; +} + +int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_extra = map_extra; + return 0; +} + +__u32 bpf_map__numa_node(const struct bpf_map *map) +{ + return map->numa_node; +} + +int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->numa_node = numa_node; + return 0; +} + +__u32 bpf_map__key_size(const struct bpf_map *map) +{ + return map->def.key_size; +} + +int bpf_map__set_key_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.key_size = size; + return 0; +} + +__u32 bpf_map__value_size(const struct bpf_map *map) +{ + return map->def.value_size; +} + +static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) +{ + struct btf *btf; + struct btf_type *datasec_type, *var_type; + struct btf_var_secinfo *var; + const struct btf_type *array_type; + const struct btf_array *array; + int vlen, element_sz, new_array_id; + __u32 nr_elements; + + /* check btf existence */ + btf = bpf_object__btf(map->obj); + if (!btf) + return -ENOENT; + + /* verify map is datasec */ + datasec_type = btf_type_by_id(btf, bpf_map__btf_value_type_id(map)); + if (!btf_is_datasec(datasec_type)) { + pr_warn("map '%s': cannot be resized, map value type is not a datasec\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify datasec has at least one var */ + vlen = btf_vlen(datasec_type); + if (vlen == 0) { + pr_warn("map '%s': cannot be resized, map value datasec is empty\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify last var in the datasec is an array */ + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + array_type = skip_mods_and_typedefs(btf, var_type->type, NULL); + if (!btf_is_array(array_type)) { + pr_warn("map '%s': cannot be resized, last var must be an array\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify request size aligns with array */ + array = btf_array(array_type); + element_sz = btf__resolve_size(btf, array->type); + if (element_sz <= 0 || (size - var->offset) % element_sz != 0) { + pr_warn("map '%s': cannot be resized, element size (%d) doesn't align with new total size (%u)\n", + bpf_map__name(map), element_sz, size); + return -EINVAL; + } + + /* create a new array based on the existing array, but with new length */ + nr_elements = (size - var->offset) / element_sz; + new_array_id = btf__add_array(btf, array->index_type, array->type, nr_elements); + if (new_array_id < 0) + return new_array_id; + + /* adding a new btf type invalidates existing pointers to btf objects, + * so refresh pointers before proceeding + */ + datasec_type = btf_type_by_id(btf, map->btf_value_type_id); + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + + /* finally update btf info */ + datasec_type->size = size; + var->size = size - var->offset; + var_type->type = new_array_id; + + return 0; +} + +int bpf_map__set_value_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + + if (map->mmaped) { + int err; + size_t mmap_old_sz, mmap_new_sz; + + mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries); + err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); + if (err) { + pr_warn("map '%s': failed to resize memory-mapped region: %d\n", + bpf_map__name(map), err); + return err; + } + err = map_btf_datasec_resize(map, size); + if (err && err != -ENOENT) { + pr_warn("map '%s': failed to adjust resized BTF, clearing BTF key/value info: %d\n", + bpf_map__name(map), err); + map->btf_value_type_id = 0; + map->btf_key_type_id = 0; + } + } + + map->def.value_size = size; + return 0; +} + +__u32 bpf_map__btf_key_type_id(const struct bpf_map *map) +{ + return map ? map->btf_key_type_id : 0; +} + +__u32 bpf_map__btf_value_type_id(const struct bpf_map *map) +{ + return map ? map->btf_value_type_id : 0; +} + +int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size) +{ + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || + size != map->def.value_size || map->fd >= 0) + return libbpf_err(-EINVAL); + + memcpy(map->mmaped, data, size); + return 0; +} + +void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +{ + if (!map->mmaped) + return NULL; + *psize = map->def.value_size; + return map->mmaped; +} + +bool bpf_map__is_internal(const struct bpf_map *map) +{ + return map->libbpf_type != LIBBPF_MAP_UNSPEC; +} + +__u32 bpf_map__ifindex(const struct bpf_map *map) +{ + return map->map_ifindex; +} + +int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_ifindex = ifindex; + return 0; +} + +int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) { + pr_warn("error: unsupported map type\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map_fd != -1) { + pr_warn("error: inner_map_fd already specified\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + map->inner_map_fd = fd; + return 0; +} + +static struct bpf_map * +__bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) +{ + ssize_t idx; + struct bpf_map *s, *e; + + if (!obj || !obj->maps) + return errno = EINVAL, NULL; + + s = obj->maps; + e = obj->maps + obj->nr_maps; + + if ((m < s) || (m >= e)) { + pr_warn("error in %s: map handler doesn't belong to object\n", + __func__); + return errno = EINVAL, NULL; + } + + idx = (m - obj->maps) + i; + if (idx >= obj->nr_maps || idx < 0) + return NULL; + return &obj->maps[idx]; +} + +struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) +{ + if (prev == NULL) + return obj->maps; + + return __bpf_map__iter(prev, obj, 1); +} + +struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) +{ + if (next == NULL) { + if (!obj->nr_maps) + return NULL; + return obj->maps + obj->nr_maps - 1; + } + + return __bpf_map__iter(next, obj, -1); +} + +struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name) +{ + struct bpf_map *pos; + + bpf_object__for_each_map(pos, obj) { + /* if it's a special internal map name (which always starts + * with dot) then check if that special name matches the + * real map name (ELF section name) + */ + if (name[0] == '.') { + if (pos->real_name && strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + /* otherwise map name has to be an exact match */ + if (map_uses_real_name(pos)) { + if (strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + if (strcmp(pos->name, name) == 0) + return pos; + } + return errno = ENOENT, NULL; +} + +int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) +{ + return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); +} + +static int validate_map_op(const struct bpf_map *map, size_t key_sz, + size_t value_sz, bool check_value_sz) +{ + if (map->fd <= 0) + return -ENOENT; + + if (map->def.key_size != key_sz) { + pr_warn("map '%s': unexpected key size %zu provided, expected %u\n", + map->name, key_sz, map->def.key_size); + return -EINVAL; + } + + if (!check_value_sz) + return 0; + + switch (map->def.type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: { + int num_cpu = libbpf_num_possible_cpus(); + size_t elem_sz = roundup(map->def.value_size, 8); + + if (value_sz != num_cpu * elem_sz) { + pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", + map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); + return -EINVAL; + } + break; + } + default: + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + return 0; +} + +int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_update_elem(map->fd, key, value, flags); +} + +int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_delete_elem_flags(map->fd, key, flags); +} + +int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_and_delete_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_get_next_key(map->fd, cur_key, next_key); +} + +long libbpf_get_error(const void *ptr) +{ + if (!IS_ERR_OR_NULL(ptr)) + return 0; + + if (IS_ERR(ptr)) + errno = -PTR_ERR(ptr); + + /* If ptr == NULL, then errno should be already set by the failing + * API, because libbpf never returns NULL on success and it now always + * sets errno on error. So no extra errno handling for ptr == NULL + * case. + */ + return -errno; +} + +/* Replace link's underlying BPF program with the new one */ +int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) +{ + int ret; + + ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + return libbpf_err_errno(ret); +} + +/* Release "ownership" of underlying BPF resource (typically, BPF program + * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected + * link, when destructed through bpf_link__destroy() call won't attempt to + * detach/unregisted that BPF resource. This is useful in situations where, + * say, attached BPF program has to outlive userspace program that attached it + * in the system. Depending on type of BPF program, though, there might be + * additional steps (like pinning BPF program in BPF FS) necessary to ensure + * exit of userspace program doesn't trigger automatic detachment and clean up + * inside the kernel. + */ +void bpf_link__disconnect(struct bpf_link *link) +{ + link->disconnected = true; +} + +int bpf_link__destroy(struct bpf_link *link) +{ + int err = 0; + + if (IS_ERR_OR_NULL(link)) + return 0; + + if (!link->disconnected && link->detach) + err = link->detach(link); + if (link->pin_path) + free(link->pin_path); + if (link->dealloc) + link->dealloc(link); + else + free(link); + + return libbpf_err(err); +} + +int bpf_link__fd(const struct bpf_link *link) +{ + return link->fd; +} + +const char *bpf_link__pin_path(const struct bpf_link *link) +{ + return link->pin_path; +} + +static int bpf_link__detach_fd(struct bpf_link *link) +{ + return libbpf_err_errno(close(link->fd)); +} + +struct bpf_link *bpf_link__open(const char *path) +{ + struct bpf_link *link; + int fd; + + fd = bpf_obj_get(path); + if (fd < 0) { + fd = -errno; + pr_warn("failed to open link at %s: %d\n", path, fd); + return libbpf_err_ptr(fd); + } + + link = calloc(1, sizeof(*link)); + if (!link) { + close(fd); + return libbpf_err_ptr(-ENOMEM); + } + link->detach = &bpf_link__detach_fd; + link->fd = fd; + + link->pin_path = strdup(path); + if (!link->pin_path) { + bpf_link__destroy(link); + return libbpf_err_ptr(-ENOMEM); + } + + return link; +} + +int bpf_link__detach(struct bpf_link *link) +{ + return bpf_link_detach(link->fd) ? -errno : 0; +} + +int bpf_link__pin(struct bpf_link *link, const char *path) +{ + int err; + + if (link->pin_path) + return libbpf_err(-EBUSY); + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + err = check_path(path); + if (err) + return libbpf_err(err); + + link->pin_path = strdup(path); + if (!link->pin_path) + return libbpf_err(-ENOMEM); + + if (bpf_obj_pin(link->fd, link->pin_path)) { + err = -errno; + zfree(&link->pin_path); + return libbpf_err(err); + } + + pr_debug("link fd=%d: pinned at %s\n", link->fd, link->pin_path); + return 0; +} + +int bpf_link__unpin(struct bpf_link *link) +{ + int err; + + if (!link->pin_path) + return libbpf_err(-EINVAL); + + err = unlink(link->pin_path); + if (err != 0) + return -errno; + + pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path); + zfree(&link->pin_path); + return 0; +} + +struct bpf_link_perf { + struct bpf_link link; + int perf_event_fd; + /* legacy kprobe support: keep track of probe identifier and type */ + char *legacy_probe_name; + bool legacy_is_kprobe; + bool legacy_is_retprobe; +}; + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe); +static int remove_uprobe_event_legacy(const char *probe_name, bool retprobe); + +static int bpf_link_perf_detach(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + int err = 0; + + if (ioctl(perf_link->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0) < 0) + err = -errno; + + if (perf_link->perf_event_fd != link->fd) + close(perf_link->perf_event_fd); + close(link->fd); + + /* legacy uprobe/kprobe needs to be removed after perf event fd closure */ + if (perf_link->legacy_probe_name) { + if (perf_link->legacy_is_kprobe) { + err = remove_kprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } else { + err = remove_uprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } + } + + return err; +} + +static void bpf_link_perf_dealloc(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + free(perf_link->legacy_probe_name); + free(perf_link); +} + +struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link_perf *link; + int prog_fd, link_fd = -1, err; + bool force_ioctl_attach; + + if (!OPTS_VALID(opts, bpf_perf_event_opts)) + return libbpf_err_ptr(-EINVAL); + + if (pfd < 0) { + pr_warn("prog '%s': invalid perf event FD %d\n", + prog->name, pfd); + return libbpf_err_ptr(-EINVAL); + } + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->link.detach = &bpf_link_perf_detach; + link->link.dealloc = &bpf_link_perf_dealloc; + link->perf_event_fd = pfd; + + force_ioctl_attach = OPTS_GET(opts, force_ioctl_attach, false); + if (kernel_supports(prog->obj, FEAT_PERF_LINK) && !force_ioctl_attach) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, + .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); + + link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n", + prog->name, pfd, + err, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link->link.fd = link_fd; + } else { + if (OPTS_GET(opts, bpf_cookie, 0)) { + pr_warn("prog '%s': user context value is not supported\n", prog->name); + err = -EOPNOTSUPP; + goto err_out; + } + + if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + if (err == -EPROTO) + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); + goto err_out; + } + link->link.fd = pfd; + } + if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + return &link->link; +err_out: + if (link_fd >= 0) + close(link_fd); + free(link); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd) +{ + return bpf_program__attach_perf_event_opts(prog, pfd, NULL); +} + +/* + * this function is expected to parse integer in the range of [0, 2^31-1] from + * given file using scanf format string fmt. If actual parsed value is + * negative, the result might be indistinguishable from error + */ +static int parse_uint_from_file(const char *file, const char *fmt) +{ + char buf[STRERR_BUFSIZE]; + int err, ret; + FILE *f; + + f = fopen(file, "re"); + if (!f) { + err = -errno; + pr_debug("failed to open '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + return err; + } + err = fscanf(f, fmt, &ret); + if (err != 1) { + err = err == EOF ? -EIO : -errno; + pr_debug("failed to parse '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + fclose(f); + return err; + } + fclose(f); + return ret; +} + +static int determine_kprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_uprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_kprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +static int determine_uprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +#define PERF_UPROBE_REF_CTR_OFFSET_BITS 32 +#define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 + +static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, + uint64_t offset, int pid, size_t ref_ctr_off) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd; + + if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + return -EINVAL; + + memset(&attr, 0, attr_sz); + + type = uprobe ? determine_uprobe_perf_type() + : determine_kprobe_perf_type(); + if (type < 0) { + pr_warn("failed to determine %s perf type: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(type, errmsg, sizeof(errmsg))); + return type; + } + if (retprobe) { + int bit = uprobe ? determine_uprobe_retprobe_bit() + : determine_kprobe_retprobe_bit(); + + if (bit < 0) { + pr_warn("failed to determine %s retprobe bit: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(bit, errmsg, sizeof(errmsg))); + return bit; + } + attr.config |= 1 << bit; + } + attr.size = attr_sz; + attr.type = type; + attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ + attr.config2 = offset; /* kprobe_addr or probe_offset */ + + /* pid filter is meaningful only for uprobes */ + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid /* pid */, + pid == -1 ? 0 : -1 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + return pfd >= 0 ? pfd : -errno; +} + +static int append_to_file(const char *file, const char *fmt, ...) +{ + int fd, n, err = 0; + va_list ap; + char buf[1024]; + + va_start(ap, fmt); + n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + if (n < 0 || n >= sizeof(buf)) + return -EINVAL; + + fd = open(file, O_WRONLY | O_APPEND | O_CLOEXEC, 0); + if (fd < 0) + return -errno; + + if (write(fd, buf, n) < 0) + err = -errno; + + close(fd); + return err; +} + +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) +{ + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) +{ + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_kprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/kprobe_events" : TRACEFS"/kprobe_events"; +} + +static const char *tracefs_uprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; +} + +static const char *tracefs_available_filter_functions(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions" + : TRACEFS"/available_filter_functions"; +} + +static const char *tracefs_available_filter_functions_addrs(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions_addrs" + : TRACEFS"/available_filter_functions_addrs"; +} + +static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *kfunc_name, size_t offset) +{ + static int index = 0; + int i; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, + __sync_fetch_and_add(&index, 1)); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } +} + +static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset) +{ + return append_to_file(tracefs_kprobe_events(), "%c:%s/%s %s+0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "kretprobes" : "kprobes", + probe_name, kfunc_name, offset); +} + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_kprobe_events(), "-:%s/%s", + retprobe ? "kretprobes" : "kprobes", probe_name); +} + +static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[256]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "kretprobes" : "kprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd, err; + + err = add_kprobe_event_legacy(probe_name, retprobe, kfunc_name, offset); + if (err < 0) { + pr_warn("failed to add legacy kprobe event for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + type = determine_kprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy kprobe perf_event_open() failed: %s\n", + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; +} + +static const char *arch_specific_syscall_pfx(void) +{ +#if defined(__x86_64__) + return "x64"; +#elif defined(__i386__) + return "ia32"; +#elif defined(__s390x__) + return "s390x"; +#elif defined(__s390__) + return "s390"; +#elif defined(__arm__) + return "arm"; +#elif defined(__aarch64__) + return "arm64"; +#elif defined(__mips__) + return "mips"; +#elif defined(__riscv) + return "riscv"; +#elif defined(__powerpc__) + return "powerpc"; +#elif defined(__powerpc64__) + return "powerpc64"; +#else + return NULL; +#endif +} + +static int probe_kern_syscall_wrapper(void) +{ + char syscall_name[64]; + const char *ksys_pfx; + + ksys_pfx = arch_specific_syscall_pfx(); + if (!ksys_pfx) + return 0; + + snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx); + + if (determine_kprobe_perf_type() >= 0) { + int pfd; + + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + if (pfd >= 0) + close(pfd); + + return pfd >= 0 ? 1 : 0; + } else { /* legacy mode */ + char probe_name[128]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0); + if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0) + return 0; + + (void)remove_kprobe_event_legacy(probe_name, false); + return 1; + } +} + +struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char errmsg[STRERR_BUFSIZE]; + char *legacy_probe = NULL; + struct bpf_link *link; + size_t offset; + bool retprobe, legacy; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_kprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); + retprobe = OPTS_GET(opts, retprobe, false); + offset = OPTS_GET(opts, offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + legacy = determine_kprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + + if (!legacy) { + pfd = perf_event_open_probe(false /* uprobe */, retprobe, + func_name, offset, + -1 /* pid */, 0 /* ref_ctr_off */); + } else { + char probe_name[256]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), + func_name, offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_kprobe_open_legacy(legacy_probe, retprobe, func_name, + offset, -1 /* pid */); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s+0x%zx' perf event: %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s+0x%zx': %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = true; + perf_link->legacy_is_retprobe = retprobe; + } + + return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog, + bool retprobe, + const char *func_name) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe, + ); + + return bpf_program__attach_kprobe_opts(prog, func_name, &opts); +} + +struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts) +{ + LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts); + char func_name[128]; + + if (!OPTS_VALID(opts, bpf_ksyscall_opts)) + return libbpf_err_ptr(-EINVAL); + + if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) { + /* arch_specific_syscall_pfx() should never return NULL here + * because it is guarded by kernel_supports(). However, since + * compiler does not know that we have an explicit conditional + * as well. + */ + snprintf(func_name, sizeof(func_name), "__%s_sys_%s", + arch_specific_syscall_pfx() ? : "", syscall_name); + } else { + snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name); + } + + kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false); + kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts); +} + +/* Adapted from perf/util/string.c */ +static bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*pat == '?') { /* Matches any single character */ + str++; + pat++; + continue; + } + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_match(str++, pat)) + return true; + } + return !*str && !*pat; +} + +struct kprobe_multi_resolve { + const char *pattern; + unsigned long *addrs; + size_t cap; + size_t cnt; +}; + +struct avail_kallsyms_data { + char **syms; + size_t cnt; + struct kprobe_multi_resolve *res; +}; + +static int avail_func_cmp(const void *a, const void *b) +{ + return strcmp(*(const char **)a, *(const char **)b); +} + +static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct avail_kallsyms_data *data = ctx; + struct kprobe_multi_resolve *res = data->res; + int err; + + if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp)) + return 0; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, sizeof(*res->addrs), res->cnt + 1); + if (err) + return err; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + return 0; +} + +static int libbpf_available_kallsyms_parse(struct kprobe_multi_resolve *res) +{ + const char *available_functions_file = tracefs_available_filter_functions(); + struct avail_kallsyms_data data; + char sym_name[500]; + FILE *f; + int err = 0, ret, i; + char **syms = NULL; + size_t cap = 0, cnt = 0; + + f = fopen(available_functions_file, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_functions_file, err); + return err; + } + + while (true) { + char *name; + + ret = fscanf(f, "%499s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 1) { + pr_warn("failed to parse available_filter_functions entry: %d\n", ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&syms, &cap, sizeof(*syms), cnt + 1); + if (err) + goto cleanup; + + name = strdup(sym_name); + if (!name) { + err = -errno; + goto cleanup; + } + + syms[cnt++] = name; + } + + /* no entries found, bail out */ + if (cnt == 0) { + err = -ENOENT; + goto cleanup; + } + + /* sort available functions */ + qsort(syms, cnt, sizeof(*syms), avail_func_cmp); + + data.syms = syms; + data.res = res; + data.cnt = cnt; + libbpf_kallsyms_parse(avail_kallsyms_cb, &data); + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + for (i = 0; i < cnt; i++) + free((char *)syms[i]); + free(syms); + + fclose(f); + return err; +} + +static bool has_available_filter_functions_addrs(void) +{ + return access(tracefs_available_filter_functions_addrs(), R_OK) != -1; +} + +static int libbpf_available_kprobes_parse(struct kprobe_multi_resolve *res) +{ + const char *available_path = tracefs_available_filter_functions_addrs(); + char sym_name[500]; + FILE *f; + int ret, err = 0; + unsigned long long sym_addr; + + f = fopen(available_path, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_path, err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %499s%*[^\n]\n", &sym_addr, sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 2) { + pr_warn("failed to parse available_filter_functions_addrs entry: %d\n", + ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, + sizeof(*res->addrs), res->cnt + 1); + if (err) + goto cleanup; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + } + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + fclose(f); + return err; +} + +struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct kprobe_multi_resolve res = { + .pattern = pattern, + }; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + const unsigned long *addrs; + int err, link_fd, prog_fd; + const __u64 *cookies; + const char **syms; + bool retprobe; + size_t cnt; + + if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + syms = OPTS_GET(opts, syms, false); + addrs = OPTS_GET(opts, addrs, false); + cnt = OPTS_GET(opts, cnt, false); + cookies = OPTS_GET(opts, cookies, false); + + if (!pattern && !addrs && !syms) + return libbpf_err_ptr(-EINVAL); + if (pattern && (addrs || syms || cookies || cnt)) + return libbpf_err_ptr(-EINVAL); + if (!pattern && !cnt) + return libbpf_err_ptr(-EINVAL); + if (addrs && syms) + return libbpf_err_ptr(-EINVAL); + + if (pattern) { + if (has_available_filter_functions_addrs()) + err = libbpf_available_kprobes_parse(&res); + else + err = libbpf_available_kallsyms_parse(&res); + if (err) + goto error; + addrs = res.addrs; + cnt = res.cnt; + } + + retprobe = OPTS_GET(opts, retprobe, false); + + lopts.kprobe_multi.syms = syms; + lopts.kprobe_multi.addrs = addrs; + lopts.kprobe_multi.cookies = cookies; + lopts.kprobe_multi.cnt = cnt; + lopts.kprobe_multi.flags = retprobe ? BPF_F_KPROBE_MULTI_RETURN : 0; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + prog_fd = bpf_program__fd(prog); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto error; + } + link->fd = link_fd; + free(res.addrs); + return link; + +error: + free(link); + free(res.addrs); + return libbpf_err_ptr(err); +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + unsigned long offset = 0; + const char *func_name; + char *func; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe") and SEC("kretprobe") */ + if (strcmp(prog->sec_name, "kprobe") == 0 || strcmp(prog->sec_name, "kretprobe") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe/"); + if (opts.retprobe) + func_name = prog->sec_name + sizeof("kretprobe/") - 1; + else + func_name = prog->sec_name + sizeof("kprobe/") - 1; + + n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); + if (n < 1) { + pr_warn("kprobe name is invalid: %s\n", func_name); + return -EINVAL; + } + if (opts.retprobe && offset != 0) { + free(func); + pr_warn("kretprobes do not support offset specification\n"); + return -EINVAL; + } + + opts.offset = offset; + *link = bpf_program__attach_kprobe_opts(prog, func, &opts); + free(func); + return libbpf_get_error(*link); +} + +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_ksyscall_opts, opts); + const char *syscall_name; + + *link = NULL; + + /* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */ + if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/"); + if (opts.retprobe) + syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1; + else + syscall_name = prog->sec_name + sizeof("ksyscall/") - 1; + + *link = bpf_program__attach_ksyscall(prog, syscall_name, &opts); + return *link ? 0 : -errno; +} + +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.multi") and SEC("kretprobe.multi") */ + if (strcmp(prog->sec_name, "kprobe.multi") == 0 || + strcmp(prog->sec_name, "kretprobe.multi") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe.multi/"); + if (opts.retprobe) + spec = prog->sec_name + sizeof("kretprobe.multi/") - 1; + else + spec = prog->sec_name + sizeof("kprobe.multi/") - 1; + + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return libbpf_get_error(*link); +} + +static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *binary_path, uint64_t offset) +{ + int i; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), binary_path, (size_t)offset); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } +} + +static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset) +{ + return append_to_file(tracefs_uprobe_events(), "%c:%s/%s %s:0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "uretprobes" : "uprobes", + probe_name, binary_path, offset); +} + +static inline int remove_uprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_uprobe_events(), "-:%s/%s", + retprobe ? "uretprobes" : "uprobes", probe_name); +} + +static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[512]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "uretprobes" : "uprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + int type, pfd, err; + + err = add_uprobe_event_legacy(probe_name, retprobe, binary_path, offset); + if (err < 0) { + pr_warn("failed to add legacy uprobe event for %s:0x%zx: %d\n", + binary_path, (size_t)offset, err); + return err; + } + type = determine_uprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", + binary_path, offset, err); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} + +/* Find offset of function name in the provided ELF object. "binary_path" is + * the path to the ELF binary represented by "elf", and only used for error + * reporting matters. "name" matches symbol name or name@@LIB for library + * functions. + */ +static long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name) +{ + int i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + bool is_shared_lib, is_name_qualified; + long ret = -ENOENT; + size_t name_len; + GElf_Ehdr ehdr; + + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; + + name_len = strlen(name); + /* Does name specify "@@LIB"? */ + is_name_qualified = strstr(name, "@@") != NULL; + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + size_t nr_syms, strtabidx, idx; + Elf_Data *symbols = NULL; + Elf_Scn *scn = NULL; + int last_bind = -1; + const char *sname; + GElf_Shdr sh; + + scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); + if (!scn) { + pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", + binary_path); + continue; + } + if (!gelf_getshdr(scn, &sh)) + continue; + strtabidx = sh.sh_link; + symbols = elf_getdata(scn, 0); + if (!symbols) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + nr_syms = symbols->d_size / sh.sh_entsize; + + for (idx = 0; idx < nr_syms; idx++) { + int curr_bind; + GElf_Sym sym; + Elf_Scn *sym_scn; + GElf_Shdr sym_sh; + + if (!gelf_getsym(symbols, idx, &sym)) + continue; + + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; + + sname = elf_strptr(elf, strtabidx, sym.st_name); + if (!sname) + continue; + + curr_bind = GELF_ST_BIND(sym.st_info); + + /* User can specify func, func@@LIB or func@@LIB_VERSION. */ + if (strncmp(sname, name, name_len) != 0) + continue; + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') + continue; + + if (ret >= 0) { + /* handle multiple matches */ + if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sname, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (curr_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + + /* Transform symbol's virtual address (absolute for + * binaries and relative for shared libs) into file + * offset, which is what kernel is expecting for + * uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more + * details. + * This is done by looking up symbol's containing + * section's header and using it's virtual address + * (sh_addr) and corresponding file offset (sh_offset) + * to transform sym.st_value (virtual address) into + * desired final file offset. + */ + sym_scn = elf_getscn(elf, sym.st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &sym_sh)) + continue; + + ret = sym.st_value - sym_sh.sh_addr + sym_sh.sh_offset; + last_bind = curr_bind; + } + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + return ret; +} + +/* Find offset of function name in ELF object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +static long elf_find_func_offset_from_file(const char *binary_path, const char *name) +{ + char errmsg[STRERR_BUFSIZE]; + long ret = -ENOENT; + Elf *elf; + int fd; + + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + + ret = elf_find_func_offset(elf, binary_path, name); + elf_end(elf); + close(fd); + return ret; +} + +/* Find offset of function name in archive specified by path. Currently + * supported are .zip files that do not compress their contents, as used on + * Android in the form of APKs, for example. "file_name" is the name of the ELF + * file inside the archive. "func_name" matches symbol name or name@@LIB for + * library functions. + * + * An overview of the APK format specifically provided here: + * https://en.wikipedia.org/w/index.php?title=Apk_(file_format)&oldid=1139099120#Package_contents + */ +static long elf_find_func_offset_from_archive(const char *archive_path, const char *file_name, + const char *func_name) +{ + struct zip_archive *archive; + struct zip_entry entry; + long ret; + Elf *elf; + + archive = zip_archive_open(archive_path); + if (IS_ERR(archive)) { + ret = PTR_ERR(archive); + pr_warn("zip: failed to open %s: %ld\n", archive_path, ret); + return ret; + } + + ret = zip_archive_find_entry(archive, file_name, &entry); + if (ret) { + pr_warn("zip: could not find archive member %s in %s: %ld\n", file_name, + archive_path, ret); + goto out; + } + pr_debug("zip: found entry for %s in %s at 0x%lx\n", file_name, archive_path, + (unsigned long)entry.data_offset); + + if (entry.compression) { + pr_warn("zip: entry %s of %s is compressed and cannot be handled\n", file_name, + archive_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + + elf = elf_memory((void *)entry.data, entry.data_length); + if (!elf) { + pr_warn("elf: could not read elf file %s from %s: %s\n", file_name, archive_path, + elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__LIBELF; + goto out; + } + + ret = elf_find_func_offset(elf, file_name, func_name); + if (ret > 0) { + pr_debug("elf: symbol address match for %s of %s in %s: 0x%x + 0x%lx = 0x%lx\n", + func_name, file_name, archive_path, entry.data_offset, ret, + ret + entry.data_offset); + ret += entry.data_offset; + } + elf_end(elf); + +out: + zip_archive_close(archive); + return ret; +} + +static const char *arch_specific_lib_paths(void) +{ + /* + * Based on https://packages.debian.org/sid/libc6. + * + * Assume that the traced program is built for the same architecture + * as libbpf, which should cover the vast majority of cases. + */ +#if defined(__x86_64__) + return "/lib/x86_64-linux-gnu"; +#elif defined(__i386__) + return "/lib/i386-linux-gnu"; +#elif defined(__s390x__) + return "/lib/s390x-linux-gnu"; +#elif defined(__s390__) + return "/lib/s390-linux-gnu"; +#elif defined(__arm__) && defined(__SOFTFP__) + return "/lib/arm-linux-gnueabi"; +#elif defined(__arm__) && !defined(__SOFTFP__) + return "/lib/arm-linux-gnueabihf"; +#elif defined(__aarch64__) + return "/lib/aarch64-linux-gnu"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 64 + return "/lib/mips64el-linux-gnuabi64"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 32 + return "/lib/mipsel-linux-gnu"; +#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return "/lib/powerpc64le-linux-gnu"; +#elif defined(__sparc__) && defined(__arch64__) + return "/lib/sparc64-linux-gnu"; +#elif defined(__riscv) && __riscv_xlen == 64 + return "/lib/riscv64-linux-gnu"; +#else + return NULL; +#endif +} + +/* Get full path to program/shared library. */ +static int resolve_full_path(const char *file, char *result, size_t result_sz) +{ + const char *search_paths[3] = {}; + int i, perm; + + if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { + search_paths[0] = getenv("LD_LIBRARY_PATH"); + search_paths[1] = "/usr/lib64:/usr/lib"; + search_paths[2] = arch_specific_lib_paths(); + perm = R_OK; + } else { + search_paths[0] = getenv("PATH"); + search_paths[1] = "/usr/bin:/usr/sbin"; + perm = R_OK | X_OK; + } + + for (i = 0; i < ARRAY_SIZE(search_paths); i++) { + const char *s; + + if (!search_paths[i]) + continue; + for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) { + char *next_path; + int seg_len; + + if (s[0] == ':') + s++; + next_path = strchr(s, ':'); + seg_len = next_path ? next_path - s : strlen(s); + if (!seg_len) + continue; + snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); + /* ensure it has required permissions */ + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) + continue; + pr_debug("resolved '%s' to '%s'\n", file, result); + return 0; + } + } + return -ENOENT; +} + +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts) +{ + const char *archive_path = NULL, *archive_sep = NULL; + char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char full_path[PATH_MAX]; + struct bpf_link *link; + size_t ref_ctr_off; + int pfd, err; + bool retprobe, legacy; + const char *func_name; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); + retprobe = OPTS_GET(opts, retprobe, false); + ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + /* Check if "binary_path" refers to an archive. */ + archive_sep = strstr(binary_path, "!/"); + if (archive_sep) { + full_path[0] = '\0'; + libbpf_strlcpy(full_path, binary_path, + min(sizeof(full_path), (size_t)(archive_sep - binary_path + 1))); + archive_path = full_path; + binary_path = archive_sep + 2; + } else if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_path, sizeof(full_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = full_path; + } + func_name = OPTS_GET(opts, func_name, NULL); + if (func_name) { + long sym_off; + + if (archive_path) { + sym_off = elf_find_func_offset_from_archive(archive_path, binary_path, + func_name); + binary_path = archive_path; + } else { + sym_off = elf_find_func_offset_from_file(binary_path, func_name); + } + if (sym_off < 0) + return libbpf_err_ptr(sym_off); + func_offset += sym_off; + } + + legacy = determine_uprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + + if (!legacy) { + pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, + func_offset, pid, ref_ctr_off); + } else { + char probe_name[PATH_MAX + 64]; + + if (ref_ctr_off) + return libbpf_err_ptr(-EINVAL); + + gen_uprobe_legacy_event_name(probe_name, sizeof(probe_name), + binary_path, func_offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_uprobe_open_legacy(legacy_probe, retprobe, + binary_path, func_offset, pid); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = false; + perf_link->legacy_is_retprobe = retprobe; + } + return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +/* Format of u[ret]probe section definition supporting auto-attach: + * u[ret]probe/binary:function[+offset] + * + * binary can be an absolute/relative path or a filename; the latter is resolved to a + * full binary path via bpf_program__attach_uprobe_opts. + * + * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be + * specified (and auto-attach is not possible) or the above format is specified for + * auto-attach. + */ +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + int n, ret = -EINVAL; + long offset = 0; + + *link = NULL; + + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", + &probe_type, &binary_path, &func_name, &offset); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 2: + pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n", + prog->name, prog->sec_name); + break; + case 3: + case 4: + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; + if (opts.retprobe && offset != 0) { + pr_warn("prog '%s': uretprobes do not support offset specification\n", + prog->name); + break; + } + opts.func_name = func_name; + *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, + prog->sec_name); + break; + } + free(probe_type); + free(binary_path); + free(func_name); + + return ret; +} + +struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, + bool retprobe, pid_t pid, + const char *binary_path, + size_t func_offset) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts, .retprobe = retprobe); + + return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts); +} + +struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts) +{ + char resolved_path[512]; + struct bpf_object *obj = prog->obj; + struct bpf_link *link; + __u64 usdt_cookie; + int err; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = resolved_path; + } + + /* USDT manager is instantiated lazily on first USDT attach. It will + * be destroyed together with BPF object in bpf_object__close(). + */ + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + if (!obj->usdt_man) { + obj->usdt_man = usdt_manager_new(obj); + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + } + + usdt_cookie = OPTS_GET(opts, usdt_cookie, 0); + link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path, + usdt_provider, usdt_name, usdt_cookie); + err = libbpf_get_error(link); + if (err) + return libbpf_err_ptr(err); + return link; +} + +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *path = NULL, *provider = NULL, *name = NULL; + const char *sec_name; + int n, err; + + sec_name = bpf_program__section_name(prog); + if (strcmp(sec_name, "usdt") == 0) { + /* no auto-attach for just SEC("usdt") */ + *link = NULL; + return 0; + } + + n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name); + if (n != 3) { + pr_warn("invalid section '%s', expected SEC(\"usdt/::\")\n", + sec_name); + err = -EINVAL; + } else { + *link = bpf_program__attach_usdt(prog, -1 /* any process */, path, + provider, name, NULL); + err = libbpf_get_error(*link); + } + free(path); + free(provider); + free(name); + return err; +} + +static int determine_tracepoint_id(const char *tp_category, + const char *tp_name) +{ + char file[PATH_MAX]; + int ret; + + ret = snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), tp_category, tp_name); + if (ret < 0) + return -errno; + if (ret >= sizeof(file)) { + pr_debug("tracepoint %s/%s path is too long\n", + tp_category, tp_name); + return -E2BIG; + } + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_open_tracepoint(const char *tp_category, + const char *tp_name) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int tp_id, pfd, err; + + tp_id = determine_tracepoint_id(tp_category, tp_name); + if (tp_id < 0) { + pr_warn("failed to determine tracepoint '%s/%s' perf event ID: %s\n", + tp_category, tp_name, + libbpf_strerror_r(tp_id, errmsg, sizeof(errmsg))); + return tp_id; + } + + memset(&attr, 0, attr_sz); + attr.type = PERF_TYPE_TRACEPOINT; + attr.size = attr_sz; + attr.config = tp_id; + + pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("tracepoint '%s/%s' perf_event_open() failed: %s\n", + tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + return pfd; +} + +struct bpf_link *bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + pfd = perf_event_open_tracepoint(tp_category, tp_name); + if (pfd < 0) { + pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(err); + } + return link; +} + +struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name) +{ + return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); +} + +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *sec_name, *tp_cat, *tp_name; + + *link = NULL; + + /* no auto-attach for SEC("tp") or SEC("tracepoint") */ + if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + return 0; + + sec_name = strdup(prog->sec_name); + if (!sec_name) + return -ENOMEM; + + /* extract "tp//" or "tracepoint//" */ + if (str_has_pfx(prog->sec_name, "tp/")) + tp_cat = sec_name + sizeof("tp/") - 1; + else + tp_cat = sec_name + sizeof("tracepoint/") - 1; + tp_name = strchr(tp_cat, '/'); + if (!tp_name) { + free(sec_name); + return -EINVAL; + } + *tp_name = '\0'; + tp_name++; + + *link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name); + free(sec_name); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n", + prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + static const char *const prefixes[] = { + "raw_tp", + "raw_tracepoint", + "raw_tp.w", + "raw_tracepoint.w", + }; + size_t i; + const char *tp_name = NULL; + + *link = NULL; + + for (i = 0; i < ARRAY_SIZE(prefixes); i++) { + size_t pfx_len; + + if (!str_has_pfx(prog->sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + /* no auto-attach case of, e.g., SEC("raw_tp") */ + if (prog->sec_name[pfx_len] == '\0') + return 0; + + if (prog->sec_name[pfx_len] != '/') + continue; + + tp_name = prog->sec_name + pfx_len + 1; + break; + } + + if (!tp_name) { + pr_warn("prog '%s': invalid section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + *link = bpf_program__attach_raw_tracepoint(prog, tp_name); + return libbpf_get_error(*link); +} + +/* Common logic for all BPF program types that attach to a btf_id */ +static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + if (!OPTS_VALID(opts, bpf_trace_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + /* libbpf is smart enough to redirect to BPF_RAW_TRACEPOINT_OPEN on old kernels */ + link_opts.tracing.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +struct bpf_link *bpf_program__attach_trace(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +struct bpf_link *bpf_program__attach_trace_opts(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + return bpf_program__attach_btf_id(prog, opts); +} + +struct bpf_link *bpf_program__attach_lsm(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_trace(prog); + return libbpf_get_error(*link); +} + +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_lsm(prog); + return libbpf_get_error(*link); +} + +static struct bpf_link * +bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id, + const char *target_name) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .target_btf_id = btf_id); + enum bpf_attach_type attach_type; + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + attach_type = bpf_program__expected_attach_type(prog); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to %s: %s\n", + prog->name, target_name, + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program__attach_fd(prog, cgroup_fd, 0, "cgroup"); +} + +struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) +{ + return bpf_program__attach_fd(prog, netns_fd, 0, "netns"); +} + +struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) +{ + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program__attach_fd(prog, ifindex, 0, "xdp"); +} + +struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, + const char *attach_func_name) +{ + int btf_id; + + if (!!target_fd != !!attach_func_name) { + pr_warn("prog '%s': supply none or both of target_fd and attach_func_name\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (prog->type != BPF_PROG_TYPE_EXT) { + pr_warn("prog '%s': only BPF_PROG_TYPE_EXT can attach as freplace", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (target_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd); + if (btf_id < 0) + return libbpf_err_ptr(btf_id); + + return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace"); + } else { + /* no target, so use raw_tracepoint_open for compatibility + * with old kernels + */ + return bpf_program__attach_trace(prog); + } +} + +struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + __u32 target_fd = 0; + + if (!OPTS_VALID(opts, bpf_iter_attach_opts)) + return libbpf_err_ptr(-EINVAL); + + link_create_opts.iter_info = OPTS_GET(opts, link_info, (void *)0); + link_create_opts.iter_info_len = OPTS_GET(opts, link_info_len, 0); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER, + &link_create_opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to iterator: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_iter(prog, NULL); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct bpf_link *link; + int prog_fd, link_fd; + + if (!OPTS_VALID(opts, bpf_netfilter_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + + link->detach = &bpf_link__detach_fd; + + lopts.netfilter.pf = OPTS_GET(opts, pf, 0); + lopts.netfilter.hooknum = OPTS_GET(opts, hooknum, 0); + lopts.netfilter.priority = OPTS_GET(opts, priority, 0); + lopts.netfilter.flags = OPTS_GET(opts, flags, 0); + + link_fd = bpf_link_create(prog_fd, 0, BPF_NETFILTER, &lopts); + if (link_fd < 0) { + char errmsg[STRERR_BUFSIZE]; + + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to netfilter: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + + return link; +} + +struct bpf_link *bpf_program__attach(const struct bpf_program *prog) +{ + struct bpf_link *link = NULL; + int err; + + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + return libbpf_err_ptr(-EOPNOTSUPP); + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); + if (err) + return libbpf_err_ptr(err); + + /* When calling bpf_program__attach() explicitly, auto-attach support + * is expected to work, so NULL returned link is considered an error. + * This is different for skeleton's attach, see comment in + * bpf_object__attach_skeleton(). + */ + if (!link) + return libbpf_err_ptr(-EOPNOTSUPP); + + return link; +} + +struct bpf_link_struct_ops { + struct bpf_link link; + int map_fd; +}; + +static int bpf_link__detach_struct_ops(struct bpf_link *link) +{ + struct bpf_link_struct_ops *st_link; + __u32 zero = 0; + + st_link = container_of(link, struct bpf_link_struct_ops, link); + + if (st_link->map_fd < 0) + /* w/o a real link */ + return bpf_map_delete_elem(link->fd, &zero); + + return close(link->fd); +} + +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + struct bpf_link_struct_ops *link; + __u32 zero = 0; + int err, fd; + + if (!bpf_map__is_struct_ops(map) || map->fd == -1) + return libbpf_err_ptr(-EINVAL); + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-EINVAL); + + /* kern_vdata should be prepared during the loading phase. */ + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) { + free(link); + return libbpf_err_ptr(err); + } + + link->link.detach = bpf_link__detach_struct_ops; + + if (!(map->def.map_flags & BPF_F_LINK)) { + /* w/o a real link */ + link->link.fd = map->fd; + link->map_fd = -1; + return &link->link; + } + + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + if (fd < 0) { + free(link); + return libbpf_err_ptr(fd); + } + + link->link.fd = fd; + link->map_fd = map->fd; + + return &link->link; +} + +/* + * Swap the back struct_ops of a link with a new struct_ops map. + */ +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) +{ + struct bpf_link_struct_ops *st_ops_link; + __u32 zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map) || map->fd < 0) + return -EINVAL; + + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); + /* Ensure the type of a link is correct */ + if (st_ops_link->map_fd < 0) + return -EINVAL; + + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && err != -EBUSY) + return err; + + err = bpf_link_update(link->fd, map->fd, NULL); + if (err < 0) + return err; + + st_ops_link->map_fd = map->fd; + + return 0; +} + +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + +static enum bpf_perf_event_ret +perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data) +{ + struct perf_event_mmap_page *header = mmap_mem; + __u64 data_head = ring_buffer_read_head(header); + __u64 data_tail = header->data_tail; + void *base = ((__u8 *)header) + page_size; + int ret = LIBBPF_PERF_EVENT_CONT; + struct perf_event_header *ehdr; + size_t ehdr_size; + + while (data_head != data_tail) { + ehdr = base + (data_tail & (mmap_size - 1)); + ehdr_size = ehdr->size; + + if (((void *)ehdr) + ehdr_size > base + mmap_size) { + void *copy_start = ehdr; + size_t len_first = base + mmap_size - copy_start; + size_t len_secnd = ehdr_size - len_first; + + if (*copy_size < ehdr_size) { + free(*copy_mem); + *copy_mem = malloc(ehdr_size); + if (!*copy_mem) { + *copy_size = 0; + ret = LIBBPF_PERF_EVENT_ERROR; + break; + } + *copy_size = ehdr_size; + } + + memcpy(*copy_mem, copy_start, len_first); + memcpy(*copy_mem + len_first, base, len_secnd); + ehdr = *copy_mem; + } + + ret = fn(ehdr, private_data); + data_tail += ehdr_size; + if (ret != LIBBPF_PERF_EVENT_CONT) + break; + } + + ring_buffer_write_tail(header, data_tail); + return libbpf_err(ret); +} + +struct perf_buffer; + +struct perf_buffer_params { + struct perf_event_attr *attr; + /* if event_cb is specified, it takes precendence */ + perf_buffer_event_fn event_cb; + /* sample_cb and lost_cb are higher-level common-case callbacks */ + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; + int cpu_cnt; + int *cpus; + int *map_keys; +}; + +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void perf_buffer__free_cpu_buf(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + if (!cpu_buf) + return; + if (cpu_buf->base && + munmap(cpu_buf->base, pb->mmap_size + pb->page_size)) + pr_warn("failed to munmap cpu_buf #%d\n", cpu_buf->cpu); + if (cpu_buf->fd >= 0) { + ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0); + close(cpu_buf->fd); + } + free(cpu_buf->buf); + free(cpu_buf); +} + +void perf_buffer__free(struct perf_buffer *pb) +{ + int i; + + if (IS_ERR_OR_NULL(pb)) + return; + if (pb->cpu_bufs) { + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); + perf_buffer__free_cpu_buf(pb, cpu_buf); + } + free(pb->cpu_bufs); + } + if (pb->epoll_fd >= 0) + close(pb->epoll_fd); + free(pb->events); + free(pb); +} + +static struct perf_cpu_buf * +perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr, + int cpu, int map_key) +{ + struct perf_cpu_buf *cpu_buf; + char msg[STRERR_BUFSIZE]; + int err; + + cpu_buf = calloc(1, sizeof(*cpu_buf)); + if (!cpu_buf) + return ERR_PTR(-ENOMEM); + + cpu_buf->pb = pb; + cpu_buf->cpu = cpu; + cpu_buf->map_key = map_key; + + cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu, + -1, PERF_FLAG_FD_CLOEXEC); + if (cpu_buf->fd < 0) { + err = -errno; + pr_warn("failed to open perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + cpu_buf->fd, 0); + if (cpu_buf->base == MAP_FAILED) { + cpu_buf->base = NULL; + err = -errno; + pr_warn("failed to mmap perf buffer on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("failed to enable perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + return cpu_buf; + +error: + perf_buffer__free_cpu_buf(pb, cpu_buf); + return (struct perf_cpu_buf *)ERR_PTR(err); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p); + +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_buffer_params p = {}; + struct perf_event_attr attr; + __u32 sample_period; + + if (!OPTS_VALID(opts, perf_buffer_opts)) + return libbpf_err_ptr(-EINVAL); + + sample_period = OPTS_GET(opts, sample_period, 1); + if (!sample_period) + sample_period = 1; + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = PERF_COUNT_SW_BPF_OUTPUT; + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = sample_period; + attr.wakeup_events = sample_period; + + p.attr = &attr; + p.sample_cb = sample_cb; + p.lost_cb = lost_cb; + p.ctx = ctx; + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) +{ + struct perf_buffer_params p = {}; + + if (!attr) + return libbpf_err_ptr(-EINVAL); + + if (!OPTS_VALID(opts, perf_buffer_raw_opts)) + return libbpf_err_ptr(-EINVAL); + + p.attr = attr; + p.event_cb = event_cb; + p.ctx = ctx; + p.cpu_cnt = OPTS_GET(opts, cpu_cnt, 0); + p.cpus = OPTS_GET(opts, cpus, NULL); + p.map_keys = OPTS_GET(opts, map_keys, NULL); + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p) +{ + const char *online_cpus_file = "/sys/devices/system/cpu/online"; + struct bpf_map_info map; + char msg[STRERR_BUFSIZE]; + struct perf_buffer *pb; + bool *online = NULL; + __u32 map_info_len; + int err, i, j, n; + + if (page_cnt == 0 || (page_cnt & (page_cnt - 1))) { + pr_warn("page count should be power of two, but is %zu\n", + page_cnt); + return ERR_PTR(-EINVAL); + } + + /* best-effort sanity checks */ + memset(&map, 0, sizeof(map)); + map_info_len = sizeof(map); + err = bpf_map_get_info_by_fd(map_fd, &map, &map_info_len); + if (err) { + err = -errno; + /* if BPF_OBJ_GET_INFO_BY_FD is supported, will return + * -EBADFD, -EFAULT, or -E2BIG on real error + */ + if (err != -EINVAL) { + pr_warn("failed to get map info for map FD %d: %s\n", + map_fd, libbpf_strerror_r(err, msg, sizeof(msg))); + return ERR_PTR(err); + } + pr_debug("failed to get map info for FD %d; API not supported? Ignoring...\n", + map_fd); + } else { + if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + pr_warn("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n", + map.name); + return ERR_PTR(-EINVAL); + } + } + + pb = calloc(1, sizeof(*pb)); + if (!pb) + return ERR_PTR(-ENOMEM); + + pb->event_cb = p->event_cb; + pb->sample_cb = p->sample_cb; + pb->lost_cb = p->lost_cb; + pb->ctx = p->ctx; + + pb->page_size = getpagesize(); + pb->mmap_size = pb->page_size * page_cnt; + pb->map_fd = map_fd; + + pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (pb->epoll_fd < 0) { + err = -errno; + pr_warn("failed to create epoll instance: %s\n", + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (p->cpu_cnt > 0) { + pb->cpu_cnt = p->cpu_cnt; + } else { + pb->cpu_cnt = libbpf_num_possible_cpus(); + if (pb->cpu_cnt < 0) { + err = pb->cpu_cnt; + goto error; + } + if (map.max_entries && map.max_entries < pb->cpu_cnt) + pb->cpu_cnt = map.max_entries; + } + + pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events)); + if (!pb->events) { + err = -ENOMEM; + pr_warn("failed to allocate events: out of memory\n"); + goto error; + } + pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs)); + if (!pb->cpu_bufs) { + err = -ENOMEM; + pr_warn("failed to allocate buffers: out of memory\n"); + goto error; + } + + err = parse_cpu_mask_file(online_cpus_file, &online, &n); + if (err) { + pr_warn("failed to get online CPU mask: %d\n", err); + goto error; + } + + for (i = 0, j = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf; + int cpu, map_key; + + cpu = p->cpu_cnt > 0 ? p->cpus[i] : i; + map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i; + + /* in case user didn't explicitly requested particular CPUs to + * be attached to, skip offline/not present CPUs + */ + if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu])) + continue; + + cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key); + if (IS_ERR(cpu_buf)) { + err = PTR_ERR(cpu_buf); + goto error; + } + + pb->cpu_bufs[j] = cpu_buf; + + err = bpf_map_update_elem(pb->map_fd, &map_key, + &cpu_buf->fd, 0); + if (err) { + err = -errno; + pr_warn("failed to set cpu #%d, key %d -> perf FD %d: %s\n", + cpu, map_key, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + pb->events[j].events = EPOLLIN; + pb->events[j].data.ptr = cpu_buf; + if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd, + &pb->events[j]) < 0) { + err = -errno; + pr_warn("failed to epoll_ctl cpu #%d perf FD %d: %s\n", + cpu, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + j++; + } + pb->cpu_cnt = j; + free(online); + + return pb; + +error: + free(online); + if (pb) + perf_buffer__free(pb); + return ERR_PTR(err); +} + +struct perf_sample_raw { + struct perf_event_header header; + uint32_t size; + char data[]; +}; + +struct perf_sample_lost { + struct perf_event_header header; + uint64_t id; + uint64_t lost; + uint64_t sample_id; +}; + +static enum bpf_perf_event_ret +perf_buffer__process_record(struct perf_event_header *e, void *ctx) +{ + struct perf_cpu_buf *cpu_buf = ctx; + struct perf_buffer *pb = cpu_buf->pb; + void *data = e; + + /* user wants full control over parsing perf event */ + if (pb->event_cb) + return pb->event_cb(pb->ctx, cpu_buf->cpu, e); + + switch (e->type) { + case PERF_RECORD_SAMPLE: { + struct perf_sample_raw *s = data; + + if (pb->sample_cb) + pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size); + break; + } + case PERF_RECORD_LOST: { + struct perf_sample_lost *s = data; + + if (pb->lost_cb) + pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost); + break; + } + default: + pr_warn("unknown perf sample type %d\n", e->type); + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static int perf_buffer__process_records(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + enum bpf_perf_event_ret ret; + + ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size, + pb->page_size, &cpu_buf->buf, + &cpu_buf->buf_size, + perf_buffer__process_record, cpu_buf); + if (ret != LIBBPF_PERF_EVENT_CONT) + return ret; + return 0; +} + +int perf_buffer__epoll_fd(const struct perf_buffer *pb) +{ + return pb->epoll_fd; +} + +int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) +{ + int i, cnt, err; + + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms); + if (cnt < 0) + return -errno; + + for (i = 0; i < cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("error while processing records: %d\n", err); + return libbpf_err(err); + } + } + return cnt; +} + +/* Return number of PERF_EVENT_ARRAY map slots set up by this perf_buffer + * manager. + */ +size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb) +{ + return pb->cpu_cnt; +} + +/* + * Return perf_event FD of a ring buffer in *buf_idx* slot of + * PERF_EVENT_ARRAY BPF map. This FD can be polled for new data using + * select()/poll()/epoll() Linux syscalls. + */ +int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return cpu_buf->fd; +} + +int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, size_t *buf_size) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + *buf = cpu_buf->base; + *buf_size = pb->mmap_size; + return 0; +} + +/* + * Consume data from perf ring buffer corresponding to slot *buf_idx* in + * PERF_EVENT_ARRAY BPF map without waiting/polling. If there is no data to + * consume, do nothing and return success. + * Returns: + * - 0 on success; + * - <0 on failure. + */ +int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return perf_buffer__process_records(pb, cpu_buf); +} + +int perf_buffer__consume(struct perf_buffer *pb) +{ + int i, err; + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("perf_buffer: failed to process records in buffer #%d: %d\n", i, err); + return libbpf_err(err); + } + } + return 0; +} + +int bpf_program__set_attach_target(struct bpf_program *prog, + int attach_prog_fd, + const char *attach_func_name) +{ + int btf_obj_fd = 0, btf_id = 0, err; + + if (!prog || attach_prog_fd < 0) + return libbpf_err(-EINVAL); + + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + if (attach_prog_fd && !attach_func_name) { + /* remember attach_prog_fd and let bpf_program__load() find + * BTF ID during the program load + */ + prog->attach_prog_fd = attach_prog_fd; + return 0; + } + + if (attach_prog_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, + attach_prog_fd); + if (btf_id < 0) + return libbpf_err(btf_id); + } else { + if (!attach_func_name) + return libbpf_err(-EINVAL); + + /* load btf_vmlinux, if not yet */ + err = bpf_object__load_vmlinux_btf(prog->obj, true); + if (err) + return libbpf_err(err); + err = find_kernel_btf_id(prog->obj, attach_func_name, + prog->expected_attach_type, + &btf_obj_fd, &btf_id); + if (err) + return libbpf_err(err); + } + + prog->attach_btf_id = btf_id; + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_prog_fd = attach_prog_fd; + return 0; +} + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) +{ + int err = 0, n, len, start, end = -1; + bool *tmp; + + *mask = NULL; + *mask_sz = 0; + + /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ + while (*s) { + if (*s == ',' || *s == '\n') { + s++; + continue; + } + n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len); + if (n <= 0 || n > 2) { + pr_warn("Failed to get CPU range %s: %d\n", s, n); + err = -EINVAL; + goto cleanup; + } else if (n == 1) { + end = start; + } + if (start < 0 || start > end) { + pr_warn("Invalid CPU range [%d,%d] in %s\n", + start, end, s); + err = -EINVAL; + goto cleanup; + } + tmp = realloc(*mask, end + 1); + if (!tmp) { + err = -ENOMEM; + goto cleanup; + } + *mask = tmp; + memset(tmp + *mask_sz, 0, start - *mask_sz); + memset(tmp + start, 1, end - start + 1); + *mask_sz = end + 1; + s += len; + } + if (!*mask_sz) { + pr_warn("Empty CPU range\n"); + return -EINVAL; + } + return 0; +cleanup: + free(*mask); + *mask = NULL; + return err; +} + +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz) +{ + int fd, err = 0, len; + char buf[128]; + + fd = open(fcpu, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("Failed to open cpu mask file %s: %d\n", fcpu, err); + return err; + } + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len <= 0) { + err = len ? -errno : -EINVAL; + pr_warn("Failed to read cpu mask from %s: %d\n", fcpu, err); + return err; + } + if (len >= sizeof(buf)) { + pr_warn("CPU mask is too big in file %s\n", fcpu); + return -E2BIG; + } + buf[len] = '\0'; + + return parse_cpu_mask_str(buf, mask, mask_sz); +} + +int libbpf_num_possible_cpus(void) +{ + static const char *fcpu = "/sys/devices/system/cpu/possible"; + static int cpus; + int err, n, i, tmp_cpus; + bool *mask; + + tmp_cpus = READ_ONCE(cpus); + if (tmp_cpus > 0) + return tmp_cpus; + + err = parse_cpu_mask_file(fcpu, &mask, &n); + if (err) + return libbpf_err(err); + + tmp_cpus = 0; + for (i = 0; i < n; i++) { + if (mask[i]) + tmp_cpus++; + } + free(mask); + + WRITE_ONCE(cpus, tmp_cpus); + return tmp_cpus; +} + +static int populate_skeleton_maps(const struct bpf_object *obj, + struct bpf_map_skeleton *maps, + size_t map_cnt) +{ + int i; + + for (i = 0; i < map_cnt; i++) { + struct bpf_map **map = maps[i].map; + const char *name = maps[i].name; + void **mmaped = maps[i].mmaped; + + *map = bpf_object__find_map_by_name(obj, name); + if (!*map) { + pr_warn("failed to find skeleton map '%s'\n", name); + return -ESRCH; + } + + /* externs shouldn't be pre-setup from user code */ + if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_KCONFIG) + *mmaped = (*map)->mmaped; + } + return 0; +} + +static int populate_skeleton_progs(const struct bpf_object *obj, + struct bpf_prog_skeleton *progs, + size_t prog_cnt) +{ + int i; + + for (i = 0; i < prog_cnt; i++) { + struct bpf_program **prog = progs[i].prog; + const char *name = progs[i].name; + + *prog = bpf_object__find_program_by_name(obj, name); + if (!*prog) { + pr_warn("failed to find skeleton program '%s'\n", name); + return -ESRCH; + } + } + return 0; +} + +int bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, + .object_name = s->name, + ); + struct bpf_object *obj; + int err; + + /* Attempt to preserve opts->object_name, unless overriden by user + * explicitly. Overwriting object name for skeletons is discouraged, + * as it breaks global data maps, because they contain object name + * prefix as their own map name prefix. When skeleton is generated, + * bpftool is making an assumption that this name will stay the same. + */ + if (opts) { + memcpy(&skel_opts, opts, sizeof(*opts)); + if (!opts->object_name) + skel_opts.object_name = s->name; + } + + obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); + err = libbpf_get_error(obj); + if (err) { + pr_warn("failed to initialize skeleton BPF object '%s': %d\n", + s->name, err); + return libbpf_err(err); + } + + *s->obj = obj; + err = populate_skeleton_maps(obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate skeleton maps for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate skeleton progs for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__open_subskeleton(struct bpf_object_subskeleton *s) +{ + int err, len, var_idx, i; + const char *var_name; + const struct bpf_map *map; + struct btf *btf; + __u32 map_type_id; + const struct btf_type *map_type, *var_type; + const struct bpf_var_skeleton *var_skel; + struct btf_var_secinfo *var; + + if (!s->obj) + return libbpf_err(-EINVAL); + + btf = bpf_object__btf(s->obj); + if (!btf) { + pr_warn("subskeletons require BTF at runtime (object %s)\n", + bpf_object__name(s->obj)); + return libbpf_err(-errno); + } + + err = populate_skeleton_maps(s->obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(s->obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + for (var_idx = 0; var_idx < s->var_cnt; var_idx++) { + var_skel = &s->vars[var_idx]; + map = *var_skel->map; + map_type_id = bpf_map__btf_value_type_id(map); + map_type = btf__type_by_id(btf, map_type_id); + + if (!btf_is_datasec(map_type)) { + pr_warn("type for map '%1$s' is not a datasec: %2$s", + bpf_map__name(map), + __btf_kind_str(btf_kind(map_type))); + return libbpf_err(-EINVAL); + } + + len = btf_vlen(map_type); + var = btf_var_secinfos(map_type); + for (i = 0; i < len; i++, var++) { + var_type = btf__type_by_id(btf, var->type); + var_name = btf__name_by_offset(btf, var_type->name_off); + if (strcmp(var_name, var_skel->name) == 0) { + *var_skel->addr = map->mmaped + var->offset; + break; + } + } + } + return 0; +} + +void bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s) +{ + if (!s) + return; + free(s->maps); + free(s->progs); + free(s->vars); + free(s); +} + +int bpf_object__load_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + err = bpf_object__load(*s->obj); + if (err) { + pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map *map = *s->maps[i].map; + size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + int prot, map_fd = bpf_map__fd(map); + void **mmaped = s->maps[i].mmaped; + + if (!mmaped) + continue; + + if (!(map->def.map_flags & BPF_F_MMAPABLE)) { + *mmaped = NULL; + continue; + } + + if (map->def.map_flags & BPF_F_RDONLY_PROG) + prot = PROT_READ; + else + prot = PROT_READ | PROT_WRITE; + + /* Remap anonymous mmap()-ed "map initialization image" as + * a BPF map-backed mmap()-ed memory, but preserving the same + * memory address. This will cause kernel to change process' + * page table to point to a different piece of kernel memory, + * but from userspace point of view memory address (and its + * contents, being identical at this point) will stay the + * same. This mapping will be released by bpf_object__close() + * as per normal clean up procedure, so we don't need to worry + * about it from skeleton's clean up perspective. + */ + *mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0); + if (*mmaped == MAP_FAILED) { + err = -errno; + *mmaped = NULL; + pr_warn("failed to re-mmap() map '%s': %d\n", + bpf_map__name(map), err); + return libbpf_err(err); + } + } + + return 0; +} + +int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_program *prog = *s->progs[i].prog; + struct bpf_link **link = s->progs[i].link; + + if (!prog->autoload || !prog->autoattach) + continue; + + /* auto-attaching not supported for this program */ + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + continue; + + /* if user already set the link manually, don't attempt auto-attach */ + if (*link) + continue; + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link); + if (err) { + pr_warn("prog '%s': failed to auto-attach: %d\n", + bpf_program__name(prog), err); + return libbpf_err(err); + } + + /* It's possible that for some SEC() definitions auto-attach + * is supported in some cases (e.g., if definition completely + * specifies target information), but is not in other cases. + * SEC("uprobe") is one such case. If user specified target + * binary and function name, such BPF program can be + * auto-attached. But if not, it shouldn't trigger skeleton's + * attach to fail. It should just be skipped. + * attach_fn signals such case with returning 0 (no error) and + * setting link to NULL. + */ + } + + return 0; +} + +void bpf_object__detach_skeleton(struct bpf_object_skeleton *s) +{ + int i; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_link **link = s->progs[i].link; + + bpf_link__destroy(*link); + *link = NULL; + } +} + +void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) +{ + if (!s) + return; + + if (s->progs) + bpf_object__detach_skeleton(s); + if (s->obj) + bpf_object__close(*s->obj); + free(s->maps); + free(s->progs); + free(s); +} diff --git a/third_party/bpftool/libbpf/src/libbpf.h b/third_party/bpftool/libbpf/src/libbpf.h new file mode 100644 index 0000000..10642ad --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf.h @@ -0,0 +1,1685 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ +#ifndef __LIBBPF_LIBBPF_H +#define __LIBBPF_LIBBPF_H + +#include +#include +#include +#include +#include // for size_t +#include + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +LIBBPF_API __u32 libbpf_major_version(void); +LIBBPF_API __u32 libbpf_minor_version(void); +LIBBPF_API const char *libbpf_version_string(void); + +enum libbpf_errno { + __LIBBPF_ERRNO__START = 4000, + + /* Something wrong in libelf */ + LIBBPF_ERRNO__LIBELF = __LIBBPF_ERRNO__START, + LIBBPF_ERRNO__FORMAT, /* BPF object format invalid */ + LIBBPF_ERRNO__KVERSION, /* Incorrect or no 'version' section */ + LIBBPF_ERRNO__ENDIAN, /* Endian mismatch */ + LIBBPF_ERRNO__INTERNAL, /* Internal error in libbpf */ + LIBBPF_ERRNO__RELOC, /* Relocation failed */ + LIBBPF_ERRNO__LOAD, /* Load program failure for unknown reason */ + LIBBPF_ERRNO__VERIFY, /* Kernel verifier blocks program loading */ + LIBBPF_ERRNO__PROG2BIG, /* Program too big */ + LIBBPF_ERRNO__KVER, /* Incorrect kernel version */ + LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ + LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ + LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ + __LIBBPF_ERRNO__END, +}; + +LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); + +/** + * @brief **libbpf_bpf_attach_type_str()** converts the provided attach type + * value into a textual representation. + * @param t The attach type. + * @return Pointer to a static string identifying the attach type. NULL is + * returned for unknown **bpf_attach_type** values. + */ +LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); + +/** + * @brief **libbpf_bpf_link_type_str()** converts the provided link type value + * into a textual representation. + * @param t The link type. + * @return Pointer to a static string identifying the link type. NULL is + * returned for unknown **bpf_link_type** values. + */ +LIBBPF_API const char *libbpf_bpf_link_type_str(enum bpf_link_type t); + +/** + * @brief **libbpf_bpf_map_type_str()** converts the provided map type value + * into a textual representation. + * @param t The map type. + * @return Pointer to a static string identifying the map type. NULL is + * returned for unknown **bpf_map_type** values. + */ +LIBBPF_API const char *libbpf_bpf_map_type_str(enum bpf_map_type t); + +/** + * @brief **libbpf_bpf_prog_type_str()** converts the provided program type + * value into a textual representation. + * @param t The program type. + * @return Pointer to a static string identifying the program type. NULL is + * returned for unknown **bpf_prog_type** values. + */ +LIBBPF_API const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t); + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, + const char *, va_list ap); + +/** + * @brief **libbpf_set_print()** sets user-provided log callback function to + * be used for libbpf warnings and informational messages. + * @param fn The log print function. If NULL, libbpf won't print anything. + * @return Pointer to old print function. + * + * This function is thread-safe. + */ +LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); + +/* Hide internal to user */ +struct bpf_object; + +struct bpf_object_open_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* object name override, if provided: + * - for object open from file, this will override setting object + * name from file path's base name; + * - for object open from memory buffer, this will specify an object + * name and will override default "-" name; + */ + const char *object_name; + /* parse map definitions non-strictly, allowing extra attributes/data */ + bool relaxed_maps; + /* maps that set the 'pinning' attribute in their definition will have + * their pin_path attribute set to a file in this directory, and be + * auto-pinned to that path on load; defaults to "/sys/fs/bpf". + */ + const char *pin_root_path; + + __u32 :32; /* stub out now removed attach_prog_fd */ + + /* Additional kernel config content that augments and overrides + * system Kconfig for CONFIG_xxx externs. + */ + const char *kconfig; + /* Path to the custom BTF to be used for BPF CO-RE relocations. + * This custom BTF completely replaces the use of vmlinux BTF + * for the purpose of CO-RE relocations. + * NOTE: any other BPF feature (e.g., fentry/fexit programs, + * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux. + */ + const char *btf_custom_path; + /* Pointer to a buffer for storing kernel logs for applicable BPF + * commands. Valid kernel_log_size has to be specified as well and are + * passed-through to bpf() syscall. Keep in mind that kernel might + * fail operation with -ENOSPC error if provided buffer is too small + * to contain entire log output. + * See the comment below for kernel_log_level for interaction between + * log_buf and log_level settings. + * + * If specified, this log buffer will be passed for: + * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden + * with bpf_program__set_log() on per-program level, to get + * BPF verifier log output. + * - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get + * BTF sanity checking log. + * + * Each BPF command (BPF_BTF_LOAD or BPF_PROG_LOAD) will overwrite + * previous contents, so if you need more fine-grained control, set + * per-program buffer with bpf_program__set_log_buf() to preserve each + * individual program's verification log. Keep using kernel_log_buf + * for BTF verification log, if necessary. + */ + char *kernel_log_buf; + size_t kernel_log_size; + /* + * Log level can be set independently from log buffer. Log_level=0 + * means that libbpf will attempt loading BTF or program without any + * logging requested, but will retry with either its own or custom log + * buffer, if provided, and log_level=1 on any error. + * And vice versa, setting log_level>0 will request BTF or prog + * loading with verbose log from the first attempt (and as such also + * for successfully loaded BTF or program), and the actual log buffer + * could be either libbpf's own auto-allocated log buffer, if + * kernel_log_buffer is NULL, or user-provided custom kernel_log_buf. + * If user didn't provide custom log buffer, libbpf will emit captured + * logs through its print callback. + */ + __u32 kernel_log_level; + + size_t :0; +}; +#define bpf_object_open_opts__last_field kernel_log_level + +/** + * @brief **bpf_object__open()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path. + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object *bpf_object__open(const char *path); + +/** + * @brief **bpf_object__open_file()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path + * @param opts options for how to load the bpf object, this parameter is + * optional and can be set to NULL + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__open_mem()** creates a bpf_object by reading + * the BPF objects raw bytes from a memory buffer containing a valid + * BPF ELF object file. + * @param obj_buf pointer to the buffer containing ELF file bytes + * @param obj_buf_sz number of bytes in the buffer + * @param opts options for how to load the bpf object + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__load()** loads BPF object into kernel. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** APIs + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); + +/** + * @brief **bpf_object__close()** closes a BPF object and releases all + * resources. + * @param obj Pointer to a valid BPF object + */ +LIBBPF_API void bpf_object__close(struct bpf_object *obj); + +/** + * @brief **bpf_object__pin_maps()** pins each map contained within + * the BPF object at the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where maps should be pinned. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__pin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ +LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path); + +/** + * @brief **bpf_object__unpin_maps()** unpins each map contained within + * the BPF object found in the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where pinned maps should be searched for. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__unpin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ +LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); + +LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); +LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); + +struct btf; +LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); +LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); + +LIBBPF_API struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name); + +LIBBPF_API int +libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type); +LIBBPF_API int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type); +LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type); + +/* Accessors of bpf_program */ +struct bpf_program; + +LIBBPF_API struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); + +#define bpf_object__for_each_program(pos, obj) \ + for ((pos) = bpf_object__next_program((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_program((obj), (pos))) + +LIBBPF_API struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); + +LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, + __u32 ifindex); + +LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); +LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); +LIBBPF_API bool bpf_program__autoattach(const struct bpf_program *prog); +LIBBPF_API void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach); + +struct bpf_insn; + +/** + * @brief **bpf_program__insns()** gives read-only access to BPF program's + * underlying BPF instructions. + * @param prog BPF program for which to return instructions + * @return a pointer to an array of BPF instructions that belong to the + * specified BPF program + * + * Returned pointer is always valid and not NULL. Number of `struct bpf_insn` + * pointed to can be fetched using **bpf_program__insn_cnt()** API. + * + * Keep in mind, libbpf can modify and append/delete BPF program's + * instructions as it processes BPF object file and prepares everything for + * uploading into the kernel. So depending on the point in BPF object + * lifetime, **bpf_program__insns()** can return different sets of + * instructions. As an example, during BPF object load phase BPF program + * instructions will be CO-RE-relocated, BPF subprograms instructions will be + * appended, ldimm64 instructions will have FDs embedded, etc. So instructions + * returned before **bpf_object__load()** and after it might be quite + * different. + */ +LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_insns()** can set BPF program's underlying + * BPF instructions. + * + * WARNING: This is a very advanced libbpf API and users need to know + * what they are doing. This should be used from prog_prepare_load_fn + * callback only. + * + * @param prog BPF program for which to return instructions + * @param new_insns a pointer to an array of BPF instructions + * @param new_insn_cnt number of `struct bpf_insn`'s that form + * specified BPF program + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt); + +/** + * @brief **bpf_program__insn_cnt()** returns number of `struct bpf_insn`'s + * that form specified BPF program. + * @param prog BPF program for which to return number of BPF instructions + * + * See **bpf_program__insns()** documentation for notes on how libbpf can + * change instructions and their count during different phases of + * **bpf_object** lifetime. + */ +LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); + +LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); + +/** + * @brief **bpf_program__pin()** pins the BPF program to a file + * in the BPF FS specified by a path. This increments the programs + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param prog BPF program to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); + +/** + * @brief **bpf_program__unpin()** unpins the BPF program from a file + * in the BPFFS specified by a path. This decrements the programs + * reference count. + * + * The file pinning the BPF program can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__unpin(struct bpf_program *prog, const char *path); +LIBBPF_API void bpf_program__unload(struct bpf_program *prog); + +struct bpf_link; + +LIBBPF_API struct bpf_link *bpf_link__open(const char *path); +LIBBPF_API int bpf_link__fd(const struct bpf_link *link); +LIBBPF_API const char *bpf_link__pin_path(const struct bpf_link *link); +/** + * @brief **bpf_link__pin()** pins the BPF link to a file + * in the BPF FS specified by a path. This increments the links + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param link BPF link to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ + +LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); + +/** + * @brief **bpf_link__unpin()** unpins the BPF link from a file + * in the BPFFS specified by a path. This decrements the links + * reference count. + * + * The file pinning the BPF link can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_link__unpin(struct bpf_link *link); +LIBBPF_API int bpf_link__update_program(struct bpf_link *link, + struct bpf_program *prog); +LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); +LIBBPF_API int bpf_link__detach(struct bpf_link *link); +LIBBPF_API int bpf_link__destroy(struct bpf_link *link); + +/** + * @brief **bpf_program__attach()** is a generic function for attaching + * a BPF program based on auto-detection of program type, attach type, + * and extra paremeters, where applicable. + * + * @param prog BPF program to attach + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + * + * This is supported for: + * - kprobe/kretprobe (depends on SEC() definition) + * - uprobe/uretprobe (depends on SEC() definition) + * - tracepoint + * - raw tracepoint + * - tracing programs (typed raw TP/fentry/fexit/fmod_ret) + */ +LIBBPF_API struct bpf_link * +bpf_program__attach(const struct bpf_program *prog); + +struct bpf_perf_event_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* don't use BPF link when attach BPF program */ + bool force_ioctl_attach; + size_t :0; +}; +#define bpf_perf_event_opts__last_field force_ioctl_attach + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd); + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts); + +/** + * enum probe_attach_mode - the mode to attach kprobe/uprobe + * + * force libbpf to attach kprobe/uprobe in specific mode, -ENOTSUP will + * be returned if it is not supported by the kernel. + */ +enum probe_attach_mode { + /* attach probe in latest supported mode by kernel */ + PROBE_ATTACH_MODE_DEFAULT = 0, + /* attach probe in legacy mode, using debugfs/tracefs */ + PROBE_ATTACH_MODE_LEGACY, + /* create perf event with perf_event_open() syscall */ + PROBE_ATTACH_MODE_PERF, + /* attach probe with BPF link */ + PROBE_ATTACH_MODE_LINK, +}; + +struct bpf_kprobe_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* function's offset to install kprobe to */ + size_t offset; + /* kprobe is return probe */ + bool retprobe; + /* kprobe attach mode */ + enum probe_attach_mode attach_mode; + size_t :0; +}; +#define bpf_kprobe_opts__last_field attach_mode + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe, + const char *func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts); + +struct bpf_kprobe_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* array of function symbols to attach */ + const char **syms; + /* array of function addresses to attach */ + const unsigned long *addrs; + /* array of user-provided values fetchable through bpf_get_attach_cookie */ + const __u64 *cookies; + /* number of elements in syms/addrs/cookies arrays */ + size_t cnt; + /* create return kprobes */ + bool retprobe; + size_t :0; +}; + +#define bpf_kprobe_multi_opts__last_field retprobe + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts); + +struct bpf_ksyscall_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* attach as return probe? */ + bool retprobe; + size_t :0; +}; +#define bpf_ksyscall_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_ksyscall()** attaches a BPF program + * to kernel syscall handler of a specified syscall. Optionally it's possible + * to request to install retprobe that will be triggered at syscall exit. It's + * also possible to associate BPF cookie (though options). + * + * Libbpf automatically will determine correct full kernel function name, + * which depending on system architecture and kernel version/configuration + * could be of the form ___sys_ or __se_sys_, and will + * attach specified program using kprobe/kretprobe mechanism. + * + * **bpf_program__attach_ksyscall()** is an API counterpart of declarative + * **SEC("ksyscall/")** annotation of BPF programs. + * + * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do + * not handle all the calling convention quirks for mmap(), clone() and compat + * syscalls. It also only attaches to "native" syscall interfaces. If host + * system supports compat syscalls or defines 32-bit syscalls in 64-bit + * kernel, such syscall interfaces won't be attached to by libbpf. + * + * These limitations may or may not change in the future. Therefore it is + * recommended to use SEC("kprobe") for these syscalls or if working with + * compat and 32-bit interfaces is required. + * + * @param prog BPF program to attach + * @param syscall_name Symbolic name of the syscall (e.g., "bpf") + * @param opts Additional options (see **struct bpf_ksyscall_opts**) + * @return Reference to the newly created BPF link; or NULL is returned on + * error, error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts); + +struct bpf_uprobe_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* offset of kernel reference counted USDT semaphore, added in + * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + size_t ref_ctr_offset; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* uprobe is return probe, invoked at function return time */ + bool retprobe; + /* Function name to attach to. Could be an unqualified ("abc") or library-qualified + * "abc@LIBXYZ" name. To specify function entry, func_name should be set while + * func_offset argument to bpf_prog__attach_uprobe_opts() should be 0. To trace an + * offset within a function, specify func_name and use func_offset argument to specify + * offset within the function. Shared library functions must specify the shared library + * binary_path. + */ + const char *func_name; + /* uprobe attach mode */ + enum probe_attach_mode attach_mode; + size_t :0; +}; +#define bpf_uprobe_opts__last_field attach_mode + +/** + * @brief **bpf_program__attach_uprobe()** attaches a BPF program + * to the userspace function which is found by binary path and + * offset. You can optionally specify a particular proccess to attach + * to. You can also optionally attach the program to the function + * exit instead of entry. + * + * @param prog BPF program to attach + * @param retprobe Attach to function exit + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe, + pid_t pid, const char *binary_path, + size_t func_offset); + +/** + * @brief **bpf_program__attach_uprobe_opts()** is just like + * bpf_program__attach_uprobe() except with a options struct + * for various configurations. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts); + +struct bpf_usdt_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value accessible through usdt_cookie() */ + __u64 usdt_cookie; + size_t :0; +}; +#define bpf_usdt_opts__last_field usdt_cookie + +/** + * @brief **bpf_program__attach_usdt()** is just like + * bpf_program__attach_uprobe_opts() except it covers USDT (User-space + * Statically Defined Tracepoint) attachment, instead of attaching to + * user-space function entry or exit. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains provided USDT probe + * @param usdt_provider USDT provider name + * @param usdt_name USDT probe name + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts); + +struct bpf_tracepoint_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_tracepoint_opts__last_field bpf_cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name); + +struct bpf_trace_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 cookie; +}; +#define bpf_trace_opts__last_field cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_trace(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_trace_opts(const struct bpf_program *prog, const struct bpf_trace_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_lsm(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); +LIBBPF_API struct bpf_link * +bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, const char *attach_func_name); + +struct bpf_netfilter_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; +}; +#define bpf_netfilter_opts__last_field flags + +LIBBPF_API struct bpf_link * +bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts); + +struct bpf_map; + +LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); + +struct bpf_iter_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + union bpf_iter_link_info *link_info; + __u32 link_info_len; +}; +#define bpf_iter_attach_opts__last_field link_info_len + +LIBBPF_API struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts); + +LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_type()** sets the program + * type of the passed BPF program. + * @param prog BPF program to set the program type for + * @param type program type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int bpf_program__set_type(struct bpf_program *prog, + enum bpf_prog_type type); + +LIBBPF_API enum bpf_attach_type +bpf_program__expected_attach_type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_expected_attach_type()** sets the + * attach type of the passed BPF program. This is used for + * auto-detection of attachment when programs are loaded. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int +bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type); + +LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); + +/* Per-program log level and log buffer getters/setters. + * See bpf_object_open_opts comments regarding log_level and log_buf + * interactions. + */ +LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level); +LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size); +LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size); + +/** + * @brief **bpf_program__set_attach_target()** sets BTF-based attach target + * for supported BPF program types: + * - BTF-aware raw tracepoints (tp_btf); + * - fentry/fexit/fmod_ret; + * - lsm; + * - freplace. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error occurred. + */ +LIBBPF_API int +bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, + const char *attach_func_name); + +/** + * @brief **bpf_object__find_map_by_name()** returns BPF map of + * the given name, if it exists within the passed BPF object + * @param obj BPF object + * @param name name of the BPF map + * @return BPF map instance, if such map exists within the BPF object; + * or NULL otherwise. + */ +LIBBPF_API struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); + +#define bpf_object__for_each_map(pos, obj) \ + for ((pos) = bpf_object__next_map((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_map((obj), (pos))) +#define bpf_map__for_each bpf_object__for_each_map + +LIBBPF_API struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); + +/** + * @brief **bpf_map__set_autocreate()** sets whether libbpf has to auto-create + * BPF map during BPF object load phase. + * @param map the BPF map instance + * @param autocreate whether to create BPF map during BPF object load + * @return 0 on success; -EBUSY if BPF object was already loaded + * + * **bpf_map__set_autocreate()** allows to opt-out from libbpf auto-creating + * BPF map. By default, libbpf will attempt to create every single BPF map + * defined in BPF object file using BPF_MAP_CREATE command of bpf() syscall + * and fill in map FD in BPF instructions. + * + * This API allows to opt-out of this process for specific map instance. This + * can be useful if host kernel doesn't support such BPF map type or used + * combination of flags and user application wants to avoid creating such + * a map in the first place. User is still responsible to make sure that their + * BPF-side code that expects to use such missing BPF map is recognized by BPF + * verifier as dead code, otherwise BPF verifier will reject such BPF program. + */ +LIBBPF_API int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate); +LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); + +/** + * @brief **bpf_map__fd()** gets the file descriptor of the passed + * BPF map + * @param map the BPF map instance + * @return the file descriptor; or -EINVAL in case of an error + */ +LIBBPF_API int bpf_map__fd(const struct bpf_map *map); +LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +/* get map name */ +LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); +/* get/set map type */ +LIBBPF_API enum bpf_map_type bpf_map__type(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); +/* get/set map size (max_entries) */ +LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); +/* get/set map flags */ +LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); +/* get/set map NUMA node */ +LIBBPF_API __u32 bpf_map__numa_node(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node); +/* get/set map key size */ +LIBBPF_API __u32 bpf_map__key_size(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_key_size(struct bpf_map *map, __u32 size); +/* get map value size */ +LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); +/** + * @brief **bpf_map__set_value_size()** sets map value size. + * @param map the BPF map instance + * @return 0, on success; negative error, otherwise + * + * There is a special case for maps with associated memory-mapped regions, like + * the global data section maps (bss, data, rodata). When this function is used + * on such a map, the mapped region is resized. Afterward, an attempt is made to + * adjust the corresponding BTF info. This attempt is best-effort and can only + * succeed if the last variable of the data section map is an array. The array + * BTF type is replaced by a new BTF array type with a different length. + * Any previously existing pointers returned from bpf_map__initial_value() or + * corresponding data section skeleton pointer must be reinitialized. + */ +LIBBPF_API int bpf_map__set_value_size(struct bpf_map *map, __u32 size); +/* get map key/value BTF type IDs */ +LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); +/* get/set map if_index */ +LIBBPF_API __u32 bpf_map__ifindex(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +/* get/set map map_extra flags */ +LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); + +LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size); +LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); + +/** + * @brief **bpf_map__is_internal()** tells the caller whether or not the + * passed map is a special map created by libbpf automatically for things like + * global variables, __ksym externs, Kconfig values, etc + * @param map the bpf_map + * @return true, if the map is an internal map; false, otherwise + */ +LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); + +/** + * @brief **bpf_map__set_pin_path()** sets the path attribute that tells where the + * BPF map should be pinned. This does not actually create the 'pin'. + * @param map The bpf_map + * @param path The path + * @return 0, on success; negative error, otherwise + */ +LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__pin_path()** gets the path attribute that tells where the + * BPF map should be pinned. + * @param map The bpf_map + * @return The path string; which can be NULL + */ +LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); + +/** + * @brief **bpf_map__is_pinned()** tells the caller whether or not the + * passed map has been pinned via a 'pin' file. + * @param map The bpf_map + * @return true, if the map is pinned; false, otherwise + */ +LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); + +/** + * @brief **bpf_map__pin()** creates a file that serves as a 'pin' + * for the BPF map. This increments the reference count on the + * BPF map which will keep the BPF map loaded even after the + * userspace process which loaded it has exited. + * @param map The bpf_map to pin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * If `path` is NULL the maps `pin_path` attribute will be used. If this is + * also NULL, an error will be returned and the map will not be pinned. + */ +LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__unpin()** removes the file that serves as a + * 'pin' for the BPF map. + * @param map The bpf_map to unpin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * The `path` parameter can be NULL, in which case the `pin_path` + * map attribute is unpinned. If both the `path` parameter and + * `pin_path` map attribute are set, they must be equal. + */ +LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); + +LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); + +/** + * @brief **bpf_map__lookup_elem()** allows to lookup BPF map value + * corresponding to provided key. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_elem()** is high-level equivalent of + * **bpf_map_lookup_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__update_elem()** allows to insert or update value in BPF + * map that corresponds to provided key. + * @param map BPF map to insert to or update element in + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory containing bytes of the value + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__update_elem()** is high-level equivalent of + * **bpf_map_update_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__delete_elem()** allows to delete element in BPF map that + * corresponds to provided key. + * @param map BPF map to delete element from + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__delete_elem()** is high-level equivalent of + * **bpf_map_delete_elem()** API with added check for key size. + */ +LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags); + +/** + * @brief **bpf_map__lookup_and_delete_elem()** allows to lookup BPF map value + * corresponding to provided key and atomically delete it afterwards. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of + * **bpf_map_lookup_and_delete_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__get_next_key()** allows to iterate BPF map keys by + * fetching next key that follows current key. + * @param map BPF map to fetch next key from + * @param cur_key pointer to memory containing bytes of current key or NULL to + * fetch the first key + * @param next_key pointer to memory to write next key into + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @return 0, on success; -ENOENT if **cur_key** is the last key in BPF map; + * negative error, otherwise + * + * **bpf_map__get_next_key()** is high-level equivalent of + * **bpf_map_get_next_key()** API with added check for key size. + */ +LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz); + +struct bpf_xdp_set_link_opts { + size_t sz; + int old_fd; + size_t :0; +}; +#define bpf_xdp_set_link_opts__last_field old_fd + +struct bpf_xdp_attach_opts { + size_t sz; + int old_prog_fd; + size_t :0; +}; +#define bpf_xdp_attach_opts__last_field old_prog_fd + +struct bpf_xdp_query_opts { + size_t sz; + __u32 prog_id; /* output */ + __u32 drv_prog_id; /* output */ + __u32 hw_prog_id; /* output */ + __u32 skb_prog_id; /* output */ + __u8 attach_mode; /* output */ + __u64 feature_flags; /* output */ + size_t :0; +}; +#define bpf_xdp_query_opts__last_field feature_flags + +LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_detach(int ifindex, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_query(int ifindex, int flags, struct bpf_xdp_query_opts *opts); +LIBBPF_API int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id); + +/* TC related API */ +enum bpf_tc_attach_point { + BPF_TC_INGRESS = 1 << 0, + BPF_TC_EGRESS = 1 << 1, + BPF_TC_CUSTOM = 1 << 2, +}; + +#define BPF_TC_PARENT(a, b) \ + ((((a) << 16) & 0xFFFF0000U) | ((b) & 0x0000FFFFU)) + +enum bpf_tc_flags { + BPF_TC_F_REPLACE = 1 << 0, +}; + +struct bpf_tc_hook { + size_t sz; + int ifindex; + enum bpf_tc_attach_point attach_point; + __u32 parent; + size_t :0; +}; +#define bpf_tc_hook__last_field parent + +struct bpf_tc_opts { + size_t sz; + int prog_fd; + __u32 flags; + __u32 prog_id; + __u32 handle; + __u32 priority; + size_t :0; +}; +#define bpf_tc_opts__last_field priority + +LIBBPF_API int bpf_tc_hook_create(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_hook_destroy(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_attach(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); + +/* Ring buffer APIs */ +struct ring_buffer; +struct user_ring_buffer; + +typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); + +struct ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define ring_buffer_opts__last_field sz + +LIBBPF_API struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts); +LIBBPF_API void ring_buffer__free(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx); +LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); +LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); + +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/** + * @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/** + * @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/** + * @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/** + * @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + +/* Perf buffer APIs */ +struct perf_buffer; + +typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, + void *data, __u32 size); +typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); + +/* common use perf buffer options */ +struct perf_buffer_opts { + size_t sz; + __u32 sample_period; + size_t :0; +}; +#define perf_buffer_opts__last_field sample_period + +/** + * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified + * BPF_PERF_EVENT_ARRAY map + * @param map_fd FD of BPF_PERF_EVENT_ARRAY BPF map that will be used by BPF + * code to send data over to user-space + * @param page_cnt number of memory pages allocated for each per-CPU buffer + * @param sample_cb function called on each received data record + * @param lost_cb function called when record loss has occurred + * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb* + * @return a new instance of struct perf_buffer on success, NULL on error with + * *errno* containing an error code + */ +LIBBPF_API struct perf_buffer * +perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, + const struct perf_buffer_opts *opts); + +enum bpf_perf_event_ret { + LIBBPF_PERF_EVENT_DONE = 0, + LIBBPF_PERF_EVENT_ERROR = -1, + LIBBPF_PERF_EVENT_CONT = -2, +}; + +struct perf_event_header; + +typedef enum bpf_perf_event_ret +(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event); + +/* raw perf buffer options, giving most power and control */ +struct perf_buffer_raw_opts { + size_t sz; + long :0; + long :0; + /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of + * max_entries of given PERF_EVENT_ARRAY map) + */ + int cpu_cnt; + /* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */ + int *cpus; + /* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */ + int *map_keys; +}; +#define perf_buffer_raw_opts__last_field map_keys + +struct perf_event_attr; + +LIBBPF_API struct perf_buffer * +perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts); + +LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); +LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx); +LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); +/** + * @brief **perf_buffer__buffer()** returns the per-cpu raw mmap()'ed underlying + * memory region of the ring buffer. + * This ring buffer can be used to implement a custom events consumer. + * The ring buffer starts with the *struct perf_event_mmap_page*, which + * holds the ring buffer managment fields, when accessing the header + * structure it's important to be SMP aware. + * You can refer to *perf_event_read_simple* for a simple example. + * @param pb the perf buffer structure + * @param buf_idx the buffer index to retreive + * @param buf (out) gets the base pointer of the mmap()'ed memory + * @param buf_size (out) gets the size of the mmap()'ed region + * @return 0 on success, negative error code for failure + */ +LIBBPF_API int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, + size_t *buf_size); + +struct bpf_prog_linfo; +struct bpf_prog_info; + +LIBBPF_API void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo); +LIBBPF_API struct bpf_prog_linfo * +bpf_prog_linfo__new(const struct bpf_prog_info *info); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip); + +/* + * Probe for supported system features + * + * Note that running many of these probes in a short amount of time can cause + * the kernel to reach the maximal size of lockable memory allowed for the + * user, causing subsequent probes to fail. In this case, the caller may want + * to adjust that limit with setrlimit(). + */ + +/** + * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports + * BPF programs of a given type. + * @param prog_type BPF program type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given program type is supported; 0, if given program type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_map_type()** detects if host kernel supports + * BPF maps of a given type. + * @param map_type BPF map type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given map type is supported; 0, if given map type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_helper()** detects if host kernel supports the + * use of a given BPF helper from specified BPF program type. + * @param prog_type BPF program type used to check the support of BPF helper + * @param helper_id BPF helper ID (enum bpf_func_id) to check support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given combination of program type and helper is supported; 0, + * if the combination is not supported; negative error code if feature + * detection for provided input arguments failed or can't be performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, + enum bpf_func_id helper_id, const void *opts); + +/** + * @brief **libbpf_num_possible_cpus()** is a helper function to get the + * number of possible CPUs that the host kernel supports and expects. + * @return number of possible CPUs; or error code on failure + * + * Example usage: + * + * int ncpus = libbpf_num_possible_cpus(); + * if (ncpus < 0) { + * // error handling + * } + * long values[ncpus]; + * bpf_map_lookup_elem(per_cpu_map_fd, key, values); + */ +LIBBPF_API int libbpf_num_possible_cpus(void); + +struct bpf_map_skeleton { + const char *name; + struct bpf_map **map; + void **mmaped; +}; + +struct bpf_prog_skeleton { + const char *name; + struct bpf_program **prog; + struct bpf_link **link; +}; + +struct bpf_object_skeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const char *name; + const void *data; + size_t data_sz; + + struct bpf_object **obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; +}; + +LIBBPF_API int +bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts); +LIBBPF_API int bpf_object__load_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s); + +struct bpf_var_skeleton { + const char *name; + struct bpf_map **map; + void **addr; +}; + +struct bpf_object_subskeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const struct bpf_object *obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; + + int var_cnt; + int var_skel_sz; /* sizeof(struct bpf_var_skeleton) */ + struct bpf_var_skeleton *vars; +}; + +LIBBPF_API int +bpf_object__open_subskeleton(struct bpf_object_subskeleton *s); +LIBBPF_API void +bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s); + +struct gen_loader_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ + const char *data; + const char *insns; + __u32 data_sz; + __u32 insns_sz; +}; + +#define gen_loader_opts__last_field insns_sz +LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, + struct gen_loader_opts *opts); + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker_file_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; +}; +#define bpf_linker_file_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, + const char *filename, + const struct bpf_linker_file_opts *opts); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + +/* + * Custom handling of BPF program's SEC() definitions + */ + +struct bpf_prog_load_opts; /* defined in bpf.h */ + +/* Called during bpf_object__open() for each recognized BPF program. Callback + * can use various bpf_program__set_*() setters to adjust whatever properties + * are necessary. + */ +typedef int (*libbpf_prog_setup_fn_t)(struct bpf_program *prog, long cookie); + +/* Called right before libbpf performs bpf_prog_load() to load BPF program + * into the kernel. Callback can adjust opts as necessary. + */ +typedef int (*libbpf_prog_prepare_load_fn_t)(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie); + +/* Called during skeleton attach or through bpf_program__attach(). If + * auto-attach is not supported, callback should return 0 and set link to + * NULL (it's not considered an error during skeleton attach, but it will be + * an error for bpf_program__attach() calls). On error, error should be + * returned directly and link set to NULL. On success, return 0 and set link + * to a valid struct bpf_link. + */ +typedef int (*libbpf_prog_attach_fn_t)(const struct bpf_program *prog, long cookie, + struct bpf_link **link); + +struct libbpf_prog_handler_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* User-provided value that is passed to prog_setup_fn, + * prog_prepare_load_fn, and prog_attach_fn callbacks. Allows user to + * register one set of callbacks for multiple SEC() definitions and + * still be able to distinguish them, if necessary. For example, + * libbpf itself is using this to pass necessary flags (e.g., + * sleepable flag) to a common internal SEC() handler. + */ + long cookie; + /* BPF program initialization callback (see libbpf_prog_setup_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_setup_fn_t prog_setup_fn; + /* BPF program loading callback (see libbpf_prog_prepare_load_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + /* BPF program attach callback (see libbpf_prog_attach_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_attach_fn_t prog_attach_fn; +}; +#define libbpf_prog_handler_opts__last_field prog_attach_fn + +/** + * @brief **libbpf_register_prog_handler()** registers a custom BPF program + * SEC() handler. + * @param sec section prefix for which custom handler is registered + * @param prog_type BPF program type associated with specified section + * @param exp_attach_type Expected BPF attach type associated with specified section + * @param opts optional cookie, callbacks, and other extra options + * @return Non-negative handler ID is returned on success. This handler ID has + * to be passed to *libbpf_unregister_prog_handler()* to unregister such + * custom handler. Negative error code is returned on error. + * + * *sec* defines which SEC() definitions are handled by this custom handler + * registration. *sec* can have few different forms: + * - if *sec* is just a plain string (e.g., "abc"), it will match only + * SEC("abc"). If BPF program specifies SEC("abc/whatever") it will result + * in an error; + * - if *sec* is of the form "abc/", proper SEC() form is + * SEC("abc/something"), where acceptable "something" should be checked by + * *prog_init_fn* callback, if there are additional restrictions; + * - if *sec* is of the form "abc+", it will successfully match both + * SEC("abc") and SEC("abc/whatever") forms; + * - if *sec* is NULL, custom handler is registered for any BPF program that + * doesn't match any of the registered (custom or libbpf's own) SEC() + * handlers. There could be only one such generic custom handler registered + * at any given time. + * + * All custom handlers (except the one with *sec* == NULL) are processed + * before libbpf's own SEC() handlers. It is allowed to "override" libbpf's + * SEC() handlers by registering custom ones for the same section prefix + * (i.e., it's possible to have custom SEC("perf_event/LLC-load-misses") + * handler). + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts); +/** + * @brief *libbpf_unregister_prog_handler()* unregisters previously registered + * custom BPF program SEC() handler. + * @param handler_id handler ID returned by *libbpf_register_prog_handler()* + * after successful registration + * @return 0 on success, negative error code if handler isn't found + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_unregister_prog_handler(int handler_id); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LIBBPF_H */ diff --git a/third_party/bpftool/libbpf/src/libbpf.map b/third_party/bpftool/libbpf/src/libbpf.map new file mode 100644 index 0000000..d9ec440 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf.map @@ -0,0 +1,399 @@ +LIBBPF_0.0.1 { + global: + bpf_btf_get_fd_by_id; + bpf_map__btf_key_type_id; + bpf_map__btf_value_type_id; + bpf_map__fd; + bpf_map__name; + bpf_map__pin; + bpf_map__reuse_fd; + bpf_map__set_ifindex; + bpf_map__set_inner_map_fd; + bpf_map__unpin; + bpf_map_delete_elem; + bpf_map_get_fd_by_id; + bpf_map_get_next_id; + bpf_map_get_next_key; + bpf_map_lookup_and_delete_elem; + bpf_map_lookup_elem; + bpf_map_update_elem; + bpf_obj_get; + bpf_obj_get_info_by_fd; + bpf_obj_pin; + bpf_object__btf_fd; + bpf_object__close; + bpf_object__find_map_by_name; + bpf_object__kversion; + bpf_object__load; + bpf_object__name; + bpf_object__open; + bpf_object__pin; + bpf_object__pin_maps; + bpf_object__pin_programs; + bpf_object__unpin_maps; + bpf_object__unpin_programs; + bpf_prog_attach; + bpf_prog_detach; + bpf_prog_detach2; + bpf_prog_get_fd_by_id; + bpf_prog_get_next_id; + bpf_prog_query; + bpf_program__fd; + bpf_program__pin; + bpf_program__set_expected_attach_type; + bpf_program__set_ifindex; + bpf_program__set_type; + bpf_program__unload; + bpf_program__unpin; + bpf_prog_linfo__free; + bpf_prog_linfo__new; + bpf_prog_linfo__lfind_addr_func; + bpf_prog_linfo__lfind; + bpf_raw_tracepoint_open; + bpf_task_fd_query; + btf__fd; + btf__find_by_name; + btf__free; + btf__name_by_offset; + btf__new; + btf__resolve_size; + btf__resolve_type; + btf__type_by_id; + libbpf_attach_type_by_name; + libbpf_get_error; + libbpf_prog_type_by_name; + libbpf_set_print; + libbpf_strerror; + local: + *; +}; + +LIBBPF_0.0.2 { + global: + bpf_map_lookup_elem_flags; + bpf_object__btf; + bpf_object__find_map_fd_by_name; + btf__get_raw_data; + btf_ext__free; + btf_ext__get_raw_data; + btf_ext__new; +} LIBBPF_0.0.1; + +LIBBPF_0.0.3 { + global: + bpf_map__is_internal; + bpf_map_freeze; +} LIBBPF_0.0.2; + +LIBBPF_0.0.4 { + global: + bpf_link__destroy; + bpf_program__attach_kprobe; + bpf_program__attach_perf_event; + bpf_program__attach_raw_tracepoint; + bpf_program__attach_tracepoint; + bpf_program__attach_uprobe; + btf_dump__dump_type; + btf_dump__free; + btf__parse_elf; + libbpf_num_possible_cpus; + perf_buffer__free; + perf_buffer__poll; +} LIBBPF_0.0.3; + +LIBBPF_0.0.5 { + global: + bpf_btf_get_next_id; +} LIBBPF_0.0.4; + +LIBBPF_0.0.6 { + global: + bpf_map__get_pin_path; + bpf_map__is_pinned; + bpf_map__set_pin_path; + bpf_object__open_file; + bpf_object__open_mem; + bpf_program__attach_trace; + bpf_program__get_expected_attach_type; + bpf_program__get_type; + btf__find_by_name_kind; + libbpf_find_vmlinux_btf_id; +} LIBBPF_0.0.5; + +LIBBPF_0.0.7 { + global: + btf_dump__emit_type_decl; + bpf_link__disconnect; + bpf_map__attach_struct_ops; + bpf_map_delete_batch; + bpf_map_lookup_and_delete_batch; + bpf_map_lookup_batch; + bpf_map_update_batch; + bpf_object__find_program_by_name; + bpf_object__attach_skeleton; + bpf_object__destroy_skeleton; + bpf_object__detach_skeleton; + bpf_object__load_skeleton; + bpf_object__open_skeleton; + bpf_program__attach; + bpf_program__name; + btf__align_of; + libbpf_find_kernel_btf; +} LIBBPF_0.0.6; + +LIBBPF_0.0.8 { + global: + bpf_link__fd; + bpf_link__open; + bpf_link__pin; + bpf_link__pin_path; + bpf_link__unpin; + bpf_link__update_program; + bpf_link_create; + bpf_link_update; + bpf_map__set_initial_value; + bpf_prog_attach_opts; + bpf_program__attach_cgroup; + bpf_program__attach_lsm; + bpf_program__set_attach_target; +} LIBBPF_0.0.7; + +LIBBPF_0.0.9 { + global: + bpf_enable_stats; + bpf_iter_create; + bpf_link_get_fd_by_id; + bpf_link_get_next_id; + bpf_program__attach_iter; + bpf_program__attach_netns; + perf_buffer__consume; + ring_buffer__add; + ring_buffer__consume; + ring_buffer__free; + ring_buffer__new; + ring_buffer__poll; +} LIBBPF_0.0.8; + +LIBBPF_0.1.0 { + global: + bpf_link__detach; + bpf_link_detach; + bpf_map__ifindex; + bpf_map__key_size; + bpf_map__map_flags; + bpf_map__max_entries; + bpf_map__numa_node; + bpf_map__set_key_size; + bpf_map__set_map_flags; + bpf_map__set_max_entries; + bpf_map__set_numa_node; + bpf_map__set_type; + bpf_map__set_value_size; + bpf_map__type; + bpf_map__value_size; + bpf_program__attach_xdp; + bpf_program__autoload; + bpf_program__set_autoload; + btf__parse; + btf__parse_raw; + btf__pointer_size; + btf__set_fd; + btf__set_pointer_size; +} LIBBPF_0.0.9; + +LIBBPF_0.2.0 { + global: + bpf_prog_bind_map; + bpf_prog_test_run_opts; + bpf_program__attach_freplace; + bpf_program__section_name; + btf__add_array; + btf__add_const; + btf__add_enum; + btf__add_enum_value; + btf__add_datasec; + btf__add_datasec_var_info; + btf__add_field; + btf__add_func; + btf__add_func_param; + btf__add_func_proto; + btf__add_fwd; + btf__add_int; + btf__add_ptr; + btf__add_restrict; + btf__add_str; + btf__add_struct; + btf__add_typedef; + btf__add_union; + btf__add_var; + btf__add_volatile; + btf__endianness; + btf__find_str; + btf__new_empty; + btf__set_endianness; + btf__str_by_offset; + perf_buffer__buffer_cnt; + perf_buffer__buffer_fd; + perf_buffer__epoll_fd; + perf_buffer__consume_buffer; +} LIBBPF_0.1.0; + +LIBBPF_0.3.0 { + global: + btf__base_btf; + btf__parse_elf_split; + btf__parse_raw_split; + btf__parse_split; + btf__new_empty_split; + btf__new_split; + ring_buffer__epoll_fd; +} LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; + btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; + bpf_map__inner_map; + bpf_object__set_kversion; + bpf_tc_attach; + bpf_tc_detach; + bpf_tc_hook_create; + bpf_tc_hook_destroy; + bpf_tc_query; +} LIBBPF_0.3.0; + +LIBBPF_0.5.0 { + global: + bpf_map__initial_value; + bpf_map__pin_path; + bpf_map_lookup_and_delete_elem_flags; + bpf_program__attach_kprobe_opts; + bpf_program__attach_perf_event_opts; + bpf_program__attach_tracepoint_opts; + bpf_program__attach_uprobe_opts; + bpf_object__gen_loader; + btf__load_from_kernel_by_id; + btf__load_from_kernel_by_id_split; + btf__load_into_kernel; + btf__load_module_btf; + btf__load_vmlinux_btf; + btf_dump__dump_type_data; + libbpf_set_strict_mode; +} LIBBPF_0.4.0; + +LIBBPF_0.6.0 { + global: + bpf_map__map_extra; + bpf_map__set_map_extra; + bpf_map_create; + bpf_object__next_map; + bpf_object__next_program; + bpf_object__prev_map; + bpf_object__prev_program; + bpf_prog_load; + bpf_program__flags; + bpf_program__insn_cnt; + bpf_program__insns; + bpf_program__set_flags; + btf__add_btf; + btf__add_decl_tag; + btf__add_type_tag; + btf__dedup; + btf__raw_data; + btf__type_cnt; + btf_dump__new; + libbpf_major_version; + libbpf_minor_version; + libbpf_version_string; + perf_buffer__new; + perf_buffer__new_raw; +} LIBBPF_0.5.0; + +LIBBPF_0.7.0 { + global: + bpf_btf_load; + bpf_program__expected_attach_type; + bpf_program__log_buf; + bpf_program__log_level; + bpf_program__set_log_buf; + bpf_program__set_log_level; + bpf_program__type; + bpf_xdp_attach; + bpf_xdp_detach; + bpf_xdp_query; + bpf_xdp_query_id; + btf_ext__raw_data; + libbpf_probe_bpf_helper; + libbpf_probe_bpf_map_type; + libbpf_probe_bpf_prog_type; + libbpf_set_memlock_rlim; +} LIBBPF_0.6.0; + +LIBBPF_0.8.0 { + global: + bpf_map__autocreate; + bpf_map__get_next_key; + bpf_map__delete_elem; + bpf_map__lookup_and_delete_elem; + bpf_map__lookup_elem; + bpf_map__set_autocreate; + bpf_map__update_elem; + bpf_map_delete_elem_flags; + bpf_object__destroy_subskeleton; + bpf_object__open_subskeleton; + bpf_program__attach_kprobe_multi_opts; + bpf_program__attach_trace_opts; + bpf_program__attach_usdt; + bpf_program__set_insns; + libbpf_register_prog_handler; + libbpf_unregister_prog_handler; +} LIBBPF_0.7.0; + +LIBBPF_1.0.0 { + global: + bpf_obj_get_opts; + bpf_prog_query_opts; + bpf_program__attach_ksyscall; + bpf_program__autoattach; + bpf_program__set_autoattach; + btf__add_enum64; + btf__add_enum64_value; + libbpf_bpf_attach_type_str; + libbpf_bpf_link_type_str; + libbpf_bpf_map_type_str; + libbpf_bpf_prog_type_str; + perf_buffer__buffer; +} LIBBPF_0.8.0; + +LIBBPF_1.1.0 { + global: + bpf_btf_get_fd_by_id_opts; + bpf_link_get_fd_by_id_opts; + bpf_map_get_fd_by_id_opts; + bpf_prog_get_fd_by_id_opts; + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; + +LIBBPF_1.2.0 { + global: + bpf_btf_get_info_by_fd; + bpf_link__update_map; + bpf_link_get_info_by_fd; + bpf_map_get_info_by_fd; + bpf_prog_get_info_by_fd; +} LIBBPF_1.1.0; + +LIBBPF_1.3.0 { + global: + bpf_obj_pin_opts; + bpf_program__attach_netfilter; +} LIBBPF_1.2.0; diff --git a/third_party/bpftool/libbpf/src/libbpf.pc.template b/third_party/bpftool/libbpf/src/libbpf.pc.template new file mode 100644 index 0000000..b45ed53 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf.pc.template @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +prefix=@PREFIX@ +libdir=@LIBDIR@ +includedir=${prefix}/include + +Name: libbpf +Description: BPF library +Version: @VERSION@ +Libs: -L${libdir} -lbpf +Requires.private: libelf zlib +Cflags: -I${includedir} diff --git a/third_party/bpftool/libbpf/src/libbpf_common.h b/third_party/bpftool/libbpf/src/libbpf_common.h new file mode 100644 index 0000000..9a7937f --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_common.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common user-facing libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_COMMON_H +#define __LIBBPF_LIBBPF_COMMON_H + +#include +#include "libbpf_version.h" + +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + +#define LIBBPF_DEPRECATED(msg) __attribute__((deprecated(msg))) + +/* Mark a symbol as deprecated when libbpf version is >= {major}.{minor} */ +#define LIBBPF_DEPRECATED_SINCE(major, minor, msg) \ + __LIBBPF_MARK_DEPRECATED_ ## major ## _ ## minor \ + (LIBBPF_DEPRECATED("libbpf v" # major "." # minor "+: " msg)) + +#define __LIBBPF_CURRENT_VERSION_GEQ(major, minor) \ + (LIBBPF_MAJOR_VERSION > (major) || \ + (LIBBPF_MAJOR_VERSION == (major) && LIBBPF_MINOR_VERSION >= (minor))) + +/* Add checks for other versions below when planning deprecation of API symbols + * with the LIBBPF_DEPRECATED_SINCE macro. + */ +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X +#else +#define __LIBBPF_MARK_DEPRECATED_1_0(X) +#endif + +/* This set of internal macros allows to do "function overloading" based on + * number of arguments provided by used in backwards-compatible way during the + * transition to libbpf 1.0 + * It's ugly but necessary evil that will be cleaned up when we get to 1.0. + * See bpf_prog_load() overload for example. + */ +#define ___libbpf_cat(A, B) A ## B +#define ___libbpf_select(NAME, NUM) ___libbpf_cat(NAME, NUM) +#define ___libbpf_nth(_1, _2, _3, _4, _5, _6, N, ...) N +#define ___libbpf_cnt(...) ___libbpf_nth(__VA_ARGS__, 6, 5, 4, 3, 2, 1) +#define ___libbpf_overload(NAME, ...) ___libbpf_select(NAME, ___libbpf_cnt(__VA_ARGS__))(__VA_ARGS__) + +/* Helper macro to declare and initialize libbpf options struct + * + * This dance with uninitialized declaration, followed by memset to zero, + * followed by assignment using compound literal syntax is done to preserve + * ability to use a nice struct field initialization syntax and **hopefully** + * have all the padding bytes initialized to zero. It's not guaranteed though, + * when copying literal, that compiler won't copy garbage in literal's padding + * bytes, but that's the best way I've found and it seems to work in practice. + * + * Macro declares opts struct of given type and name, zero-initializes, + * including any extra padding, it with memset() and then assigns initial + * values provided by users in struct initializer-syntax as varargs. + */ +#define LIBBPF_OPTS(TYPE, NAME, ...) \ + struct TYPE NAME = ({ \ + memset(&NAME, 0, sizeof(struct TYPE)); \ + (struct TYPE) { \ + .sz = sizeof(struct TYPE), \ + __VA_ARGS__ \ + }; \ + }) + +#endif /* __LIBBPF_LIBBPF_COMMON_H */ diff --git a/third_party/bpftool/libbpf/src/libbpf_errno.c b/third_party/bpftool/libbpf/src/libbpf_errno.c new file mode 100644 index 0000000..6b18017 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_errno.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + */ + +#undef _GNU_SOURCE +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START) +#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c) +#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START) + +static const char *libbpf_strerror_table[NR_ERRNO] = { + [ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf", + [ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid", + [ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost", + [ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch", + [ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf", + [ERRCODE_OFFSET(RELOC)] = "Relocation failed", + [ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading", + [ERRCODE_OFFSET(PROG2BIG)] = "Program too big", + [ERRCODE_OFFSET(KVER)] = "Incorrect kernel version", + [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", + [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", + [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", +}; + +int libbpf_strerror(int err, char *buf, size_t size) +{ + int ret; + + if (!buf || !size) + return libbpf_err(-EINVAL); + + err = err > 0 ? err : -err; + + if (err < __LIBBPF_ERRNO__START) { + ret = strerror_r(err, buf, size); + buf[size - 1] = '\0'; + return libbpf_err_errno(ret); + } + + if (err < __LIBBPF_ERRNO__END) { + const char *msg; + + msg = libbpf_strerror_table[ERRNO_OFFSET(err)]; + ret = snprintf(buf, size, "%s", msg); + buf[size - 1] = '\0'; + /* The length of the buf and msg is positive. + * A negative number may be returned only when the + * size exceeds INT_MAX. Not likely to appear. + */ + if (ret >= size) + return libbpf_err(-ERANGE); + return 0; + } + + ret = snprintf(buf, size, "Unknown libbpf error %d", err); + buf[size - 1] = '\0'; + if (ret >= size) + return libbpf_err(-ERANGE); + return libbpf_err(-ENOENT); +} diff --git a/third_party/bpftool/libbpf/src/libbpf_internal.h b/third_party/bpftool/libbpf/src/libbpf_internal.h new file mode 100644 index 0000000..e4d0566 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_internal.h @@ -0,0 +1,580 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Internal libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_INTERNAL_H +#define __LIBBPF_LIBBPF_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include "relo_core.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +#include "libbpf.h" +#include "btf.h" + +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_ABS64 +#define R_BPF_64_ABS64 2 +#endif +#ifndef R_BPF_64_ABS32 +#define R_BPF_64_ABS32 3 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif + +/* Older libelf all end up in this expression, for both 32 and 64 bit */ +#ifndef ELF64_ST_VISIBILITY +#define ELF64_ST_VISIBILITY(o) ((o) & 0x03) +#endif + +#define BTF_INFO_ENC(kind, kind_flag, vlen) \ + ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) +#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) +#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \ + ((encoding) << 24 | (bits_offset) << 16 | (nr_bits)) +#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ + BTF_INT_ENC(encoding, bits_offset, bits) +#define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) +#define BTF_PARAM_ENC(name, type) (name), (type) +#define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) +#define BTF_TYPE_DECL_TAG_ENC(value, type, component_idx) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_DECL_TAG, 0, 0), type), (component_idx) +#define BTF_TYPE_TYPE_TAG_ENC(value, type) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_TYPE_TAG, 0, 0), type) + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef min +# define min(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef max +# define max(x, y) ((x) < (y) ? (y) : (x)) +#endif +#ifndef offsetofend +# define offsetofend(TYPE, FIELD) \ + (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) +#endif +#ifndef __alias +#define __alias(symbol) __attribute__((alias(#symbol))) +#endif + +/* Check whether a string `str` has prefix `pfx`, regardless if `pfx` is + * a string literal known at compilation time or char * pointer known only at + * runtime. + */ +#define str_has_pfx(str, pfx) \ + (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) + +/* suffix check */ +static inline bool str_has_sfx(const char *str, const char *sfx) +{ + size_t str_len = strlen(str); + size_t sfx_len = strlen(sfx); + + if (sfx_len > str_len) + return false; + return strcmp(str + str_len - sfx_len, sfx) == 0; +} + +/* Symbol versioning is different between static and shared library. + * Properly versioned symbols are needed for shared library, but + * only the symbol of the new version is needed for static library. + * Starting with GNU C 10, use symver attribute instead of .symver assembler + * directive, which works better with GCC LTO builds. + */ +#if defined(SHARED) && defined(__GNUC__) && __GNUC__ >= 10 + +#define DEFAULT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@@" #version))) +#define COMPAT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@" #version))) + +#elif defined(SHARED) + +#define COMPAT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@" #version); +#define DEFAULT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@@" #version); + +#else /* !SHARED */ + +#define COMPAT_VERSION(internal_name, api_name, version) +#define DEFAULT_VERSION(internal_name, api_name, version) \ + extern typeof(internal_name) api_name \ + __attribute__((alias(#internal_name))); + +#endif + +extern void libbpf_print(enum libbpf_print_level level, + const char *format, ...) + __attribute__((format(printf, 2, 3))); + +#define __pr(level, fmt, ...) \ +do { \ + libbpf_print(level, "libbpf: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_warn(fmt, ...) __pr(LIBBPF_WARN, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__) + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +struct bpf_link { + int (*detach)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); + char *pin_path; /* NULL, if not pinned */ + int fd; /* hook FD, -1 if not applicable */ + bool disconnected; +}; + +/* + * Re-implement glibc's reallocarray() for libbpf internal-only use. + * reallocarray(), unfortunately, is not available in all versions of glibc, + * so requires extra feature detection and using reallocarray() stub from + * and COMPAT_NEED_REALLOCARRAY. All this complicates + * build of libbpf unnecessarily and is just a maintenance burden. Instead, + * it's trivial to implement libbpf-specific internal version and use it + * throughout libbpf. + */ +static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) +{ + size_t total; + +#if __has_builtin(__builtin_mul_overflow) + if (unlikely(__builtin_mul_overflow(nmemb, size, &total))) + return NULL; +#else + if (size == 0 || nmemb > ULONG_MAX / size) + return NULL; + total = nmemb * size; +#endif + return realloc(ptr, total); +} + +/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst + * is zero-terminated string no matter what (unless sz == 0, in which case + * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs + * in what is returned. Given this is internal helper, it's trivial to extend + * this, when necessary. Use this instead of strncpy inside libbpf source code. + */ +static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) +{ + size_t i; + + if (sz == 0) + return; + + sz--; + for (i = 0; i < sz && src[i]; i++) + dst[i] = src[i]; + dst[i] = '\0'; +} + +__u32 get_kernel_version(void); + +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + MAP_DEF_MAP_EXTRA = 0x400, + + MAP_DEF_ALL = 0x7ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; + __u64 map_extra; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); + +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); + +static inline bool libbpf_is_mem_zeroed(const char *p, ssize_t len) +{ + while (len > 0) { + if (*p) + return false; + p++; + len--; + } + return true; +} + +static inline bool libbpf_validate_opts(const char *opts, + size_t opts_sz, size_t user_sz, + const char *type_name) +{ + if (user_sz < sizeof(size_t)) { + pr_warn("%s size (%zu) is too small\n", type_name, user_sz); + return false; + } + if (!libbpf_is_mem_zeroed(opts + opts_sz, (ssize_t)user_sz - opts_sz)) { + pr_warn("%s has non-zero extra bytes\n", type_name); + return false; + } + return true; +} + +#define OPTS_VALID(opts, type) \ + (!(opts) || libbpf_validate_opts((const char *)opts, \ + offsetofend(struct type, \ + type##__last_field), \ + (opts)->sz, #type)) +#define OPTS_HAS(opts, field) \ + ((opts) && opts->sz >= offsetofend(typeof(*(opts)), field)) +#define OPTS_GET(opts, field, fallback_value) \ + (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) +#define OPTS_SET(opts, field, value) \ + do { \ + if (OPTS_HAS(opts, field)) \ + (opts)->field = value; \ + } while (0) + +#define OPTS_ZEROED(opts, last_nonzero_field) \ +({ \ + ssize_t __off = offsetofend(typeof(*(opts)), last_nonzero_field); \ + !(opts) || libbpf_is_mem_zeroed((const void *)opts + __off, \ + (opts)->sz - __off); \ +}) + +enum kern_feature_id { + /* v4.14: kernel support for program & map names. */ + FEAT_PROG_NAME, + /* v5.2: kernel support for global data sections. */ + FEAT_GLOBAL_DATA, + /* BTF support */ + FEAT_BTF, + /* BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO support */ + FEAT_BTF_FUNC, + /* BTF_KIND_VAR and BTF_KIND_DATASEC support */ + FEAT_BTF_DATASEC, + /* BTF_FUNC_GLOBAL is supported */ + FEAT_BTF_GLOBAL_FUNC, + /* BPF_F_MMAPABLE is supported for arrays */ + FEAT_ARRAY_MMAP, + /* kernel support for expected_attach_type in BPF_PROG_LOAD */ + FEAT_EXP_ATTACH_TYPE, + /* bpf_probe_read_{kernel,user}[_str] helpers */ + FEAT_PROBE_READ_KERN, + /* BPF_PROG_BIND_MAP is supported */ + FEAT_PROG_BIND_MAP, + /* Kernel support for module BTFs */ + FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, + /* BPF perf link support */ + FEAT_PERF_LINK, + /* BTF_KIND_DECL_TAG support */ + FEAT_BTF_DECL_TAG, + /* BTF_KIND_TYPE_TAG support */ + FEAT_BTF_TYPE_TAG, + /* memcg-based accounting for BPF maps and progs */ + FEAT_MEMCG_ACCOUNT, + /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ + FEAT_BPF_COOKIE, + /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ + FEAT_BTF_ENUM64, + /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ + FEAT_SYSCALL_WRAPPER, + __FEAT_CNT, +}; + +int probe_memcg_account(void); +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id); +int bump_rlimit_memlock(void); + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len); +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind); + +struct btf_ext_info { + /* + * info points to the individual info section (e.g. func_info and + * line_info) from the .BTF.ext. It does not include the __u32 rec_size. + */ + void *info; + __u32 rec_size; + __u32 len; + /* optional (maintained internally by libbpf) mapping between .BTF.ext + * section and corresponding ELF section. This is used to join + * information like CO-RE relocation records with corresponding BPF + * programs defined in ELF sections + */ + __u32 *sec_idxs; + int sec_cnt; +}; + +#define for_each_btf_ext_sec(seg, sec) \ + for (sec = (seg)->info; \ + (void *)sec < (seg)->info + (seg)->len; \ + sec = (void *)sec + sizeof(struct btf_ext_info_sec) + \ + (seg)->rec_size * sec->num_info) + +#define for_each_btf_ext_rec(seg, sec, i, rec) \ + for (i = 0, rec = (void *)&(sec)->data; \ + i < (sec)->num_info; \ + i++, rec = (void *)rec + (seg)->rec_size) + +/* + * The .BTF.ext ELF section layout defined as + * struct btf_ext_header + * func_info subsection + * + * The func_info subsection layout: + * record size for struct bpf_func_info in the func_info subsection + * struct btf_sec_func_info for section #1 + * a list of bpf_func_info records for section #1 + * where struct bpf_func_info mimics one in include/uapi/linux/bpf.h + * but may not be identical + * struct btf_sec_func_info for section #2 + * a list of bpf_func_info records for section #2 + * ...... + * + * Note that the bpf_func_info record size in .BTF.ext may not + * be the same as the one defined in include/uapi/linux/bpf.h. + * The loader should ensure that record_size meets minimum + * requirement and pass the record as is to the kernel. The + * kernel will handle the func_info properly based on its contents. + */ +struct btf_ext_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; + + /* optional part of .BTF.ext header */ + __u32 core_relo_off; + __u32 core_relo_len; +}; + +struct btf_ext { + union { + struct btf_ext_header *hdr; + void *data; + }; + struct btf_ext_info func_info; + struct btf_ext_info line_info; + struct btf_ext_info core_relo_info; + __u32 data_size; +}; + +struct btf_ext_info_sec { + __u32 sec_name_off; + __u32 num_info; + /* Followed by num_info * record_size number of bytes */ + __u8 data[]; +}; + +/* The minimum bpf_func_info checked by the loader */ +struct bpf_func_info_min { + __u32 insn_off; + __u32 type_id; +}; + +/* The minimum bpf_line_info checked by the loader */ +struct bpf_line_info_min { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + + +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind); + +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); + +/* handle direct returned errors */ +static inline int libbpf_err(int ret) +{ + if (ret < 0) + errno = -ret; + return ret; +} + +/* handle errno-based (e.g., syscall or libc) errors according to libbpf's + * strict mode settings + */ +static inline int libbpf_err_errno(int ret) +{ + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; +} + +/* handle error for pointer-returning APIs, err is assumed to be < 0 always */ +static inline void *libbpf_err_ptr(int err) +{ + /* set errno on error, this doesn't break anything */ + errno = -err; + return NULL; +} + +/* handle pointer-returning APIs' error handling */ +static inline void *libbpf_ptr(void *ret) +{ + /* set errno on error, this doesn't break anything */ + if (IS_ERR(ret)) + errno = -PTR_ERR(ret); + + return IS_ERR(ret) ? NULL : ret; +} + +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +/* if fd is stdin, stdout, or stderr, dup to a fd greater than 2 + * Takes ownership of the fd passed in, and closes it if calling + * fcntl(fd, F_DUPFD_CLOEXEC, 3). + */ +static inline int ensure_good_fd(int fd) +{ + int old_fd = fd, saved_errno; + + if (fd < 0) + return fd; + if (fd < 3) { + fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + saved_errno = errno; + close(old_fd); + errno = saved_errno; + if (fd < 0) { + pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); + errno = saved_errno; + } + } + return fd; +} + +/* The following two functions are exposed to bpftool */ +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands); +void bpf_core_free_cands(struct bpf_core_cand_list *cands); + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj); +void usdt_manager_free(struct usdt_manager *man); +struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, + const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie); + +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +#define PROG_LOAD_ATTEMPTS 5 +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); + +#endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/third_party/bpftool/libbpf/src/libbpf_legacy.h b/third_party/bpftool/libbpf/src/libbpf_legacy.h new file mode 100644 index 0000000..1e1be46 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_legacy.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Libbpf legacy APIs (either discouraged or deprecated, as mentioned in [0]) + * + * [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY + * + * Copyright (C) 2021 Facebook + */ +#ifndef __LIBBPF_LEGACY_BPF_H +#define __LIBBPF_LEGACY_BPF_H + +#include +#include +#include +#include +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ +enum libbpf_strict_mode { + /* Turn on all supported strict features of libbpf to simulate libbpf + * v1.0 behavior. + * This will be the default behavior in libbpf v1.0. + */ + LIBBPF_STRICT_ALL = 0xffffffff, + + /* + * Disable any libbpf 1.0 behaviors. This is the default before libbpf + * v1.0. It won't be supported anymore in v1.0, please update your + * code so that it handles LIBBPF_STRICT_ALL mode before libbpf v1.0. + */ + LIBBPF_STRICT_NONE = 0x00, + /* + * Return NULL pointers on error, not ERR_PTR(err). + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_CLEAN_PTRS = 0x01, + /* + * Return actual error codes from low-level APIs directly, not just -1. + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_DIRECT_ERRS = 0x02, + /* + * Enforce strict BPF program section (SEC()) names. + * E.g., while prefiously SEC("xdp_whatever") or SEC("perf_event_blah") were + * allowed, with LIBBPF_STRICT_SEC_PREFIX this will become + * unrecognized by libbpf and would have to be just SEC("xdp") and + * SEC("xdp") and SEC("perf_event"). + * + * Note, in this mode the program pin path will be based on the + * function name instead of section name. + * + * Additionally, routines in the .text section are always considered + * sub-programs. Legacy behavior allows for a single routine in .text + * to be a program. + */ + LIBBPF_STRICT_SEC_NAME = 0x04, + /* + * Disable the global 'bpf_objects_list'. Maintaining this list adds + * a race condition to bpf_object__open() and bpf_object__close(). + * Clients can maintain it on their own if it is valuable for them. + */ + LIBBPF_STRICT_NO_OBJECT_LIST = 0x08, + /* + * Automatically bump RLIMIT_MEMLOCK using setrlimit() before the + * first BPF program or map creation operation. This is done only if + * kernel is too old to support memcg-based memory accounting for BPF + * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before + * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() + * operation. + */ + LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK = 0x10, + /* + * Error out on any SEC("maps") map definition, which are deprecated + * in favor of BTF-defined map definitions in SEC(".maps"). + */ + LIBBPF_STRICT_MAP_DEFINITIONS = 0x20, + + __LIBBPF_STRICT_LAST, +}; + +LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); + +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + +#define DECLARE_LIBBPF_OPTS LIBBPF_OPTS + +/* "Discouraged" APIs which don't follow consistent libbpf naming patterns. + * They are normally a trivial aliases or wrappers for proper APIs and are + * left to minimize unnecessary disruption for users of libbpf. But they + * shouldn't be used going forward. + */ + +struct bpf_program; +struct bpf_map; +struct btf; +struct btf_ext; + +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); +LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); +LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); +LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LEGACY_BPF_H */ diff --git a/third_party/bpftool/libbpf/src/libbpf_probes.c b/third_party/bpftool/libbpf/src/libbpf_probes.c new file mode 100644 index 0000000..9c4db90 --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_probes.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, + * but Ubuntu provides /proc/version_signature file, as described at + * https://ubuntu.com/kernel, with an example contents below, which we + * can use to get a proper LINUX_VERSION_CODE. + * + * Ubuntu 5.4.0-12.15-generic 5.4.8 + * + * In the above, 5.4.8 is what kernel is actually expecting, while + * uname() call will return 5.4.0 in info.release. + */ +static __u32 get_ubuntu_kernel_version(void) +{ + const char *ubuntu_kver_file = "/proc/version_signature"; + __u32 major, minor, patch; + int ret; + FILE *f; + + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) != 0) + return 0; + + f = fopen(ubuntu_kver_file, "re"); + if (!f) + return 0; + + ret = fscanf(f, "%*s %*s %u.%u.%u\n", &major, &minor, &patch); + fclose(f); + if (ret != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +/* On Debian LINUX_VERSION_CODE doesn't correspond to info.release. + * Instead, it is provided in info.version. An example content of + * Debian 10 looks like the below. + * + * utsname::release 4.19.0-22-amd64 + * utsname::version #1 SMP Debian 4.19.260-1 (2022-09-29) + * + * In the above, 4.19.260 is what kernel is actually expecting, while + * uname() call will return 4.19.0 in info.release. + */ +static __u32 get_debian_kernel_version(struct utsname *info) +{ + __u32 major, minor, patch; + char *p; + + p = strstr(info->version, "Debian "); + if (!p) { + /* This is not a Debian kernel. */ + return 0; + } + + if (sscanf(p, "Debian %u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +__u32 get_kernel_version(void) +{ + __u32 major, minor, patch, version; + struct utsname info; + + /* Check if this is an Ubuntu kernel. */ + version = get_ubuntu_kernel_version(); + if (version != 0) + return version; + + uname(&info); + + /* Check if this is a Debian kernel. */ + version = get_debian_kernel_version(&info); + if (version != 0) + return version; + + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +static int probe_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, size_t insns_cnt, + char *log_buf, size_t log_buf_sz) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = log_buf_sz, + .log_level = log_buf ? 1 : 0, + ); + int fd, err, exp_err = 0; + const char *exp_msg = NULL; + char buf[4096]; + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + opts.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + break; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + opts.expected_attach_type = BPF_CGROUP_GETSOCKOPT; + break; + case BPF_PROG_TYPE_SK_LOOKUP: + opts.expected_attach_type = BPF_SK_LOOKUP; + break; + case BPF_PROG_TYPE_KPROBE: + opts.kern_version = get_kernel_version(); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + opts.expected_attach_type = BPF_LIRC_MODE2; + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + if (prog_type == BPF_PROG_TYPE_TRACING) + opts.expected_attach_type = BPF_TRACE_FENTRY; + else + opts.expected_attach_type = BPF_MODIFY_RETURN; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "attach_btf_id 1 is not a function"; + break; + case BPF_PROG_TYPE_EXT: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "Cannot replace kernel functions"; + break; + case BPF_PROG_TYPE_SYSCALL: + opts.prog_flags = BPF_F_SLEEPABLE; + break; + case BPF_PROG_TYPE_STRUCT_OPS: + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_PROG_TYPE_UNSPEC: + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + break; + case BPF_PROG_TYPE_NETFILTER: + opts.expected_attach_type = BPF_NETFILTER; + break; + default: + return -EOPNOTSUPP; + } + + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); + err = -errno; + if (fd >= 0) + close(fd); + if (exp_err) { + if (fd >= 0 || err != exp_err) + return 0; + if (exp_msg && !strstr(buf, exp_msg)) + return 0; + return 1; + } + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); + return libbpf_err(ret); +} + +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len) +{ + struct btf_header hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_len = types_len, + .str_off = types_len, + .str_len = str_len, + }; + int btf_fd, btf_len; + __u8 *raw_btf; + + btf_len = hdr.hdr_len + hdr.type_len + hdr.str_len; + raw_btf = malloc(btf_len); + if (!raw_btf) + return -ENOMEM; + + memcpy(raw_btf, &hdr, sizeof(hdr)); + memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); + memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); + + btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); + + free(raw_btf); + return btf_fd; +} + +static int load_local_storage_btf(void) +{ + const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l"; + /* struct bpf_spin_lock { + * int val; + * }; + * struct val { + * int cnt; + * struct bpf_spin_lock l; + * }; + */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* struct bpf_spin_lock */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), + BTF_MEMBER_ENC(15, 1, 0), /* int val; */ + /* struct val */ /* [3] */ + BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8), + BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */ + BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */ + }; + + return libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs)); +} + +static int probe_map_create(enum bpf_map_type map_type) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int key_size, value_size, max_entries; + __u32 btf_key_type_id = 0, btf_value_type_id = 0; + int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err = 0; + + key_size = sizeof(__u32); + value_size = sizeof(__u32); + max_entries = 1; + + switch (map_type) { + case BPF_MAP_TYPE_STACK_TRACE: + value_size = sizeof(__u64); + break; + case BPF_MAP_TYPE_LPM_TRIE: + key_size = sizeof(__u64); + value_size = sizeof(__u64); + opts.map_flags = BPF_F_NO_PREALLOC; + break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + key_size = sizeof(struct bpf_cgroup_storage_key); + value_size = sizeof(__u64); + max_entries = 0; + break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + key_size = 0; + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + btf_key_type_id = 1; + btf_value_type_id = 3; + value_size = 8; + max_entries = 0; + opts.map_flags = BPF_F_NO_PREALLOC; + btf_fd = load_local_storage_btf(); + if (btf_fd < 0) + return btf_fd; + break; + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + key_size = 0; + value_size = 0; + max_entries = sysconf(_SC_PAGE_SIZE); + break; + case BPF_MAP_TYPE_STRUCT_OPS: + /* we'll get -ENOTSUPP for invalid BTF type ID for struct_ops */ + opts.btf_vmlinux_value_type_id = 1; + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_MAP_TYPE_BLOOM_FILTER: + key_size = 0; + max_entries = 1; + break; + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + break; + case BPF_MAP_TYPE_UNSPEC: + default: + return -EOPNOTSUPP; + } + + if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, + sizeof(__u32), sizeof(__u32), 1, NULL); + if (fd_inner < 0) + goto cleanup; + + opts.inner_map_fd = fd_inner; + } + + if (btf_fd >= 0) { + opts.btf_fd = btf_fd; + opts.btf_key_type_id = btf_key_type_id; + opts.btf_value_type_id = btf_value_type_id; + } + + fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); + err = -errno; + +cleanup: + if (fd >= 0) + close(fd); + if (fd_inner >= 0) + close(fd_inner); + if (btf_fd >= 0) + close(btf_fd); + + if (exp_err) + return fd < 0 && err == exp_err ? 1 : 0; + else + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) +{ + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_map_create(map_type); + return libbpf_err(ret); +} + +int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, + const void *opts) +{ + struct bpf_insn insns[] = { + BPF_EMIT_CALL((__u32)helper_id), + BPF_EXIT_INSN(), + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + char buf[4096]; + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + /* we can't successfully load all prog types to check for BPF helper + * support, so bail out with -EOPNOTSUPP error + */ + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + return -EOPNOTSUPP; + default: + break; + } + + buf[0] = '\0'; + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); + if (ret < 0) + return libbpf_err(ret); + + /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) + * at all, it will emit something like "invalid func unknown#181". + * If BPF verifier recognizes BPF helper but it's not supported for + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * In both cases, provided combination of BPF program type and BPF + * helper is not supported by the kernel. + * In all other cases, probe_prog_load() above will either succeed (e.g., + * because BPF helper happens to accept no input arguments or it + * accepts one input argument and initial PTR_TO_CTX is fine for + * that), or we'll get some more specific BPF verifier error about + * some unsatisfied conditions. + */ + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + return 0; + return 1; /* assume supported */ +} diff --git a/third_party/bpftool/libbpf/src/libbpf_version.h b/third_party/bpftool/libbpf/src/libbpf_version.h new file mode 100644 index 0000000..290411d --- /dev/null +++ b/third_party/bpftool/libbpf/src/libbpf_version.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2021 Facebook */ +#ifndef __LIBBPF_VERSION_H +#define __LIBBPF_VERSION_H + +#define LIBBPF_MAJOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 3 + +#endif /* __LIBBPF_VERSION_H */ diff --git a/third_party/bpftool/libbpf/src/linker.c b/third_party/bpftool/libbpf/src/linker.c new file mode 100644 index 0000000..5ced96d --- /dev/null +++ b/third_party/bpftool/libbpf/src/linker.c @@ -0,0 +1,2905 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +#define BTF_EXTERN_SEC ".extern" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; +}; + +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; +}; + +#define pr_warn_elf(fmt, ...) \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); + } + free(linker->secs); + + free(linker->glob_syms); + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return errno = EINVAL, NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return errno = EINVAL, NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return errno = ENOMEM, NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return errno = -err, NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr) { + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts) +{ + struct src_obj obj = {}; + int err = 0; + + if (!OPTS_VALID(opts, bpf_linker_file_opts)) + return libbpf_err(-EINVAL); + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = err ?: linker_load_obj_file(linker, filename, opts, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); + + /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return libbpf_err(err); +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY | O_CLOEXEC); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); + + return err; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec; + int i, err; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; + break; + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); + return -EINVAL; + } + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); + + return -EINVAL; + } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } + + return 0; +} + +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel -> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32 && + sym_type != R_BPF_64_ABS64 && sym_type != R_BPF_64_ABS32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + + return 0; +} + +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id >= btf__type_cnt(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; + + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -ENOMEM; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->ephemeral || src->ephemeral) + return true; + + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align, src_align; + size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + /* If dst_align_sz == 0, realloc() behaves in a special way: + * 1. When dst->raw_data is NULL it returns: + * "either NULL or a pointer suitable to be passed to free()" [1]. + * 2. When dst->raw_data is not-NULL it frees dst->raw_data and returns NULL, + * thus invalidating any "pointer suitable to be passed to free()" obtained + * at step (1). + * + * The dst_align_sz > 0 check avoids error exit after (2), otherwise + * dst->raw_data would be freed again in bpf_linker__free(). + * + * [1] man 3 realloc + */ + if (!tmp && dst_align_sz > 0) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped || sec->ephemeral) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; + int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; + } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } + + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n, m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) + continue; + + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) + continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; + } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; + } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } + + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} + +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= ~0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } + + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); + + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; + } + + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("sections %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx, dst_sym_idx, sym_type; + Elf64_Sym *src_sym; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + const char *name; + + if (!obj->btf) + return 0; + + start_id = btf__type_cnt(linker->btf); + n = btf__type_cnt(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i < n; i++) { + struct glob_sym *glob_sym = NULL; + + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } + } + + /* remap all the types except DATASECs */ + n = btf__type_cnt(linker->btf); + for (i = start_id; i < n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = finalize_btf(linker); + if (err) + return libbpf_err(err); + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return libbpf_err(err); + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return libbpf_err(err); + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + LIBBPF_OPTS(btf_dedup_opts, opts); + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__type_cnt(linker->btf) == 1) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->has_btf) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + opts.btf_ext = linker->btf_ext; + err = btf__dedup(linker->btf, &opts); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + goto out; + } + +out: + free(data); + return err; +} diff --git a/third_party/bpftool/libbpf/src/netlink.c b/third_party/bpftool/libbpf/src/netlink.c new file mode 100644 index 0000000..84dd5fa --- /dev/null +++ b/third_party/bpftool/libbpf/src/netlink.c @@ -0,0 +1,917 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); + +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, + void *cookie); + +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + +struct xdp_id_md { + int ifindex; + __u32 flags; + struct xdp_link_info info; + __u64 feature_flags; +}; + +struct xdp_features_md { + int ifindex; + __u64 flags; +}; + +static int libbpf_netlink_open(__u32 *nl_pid, int proto) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + pr_warn("Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static void libbpf_netlink_close(int sock) +{ + close(sock); +} + +enum { + NL_CONT, + NL_NEXT, + NL_DONE, +}; + +static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags) +{ + int len; + + do { + len = recvmsg(sock, mhdr, flags); + } while (len < 0 && (errno == EINTR || errno == EAGAIN)); + + if (len < 0) + return -errno; + return len; +} + +static int alloc_iov(struct iovec *iov, int len) +{ + void *nbuf; + + nbuf = realloc(iov->iov_base, len); + if (!nbuf) + return -ENOMEM; + + iov->iov_base = nbuf; + iov->iov_len = len; + return 0; +} + +static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct iovec iov = {}; + struct msghdr mhdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + int len, ret; + + ret = alloc_iov(&iov, 4096); + if (ret) + goto done; + + while (multipart) { +start: + multipart = false; + len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC); + if (len < 0) { + ret = len; + goto done; + } + + if (len > iov.iov_len) { + ret = alloc_iov(&iov, len); + if (ret) + goto done; + } + + len = netlink_recvmsg(sock, &mhdr, 0); + if (len < 0) { + ret = len; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + libbpf_nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + ret = 0; + goto done; + default: + break; + } + if (_fn) { + ret = _fn(nh, fn, cookie); + switch (ret) { + case NL_CONT: + break; + case NL_NEXT: + goto start; + case NL_DONE: + ret = 0; + goto done; + default: + goto done; + } + } + } + } + ret = 0; +done: + free(iov.iov_base); + return ret; +} + +static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, + int proto, __dump_nlmsg_t parse_msg, + libbpf_dump_nlmsg_t parse_attr, + void *cookie) +{ + __u32 nl_pid = 0; + int sock, ret; + + sock = libbpf_netlink_open(&nl_pid, proto); + if (sock < 0) + return sock; + + req->nh.nlmsg_pid = 0; + req->nh.nlmsg_seq = time(NULL); + + if (send(sock, req, req->nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto out; + } + + ret = libbpf_netlink_recv(sock, nl_pid, req->nh.nlmsg_seq, + parse_msg, parse_attr, cookie); +out: + libbpf_netlink_close(sock); + return ret; +} + +static int parse_genl_family_id(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[CTRL_ATTR_FAMILY_ID + 1]; + __u16 *id = cookie; + + libbpf_nla_parse(tb, CTRL_ATTR_FAMILY_ID, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + if (!tb[CTRL_ATTR_FAMILY_ID]) + return NL_CONT; + + *id = libbpf_nla_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]); + return NL_DONE; +} + +static int libbpf_netlink_resolve_genl_family_id(const char *name, + __u16 len, __u16 *id) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN), + .nh.nlmsg_type = GENL_ID_CTRL, + .nh.nlmsg_flags = NLM_F_REQUEST, + .gnl.cmd = CTRL_CMD_GETFAMILY, + .gnl.version = 2, + }; + int err; + + err = nlattr_add(&req, CTRL_ATTR_FAMILY_NAME, name, len); + if (err < 0) + return err; + + return libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_genl_family_id, NULL, id); +} + +static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, + __u32 flags) +{ + struct nlattr *nla; + int ret; + struct libbpf_nla_req req; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + nla = nlattr_begin_nested(&req, IFLA_XDP); + if (!nla) + return -EMSGSIZE; + ret = nlattr_add(&req, IFLA_XDP_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + if (flags) { + ret = nlattr_add(&req, IFLA_XDP_FLAGS, &flags, sizeof(flags)); + if (ret < 0) + return ret; + } + if (flags & XDP_FLAGS_REPLACE) { + ret = nlattr_add(&req, IFLA_XDP_EXPECTED_FD, &old_fd, + sizeof(old_fd)); + if (ret < 0) + return ret; + } + nlattr_end_nested(&req, nla); + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + int old_prog_fd, err; + + if (!OPTS_VALID(opts, bpf_xdp_attach_opts)) + return libbpf_err(-EINVAL); + + old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + if (old_prog_fd) + flags |= XDP_FLAGS_REPLACE; + else + old_prog_fd = -1; + + err = __bpf_set_link_xdp_fd_replace(ifindex, prog_fd, old_prog_fd, flags); + return libbpf_err(err); +} + +int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + return bpf_xdp_attach(ifindex, -1, flags, opts); +} + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + + if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb) +{ + struct nlattr *xdp_tb[IFLA_XDP_MAX + 1]; + struct xdp_id_md *xdp_id = cookie; + struct ifinfomsg *ifinfo = msg; + int ret; + + if (xdp_id->ifindex && xdp_id->ifindex != ifinfo->ifi_index) + return 0; + + if (!tb[IFLA_XDP]) + return 0; + + ret = libbpf_nla_parse_nested(xdp_tb, IFLA_XDP_MAX, tb[IFLA_XDP], NULL); + if (ret) + return ret; + + if (!xdp_tb[IFLA_XDP_ATTACHED]) + return 0; + + xdp_id->info.attach_mode = libbpf_nla_getattr_u8( + xdp_tb[IFLA_XDP_ATTACHED]); + + if (xdp_id->info.attach_mode == XDP_ATTACHED_NONE) + return 0; + + if (xdp_tb[IFLA_XDP_PROG_ID]) + xdp_id->info.prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_PROG_ID]); + + if (xdp_tb[IFLA_XDP_SKB_PROG_ID]) + xdp_id->info.skb_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_SKB_PROG_ID]); + + if (xdp_tb[IFLA_XDP_DRV_PROG_ID]) + xdp_id->info.drv_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_DRV_PROG_ID]); + + if (xdp_tb[IFLA_XDP_HW_PROG_ID]) + xdp_id->info.hw_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_HW_PROG_ID]); + + return 0; +} + +static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[NETDEV_CMD_MAX + 1]; + struct xdp_features_md *md = cookie; + __u32 ifindex; + + libbpf_nla_parse(tb, NETDEV_CMD_MAX, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + + if (!tb[NETDEV_A_DEV_IFINDEX] || !tb[NETDEV_A_DEV_XDP_FEATURES]) + return NL_CONT; + + ifindex = libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_IFINDEX]); + if (ifindex != md->ifindex) + return NL_CONT; + + md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + return NL_DONE; +} + +int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifinfo.ifi_family = AF_PACKET, + }; + struct xdp_id_md xdp_id = {}; + struct xdp_features_md md = { + .ifindex = ifindex, + }; + __u16 id; + int err; + + if (!OPTS_VALID(opts, bpf_xdp_query_opts)) + return libbpf_err(-EINVAL); + + if (xdp_flags & ~XDP_FLAGS_MASK) + return libbpf_err(-EINVAL); + + /* Check whether the single {HW,DRV,SKB} mode is set */ + xdp_flags &= XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE; + if (xdp_flags & (xdp_flags - 1)) + return libbpf_err(-EINVAL); + + xdp_id.ifindex = ifindex; + xdp_id.flags = xdp_flags; + + err = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, __dump_link_nlmsg, + get_xdp_info, &xdp_id); + if (err) + return libbpf_err(err); + + OPTS_SET(opts, prog_id, xdp_id.info.prog_id); + OPTS_SET(opts, drv_prog_id, xdp_id.info.drv_prog_id); + OPTS_SET(opts, hw_prog_id, xdp_id.info.hw_prog_id); + OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id); + OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode); + + if (!OPTS_HAS(opts, feature_flags)) + return 0; + + err = libbpf_netlink_resolve_genl_family_id("netdev", sizeof("netdev"), &id); + if (err < 0) { + if (err == -ENOENT) { + opts->feature_flags = 0; + goto skip_feature_flags; + } + return libbpf_err(err); + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = id; + req.gnl.cmd = NETDEV_CMD_DEV_GET; + req.gnl.version = 2; + + err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex)); + if (err < 0) + return libbpf_err(err); + + err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_xdp_features, NULL, &md); + if (err) + return libbpf_err(err); + + opts->feature_flags = md.flags; + +skip_feature_flags: + return 0; +} + +int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int ret; + + ret = bpf_xdp_query(ifindex, flags, &opts); + if (ret) + return libbpf_err(ret); + + flags &= XDP_FLAGS_MODES; + + if (opts.attach_mode != XDP_ATTACHED_MULTI && !flags) + *prog_id = opts.prog_id; + else if (flags & XDP_FLAGS_DRV_MODE) + *prog_id = opts.drv_prog_id; + else if (flags & XDP_FLAGS_HW_MODE) + *prog_id = opts.hw_prog_id; + else if (flags & XDP_FLAGS_SKB_MODE) + *prog_id = opts.skb_prog_id; + else + *prog_id = 0; + + return 0; +} + + +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); + +static int clsact_config(struct libbpf_nla_req *req) +{ + req->tc.tcm_parent = TC_H_CLSACT; + req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + + return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); +} + +static int attach_point_to_config(struct bpf_tc_hook *hook, + qdisc_config_t *config) +{ + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + case BPF_TC_INGRESS | BPF_TC_EGRESS: + if (OPTS_GET(hook, parent, 0)) + return -EINVAL; + *config = &clsact_config; + return 0; + case BPF_TC_CUSTOM: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int tc_get_tcm_parent(enum bpf_tc_attach_point attach_point, + __u32 *parent) +{ + switch (attach_point) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + if (*parent) + return -EINVAL; + *parent = TC_H_MAKE(TC_H_CLSACT, + attach_point == BPF_TC_INGRESS ? + TC_H_MIN_INGRESS : TC_H_MIN_EGRESS); + break; + case BPF_TC_CUSTOM: + if (!*parent) + return -EINVAL; + break; + default: + return -EINVAL; + } + return 0; +} + +static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) +{ + qdisc_config_t config; + int ret; + struct libbpf_nla_req req; + + ret = attach_point_to_config(hook, &config); + if (ret < 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; + req.nh.nlmsg_type = cmd; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); + + ret = config(&req); + if (ret < 0) + return ret; + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_NEWQDISC, NLM_F_CREATE | NLM_F_EXCL); +} + +static int tc_qdisc_delete(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_DELQDISC, 0); +} + +int bpf_tc_hook_create(struct bpf_tc_hook *hook) +{ + int ret; + + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + ret = tc_qdisc_create_excl(hook); + return libbpf_err(ret); +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush); + +int bpf_tc_hook_destroy(struct bpf_tc_hook *hook) +{ + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + return libbpf_err(__bpf_tc_detach(hook, NULL, true)); + case BPF_TC_INGRESS | BPF_TC_EGRESS: + return libbpf_err(tc_qdisc_delete(hook)); + case BPF_TC_CUSTOM: + return libbpf_err(-EOPNOTSUPP); + default: + return libbpf_err(-EINVAL); + } +} + +struct bpf_cb_ctx { + struct bpf_tc_opts *opts; + bool processed; +}; + +static int __get_tc_info(void *cookie, struct tcmsg *tc, struct nlattr **tb, + bool unicast) +{ + struct nlattr *tbb[TCA_BPF_MAX + 1]; + struct bpf_cb_ctx *info = cookie; + + if (!info || !info->opts) + return -EINVAL; + if (unicast && info->processed) + return -EINVAL; + if (!tb[TCA_OPTIONS]) + return NL_CONT; + + libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL); + if (!tbb[TCA_BPF_ID]) + return -EINVAL; + + OPTS_SET(info->opts, prog_id, libbpf_nla_getattr_u32(tbb[TCA_BPF_ID])); + OPTS_SET(info->opts, handle, tc->tcm_handle); + OPTS_SET(info->opts, priority, TC_H_MAJ(tc->tcm_info) >> 16); + + info->processed = true; + return unicast ? NL_NEXT : NL_DONE; +} + +static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct tcmsg *tc = NLMSG_DATA(nh); + struct nlattr *tb[TCA_MAX + 1]; + + libbpf_nla_parse(tb, TCA_MAX, + (struct nlattr *)((void *)tc + NLMSG_ALIGN(sizeof(*tc))), + NLMSG_PAYLOAD(nh, sizeof(*tc)), NULL); + if (!tb[TCA_KIND]) + return NL_CONT; + return __get_tc_info(cookie, tc, tb, nh->nlmsg_flags & NLM_F_ECHO); +} + +static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + char name[256]; + int len, ret; + + memset(&info, 0, info_len); + ret = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (ret < 0) + return ret; + + ret = nlattr_add(req, TCA_BPF_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + len = snprintf(name, sizeof(name), "%s:[%u]", info.name, info.id); + if (len < 0) + return -errno; + if (len >= sizeof(name)) + return -ENAMETOOLONG; + return nlattr_add(req, TCA_BPF_NAME, name, len + 1); +} + +int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, bpf_flags, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + struct nlattr *nla; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || !prog_fd || prog_id) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + if (flags & ~BPF_TC_F_REPLACE) + return libbpf_err(-EINVAL); + + flags = (flags & BPF_TC_F_REPLACE) ? NLM_F_REPLACE : NLM_F_EXCL; + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | + NLM_F_ECHO | flags; + req.nh.nlmsg_type = RTM_NEWTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + nla = nlattr_begin_nested(&req, TCA_OPTIONS); + if (!nla) + return libbpf_err(-EMSGSIZE); + ret = tc_add_fd_and_name(&req, prog_fd); + if (ret < 0) + return libbpf_err(ret); + bpf_flags = TCA_BPF_FLAG_ACT_DIRECT; + ret = nlattr_add(&req, TCA_BPF_FLAGS, &bpf_flags, sizeof(bpf_flags)); + if (ret < 0) + return libbpf_err(ret); + nlattr_end_nested(&req, nla); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush) +{ + __u32 protocol = 0, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct libbpf_nla_req req; + + if (!hook || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return -EINVAL; + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id) + return -EINVAL; + if (priority > UINT16_MAX) + return -EINVAL; + if (!flush) { + if (!handle || !priority) + return -EINVAL; + protocol = ETH_P_ALL; + } else { + if (handle || priority) + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_DELTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + if (!flush) { + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + } + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return ret; + req.tc.tcm_parent = parent; + + if (!flush) { + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return ret; + } + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts) +{ + int ret; + + if (!opts) + return libbpf_err(-EINVAL); + + ret = __bpf_tc_detach(hook, opts, false); + return libbpf_err(ret); +} + +int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id || + !handle || !priority) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = RTM_GETTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} diff --git a/third_party/bpftool/libbpf/src/nlattr.c b/third_party/bpftool/libbpf/src/nlattr.c new file mode 100644 index 0000000..975e265 --- /dev/null +++ b/third_party/bpftool/libbpf/src/nlattr.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#include +#include +#include +#include +#include "nlattr.h" +#include "libbpf_internal.h" + +static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = { + [LIBBPF_NLA_U8] = sizeof(uint8_t), + [LIBBPF_NLA_U16] = sizeof(uint16_t), + [LIBBPF_NLA_U32] = sizeof(uint32_t), + [LIBBPF_NLA_U64] = sizeof(uint64_t), + [LIBBPF_NLA_STRING] = 1, + [LIBBPF_NLA_FLAG] = 0, +}; + +static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) +{ + int totlen = NLA_ALIGN(nla->nla_len); + + *remaining -= totlen; + return (struct nlattr *)((void *)nla + totlen); +} + +static int nla_ok(const struct nlattr *nla, int remaining) +{ + return remaining >= (int)sizeof(*nla) && + nla->nla_len >= sizeof(*nla) && + nla->nla_len <= remaining; +} + +static int nla_type(const struct nlattr *nla) +{ + return nla->nla_type & NLA_TYPE_MASK; +} + +static int validate_nla(struct nlattr *nla, int maxtype, + struct libbpf_nla_policy *policy) +{ + struct libbpf_nla_policy *pt; + unsigned int minlen = 0; + int type = nla_type(nla); + + if (type < 0 || type > maxtype) + return 0; + + pt = &policy[type]; + + if (pt->type > LIBBPF_NLA_TYPE_MAX) + return 0; + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != LIBBPF_NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (libbpf_nla_len(nla) < minlen) + return -1; + + if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen) + return -1; + + if (pt->type == LIBBPF_NLA_STRING) { + char *data = libbpf_nla_data(nla); + + if (data[libbpf_nla_len(nla) - 1] != '\0') + return -1; + } + + return 0; +} + +static inline int nlmsg_len(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_len - NLMSG_HDRLEN; +} + +/** + * Create attribute index based on a stream of attributes. + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg head Head of attribute stream. + * @arg len Length of attribute stream. + * @arg policy Attribute validation policy. + * + * Iterates over the stream of attributes and stores a pointer to each + * attribute in the index array using the attribute type as index to + * the array. Attribute with a type greater than the maximum type + * specified will be silently ignored in order to maintain backwards + * compatibility. If \a policy is not NULL, the attribute will be + * validated using the specified policy. + * + * @see nla_validate + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + libbpf_nla_for_each_attr(nla, head, len, rem) { + int type = nla_type(nla); + + if (type > maxtype) + continue; + + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + if (tb[type]) + pr_warn("Attribute of type %#x found multiple times in message, " + "previous attribute is being ignored.\n", type); + + tb[type] = nla; + } + + err = 0; +errout: + return err; +} + +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to libbpf_nla_parse(). + * + * @see libbpf_nla_parse + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy) +{ + return libbpf_nla_parse(tb, maxtype, libbpf_nla_data(nla), + libbpf_nla_len(nla), policy); +} + +/* dump netlink extended ack error message */ +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) +{ + struct libbpf_nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { + [NLMSGERR_ATTR_MSG] = { .type = LIBBPF_NLA_STRING }, + [NLMSGERR_ATTR_OFFS] = { .type = LIBBPF_NLA_U32 }, + }; + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr; + struct nlmsgerr *err; + char *errmsg = NULL; + int hlen, alen; + + /* no TLVs, nothing to do here */ + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return 0; + + err = (struct nlmsgerr *)NLMSG_DATA(nlh); + hlen = sizeof(*err); + + /* if NLM_F_CAPPED is set then the inner err msg was capped */ + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + hlen += nlmsg_len(&err->msg); + + attr = (struct nlattr *) ((void *) err + hlen); + alen = (void *)nlh + nlh->nlmsg_len - (void *)attr; + + if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, + extack_policy) != 0) { + pr_warn("Failed to parse extended error attributes\n"); + return 0; + } + + if (tb[NLMSGERR_ATTR_MSG]) + errmsg = (char *) libbpf_nla_data(tb[NLMSGERR_ATTR_MSG]); + + pr_warn("Kernel error message: %s\n", errmsg); + + return 0; +} diff --git a/third_party/bpftool/libbpf/src/nlattr.h b/third_party/bpftool/libbpf/src/nlattr.h new file mode 100644 index 0000000..d92d1c1 --- /dev/null +++ b/third_party/bpftool/libbpf/src/nlattr.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#ifndef __LIBBPF_NLATTR_H +#define __LIBBPF_NLATTR_H + +#include +#include +#include +#include +#include +#include + +/* avoid multiple definition of netlink features */ +#define __LINUX_NETLINK_H + +/** + * Standard attribute types to specify validation policy + */ +enum { + LIBBPF_NLA_UNSPEC, /**< Unspecified type, binary data chunk */ + LIBBPF_NLA_U8, /**< 8 bit integer */ + LIBBPF_NLA_U16, /**< 16 bit integer */ + LIBBPF_NLA_U32, /**< 32 bit integer */ + LIBBPF_NLA_U64, /**< 64 bit integer */ + LIBBPF_NLA_STRING, /**< NUL terminated character string */ + LIBBPF_NLA_FLAG, /**< Flag */ + LIBBPF_NLA_MSECS, /**< Micro seconds (64bit) */ + LIBBPF_NLA_NESTED, /**< Nested attributes */ + __LIBBPF_NLA_TYPE_MAX, +}; + +#define LIBBPF_NLA_TYPE_MAX (__LIBBPF_NLA_TYPE_MAX - 1) + +/** + * @ingroup attr + * Attribute validation policy. + * + * See section @core_doc{core_attr_parse,Attribute Parsing} for more details. + */ +struct libbpf_nla_policy { + /** Type of attribute or LIBBPF_NLA_UNSPEC */ + uint16_t type; + + /** Minimal length of payload required */ + uint16_t minlen; + + /** Maximal length of payload allowed */ + uint16_t maxlen; +}; + +struct libbpf_nla_req { + struct nlmsghdr nh; + union { + struct ifinfomsg ifinfo; + struct tcmsg tc; + struct genlmsghdr gnl; + }; + char buf[128]; +}; + +/** + * @ingroup attr + * Iterate over a stream of attributes + * @arg pos loop counter, set to current attribute + * @arg head head of attribute stream + * @arg len length of attribute stream + * @arg rem initialized to len, holds bytes currently remaining in stream + */ +#define libbpf_nla_for_each_attr(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nla_ok(pos, rem); \ + pos = nla_next(pos, &(rem))) + +/** + * libbpf_nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *libbpf_nla_data(const struct nlattr *nla) +{ + return (void *)nla + NLA_HDRLEN; +} + +static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)libbpf_nla_data(nla); +} + +static inline uint16_t libbpf_nla_getattr_u16(const struct nlattr *nla) +{ + return *(uint16_t *)libbpf_nla_data(nla); +} + +static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)libbpf_nla_data(nla); +} + +static inline uint64_t libbpf_nla_getattr_u64(const struct nlattr *nla) +{ + return *(uint64_t *)libbpf_nla_data(nla); +} + +static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)libbpf_nla_data(nla); +} + +/** + * libbpf_nla_len - length of payload + * @nla: netlink attribute + */ +static inline int libbpf_nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy); +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy); + +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); + +static inline struct nlattr *nla_data(struct nlattr *nla) +{ + return (struct nlattr *)((void *)nla + NLA_HDRLEN); +} + +static inline struct nlattr *req_tail(struct libbpf_nla_req *req) +{ + return (struct nlattr *)((void *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); +} + +static inline int nlattr_add(struct libbpf_nla_req *req, int type, + const void *data, int len) +{ + struct nlattr *nla; + + if (NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > sizeof(*req)) + return -EMSGSIZE; + if (!!data != !!len) + return -EINVAL; + + nla = req_tail(req); + nla->nla_type = type; + nla->nla_len = NLA_HDRLEN + len; + if (data) + memcpy(nla_data(nla), data, len); + req->nh.nlmsg_len = NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(nla->nla_len); + return 0; +} + +static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int type) +{ + struct nlattr *tail; + + tail = req_tail(req); + if (nlattr_add(req, type | NLA_F_NESTED, NULL, 0)) + return NULL; + return tail; +} + +static inline void nlattr_end_nested(struct libbpf_nla_req *req, + struct nlattr *tail) +{ + tail->nla_len = (void *)req_tail(req) - (void *)tail; +} + +#endif /* __LIBBPF_NLATTR_H */ diff --git a/third_party/bpftool/libbpf/src/relo_core.c b/third_party/bpftool/libbpf/src/relo_core.c new file mode 100644 index 0000000..a26b2f5 --- /dev/null +++ b/third_party/bpftool/libbpf/src/relo_core.c @@ -0,0 +1,1687 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Facebook */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include "relo_core.h" + +static const char *btf_kind_str(const struct btf_type *t) +{ + return btf_type_str(t); +} + +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) +{ + return btf_type_skip_modifiers(btf, id, res_id); +} + +static const char *btf__name_by_offset(const struct btf *btf, u32 offset) +{ + return btf_name_by_offset(btf, offset); +} + +static s64 btf__resolve_size(const struct btf *btf, u32 type_id) +{ + const struct btf_type *t; + int size; + + t = btf_type_by_id(btf, type_id); + t = btf_resolve_size(btf, t, &size); + if (IS_ERR(t)) + return PTR_ERR(t); + return size; +} + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +#undef pr_warn +#undef pr_info +#undef pr_debug +#define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) +#else +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#endif + +static bool is_flex_arr(const struct btf *btf, + const struct bpf_core_accessor *acc, + const struct btf_array *arr) +{ + const struct btf_type *t; + + /* not a flexible array, if not inside a struct or has non-zero size */ + if (!acc->name || arr->nelems > 0) + return false; + + /* has to be the last member of enclosing struct */ + t = btf_type_by_id(btf, acc->type_id); + return acc->idx == btf_vlen(t) - 1; +} + +static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off"; + case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz"; + case BPF_CORE_FIELD_EXISTS: return "field_exists"; + case BPF_CORE_FIELD_SIGNED: return "signed"; + case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64"; + case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64"; + case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; + case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; + case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; + case BPF_CORE_TYPE_SIZE: return "type_size"; + case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; + case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; + default: return "unknown"; + } +} + +static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_EXISTS: + case BPF_CORE_FIELD_SIGNED: + case BPF_CORE_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: + return true; + default: + return false; + } +} + +static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_TYPE_ID_LOCAL: + case BPF_CORE_TYPE_ID_TARGET: + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + case BPF_CORE_TYPE_SIZE: + return true; + default: + return false; + } +} + +static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_VALUE: + return true; + default: + return false; + } +} + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + +/* + * Turn bpf_core_relo into a low- and high-level spec representation, + * validating correctness along the way, as well as calculating resulting + * field bit offset, specified by accessor string. Low-level spec captures + * every single level of nestedness, including traversing anonymous + * struct/union members. High-level one only captures semantically meaningful + * "turning points": named fields and array indicies. + * E.g., for this case: + * + * struct sample { + * int __unimportant; + * struct { + * int __1; + * int __2; + * int a[7]; + * }; + * }; + * + * struct sample *s = ...; + * + * int x = &s->a[3]; // access string = '0:1:2:3' + * + * Low-level spec has 1:1 mapping with each element of access string (it's + * just a parsed access string representation): [0, 1, 2, 3]. + * + * High-level spec will capture only 3 points: + * - initial zero-index access by pointer (&s->... is the same as &s[0]...); + * - field 'a' access (corresponds to '2' in low-level spec); + * - array element #3 access (corresponds to '3' in low-level spec). + * + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, + * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their + * spec and raw_spec are kept empty. + * + * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access + * string to specify enumerator's value index that need to be relocated. + */ +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec) +{ + int access_idx, parsed_len, i; + struct bpf_core_accessor *acc; + const struct btf_type *t; + const char *name, *spec_str; + __u32 id, name_off; + __s64 sz; + + spec_str = btf__name_by_offset(btf, relo->access_str_off); + if (str_is_empty(spec_str) || *spec_str == ':') + return -EINVAL; + + memset(spec, 0, sizeof(*spec)); + spec->btf = btf; + spec->root_type_id = relo->type_id; + spec->relo_kind = relo->kind; + + /* type-based relocations don't have a field access string */ + if (core_relo_is_type_based(relo->kind)) { + if (strcmp(spec_str, "0")) + return -EINVAL; + return 0; + } + + /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ + while (*spec_str) { + if (*spec_str == ':') + ++spec_str; + if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) + return -EINVAL; + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + spec_str += parsed_len; + spec->raw_spec[spec->raw_len++] = access_idx; + } + + if (spec->raw_len == 0) + return -EINVAL; + + t = skip_mods_and_typedefs(btf, relo->type_id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[0]; + acc = &spec->spec[0]; + acc->type_id = id; + acc->idx = access_idx; + spec->len++; + + if (core_relo_is_enumval_based(relo->kind)) { + if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + return -EINVAL; + + /* record enumerator name in a first accessor */ + name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off + : btf_enum64(t)[access_idx].name_off; + acc->name = btf__name_by_offset(btf, name_off); + return 0; + } + + if (!core_relo_is_field_based(relo->kind)) + return -EINVAL; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset = access_idx * sz * 8; + + for (i = 1; i < spec->raw_len; i++) { + t = skip_mods_and_typedefs(btf, id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[i]; + acc = &spec->spec[spec->len]; + + if (btf_is_composite(t)) { + const struct btf_member *m; + __u32 bit_offset; + + if (access_idx >= btf_vlen(t)) + return -EINVAL; + + bit_offset = btf_member_bit_offset(t, access_idx); + spec->bit_offset += bit_offset; + + m = btf_members(t) + access_idx; + if (m->name_off) { + name = btf__name_by_offset(btf, m->name_off); + if (str_is_empty(name)) + return -EINVAL; + + acc->type_id = id; + acc->idx = access_idx; + acc->name = name; + spec->len++; + } + + id = m->type; + } else if (btf_is_array(t)) { + const struct btf_array *a = btf_array(t); + bool flex; + + t = skip_mods_and_typedefs(btf, a->type, &id); + if (!t) + return -EINVAL; + + flex = is_flex_arr(btf, acc - 1, a); + if (!flex && access_idx >= a->nelems) + return -EINVAL; + + spec->spec[spec->len].type_id = id; + spec->spec[spec->len].idx = access_idx; + spec->len++; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset += access_idx * sz * 8; + } else { + pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + prog_name, relo->type_id, spec_str, i, id, btf_kind_str(t)); + return -EINVAL; + } + } + + return 0; +} + +/* Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +static int bpf_core_fields_are_compat(const struct btf *local_btf, + __u32 local_id, + const struct btf *targ_btf, + __u32 targ_id) +{ + const struct btf_type *local_type, *targ_type; + +recur: + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_is_composite(local_type) && btf_is_composite(targ_type)) + return 1; + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_PTR: + case BTF_KIND_FLOAT: + return 1; + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + case BTF_KIND_ENUM: { + const char *local_name, *targ_name; + size_t local_len, targ_len; + + local_name = btf__name_by_offset(local_btf, + local_type->name_off); + targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); + local_len = bpf_core_essential_name_len(local_name); + targ_len = bpf_core_essential_name_len(targ_name); + /* one of them is anonymous or both w/ same flavor-less names */ + return local_len == 0 || targ_len == 0 || + (local_len == targ_len && + strncmp(local_name, targ_name, local_len) == 0); + } + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && + btf_int_offset(targ_type) == 0; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + default: + return 0; + } +} + +/* + * Given single high-level named field accessor in local type, find + * corresponding high-level accessor for a target type. Along the way, + * maintain low-level spec for target as well. Also keep updating target + * bit offset. + * + * Searching is performed through recursive exhaustive enumeration of all + * fields of a struct/union. If there are any anonymous (embedded) + * structs/unions, they are recursively searched as well. If field with + * desired name is found, check compatibility between local and target types, + * before returning result. + * + * 1 is returned, if field is found. + * 0 is returned if no compatible field is found. + * <0 is returned on error. + */ +static int bpf_core_match_member(const struct btf *local_btf, + const struct bpf_core_accessor *local_acc, + const struct btf *targ_btf, + __u32 targ_id, + struct bpf_core_spec *spec, + __u32 *next_targ_id) +{ + const struct btf_type *local_type, *targ_type; + const struct btf_member *local_member, *m; + const char *local_name, *targ_name; + __u32 local_id; + int i, n, found; + + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!targ_type) + return -EINVAL; + if (!btf_is_composite(targ_type)) + return 0; + + local_id = local_acc->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_member = btf_members(local_type) + local_acc->idx; + local_name = btf__name_by_offset(local_btf, local_member->name_off); + + n = btf_vlen(targ_type); + m = btf_members(targ_type); + for (i = 0; i < n; i++, m++) { + __u32 bit_offset; + + bit_offset = btf_member_bit_offset(targ_type, i); + + /* too deep struct/union/array nesting */ + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + /* speculate this member will be the good one */ + spec->bit_offset += bit_offset; + spec->raw_spec[spec->raw_len++] = i; + + targ_name = btf__name_by_offset(targ_btf, m->name_off); + if (str_is_empty(targ_name)) { + /* embedded struct/union, we need to go deeper */ + found = bpf_core_match_member(local_btf, local_acc, + targ_btf, m->type, + spec, next_targ_id); + if (found) /* either found or error */ + return found; + } else if (strcmp(local_name, targ_name) == 0) { + /* matching named field */ + struct bpf_core_accessor *targ_acc; + + targ_acc = &spec->spec[spec->len++]; + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + + *next_targ_id = m->type; + found = bpf_core_fields_are_compat(local_btf, + local_member->type, + targ_btf, m->type); + if (!found) + spec->len--; /* pop accessor */ + return found; + } + /* member turned out not to be what we looked for */ + spec->bit_offset -= bit_offset; + spec->raw_len--; + } + + return 0; +} + +/* + * Try to match local spec to a target type and, if successful, produce full + * target spec (high-level, low-level + bit offset). + */ +static int bpf_core_spec_match(struct bpf_core_spec *local_spec, + const struct btf *targ_btf, __u32 targ_id, + struct bpf_core_spec *targ_spec) +{ + const struct btf_type *targ_type; + const struct bpf_core_accessor *local_acc; + struct bpf_core_accessor *targ_acc; + int i, sz, matched; + __u32 name_off; + + memset(targ_spec, 0, sizeof(*targ_spec)); + targ_spec->btf = targ_btf; + targ_spec->root_type_id = targ_id; + targ_spec->relo_kind = local_spec->relo_kind; + + if (core_relo_is_type_based(local_spec->relo_kind)) { + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + } + + local_acc = &local_spec->spec[0]; + targ_acc = &targ_spec->spec[0]; + + if (core_relo_is_enumval_based(local_spec->relo_kind)) { + size_t local_essent_len, targ_essent_len; + const char *targ_name; + + /* has to resolve to an enum */ + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); + if (!btf_is_any_enum(targ_type)) + return 0; + + local_essent_len = bpf_core_essential_name_len(local_acc->name); + + for (i = 0; i < btf_vlen(targ_type); i++) { + if (btf_is_enum(targ_type)) + name_off = btf_enum(targ_type)[i].name_off; + else + name_off = btf_enum64(targ_type)[i].name_off; + + targ_name = btf__name_by_offset(targ_spec->btf, name_off); + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + return 1; + } + } + return 0; + } + + if (!core_relo_is_field_based(local_spec->relo_kind)) + return -EINVAL; + + for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, + &targ_id); + if (!targ_type) + return -EINVAL; + + if (local_acc->name) { + matched = bpf_core_match_member(local_spec->btf, + local_acc, + targ_btf, targ_id, + targ_spec, &targ_id); + if (matched <= 0) + return matched; + } else { + /* for i=0, targ_id is already treated as array element + * type (because it's the original struct), for others + * we should find array element type first + */ + if (i > 0) { + const struct btf_array *a; + bool flex; + + if (!btf_is_array(targ_type)) + return 0; + + a = btf_array(targ_type); + flex = is_flex_arr(targ_btf, targ_acc - 1, a); + if (!flex && local_acc->idx >= a->nelems) + return 0; + if (!skip_mods_and_typedefs(targ_btf, a->type, + &targ_id)) + return -EINVAL; + } + + /* too deep struct/union/array nesting */ + if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + targ_acc->type_id = targ_id; + targ_acc->idx = local_acc->idx; + targ_acc->name = NULL; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + + sz = btf__resolve_size(targ_btf, targ_id); + if (sz < 0) + return sz; + targ_spec->bit_offset += local_acc->idx * sz * 8; + } + } + + return 1; +} + +static int bpf_core_calc_field_relo(const char *prog_name, + const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, __u32 *field_sz, __u32 *type_id, + bool *validate) +{ + const struct bpf_core_accessor *acc; + const struct btf_type *t; + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; + const struct btf_member *m; + const struct btf_type *mt; + bool bitfield; + __s64 sz; + + *field_sz = 0; + + if (relo->kind == BPF_CORE_FIELD_EXISTS) { + *val = spec ? 1 : 0; + return 0; + } + + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + + acc = &spec->spec[spec->len - 1]; + t = btf_type_by_id(spec->btf, acc->type_id); + + /* a[n] accessor needs special handling */ + if (!acc->name) { + if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) { + *val = spec->bit_offset / 8; + /* remember field size for load/store mem size */ + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *field_sz = sz; + *type_id = acc->type_id; + } else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) { + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + } else { + pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", + prog_name, relo->kind, relo->insn_off / 8); + return -EINVAL; + } + if (validate) + *validate = true; + return 0; + } + + m = btf_members(t) + acc->idx; + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); + bit_off = spec->bit_offset; + bit_sz = btf_member_bitfield_size(t, acc->idx); + + bitfield = bit_sz > 0; + if (bitfield) { + byte_sz = mt->size; + byte_off = bit_off / 8 / byte_sz * byte_sz; + /* figure out smallest int size necessary for bitfield load */ + while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { + if (byte_sz >= 8) { + /* bitfield can't be read with 64-bit read */ + pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", + prog_name, relo->kind, relo->insn_off / 8); + return -E2BIG; + } + byte_sz *= 2; + byte_off = bit_off / 8 / byte_sz * byte_sz; + } + } else { + sz = btf__resolve_size(spec->btf, field_type_id); + if (sz < 0) + return -EINVAL; + byte_sz = sz; + byte_off = spec->bit_offset / 8; + bit_sz = byte_sz * 8; + } + + /* for bitfields, all the relocatable aspects are ambiguous and we + * might disagree with compiler, so turn off validation of expected + * value, except for signedness + */ + if (validate) + *validate = !bitfield; + + switch (relo->kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + *val = byte_off; + if (!bitfield) { + *field_sz = byte_sz; + *type_id = field_type_id; + } + break; + case BPF_CORE_FIELD_BYTE_SIZE: + *val = byte_sz; + break; + case BPF_CORE_FIELD_SIGNED: + *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || + (btf_int_encoding(mt) & BTF_INT_SIGNED); + if (validate) + *validate = true; /* signedness is never ambiguous */ + break; + case BPF_CORE_FIELD_LSHIFT_U64: +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *val = 64 - (bit_off + bit_sz - byte_off * 8); +#else + *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); +#endif + break; + case BPF_CORE_FIELD_RSHIFT_U64: + *val = 64 - bit_sz; + if (validate) + *validate = true; /* right shift is never ambiguous */ + break; + case BPF_CORE_FIELD_EXISTS: + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, bool *validate) +{ + __s64 sz; + + /* by default, always check expected value in bpf_insn */ + if (validate) + *validate = true; + + /* type-based relos return zero when target type is not found */ + if (!spec) { + *val = 0; + return 0; + } + + switch (relo->kind) { + case BPF_CORE_TYPE_ID_TARGET: + *val = spec->root_type_id; + /* type ID, embedded in bpf_insn, might change during linking, + * so enforcing it is pointless + */ + if (validate) + *validate = false; + break; + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + *val = 1; + break; + case BPF_CORE_TYPE_SIZE: + sz = btf__resolve_size(spec->btf, spec->root_type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + break; + case BPF_CORE_TYPE_ID_LOCAL: + /* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */ + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val) +{ + const struct btf_type *t; + + switch (relo->kind) { + case BPF_CORE_ENUMVAL_EXISTS: + *val = spec ? 1 : 0; + break; + case BPF_CORE_ENUMVAL_VALUE: + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + t = btf_type_by_id(spec->btf, spec->spec[0].type_id); + if (btf_is_enum(t)) + *val = btf_enum(t)[spec->spec[0].idx].val; + else + *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +/* Calculate original and target relocation values, given local and target + * specs and relocation kind. These values are calculated for each candidate. + * If there are multiple candidates, resulting values should all be consistent + * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. + * If instruction has to be poisoned, *poison will be set to true. + */ +static int bpf_core_calc_relo(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct bpf_core_spec *local_spec, + const struct bpf_core_spec *targ_spec, + struct bpf_core_relo_res *res) +{ + int err = -EOPNOTSUPP; + + res->orig_val = 0; + res->new_val = 0; + res->poison = false; + res->validate = true; + res->fail_memsz_adjust = false; + res->orig_sz = res->new_sz = 0; + res->orig_type_id = res->new_type_id = 0; + + if (core_relo_is_field_based(relo->kind)) { + err = bpf_core_calc_field_relo(prog_name, relo, local_spec, + &res->orig_val, &res->orig_sz, + &res->orig_type_id, &res->validate); + err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec, + &res->new_val, &res->new_sz, + &res->new_type_id, NULL); + if (err) + goto done; + /* Validate if it's safe to adjust load/store memory size. + * Adjustments are performed only if original and new memory + * sizes differ. + */ + res->fail_memsz_adjust = false; + if (res->orig_sz != res->new_sz) { + const struct btf_type *orig_t, *new_t; + + orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf_type_by_id(targ_spec->btf, res->new_type_id); + + /* There are two use cases in which it's safe to + * adjust load/store's mem size: + * - reading a 32-bit kernel pointer, while on BPF + * size pointers are always 64-bit; in this case + * it's safe to "downsize" instruction size due to + * pointer being treated as unsigned integer with + * zero-extended upper 32-bits; + * - reading unsigned integers, again due to + * zero-extension is preserving the value correctly. + * + * In all other cases it's incorrect to attempt to + * load/store field because read value will be + * incorrect, so we poison relocated instruction. + */ + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) + goto done; + if (btf_is_int(orig_t) && btf_is_int(new_t) && + btf_int_encoding(orig_t) != BTF_INT_SIGNED && + btf_int_encoding(new_t) != BTF_INT_SIGNED) + goto done; + + /* mark as invalid mem size adjustment, but this will + * only be checked for LDX/STX/ST insns + */ + res->fail_memsz_adjust = true; + } + } else if (core_relo_is_type_based(relo->kind)) { + err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val, &res->validate); + err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val, NULL); + } else if (core_relo_is_enumval_based(relo->kind)) { + err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); + err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); + } + +done: + if (err == -EUCLEAN) { + /* EUCLEAN is used to signal instruction poisoning request */ + res->poison = true; + err = 0; + } else if (err == -EOPNOTSUPP) { + /* EOPNOTSUPP means unknown/unsupported relocation */ + pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); + } + + return err; +} + +/* + * Turn instruction for which CO_RE relocation failed into invalid one with + * distinct signature. + */ +static void bpf_core_poison_insn(const char *prog_name, int relo_idx, + int insn_idx, struct bpf_insn *insn) +{ + pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", + prog_name, relo_idx, insn_idx); + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with the following message: + * invalid func unknown#195896080 + */ + insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ +} + +static int insn_bpf_size_to_bytes(struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_DW: return 8; + case BPF_W: return 4; + case BPF_H: return 2; + case BPF_B: return 1; + default: return -1; + } +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* + * Patch relocatable BPF instruction. + * + * Patched value is determined by relocation kind and target specification. + * For existence relocations target spec will be NULL if field/type is not found. + * Expected insn->imm value is determined using relocation kind and local + * spec, and is checked before patching instruction. If actual insn->imm value + * is wrong, bail out with error. + * + * Currently supported classes of BPF instruction are: + * 1. rX = (assignment with immediate operand); + * 2. rX += (arithmetic operations with immediate operand); + * 3. rX = (load with 64-bit immediate value); + * 4. rX = *(T *)(rY + ), where T is one of {u8, u16, u32, u64}; + * 5. *(T *)(rX + ) = rY, where T is one of {u8, u16, u32, u64}; + * 6. *(T *)(rX + ) = , where T is one of {u8, u16, u32, u64}. + */ +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res) +{ + __u64 orig_val, new_val; + __u8 class; + + class = BPF_CLASS(insn->code); + + if (res->poison) { +poison: + /* poison second part of ldimm64 to avoid confusing error from + * verifier about "unknown opcode 00" + */ + if (is_ldimm64_insn(insn)) + bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1); + bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn); + return 0; + } + + orig_val = res->orig_val; + new_val = res->new_val; + + switch (class) { + case BPF_ALU: + case BPF_ALU64: + if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + if (res->validate && insn->imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, insn->imm, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + orig_val = insn->imm; + insn->imm = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)orig_val, (unsigned long long)new_val); + break; + case BPF_LDX: + case BPF_ST: + case BPF_STX: + if (res->validate && insn->off != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + if (new_val > SHRT_MAX) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)new_val); + return -ERANGE; + } + if (res->fail_memsz_adjust) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", + prog_name, relo_idx, insn_idx); + goto poison; + } + + orig_val = insn->off; + insn->off = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, + (unsigned long long)new_val); + + if (res->new_sz != res->orig_sz) { + int insn_bytes_sz, insn_bpf_sz; + + insn_bytes_sz = insn_bpf_size_to_bytes(insn); + if (insn_bytes_sz != res->orig_sz) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", + prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); + return -EINVAL; + } + + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); + if (insn_bpf_sz < 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", + prog_name, relo_idx, insn_idx, res->new_sz); + return -EINVAL; + } + + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", + prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz); + } + break; + case BPF_LD: { + __u64 imm; + + if (!is_ldimm64_insn(insn) || + insn[0].src_reg != 0 || insn[0].off != 0 || + insn[1].code != 0 || insn[1].dst_reg != 0 || + insn[1].src_reg != 0 || insn[1].off != 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", + prog_name, relo_idx, insn_idx); + return -EINVAL; + } + + imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); + if (res->validate && imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, (unsigned long long)imm, + (unsigned long long)orig_val, (unsigned long long)new_val); + return -EINVAL; + } + + insn[0].imm = new_val; + insn[1].imm = new_val >> 32; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)imm, (unsigned long long)new_val); + break; + } + default: + pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", + prog_name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); + return -EINVAL; + } + + return 0; +} + +/* Output spec definition in the format: + * [] () + => @, + * where is a C-syntax view of recorded field access, e.g.: x.a[3].b + */ +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) +{ + const struct btf_type *t; + const char *s; + __u32 type_id; + int i, len = 0; + +#define append_buf(fmt, args...) \ + ({ \ + int r; \ + r = snprintf(buf, buf_sz, fmt, ##args); \ + len += r; \ + if (r >= buf_sz) \ + r = buf_sz; \ + buf += r; \ + buf_sz -= r; \ + }) + + type_id = spec->root_type_id; + t = btf_type_by_id(spec->btf, type_id); + s = btf__name_by_offset(spec->btf, t->name_off); + + append_buf("<%s> [%u] %s %s", + core_relo_kind_str(spec->relo_kind), + type_id, btf_kind_str(t), str_is_empty(s) ? "" : s); + + if (core_relo_is_type_based(spec->relo_kind)) + return len; + + if (core_relo_is_enumval_based(spec->relo_kind)) { + t = skip_mods_and_typedefs(spec->btf, type_id, NULL); + if (btf_is_enum(t)) { + const struct btf_enum *e; + const char *fmt_str; + + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; + append_buf(fmt_str, s, e->val); + } else { + const struct btf_enum64 *e; + const char *fmt_str; + + e = btf_enum64(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; + append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); + } + return len; + } + + if (core_relo_is_field_based(spec->relo_kind)) { + for (i = 0; i < spec->len; i++) { + if (spec->spec[i].name) + append_buf(".%s", spec->spec[i].name); + else if (i > 0 || spec->spec[i].idx > 0) + append_buf("[%u]", spec->spec[i].idx); + } + + append_buf(" ("); + for (i = 0; i < spec->raw_len; i++) + append_buf("%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); + + if (spec->bit_offset % 8) + append_buf(" @ offset %u.%u)", spec->bit_offset / 8, spec->bit_offset % 8); + else + append_buf(" @ offset %u)", spec->bit_offset / 8); + return len; + } + + return len; +#undef append_buf +} + +/* + * Calculate CO-RE relocation target result. + * + * The outline and important points of the algorithm: + * 1. For given local type, find corresponding candidate target types. + * Candidate type is a type with the same "essential" name, ignoring + * everything after last triple underscore (___). E.g., `sample`, + * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates + * for each other. Names with triple underscore are referred to as + * "flavors" and are useful, among other things, to allow to + * specify/support incompatible variations of the same kernel struct, which + * might differ between different kernel versions and/or build + * configurations. + * + * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C + * converter, when deduplicated BTF of a kernel still contains more than + * one different types with the same name. In that case, ___2, ___3, etc + * are appended starting from second name conflict. But start flavors are + * also useful to be defined "locally", in BPF program, to extract same + * data from incompatible changes between different kernel + * versions/configurations. For instance, to handle field renames between + * kernel versions, one can use two flavors of the struct name with the + * same common name and use conditional relocations to extract that field, + * depending on target kernel version. + * 2. For each candidate type, try to match local specification to this + * candidate target type. Matching involves finding corresponding + * high-level spec accessors, meaning that all named fields should match, + * as well as all array accesses should be within the actual bounds. Also, + * types should be compatible (see bpf_core_fields_are_compat for details). + * 3. It is supported and expected that there might be multiple flavors + * matching the spec. As long as all the specs resolve to the same set of + * offsets across all candidates, there is no error. If there is any + * ambiguity, CO-RE relocation will fail. This is necessary to accommodate + * imperfection of BTF deduplication, which can cause slight duplication of + * the same BTF type, if some directly or indirectly referenced (by + * pointer) type gets resolved to different actual types in different + * object files. If such a situation occurs, deduplicated BTF will end up + * with two (or more) structurally identical types, which differ only in + * types they refer to through pointer. This should be OK in most cases and + * is not an error. + * 4. Candidate types search is performed by linearly scanning through all + * types in target BTF. It is anticipated that this is overall more + * efficient memory-wise and not significantly worse (if not better) + * CPU-wise compared to prebuilding a map from all local type names to + * a list of candidate type names. It's also sped up by caching resolved + * list of matching candidates per each local "root" type ID, that has at + * least one bpf_core_relo associated with it. This list is shared + * between multiple relocations for the same type ID and is updated as some + * of the candidates are pruned due to structural incompatibility. + */ +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec *local_spec = &specs_scratch[0]; + struct bpf_core_spec *cand_spec = &specs_scratch[1]; + struct bpf_core_spec *targ_spec = &specs_scratch[2]; + struct bpf_core_relo_res cand_res; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id; + char spec_buf[256]; + int i, j, err; + + local_id = relo->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + err = bpf_core_parse_spec(prog_name, local_btf, relo, local_spec); + if (err) { + const char *spec_str; + + spec_str = btf__name_by_offset(local_btf, relo->access_str_off); + pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + str_is_empty(local_name) ? "" : local_name, + spec_str ?: "", err); + return -EINVAL; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), local_spec); + pr_debug("prog '%s': relo #%d: %s\n", prog_name, relo_idx, spec_buf); + + /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ + if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { + /* bpf_insn's imm value could get out of sync during linking */ + memset(targ_res, 0, sizeof(*targ_res)); + targ_res->validate = false; + targ_res->poison = false; + targ_res->orig_val = local_spec->root_type_id; + targ_res->new_val = local_spec->root_type_id; + return 0; + } + + /* libbpf doesn't support candidate search for anonymous types */ + if (str_is_empty(local_name)) { + pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + return -EOPNOTSUPP; + } + + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(local_spec, cands->cands[i].btf, + cands->cands[i].id, cand_spec); + if (err < 0) { + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n ", + prog_name, relo_idx, i, spec_buf, err); + return err; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_debug("prog '%s': relo #%d: %s candidate #%d %s\n", prog_name, + relo_idx, err == 0 ? "non-matching" : "matching", i, spec_buf); + + if (err == 0) + continue; + + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); + if (err) + return err; + + if (j == 0) { + *targ_res = cand_res; + *targ_spec = *cand_spec; + } else if (cand_spec->bit_offset != targ_spec->bit_offset) { + /* if there are many field relo candidates, they + * should all resolve to the same bit offset + */ + pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", + prog_name, relo_idx, cand_spec->bit_offset, + targ_spec->bit_offset); + return -EINVAL; + } else if (cand_res.poison != targ_res->poison || + cand_res.new_val != targ_res->new_val) { + /* all candidates should result in the same relocation + * decision and value, otherwise it's dangerous to + * proceed due to ambiguity + */ + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", + prog_name, relo_idx, + cand_res.poison ? "failure" : "success", + (unsigned long long)cand_res.new_val, + targ_res->poison ? "failure" : "success", + (unsigned long long)targ_res->new_val); + return -EINVAL; + } + + cands->cands[j++] = cands->cands[i]; + } + + /* + * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field + * existence checks or kernel version/config checks, it's expected + * that we might not find any candidates. In this case, if field + * wasn't found in any candidate, the list of candidates shouldn't + * change at all, we'll just handle relocating appropriately, + * depending on relo's kind. + */ + if (j > 0) + cands->len = j; + + /* + * If no candidates were found, it might be both a programmer error, + * as well as expected case, depending whether instruction w/ + * relocation is guarded in some way that makes it unreachable (dead + * code) if relocation can't be resolved. This is handled in + * bpf_core_patch_insn() uniformly by replacing that instruction with + * BPF helper call insn (using invalid helper ID). If that instruction + * is indeed unreachable, then it will be ignored and eliminated by + * verifier. If it was an error, then verifier will complain and point + * to a specific instruction number in its log. + */ + if (j == 0) { + pr_debug("prog '%s': relo #%d: no matching targets found\n", + prog_name, relo_idx); + + /* calculate single target relo result explicitly */ + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, targ_res); + if (err) + return err; + } + + return 0; +} + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". This function assumes that root types were + * already checked for name match. + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/third_party/bpftool/libbpf/src/relo_core.h b/third_party/bpftool/libbpf/src/relo_core.h new file mode 100644 index 0000000..1c0566d --- /dev/null +++ b/third_party/bpftool/libbpf/src/relo_core.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2019 Facebook */ + +#ifndef __RELO_CORE_H +#define __RELO_CORE_H + +#include + +struct bpf_core_cand { + const struct btf *btf; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct bpf_core_cand_list { + struct bpf_core_cand *cands; + int len; +}; + +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + +struct bpf_core_relo_res { + /* expected value in the instruction, unless validate == false */ + __u64 orig_val; + /* new value that needs to be patched up to */ + __u64 new_val; + /* relocation unsuccessful, poison instruction, but don't fail load */ + bool poison; + /* some relocations can't be validated against orig_val */ + bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + ) = rY + * rX = *(T *)(rY + ), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; +}; + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); + +size_t bpf_core_essential_name_len(const char *name); + +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res); + +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res); + +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec); + +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec); + +#endif diff --git a/third_party/bpftool/libbpf/src/ringbuf.c b/third_party/bpftool/libbpf/src/ringbuf.c new file mode 100644 index 0000000..0219936 --- /dev/null +++ b/third_party/bpftool/libbpf/src/ringbuf.c @@ -0,0 +1,587 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Ring buffer operations. + * + * Copyright (C) 2020 Facebook, Inc. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" +#include "bpf.h" + +struct ring { + ring_buffer_sample_fn sample_cb; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + +static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +{ + if (r->consumer_pos) { + munmap(r->consumer_pos, rb->page_size); + r->consumer_pos = NULL; + } + if (r->producer_pos) { + munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); + r->producer_pos = NULL; + } +} + +/* Add extra RINGBUF maps to this ring buffer manager */ +int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + struct epoll_event *e; + struct ring *r; + __u64 mmap_sz; + void *tmp; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_map_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + if (info.type != BPF_MAP_TYPE_RINGBUF) { + pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", + map_fd); + return libbpf_err(-EINVAL); + } + + tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->rings = tmp; + + tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->events = tmp; + + r = &rb->rings[rb->ring_cnt]; + memset(r, 0, sizeof(*r)); + + r->map_fd = map_fd; + r->sample_cb = sample_cb; + r->ctx = ctx; + r->mask = info.max_entries - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. We map twice as big + * data size to allow simple reading of samples that wrap around the + * end of a ring buffer. See kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); + return libbpf_err(-E2BIG); + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->producer_pos = tmp; + r->data = tmp + rb->page_size; + + e = &rb->events[rb->ring_cnt]; + memset(e, 0, sizeof(*e)); + + e->events = EPOLLIN; + e->data.fd = rb->ring_cnt; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + rb->ring_cnt++; + return 0; +} + +void ring_buffer__free(struct ring_buffer *rb) +{ + int i; + + if (!rb) + return; + + for (i = 0; i < rb->ring_cnt; ++i) + ringbuf_unmap_ring(rb, &rb->rings[i]); + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb->events); + free(rb->rings); + free(rb); +} + +struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts) +{ + struct ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = ring_buffer__add(rb, map_fd, sample_cb, ctx); + if (err) + goto err_out; + + return rb; + +err_out: + ring_buffer__free(rb); + return errno = -err, NULL; +} + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits (discard and busy, if set) */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += BPF_RINGBUF_HDR_SZ; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static int64_t ringbuf_process_ring(struct ring *r) +{ + int *len_ptr, len, err; + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt = 0; + unsigned long cons_pos, prod_pos; + bool got_new_data; + void *sample; + + cons_pos = smp_load_acquire(r->consumer_pos); + do { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + goto done; + + got_new_data = true; + cons_pos += roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err = r->sample_cb(r->ctx, sample, len); + if (err < 0) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + cons_pos); + return err; + } + cnt++; + } + + smp_store_release(r->consumer_pos, cons_pos); + } + } while (got_new_data); +done: + return cnt; +} + +/* Consume available ring buffer(s) data without event polling. + * Returns number of records consumed across all registered ring buffers (or + * INT_MAX, whichever is less), or negative number if any of the callbacks + * return error. + */ +int ring_buffer__consume(struct ring_buffer *rb) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = &rb->rings[i]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Poll for available data and consume records, if any are available. + * Returns number of records consumed (or INT_MAX, whichever is less), or + * negative number, if any of the registered callbacks returned error. + */ +int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) +{ + int i, cnt; + int64_t err, res = 0; + + cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + if (cnt < 0) + return libbpf_err(-errno); + + for (i = 0; i < cnt; i++) { + __u32 ring_id = rb->events[i].data.fd; + struct ring *ring = &rb->rings[ring_id]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Get an fd that can be used to sleep until data is available in the ring(s) */ +int ring_buffer__epoll_fd(const struct ring_buffer *rb) +{ + return rb->epoll_fd; +} + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + __u64 mmap_sz; + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_map_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("user ringbuf: ring buf size (%u) is too big\n", info.max_entries); + return -E2BIG; + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* The top two bits are used as special flags */ + if (size & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT)) + return errno = E2BIG, NULL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/third_party/bpftool/libbpf/src/skel_internal.h b/third_party/bpftool/libbpf/src/skel_internal.h new file mode 100644 index 0000000..1e82ab0 --- /dev/null +++ b/third_party/bpftool/libbpf/src/skel_internal.h @@ -0,0 +1,374 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __SKEL_INTERNAL_H +#define __SKEL_INTERNAL_H + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include "bpf.h" +#endif + +#ifndef __NR_bpf +# if defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# endif +#endif + +/* This file is a base header for auto-generated *.lskel.h files. + * Its contents will change and may become part of auto-generation in the future. + * + * The layout of bpf_[map|prog]_desc and bpf_loader_ctx is feature dependent + * and will change from one version of libbpf to another and features + * requested during loader program generation. + */ +struct bpf_map_desc { + /* output of the loader prog */ + int map_fd; + /* input for the loader prog */ + __u32 max_entries; + __aligned_u64 initial_value; +}; +struct bpf_prog_desc { + int prog_fd; +}; + +enum { + BPF_SKEL_KERNEL = (1ULL << 0), +}; + +struct bpf_loader_ctx { + __u32 sz; + __u32 flags; + __u32 log_level; + __u32 log_size; + __u64 log_buf; +}; + +struct bpf_load_and_run_opts { + struct bpf_loader_ctx *ctx; + const void *data; + const void *insns; + __u32 data_sz; + __u32 insns_sz; + const char *errstr; +}; + +long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); + +static inline int skel_sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ +#ifdef __KERNEL__ + return kern_sys_bpf(cmd, attr, size); +#else + return syscall(__NR_bpf, cmd, attr, size); +#endif +} + +#ifdef __KERNEL__ +static inline int close(int fd) +{ + return close_fd(fd); +} + +static inline void *skel_alloc(size_t size) +{ + struct bpf_loader_ctx *ctx = kzalloc(size, GFP_KERNEL); + + if (!ctx) + return NULL; + ctx->flags |= BPF_SKEL_KERNEL; + return ctx; +} + +static inline void skel_free(const void *p) +{ + kfree(p); +} + +/* skel->bss/rodata maps are populated the following way: + * + * For kernel use: + * skel_prep_map_data() allocates kernel memory that kernel module can directly access. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform probe_read_kernel() from maps.rodata.initial_value. + * skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and + * does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree + * is not nessary. + * + * For user space: + * skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform copy_from_user() from maps.rodata.initial_value. + * skel_finalize_map_data() remaps bpf array map value from the kernel memory into + * skel->rodata address. + * + * The "bpftool gen skeleton -L" command generates lskel.h that is suitable for + * both kernel and user space. The generated loader program does + * either bpf_probe_read_kernel() or bpf_copy_from_user() from initial_value + * depending on bpf_loader_ctx->flags. + */ +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + if (addr != ~0ULL) + kvfree(p); + /* When addr == ~0ULL the 'p' points to + * ((struct bpf_array *)map)->value. See skel_finalize_map_data. + */ +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = kvmalloc(val_sz, GFP_KERNEL); + if (!addr) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + struct bpf_map *map; + void *addr = NULL; + + kvfree((void *) (long) *init_val); + *init_val = ~0ULL; + + /* At this point bpf_load_and_run() finished without error and + * 'fd' is a valid bpf map FD. All sanity checks below should succeed. + */ + map = bpf_map_get(fd); + if (IS_ERR(map)) + return NULL; + if (map->map_type != BPF_MAP_TYPE_ARRAY) + goto out; + addr = ((struct bpf_array *)map)->value; + /* the addr stays valid, since FD is not closed */ +out: + bpf_map_put(map); + return addr; +} + +#else + +static inline void *skel_alloc(size_t size) +{ + return calloc(1, size); +} + +static inline void skel_free(void *p) +{ + free(p); +} + +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + munmap(p, sz); +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (addr == (void *) -1) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + void *addr; + + addr = mmap((void *) (long) *init_val, mmap_sz, flags, MAP_SHARED | MAP_FIXED, fd, 0); + if (addr == (void *) -1) + return NULL; + return addr; +} +#endif + +static inline int skel_closenz(int fd) +{ + if (fd > 0) + return close(fd); + return -EINVAL; +} + +#ifndef offsetofend +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) +#endif + +static inline int skel_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + + attr.map_type = map_type; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + return skel_sys_bpf(BPF_MAP_CREATE, &attr, attr_sz); +} + +static inline int skel_map_update_elem(int fd, const void *key, + const void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long) key; + attr.value = (long) value; + attr.flags = flags; + + return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + +static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = (long) name; + attr.raw_tracepoint.prog_fd = prog_fd; + + return skel_sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); +} + +static inline int skel_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create.iter_info_len); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + + return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); +} + +#ifdef __KERNEL__ +#define set_err +#else +#define set_err err = -errno +#endif + +static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) +{ + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); + int map_fd = -1, prog_fd = -1, key = 0, err; + union bpf_attr attr; + + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + if (map_fd < 0) { + opts->errstr = "failed to create loader map"; + set_err; + goto out; + } + + err = skel_map_update_elem(map_fd, &key, opts->data, 0); + if (err < 0) { + opts->errstr = "failed to update loader map"; + set_err; + goto out; + } + + memset(&attr, 0, prog_load_attr_sz); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = (long) opts->insns; + attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); + attr.license = (long) "Dual BSD/GPL"; + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + attr.fd_array = (long) &map_fd; + attr.log_level = opts->ctx->log_level; + attr.log_size = opts->ctx->log_size; + attr.log_buf = opts->ctx->log_buf; + attr.prog_flags = BPF_F_SLEEPABLE; + err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, prog_load_attr_sz); + if (prog_fd < 0) { + opts->errstr = "failed to load loader prog"; + set_err; + goto out; + } + + memset(&attr, 0, test_run_attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.ctx_in = (long) opts->ctx; + attr.test.ctx_size_in = opts->ctx->sz; + err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz); + if (err < 0 || (int)attr.test.retval < 0) { + opts->errstr = "failed to execute loader prog"; + if (err < 0) { + set_err; + } else { + err = (int)attr.test.retval; +#ifndef __KERNEL__ + errno = -err; +#endif + } + goto out; + } + err = 0; +out: + if (map_fd >= 0) + close(map_fd); + if (prog_fd >= 0) + close(prog_fd); + return err; +} + +#endif diff --git a/third_party/bpftool/libbpf/src/str_error.c b/third_party/bpftool/libbpf/src/str_error.c new file mode 100644 index 0000000..146da01 --- /dev/null +++ b/third_party/bpftool/libbpf/src/str_error.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#undef _GNU_SOURCE +#include +#include +#include "str_error.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *libbpf_strerror_r(int err, char *dst, int len) +{ + int ret = strerror_r(err < 0 ? -err : err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/third_party/bpftool/libbpf/src/str_error.h b/third_party/bpftool/libbpf/src/str_error.h new file mode 100644 index 0000000..a139334 --- /dev/null +++ b/third_party/bpftool/libbpf/src/str_error.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBBPF_STR_ERROR_H +#define __LIBBPF_STR_ERROR_H + +char *libbpf_strerror_r(int err, char *dst, int len); +#endif /* __LIBBPF_STR_ERROR_H */ diff --git a/third_party/bpftool/libbpf/src/strset.c b/third_party/bpftool/libbpf/src/strset.c new file mode 100644 index 0000000..2464bcb --- /dev/null +++ b/third_party/bpftool/libbpf/src/strset.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(long key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + key; + + return str_hash(str); +} + +static bool strset_equal_fn(long key1, long key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + key1; + const char *str2 = s->strs_data + key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, off, off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); + free(set); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, new_off, &old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, new_off, new_off, + HASHMAP_ADD, &old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/third_party/bpftool/libbpf/src/strset.h b/third_party/bpftool/libbpf/src/strset.h new file mode 100644 index 0000000..b6ddf77 --- /dev/null +++ b/third_party/bpftool/libbpf/src/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include +#include + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ diff --git a/third_party/bpftool/libbpf/src/usdt.bpf.h b/third_party/bpftool/libbpf/src/usdt.bpf.h new file mode 100644 index 0000000..0bd4c13 --- /dev/null +++ b/third_party/bpftool/libbpf/src/usdt.bpf.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef __USDT_BPF_H__ +#define __USDT_BPF_H__ + +#include +#include +#include + +/* Below types and maps are internal implementation details of libbpf's USDT + * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should + * be considered an unstable API as well and might be adjusted based on user + * feedback from using libbpf's USDT support in production. + */ + +/* User can override BPF_USDT_MAX_SPEC_CNT to change default size of internal + * map that keeps track of USDT argument specifications. This might be + * necessary if there are a lot of USDT attachments. + */ +#ifndef BPF_USDT_MAX_SPEC_CNT +#define BPF_USDT_MAX_SPEC_CNT 256 +#endif +/* User can override BPF_USDT_MAX_IP_CNT to change default size of internal + * map that keeps track of IP (memory address) mapping to USDT argument + * specification. + * Note, if kernel supports BPF cookies, this map is not used and could be + * resized all the way to 1 to save a bit of memory. + */ +#ifndef BPF_USDT_MAX_IP_CNT +#define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) +#endif + +enum __bpf_usdt_arg_type { + BPF_USDT_ARG_CONST, + BPF_USDT_ARG_REG, + BPF_USDT_ARG_REG_DEREF, +}; + +struct __bpf_usdt_arg_spec { + /* u64 scalar interpreted depending on arg_type, see below */ + __u64 val_off; + /* arg location case, see bpf_udst_arg() for details */ + enum __bpf_usdt_arg_type arg_type; + /* offset of referenced register within struct pt_regs */ + short reg_off; + /* whether arg should be interpreted as signed value */ + bool arg_signed; + /* number of bits that need to be cleared and, optionally, + * sign-extended to cast arguments that are 1, 2, or 4 bytes + * long into final 8-byte u64/s64 value returned to user + */ + char arg_bitshift; +}; + +/* should match USDT_MAX_ARG_CNT in usdt.c exactly */ +#define BPF_USDT_MAX_ARG_CNT 12 +struct __bpf_usdt_spec { + struct __bpf_usdt_arg_spec args[BPF_USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, BPF_USDT_MAX_SPEC_CNT); + __type(key, int); + __type(value, struct __bpf_usdt_spec); +} __bpf_usdt_specs SEC(".maps") __weak; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, BPF_USDT_MAX_IP_CNT); + __type(key, long); + __type(value, __u32); +} __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; + +extern const _Bool LINUX_HAS_BPF_COOKIE __kconfig; + +static __always_inline +int __bpf_usdt_spec_id(struct pt_regs *ctx) +{ + if (!LINUX_HAS_BPF_COOKIE) { + long ip = PT_REGS_IP(ctx); + int *spec_id_ptr; + + spec_id_ptr = bpf_map_lookup_elem(&__bpf_usdt_ip_to_spec_id, &ip); + return spec_id_ptr ? *spec_id_ptr : -ESRCH; + } + + return bpf_get_attach_cookie(ctx); +} + +/* Return number of USDT arguments defined for currently traced USDT. */ +__weak __hidden +int bpf_usdt_arg_cnt(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + return spec->arg_cnt; +} + +/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res. + * Returns 0 on success; negative error, otherwise. + * On error *res is guaranteed to be set to zero. + */ +__weak __hidden +int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) +{ + struct __bpf_usdt_spec *spec; + struct __bpf_usdt_arg_spec *arg_spec; + unsigned long val; + int err, spec_id; + + *res = 0; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + if (arg_num >= BPF_USDT_MAX_ARG_CNT) + return -ENOENT; + barrier_var(arg_num); + if (arg_num >= spec->arg_cnt) + return -ENOENT; + + arg_spec = &spec->args[arg_num]; + switch (arg_spec->arg_type) { + case BPF_USDT_ARG_CONST: + /* Arg is just a constant ("-4@$-9" in USDT arg spec). + * value is recorded in arg_spec->val_off directly. + */ + val = arg_spec->val_off; + break; + case BPF_USDT_ARG_REG: + /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), + * so we read the contents of that register directly from + * struct pt_regs. To keep things simple user-space parts + * record offsetof(struct pt_regs, ) in arg_spec->reg_off. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + break; + case BPF_USDT_ARG_REG_DEREF: + /* Arg is in memory addressed by register, plus some offset + * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is + * identified like with BPF_USDT_ARG_REG case, and the offset + * is in arg_spec->val_off. We first fetch register contents + * from pt_regs, then do another user-space probe read to + * fetch argument value itself. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; +#endif + break; + default: + return -EINVAL; + } + + /* cast arg from 1, 2, or 4 bytes to final 8 byte size clearing + * necessary upper arg_bitshift bits, with sign extension if argument + * is signed + */ + val <<= arg_spec->arg_bitshift; + if (arg_spec->arg_signed) + val = ((long)val) >> arg_spec->arg_bitshift; + else + val = val >> arg_spec->arg_bitshift; + *res = val; + return 0; +} + +/* Retrieve user-specified cookie value provided during attach as + * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie + * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself + * utilizing BPF cookies internally, so user can't use BPF cookie directly + * for USDT programs and has to use bpf_usdt_cookie() API instead. + */ +__weak __hidden +long bpf_usdt_cookie(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return 0; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return 0; + + return spec->usdt_cookie; +} + +/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ +#define ___bpf_usdt_args0() ctx +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) + +/* + * BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for + * tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes. + * Original struct pt_regs * context is preserved as 'ctx' argument. + */ +#define BPF_USDT(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_usdt_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#endif /* __USDT_BPF_H__ */ diff --git a/third_party/bpftool/libbpf/src/usdt.c b/third_party/bpftool/libbpf/src/usdt.c new file mode 100644 index 0000000..37455d0 --- /dev/null +++ b/third_party/bpftool/libbpf/src/usdt.c @@ -0,0 +1,1558 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* s8 will be marked as poison while it's a reg of riscv */ +#if defined(__riscv) +#define rv_s8 s8 +#endif + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_common.h" +#include "libbpf_internal.h" +#include "hashmap.h" + +/* libbpf's USDT support consists of BPF-side state/code and user-space + * state/code working together in concert. BPF-side parts are defined in + * usdt.bpf.h header library. User-space state is encapsulated by struct + * usdt_manager and all the supporting code centered around usdt_manager. + * + * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map + * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that + * don't support BPF cookie (see below). These two maps are implicitly + * embedded into user's end BPF object file when user's code included + * usdt.bpf.h. This means that libbpf doesn't do anything special to create + * these USDT support maps. They are created by normal libbpf logic of + * instantiating BPF maps when opening and loading BPF object. + * + * As such, libbpf is basically unaware of the need to do anything + * USDT-related until the very first call to bpf_program__attach_usdt(), which + * can be called by user explicitly or happen automatically during skeleton + * attach (or, equivalently, through generic bpf_program__attach() call). At + * this point, libbpf will instantiate and initialize struct usdt_manager and + * store it in bpf_object. USDT manager is per-BPF object construct, as each + * independent BPF object might or might not have USDT programs, and thus all + * the expected USDT-related state. There is no coordination between two + * bpf_object in parts of USDT attachment, they are oblivious of each other's + * existence and libbpf is just oblivious, dealing with bpf_object-specific + * USDT state. + * + * Quick crash course on USDTs. + * + * From user-space application's point of view, USDT is essentially just + * a slightly special function call that normally has zero overhead, unless it + * is being traced by some external entity (e.g, BPF-based tool). Here's how + * a typical application can trigger USDT probe: + * + * #include // provided by systemtap-sdt-devel package + * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h + * + * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); + * + * USDT is identified by it's : pair of names. Each + * individual USDT has a fixed number of arguments (3 in the above example) + * and specifies values of each argument as if it was a function call. + * + * USDT call is actually not a function call, but is instead replaced by + * a single NOP instruction (thus zero overhead, effectively). But in addition + * to that, those USDT macros generate special SHT_NOTE ELF records in + * .note.stapsdt ELF section. Here's an example USDT definition as emitted by + * `readelf -n `: + * + * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) + * Provider: test + * Name: usdt12 + * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e + * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil + * + * In this case we have USDT test:usdt12 with 12 arguments. + * + * Location and base are offsets used to calculate absolute IP address of that + * NOP instruction that kernel can replace with an interrupt instruction to + * trigger instrumentation code (BPF program for all that we care about). + * + * Semaphore above is and optional feature. It records an address of a 2-byte + * refcount variable (normally in '.probes' ELF section) used for signaling if + * there is anything that is attached to USDT. This is useful for user + * applications if, for example, they need to prepare some arguments that are + * passed only to USDTs and preparation is expensive. By checking if USDT is + * "activated", an application can avoid paying those costs unnecessarily. + * Recent enough kernel has built-in support for automatically managing this + * refcount, which libbpf expects and relies on. If USDT is defined without + * associated semaphore, this value will be zero. See selftests for semaphore + * examples. + * + * Arguments is the most interesting part. This USDT specification string is + * providing information about all the USDT arguments and their locations. The + * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and + * whether the argument is signed or unsigned (negative size means signed). + * The part after @ sign is assembly-like definition of argument location + * (see [0] for more details). Technically, assembler can provide some pretty + * advanced definitions, but libbpf is currently supporting three most common + * cases: + * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); + * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer + * whose value is in register %rdx"; + * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which + * specifies signed 32-bit integer stored at offset -1204 bytes from + * memory address stored in %rbp. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * + * During attachment, libbpf parses all the relevant USDT specifications and + * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side + * code through spec map. This allows BPF applications to quickly fetch the + * actual value at runtime using a simple BPF-side code. + * + * With basics out of the way, let's go over less immediately obvious aspects + * of supporting USDTs. + * + * First, there is no special USDT BPF program type. It is actually just + * a uprobe BPF program (which for kernel, at least currently, is just a kprobe + * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference + * that uprobe is usually attached at the function entry, while USDT will + * normally will be somewhere inside the function. But it should always be + * pointing to NOP instruction, which makes such uprobes the fastest uprobe + * kind. + * + * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) + * macro invocations can end up being inlined many-many times, depending on + * specifics of each individual user application. So single conceptual USDT + * (identified by provider:name pair of identifiers) is, generally speaking, + * multiple uprobe locations (USDT call sites) in different places in user + * application. Further, again due to inlining, each USDT call site might end + * up having the same argument #N be located in a different place. In one call + * site it could be a constant, in another will end up in a register, and in + * yet another could be some other register or even somewhere on the stack. + * + * As such, "attaching to USDT" means (in general case) attaching the same + * uprobe BPF program to multiple target locations in user application, each + * potentially having a completely different USDT spec associated with it. + * To wire all this up together libbpf allocates a unique integer spec ID for + * each unique USDT spec. Spec IDs are allocated as sequential small integers + * so that they can be used as keys in array BPF map (for performance reasons). + * Spec ID allocation and accounting is big part of what usdt_manager is + * about. This state has to be maintained per-BPF object and coordinate + * between different USDT attachments within the same BPF object. + * + * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out + * as struct usdt_spec. Each invocation of BPF program at runtime needs to + * know its associated spec ID. It gets it either through BPF cookie, which + * libbpf sets to spec ID during attach time, or, if kernel is too old to + * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such + * case. The latter means that some modes of operation can't be supported + * without BPF cookie. Such mode is attaching to shared library "generically", + * without specifying target process. In such case, it's impossible to + * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode + * is not supported without BPF cookie support. + * + * Note that libbpf is using BPF cookie functionality for its own internal + * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf + * provides conceptually equivalent USDT cookie support. It's still u64 + * user-provided value that can be associated with USDT attachment. Note that + * this will be the same value for all USDT call sites within the same single + * *logical* USDT attachment. This makes sense because to user attaching to + * USDT is a single BPF program triggered for singular USDT probe. The fact + * that this is done at multiple actual locations is a mostly hidden + * implementation details. This USDT cookie value can be fetched with + * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h + * + * Lastly, while single USDT can have tons of USDT call sites, it doesn't + * necessarily have that many different USDT specs. It very well might be + * that 1000 USDT call sites only need 5 different USDT specs, because all the + * arguments are typically contained in a small set of registers or stack + * locations. As such, it's wasteful to allocate as many USDT spec IDs as + * there are USDT call sites. So libbpf tries to be frugal and performs + * on-the-fly deduplication during a single USDT attachment to only allocate + * the minimal required amount of unique USDT specs (and thus spec IDs). This + * is trivially achieved by using USDT spec string (Arguments string from USDT + * note) as a lookup key in a hashmap. USDT spec string uniquely defines + * everything about how to fetch USDT arguments, so two USDT call sites + * sharing USDT spec string can safely share the same USDT spec and spec ID. + * Note, this spec string deduplication is happening only during the same USDT + * attachment, so each USDT spec shares the same USDT cookie value. This is + * not generally true for other USDT attachments within the same BPF object, + * as even if USDT spec string is the same, USDT cookie value can be + * different. It was deemed excessive to try to deduplicate across independent + * USDT attachments by taking into account USDT spec string *and* USDT cookie + * value, which would complicated spec ID accounting significantly for little + * gain. + */ + +#define USDT_BASE_SEC ".stapsdt.base" +#define USDT_SEMA_SEC ".probes" +#define USDT_NOTE_SEC ".note.stapsdt" +#define USDT_NOTE_TYPE 3 +#define USDT_NOTE_NAME "stapsdt" + +/* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */ +enum usdt_arg_type { + USDT_ARG_CONST, + USDT_ARG_REG, + USDT_ARG_REG_DEREF, +}; + +/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ +struct usdt_arg_spec { + __u64 val_off; + enum usdt_arg_type arg_type; + short reg_off; + bool arg_signed; + char arg_bitshift; +}; + +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ +#define USDT_MAX_ARG_CNT 12 + +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ +struct usdt_spec { + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct usdt_note { + const char *provider; + const char *name; + /* USDT args specification string, e.g.: + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" + */ + const char *args; + long loc_addr; + long base_addr; + long sema_addr; +}; + +struct usdt_target { + long abs_ip; + long rel_ip; + long sema_off; + struct usdt_spec spec; + const char *spec_str; +}; + +struct usdt_manager { + struct bpf_map *specs_map; + struct bpf_map *ip_to_spec_id_map; + + int *free_spec_ids; + size_t free_spec_cnt; + size_t next_free_spec_id; + + bool has_bpf_cookie; + bool has_sema_refcnt; +}; + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj) +{ + static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; + struct usdt_manager *man; + struct bpf_map *specs_map, *ip_to_spec_id_map; + + specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); + ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); + if (!specs_map || !ip_to_spec_id_map) { + pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); + return ERR_PTR(-ESRCH); + } + + man = calloc(1, sizeof(*man)); + if (!man) + return ERR_PTR(-ENOMEM); + + man->specs_map = specs_map; + man->ip_to_spec_id_map = ip_to_spec_id_map; + + /* Detect if BPF cookie is supported for kprobes. + * We don't need IP-to-ID mapping if we can use BPF cookies. + * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") + */ + man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); + + /* Detect kernel support for automatic refcounting of USDT semaphore. + * If this is not supported, USDTs with semaphores will not be supported. + * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; + + return man; +} + +void usdt_manager_free(struct usdt_manager *man) +{ + if (IS_ERR_OR_NULL(man)) + return; + + free(man->free_spec_ids); + free(man); +} + +static int sanity_check_usdt_elf(Elf *elf, const char *path) +{ + GElf_Ehdr ehdr; + int endianness; + + if (elf_kind(elf) != ELF_K_ELF) { + pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); + return -EBADF; + } + + switch (gelf_getclass(elf)) { + case ELFCLASS64: + if (sizeof(void *) != 8) { + pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + case ELFCLASS32: + if (sizeof(void *) != 4) { + pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + default: + pr_warn("usdt: unsupported ELF class for '%s'\n", path); + return -EBADF; + } + + if (!gelf_getehdr(elf, &ehdr)) + return -EINVAL; + + if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { + pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", + path, ehdr.e_type); + return -EBADF; + } + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + endianness = ELFDATA2MSB; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (endianness != ehdr.e_ident[EI_DATA]) { + pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); + return -EBADF; + } + + return 0; +} + +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) +{ + Elf_Scn *sec = NULL; + size_t shstrndx; + + if (elf_getshdrstrndx(elf, &shstrndx)) + return -EINVAL; + + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) + return -EINVAL; + + while ((sec = elf_nextscn(elf, sec)) != NULL) { + char *name; + + if (!gelf_getshdr(sec, shdr)) + return -EINVAL; + + name = elf_strptr(elf, shstrndx, shdr->sh_name); + if (name && strcmp(sec_name, name) == 0) { + *scn = sec; + return 0; + } + } + + return -ENOENT; +} + +struct elf_seg { + long start; + long end; + long offset; + bool is_exec; +}; + +static int cmp_elf_segs(const void *_a, const void *_b) +{ + const struct elf_seg *a = _a; + const struct elf_seg *b = _b; + + return a->start < b->start ? -1 : 1; +} + +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) +{ + GElf_Phdr phdr; + size_t n; + int i, err; + struct elf_seg *seg; + void *tmp; + + *seg_cnt = 0; + + if (elf_getphdrnum(elf, &n)) { + err = -errno; + return err; + } + + for (i = 0; i < n; i++) { + if (!gelf_getphdr(elf, i, &phdr)) { + err = -errno; + return err; + } + + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, + (long)phdr.p_type, (long)phdr.p_flags); + if (phdr.p_type != PT_LOAD) + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) + return -ENOMEM; + + *segs = tmp; + seg = *segs + *seg_cnt; + (*seg_cnt)++; + + seg->start = phdr.p_vaddr; + seg->end = phdr.p_vaddr + phdr.p_memsz; + seg->offset = phdr.p_offset; + seg->is_exec = phdr.p_flags & PF_X; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); + return -ESRCH; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + return 0; +} + +static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +{ + char path[PATH_MAX], line[PATH_MAX], mode[16]; + size_t seg_start, seg_end, seg_off; + struct elf_seg *seg; + int tmp_pid, i, err; + FILE *f; + + *seg_cnt = 0; + + /* Handle containerized binaries only accessible from + * /proc//root/. They will be reported as just / in + * /proc//maps. + */ + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + goto proceed; + + if (!realpath(lib_path, path)) { + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", + lib_path, -errno); + libbpf_strlcpy(path, lib_path, sizeof(path)); + } + +proceed: + sprintf(line, "/proc/%d/maps", pid); + f = fopen(line, "re"); + if (!f) { + err = -errno; + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", + line, lib_path, err); + return err; + } + + /* We need to handle lines with no path at the end: + * + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + */ + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + void *tmp; + + /* to handle no path case (see above) we need to capture line + * without skipping any whitespaces. So we need to strip + * leading whitespaces manually here + */ + i = 0; + while (isblank(line[i])) + i++; + if (strcmp(line + i, path) != 0) + continue; + + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", + path, seg_start, seg_end, mode, seg_off); + + /* ignore non-executable sections for shared libs */ + if (mode[2] != 'x') + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + + *segs = tmp; + seg = *segs + *seg_cnt; + *seg_cnt += 1; + + seg->start = seg_start; + seg->end = seg_end; + seg->offset = seg_off; + seg->is_exec = true; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", + lib_path, path, pid); + err = -ESRCH; + goto err_out; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + err = 0; +err_out: + fclose(f); + return err; +} + +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long virtaddr) +{ + struct elf_seg *seg; + int i; + + /* for ELF binaries (both executables and shared libraries), we are + * given virtual address (absolute for executables, relative for + * libraries) which should match address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= virtaddr && virtaddr < seg->end) + return seg; + } + return NULL; +} + +static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long offset) +{ + struct elf_seg *seg; + int i; + + /* for VMA segments from /proc//maps file, provided "address" is + * actually a file offset, so should be fall within logical + * offset-based range of [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start)) + return seg; + } + return NULL; +} + +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *usdt_note); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); + +static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, + const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie, + struct usdt_target **out_targets, size_t *out_target_cnt) +{ + size_t off, name_off, desc_off, seg_cnt = 0, vma_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *vma_segs = NULL; + struct usdt_target *targets = NULL, *target; + long base_addr = 0; + Elf_Scn *notes_scn, *base_scn; + GElf_Shdr base_shdr, notes_shdr; + GElf_Ehdr ehdr; + GElf_Nhdr nhdr; + Elf_Data *data; + int err; + + *out_targets = NULL; + *out_target_cnt = 0; + + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); + if (err) { + pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); + return err; + } + + if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { + pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); + return -EINVAL; + } + + err = parse_elf_segs(elf, path, &segs, &seg_cnt); + if (err) { + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); + goto err_out; + } + + /* .stapsdt.base ELF section is optional, but is used for prelink + * offset compensation (see a big comment further below) + */ + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) + base_addr = base_shdr.sh_addr; + + data = elf_getdata(notes_scn, 0); + off = 0; + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; + struct usdt_note note; + struct elf_seg *seg = NULL; + void *tmp; + + err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + if (err) + goto err_out; + + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) + continue; + + /* We need to compensate "prelink effect". See [0] for details, + * relevant parts quoted here: + * + * Each SDT probe also expands into a non-allocated ELF note. You can + * find this by looking at SHT_NOTE sections and decoding the format; + * see below for details. Because the note is non-allocated, it means + * there is no runtime cost, and also preserved in both stripped files + * and .debug files. + * + * However, this means that prelink won't adjust the note's contents + * for address offsets. Instead, this is done via the .stapsdt.base + * section. This is a special section that is added to the text. We + * will only ever have one of these sections in a final link and it + * will only ever be one byte long. Nothing about this section itself + * matters, we just use it as a marker to detect prelink address + * adjustments. + * + * Each probe note records the link-time address of the .stapsdt.base + * section alongside the probe PC address. The decoder compares the + * base address stored in the note with the .stapsdt.base section's + * sh_addr. Initially these are the same, but the section header will + * be adjusted by prelink. So the decoder applies the difference to + * the probe PC address to get the correct prelinked PC address; the + * same adjustment is applied to the semaphore address, if any. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + */ + usdt_abs_ip = note.loc_addr; + if (base_addr) + usdt_abs_ip += base_addr - note.base_addr; + + /* When attaching uprobes (which is what USDTs basically are) + * kernel expects file offset to be specified, not a relative + * virtual address, so we need to translate virtual address to + * file offset, for both ET_EXEC and ET_DYN binaries. + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + /* translate from virtual address to file offset */ + usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset; + + if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) { + /* If we don't have BPF cookie support but need to + * attach to a shared library, we'll need to know and + * record absolute addresses of attach points due to + * the need to lookup USDT spec by absolute IP of + * triggered uprobe. Doing this resolution is only + * possible when we have a specific PID of the process + * that's using specified shared library. BPF cookie + * removes the absolute address limitation as we don't + * need to do this lookup (we just use BPF cookie as + * an index of USDT spec), so for newer kernels with + * BPF cookie support libbpf supports USDT attachment + * to shared libraries with no PID filter. + */ + if (pid < 0) { + pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n"); + err = -ENOTSUP; + goto err_out; + } + + /* vma_segs are lazily initialized only if necessary */ + if (vma_seg_cnt == 0) { + err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt); + if (err) { + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", + pid, path, err); + goto err_out; + } + } + + seg = find_vma_seg(vma_segs, vma_seg_cnt, usdt_rel_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_rel_ip); + goto err_out; + } + + usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip; + } + + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); + + /* Adjust semaphore address to be a file offset */ + if (note.sema_addr) { + if (!man->has_sema_refcnt) { + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", + usdt_provider, usdt_name, path); + err = -ENOTSUP; + goto err_out; + } + + seg = find_elf_seg(segs, seg_cnt, note.sema_addr); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", + usdt_provider, usdt_name, path, note.sema_addr); + goto err_out; + } + if (seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + note.sema_addr); + goto err_out; + } + + usdt_sema_off = note.sema_addr - seg->start + seg->offset; + + pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", + path, note.sema_addr, note.base_addr, usdt_sema_off, + seg->start, seg->end, seg->offset); + } + + /* Record adjusted addresses and offsets and parse USDT spec */ + tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + targets = tmp; + + target = &targets[target_cnt]; + memset(target, 0, sizeof(*target)); + + target->abs_ip = usdt_abs_ip; + target->rel_ip = usdt_rel_ip; + target->sema_off = usdt_sema_off; + + /* notes.args references strings from ELF itself, so they can + * be referenced safely until elf_end() call + */ + target->spec_str = note.args; + + err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); + if (err) + goto err_out; + + target_cnt++; + } + + *out_targets = targets; + *out_target_cnt = target_cnt; + err = target_cnt; + +err_out: + free(segs); + free(vma_segs); + if (err < 0) + free(targets); + return err; +} + +struct bpf_link_usdt { + struct bpf_link link; + + struct usdt_manager *usdt_man; + + size_t spec_cnt; + int *spec_ids; + + size_t uprobe_cnt; + struct { + long abs_ip; + struct bpf_link *link; + } *uprobes; +}; + +static int bpf_link_usdt_detach(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + struct usdt_manager *man = usdt_link->usdt_man; + int i; + + for (i = 0; i < usdt_link->uprobe_cnt; i++) { + /* detach underlying uprobe link */ + bpf_link__destroy(usdt_link->uprobes[i].link); + /* there is no need to update specs map because it will be + * unconditionally overwritten on subsequent USDT attaches, + * but if BPF cookies are not used we need to remove entry + * from ip_to_spec_id map, otherwise we'll run into false + * conflicting IP errors + */ + if (!man->has_bpf_cookie) { + /* not much we can do about errors here */ + (void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map), + &usdt_link->uprobes[i].abs_ip); + } + } + + /* try to return the list of previously used spec IDs to usdt_manager + * for future reuse for subsequent USDT attaches + */ + if (!man->free_spec_ids) { + /* if there were no free spec IDs yet, just transfer our IDs */ + man->free_spec_ids = usdt_link->spec_ids; + man->free_spec_cnt = usdt_link->spec_cnt; + usdt_link->spec_ids = NULL; + } else { + /* otherwise concat IDs */ + size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt; + int *new_free_ids; + + new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt, + sizeof(*new_free_ids)); + /* If we couldn't resize free_spec_ids, we'll just leak + * a bunch of free IDs; this is very unlikely to happen and if + * system is so exhausted on memory, it's the least of user's + * concerns, probably. + * So just do our best here to return those IDs to usdt_manager. + * Another edge case when we can legitimately get NULL is when + * new_cnt is zero, which can happen in some edge cases, so we + * need to be careful about that. + */ + if (new_free_ids || new_cnt == 0) { + memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, + usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); + man->free_spec_ids = new_free_ids; + man->free_spec_cnt = new_cnt; + } + } + + return 0; +} + +static void bpf_link_usdt_dealloc(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + + free(usdt_link->spec_ids); + free(usdt_link->uprobes); + free(usdt_link); +} + +static size_t specs_hash_fn(long key, void *ctx) +{ + return str_hash((char *)key); +} + +static bool specs_equal_fn(long key1, long key2, void *ctx) +{ + return strcmp((char *)key1, (char *)key2) == 0; +} + +static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, + struct bpf_link_usdt *link, struct usdt_target *target, + int *spec_id, bool *is_new) +{ + long tmp; + void *new_ids; + int err; + + /* check if we already allocated spec ID for this spec string */ + if (hashmap__find(specs_hash, target->spec_str, &tmp)) { + *spec_id = tmp; + *is_new = false; + return 0; + } + + /* otherwise it's a new ID that needs to be set up in specs map and + * returned back to usdt_manager when USDT link is detached + */ + new_ids = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); + if (!new_ids) + return -ENOMEM; + link->spec_ids = new_ids; + + /* get next free spec ID, giving preference to free list, if not empty */ + if (man->free_spec_cnt) { + *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->free_spec_cnt--; + } else { + /* don't allocate spec ID bigger than what fits in specs map */ + if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map)) + return -E2BIG; + + *spec_id = man->next_free_spec_id; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->next_free_spec_id++; + } + + /* remember new spec ID in the link for later return back to free list on detach */ + link->spec_ids[link->spec_cnt] = *spec_id; + link->spec_cnt++; + *is_new = true; + return 0; +} + +struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie) +{ + int i, fd, err, spec_map_fd, ip_map_fd; + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct hashmap *specs_hash = NULL; + struct bpf_link_usdt *link = NULL; + struct usdt_target *targets = NULL; + size_t target_cnt; + Elf *elf; + + spec_map_fd = bpf_map__fd(man->specs_map); + ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + return libbpf_err_ptr(err); + } + + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + err = -EBADF; + pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); + goto err_out; + } + + err = sanity_check_usdt_elf(elf, path); + if (err) + goto err_out; + + /* normalize PID filter */ + if (pid < 0) + pid = -1; + else if (pid == 0) + pid = getpid(); + + /* discover USDT in given binary, optionally limiting + * activations to a given PID, if pid > 0 + */ + err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + usdt_cookie, &targets, &target_cnt); + if (err <= 0) { + err = (err == 0) ? -ENOENT : err; + goto err_out; + } + + specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL); + if (IS_ERR(specs_hash)) { + err = PTR_ERR(specs_hash); + goto err_out; + } + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto err_out; + } + + link->usdt_man = man; + link->link.detach = &bpf_link_usdt_detach; + link->link.dealloc = &bpf_link_usdt_dealloc; + + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < target_cnt; i++) { + struct usdt_target *target = &targets[i]; + struct bpf_link *uprobe_link; + bool is_new; + int spec_id; + + /* Spec ID can be either reused or newly allocated. If it is + * newly allocated, we'll need to fill out spec map, otherwise + * entire spec should be valid and can be just used by a new + * uprobe. We reuse spec when USDT arg spec is identical. We + * also never share specs between two different USDT + * attachments ("links"), so all the reused specs already + * share USDT cookie value implicitly. + */ + err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new); + if (err) + goto err_out; + + if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) { + err = -errno; + pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n", + spec_id, usdt_provider, usdt_name, path, err); + goto err_out; + } + if (!man->has_bpf_cookie && + bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) { + err = -errno; + if (err == -EEXIST) { + pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n", + spec_id, usdt_provider, usdt_name, path); + } else { + pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n", + target->abs_ip, spec_id, usdt_provider, usdt_name, + path, err); + } + goto err_out; + } + + opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + + free(targets); + hashmap__free(specs_hash); + elf_end(elf); + close(fd); + + return &link->link; + +err_out: + if (link) + bpf_link__destroy(&link->link); + free(targets); + hashmap__free(specs_hash); + if (elf) + elf_end(elf); + close(fd); + return libbpf_err_ptr(err); +} + +/* Parse out USDT ELF note from '.note.stapsdt' section. + * Logic inspired by perf's code. + */ +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *note) +{ + const char *provider, *name, *args; + long addrs[3]; + size_t len; + + /* sanity check USDT note name and type first */ + if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) + return -EINVAL; + if (nhdr->n_type != USDT_NOTE_TYPE) + return -EINVAL; + + /* sanity check USDT note contents ("description" in ELF terminology) */ + len = nhdr->n_descsz; + data = data + desc_off; + + /* +3 is the very minimum required to store three empty strings */ + if (len < sizeof(addrs) + 3) + return -EINVAL; + + /* get location, base, and semaphore addrs */ + memcpy(&addrs, data, sizeof(addrs)); + + /* parse string fields: provider, name, args */ + provider = data + sizeof(addrs); + + name = (const char *)memchr(provider, '\0', data + len - provider); + if (!name) /* non-zero-terminated provider */ + return -EINVAL; + name++; + if (name >= data + len || *name == '\0') /* missing or empty name */ + return -EINVAL; + + args = memchr(name, '\0', data + len - name); + if (!args) /* non-zero-terminated name */ + return -EINVAL; + ++args; + if (args >= data + len) /* missing arguments spec */ + return -EINVAL; + + note->provider = provider; + note->name = name; + if (*args == '\0' || *args == ':') + note->args = ""; + else + note->args = args; + note->loc_addr = addrs[0]; + note->base_addr = addrs[1]; + note->sema_addr = addrs[2]; + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie) +{ + struct usdt_arg_spec *arg; + const char *s; + int arg_sz, len; + + spec->usdt_cookie = usdt_cookie; + spec->arg_cnt = 0; + + s = note->args; + while (s[0]) { + if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { + pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", + USDT_MAX_ARG_CNT, note->provider, note->name, note->args); + return -E2BIG; + } + + arg = &spec->args[spec->arg_cnt]; + len = parse_usdt_arg(s, spec->arg_cnt, arg, &arg_sz); + if (len < 0) + return len; + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + spec->arg_cnt, s, arg_sz); + return -EINVAL; + } + + s += len; + spec->arg_cnt++; + } + + return 0; +} + +/* Architecture-specific logic for parsing USDT argument location specs */ + +#if defined(__x86_64__) || defined(__i386__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *names[4]; + size_t pt_regs_off; + } reg_map[] = { +#ifdef __x86_64__ +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) +#else +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) +#endif + { {"rip", "eip", "", ""}, reg_off(rip, eip) }, + { {"rax", "eax", "ax", "al"}, reg_off(rax, eax) }, + { {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) }, + { {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) }, + { {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) }, + { {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) }, + { {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) }, + { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, + { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, +#undef reg_off +#ifdef __x86_64__ + { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, + { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, + { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, + { {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) }, + { {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) }, + { {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) }, + { {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) }, + { {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) }, +#endif + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) { + if (strcmp(reg_name, reg_map[i].names[j]) == 0) + return reg_map[i].pt_regs_off; + } + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -4@-20(%rbp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case without offset, e.g., 8@(%rsp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -4@%eax */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ $%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@$71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__s390x__) + +/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + unsigned int reg; + int len; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", arg_sz, &off, ®, &len) == 3) { + /* Memory dereference case, e.g., -2@-28(%r15) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %%r%u %n", arg_sz, ®, &len) == 2) { + /* Register read case, e.g., -8@%r0 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__aarch64__) + +static int calc_pt_regs_off(const char *reg_name) +{ + int reg_num; + + if (sscanf(reg_name, "x%d", ®_num) == 1) { + if (reg_num >= 0 && reg_num < 31) + return offsetof(struct user_pt_regs, regs[reg_num]); + } else if (strcmp(reg_name, "sp") == 0) { + return offsetof(struct user_pt_regs, sp); + } + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , %ld ] %n", arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[sp, 96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@x4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__riscv) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "ra", offsetof(struct user_regs_struct, ra) }, + { "sp", offsetof(struct user_regs_struct, sp) }, + { "gp", offsetof(struct user_regs_struct, gp) }, + { "tp", offsetof(struct user_regs_struct, tp) }, + { "a0", offsetof(struct user_regs_struct, a0) }, + { "a1", offsetof(struct user_regs_struct, a1) }, + { "a2", offsetof(struct user_regs_struct, a2) }, + { "a3", offsetof(struct user_regs_struct, a3) }, + { "a4", offsetof(struct user_regs_struct, a4) }, + { "a5", offsetof(struct user_regs_struct, a5) }, + { "a6", offsetof(struct user_regs_struct, a6) }, + { "a7", offsetof(struct user_regs_struct, a7) }, + { "s0", offsetof(struct user_regs_struct, s0) }, + { "s1", offsetof(struct user_regs_struct, s1) }, + { "s2", offsetof(struct user_regs_struct, s2) }, + { "s3", offsetof(struct user_regs_struct, s3) }, + { "s4", offsetof(struct user_regs_struct, s4) }, + { "s5", offsetof(struct user_regs_struct, s5) }, + { "s6", offsetof(struct user_regs_struct, s6) }, + { "s7", offsetof(struct user_regs_struct, s7) }, + { "s8", offsetof(struct user_regs_struct, rv_s8) }, + { "s9", offsetof(struct user_regs_struct, s9) }, + { "s10", offsetof(struct user_regs_struct, s10) }, + { "s11", offsetof(struct user_regs_struct, s11) }, + { "t0", offsetof(struct user_regs_struct, t0) }, + { "t1", offsetof(struct user_regs_struct, t1) }, + { "t2", offsetof(struct user_regs_struct, t2) }, + { "t3", offsetof(struct user_regs_struct, t3) }, + { "t4", offsetof(struct user_regs_struct, t4) }, + { "t5", offsetof(struct user_regs_struct, t5) }, + { "t6", offsetof(struct user_regs_struct, t6) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -8@-88(s0) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@a1 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__arm__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "r0", offsetof(struct pt_regs, uregs[0]) }, + { "r1", offsetof(struct pt_regs, uregs[1]) }, + { "r2", offsetof(struct pt_regs, uregs[2]) }, + { "r3", offsetof(struct pt_regs, uregs[3]) }, + { "r4", offsetof(struct pt_regs, uregs[4]) }, + { "r5", offsetof(struct pt_regs, uregs[5]) }, + { "r6", offsetof(struct pt_regs, uregs[6]) }, + { "r7", offsetof(struct pt_regs, uregs[7]) }, + { "r8", offsetof(struct pt_regs, uregs[8]) }, + { "r9", offsetof(struct pt_regs, uregs[9]) }, + { "r10", offsetof(struct pt_regs, uregs[10]) }, + { "fp", offsetof(struct pt_regs, uregs[11]) }, + { "ip", offsetof(struct pt_regs, uregs[12]) }, + { "sp", offsetof(struct pt_regs, uregs[13]) }, + { "lr", offsetof(struct pt_regs, uregs[14]) }, + { "pc", offsetof(struct pt_regs, uregs[15]) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , #%ld ] %n", + arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[fp, #96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ #%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@#5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@r4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#else + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); + return -ENOTSUP; +} + +#endif diff --git a/third_party/bpftool/libbpf/src/zip.c b/third_party/bpftool/libbpf/src/zip.c new file mode 100644 index 0000000..3f26d62 --- /dev/null +++ b/third_party/bpftool/libbpf/src/zip.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Routines for dealing with .zip archives. + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf_internal.h" +#include "zip.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" + +/* Specification of ZIP file format can be found here: + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + * For a high level overview of the structure of a ZIP file see + * sections 4.3.1 - 4.3.6. + * + * Data structures appearing in ZIP files do not contain any + * padding and they might be misaligned. To allow us to safely + * operate on pointers to such structures and their members, we + * declare the types as packed. + */ + +#define END_OF_CD_RECORD_MAGIC 0x06054b50 + +/* See section 4.3.16 of the spec. */ +struct end_of_cd_record { + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ + __u32 magic; + + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. + * Zip archive might span multiple files (disks). + */ + __u16 this_disk; + + /* Number of the file containing the beginning of the central directory or + * 0xFFFF if ZIP64 archive. + */ + __u16 cd_disk; + + /* Number of central directory records on this disk or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records; + + /* Number of central directory records on all disks or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records_total; + + /* Size of the central directory record or 0xFFFFFFFF if ZIP64 archive. */ + __u32 cd_size; + + /* Offset of the central directory from the beginning of the archive or + * 0xFFFFFFFF if ZIP64 archive. + */ + __u32 cd_offset; + + /* Length of comment data following end of central directory record. */ + __u16 comment_length; + + /* Up to 64k of arbitrary bytes. */ + /* uint8_t comment[comment_length] */ +} __attribute__((packed)); + +#define CD_FILE_HEADER_MAGIC 0x02014b50 +#define FLAG_ENCRYPTED (1 << 0) +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) + +/* See section 4.3.12 of the spec. */ +struct cd_file_header { + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ + __u32 magic; + __u16 version; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; + __u16 file_comment_length; + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ + __u16 disk; + __u16 internal_attributes; + __u32 external_attributes; + /* Offset from the start of the disk containing the local file header to the + * start of the local file header. + */ + __u32 offset; +} __attribute__((packed)); + +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 + +/* See section 4.3.7 of the spec. */ +struct local_file_header { + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ + __u32 magic; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; +} __attribute__((packed)); + +#pragma GCC diagnostic pop + +struct zip_archive { + void *data; + __u32 size; + __u32 cd_offset; + __u32 cd_records; +}; + +static void *check_access(struct zip_archive *archive, __u32 offset, __u32 size) +{ + if (offset + size > archive->size || offset > offset + size) + return NULL; + + return archive->data + offset; +} + +/* Returns 0 on success, -EINVAL on error and -ENOTSUP if the eocd indicates the + * archive uses features which are not supported. + */ +static int try_parse_end_of_cd(struct zip_archive *archive, __u32 offset) +{ + __u16 comment_length, cd_records; + struct end_of_cd_record *eocd; + __u32 cd_offset, cd_size; + + eocd = check_access(archive, offset, sizeof(*eocd)); + if (!eocd || eocd->magic != END_OF_CD_RECORD_MAGIC) + return -EINVAL; + + comment_length = eocd->comment_length; + if (offset + sizeof(*eocd) + comment_length != archive->size) + return -EINVAL; + + cd_records = eocd->cd_records; + if (eocd->this_disk != 0 || eocd->cd_disk != 0 || eocd->cd_records_total != cd_records) + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ + return -ENOTSUP; + + cd_offset = eocd->cd_offset; + cd_size = eocd->cd_size; + if (!check_access(archive, cd_offset, cd_size)) + return -EINVAL; + + archive->cd_offset = cd_offset; + archive->cd_records = cd_records; + return 0; +} + +static int find_cd(struct zip_archive *archive) +{ + int64_t limit, offset; + int rc = -EINVAL; + + if (archive->size <= sizeof(struct end_of_cd_record)) + return -EINVAL; + + /* Because the end of central directory ends with a variable length array of + * up to 0xFFFF bytes we can't know exactly where it starts and need to + * search for it at the end of the file, scanning the (limit, offset] range. + */ + offset = archive->size - sizeof(struct end_of_cd_record); + limit = (int64_t)offset - (1 << 16); + + for (; offset >= 0 && offset > limit && rc != 0; offset--) { + rc = try_parse_end_of_cd(archive, offset); + if (rc == -ENOTSUP) + break; + } + return rc; +} + +struct zip_archive *zip_archive_open(const char *path) +{ + struct zip_archive *archive; + int err, fd; + off_t size; + void *data; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return ERR_PTR(-errno); + + size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1 || size > UINT32_MAX) { + close(fd); + return ERR_PTR(-EINVAL); + } + + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + err = -errno; + close(fd); + + if (data == MAP_FAILED) + return ERR_PTR(err); + + archive = malloc(sizeof(*archive)); + if (!archive) { + munmap(data, size); + return ERR_PTR(-ENOMEM); + }; + + archive->data = data; + archive->size = size; + + err = find_cd(archive); + if (err) { + munmap(data, size); + free(archive); + return ERR_PTR(err); + } + + return archive; +} + +void zip_archive_close(struct zip_archive *archive) +{ + munmap(archive->data, archive->size); + free(archive); +} + +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, + __u32 offset) +{ + struct local_file_header *lfh; + + lfh = check_access(archive, offset, sizeof(*lfh)); + if (!lfh || lfh->magic != LOCAL_FILE_HEADER_MAGIC) + return NULL; + + return lfh; +} + +static int get_entry_at_offset(struct zip_archive *archive, __u32 offset, struct zip_entry *out) +{ + struct local_file_header *lfh; + __u32 compressed_size; + const char *name; + void *data; + + lfh = local_file_header_at_offset(archive, offset); + if (!lfh) + return -EINVAL; + + offset += sizeof(*lfh); + if ((lfh->flags & FLAG_ENCRYPTED) || (lfh->flags & FLAG_HAS_DATA_DESCRIPTOR)) + return -EINVAL; + + name = check_access(archive, offset, lfh->file_name_length); + if (!name) + return -EINVAL; + + offset += lfh->file_name_length; + if (!check_access(archive, offset, lfh->extra_field_length)) + return -EINVAL; + + offset += lfh->extra_field_length; + compressed_size = lfh->compressed_size; + data = check_access(archive, offset, compressed_size); + if (!data) + return -EINVAL; + + out->compression = lfh->compression; + out->name_length = lfh->file_name_length; + out->name = name; + out->data = data; + out->data_length = compressed_size; + out->data_offset = offset; + + return 0; +} + +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, + struct zip_entry *out) +{ + size_t file_name_length = strlen(file_name); + __u32 i, offset = archive->cd_offset; + + for (i = 0; i < archive->cd_records; ++i) { + __u16 cdfh_name_length, cdfh_flags; + struct cd_file_header *cdfh; + const char *cdfh_name; + + cdfh = check_access(archive, offset, sizeof(*cdfh)); + if (!cdfh || cdfh->magic != CD_FILE_HEADER_MAGIC) + return -EINVAL; + + offset += sizeof(*cdfh); + cdfh_name_length = cdfh->file_name_length; + cdfh_name = check_access(archive, offset, cdfh_name_length); + if (!cdfh_name) + return -EINVAL; + + cdfh_flags = cdfh->flags; + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && + file_name_length == cdfh_name_length && + memcmp(file_name, archive->data + offset, file_name_length) == 0) { + return get_entry_at_offset(archive, cdfh->offset, out); + } + + offset += cdfh_name_length; + offset += cdfh->extra_field_length; + offset += cdfh->file_comment_length; + } + + return -ENOENT; +} diff --git a/third_party/bpftool/libbpf/src/zip.h b/third_party/bpftool/libbpf/src/zip.h new file mode 100644 index 0000000..1c1bb21 --- /dev/null +++ b/third_party/bpftool/libbpf/src/zip.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LIBBPF_ZIP_H +#define __LIBBPF_ZIP_H + +#include + +/* Represents an open zip archive. + * Only basic ZIP files are supported, in particular the following are not + * supported: + * - encryption + * - streaming + * - multi-part ZIP files + * - ZIP64 + */ +struct zip_archive; + +/* Carries information on name, compression method, and data corresponding to a + * file in a zip archive. + */ +struct zip_entry { + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ + __u16 compression; + + /* Non-null terminated name of the file. */ + const char *name; + /* Length of the file name. */ + __u16 name_length; + + /* Pointer to the file data. */ + const void *data; + /* Length of the file data. */ + __u32 data_length; + /* Offset of the file data within the archive. */ + __u32 data_offset; +}; + +/* Open a zip archive. Returns NULL in case of an error. */ +struct zip_archive *zip_archive_open(const char *path); + +/* Close a zip archive and release resources. */ +void zip_archive_close(struct zip_archive *archive); + +/* Look up an entry corresponding to a file in given zip archive. */ +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); + +#endif diff --git a/third_party/bpftool/scripts/gh-label-release-assets.sh b/third_party/bpftool/scripts/gh-label-release-assets.sh new file mode 100755 index 0000000..4143a1e --- /dev/null +++ b/third_party/bpftool/scripts/gh-label-release-assets.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail + +# Use this script to add labels to GitHub release assets for a given release. +# +# Based on the following console workflow: +# +# gh api \ +# '/repos/qmonnet/bpftool/releases/tags/v7.2.0-snapshot.0' \ +# --jq '.id' +# gh api \ +# '/repos/qmonnet/bpftool/releases/96330927/assets' \ +# --jq '.[] | select(.name == "bpftool-amd64.tar.gz").id' +# gh api \ +# --method PATCH \ +# -H "Accept: application/vnd.github+json" \ +# -H "X-GitHub-Api-Version: 2022-11-28" \ +# '/repos/qmonnet/bpftool/releases/assets/100280866' \ +# -f name='bpftool-arm64.tar.gz' \ +# -f label='Compressed binary (arm64)' + +REPO="libbpf/bpftool" + +usage() { + echo "Update asset labels for bpftool releases" + echo "Usage:" + echo " $0 [options] " + echo "" + echo "OPTIONS" + echo " -h display this help" + exit "$1" +} + +OPTIND=1 +while getopts "h" opt; do + case "$opt" in + h) + usage 0 + ;; + *) + usage 1 + ;; + esac +done +shift $((OPTIND-1)) +[[ "${1:-}" = "--" ]] && shift + +# Get release tag from command line +if [[ "$#" -lt 1 ]]; then + echo "error: missing release tag" + usage 1 +fi +release_tag="$1" +echo "repo: ${REPO}, release tag: ${release_tag}" + +# Add labels to set for given asset names here: +declare -A assets_labels=( + ["bpftool-libbpf-${release_tag}-sources.tar.gz"]="Source code, including libbpf submodule (tar.gz)" +) + +# Get release ID +release_id="$(gh api "/repos/${REPO}/releases/tags/${release_tag}" --jq '.id')" +echo " found release ID ${release_id}" + +# For each label to set, get asset ID, prompt user for confirmation, set label +for asset_name in "${!assets_labels[@]}"; do + asset_id="$(gh api "/repos/${REPO}/releases/${release_id}/assets" \ + --jq ".[] | select(.name == \"${asset_name}\").id")" + echo " found asset ID ${asset_id}" + + echo "asset '${asset_name}': add label '${assets_labels[${asset_name}]}'" + answer="" + read -rp 'proceed? [y/N]: ' answer + + case "${answer}" in + y|yes|Y|Yes|YES) + gh api \ + --method PATCH \ + -H 'Accept: application/vnd.github+json' \ + -H 'X-GitHub-Api-Version: 2022-11-28' \ + "/repos/${REPO}/releases/assets/${asset_id}" \ + -f label="${assets_labels[${asset_name}]}" + ;; + *) + echo "cancelled" + ;; + esac +done diff --git a/third_party/bpftool/scripts/sync-kernel-expected-diff.patch b/third_party/bpftool/scripts/sync-kernel-expected-diff.patch new file mode 100644 index 0000000..769801a --- /dev/null +++ b/third_party/bpftool/scripts/sync-kernel-expected-diff.patch @@ -0,0 +1,152 @@ +--- docs/Makefile ++++ docs/Makefile +@@ -1,5 +1,5 @@ + # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +-include ../../../scripts/Makefile.include ++include ../src/Makefile.include + + INSTALL ?= install + RM ?= rm -f +--- src/.gitignore ++++ src/.gitignore +@@ -1,8 +1,8 @@ + # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) ++*.o + *.d + /bootstrap/ + /bpftool +-bpftool*.8 + FEATURE-DUMP.bpftool + feature + libbpf +--- src/Makefile ++++ src/Makefile +@@ -1,10 +1,8 @@ + # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +-include ../../scripts/Makefile.include ++include Makefile.include + + ifeq ($(srctree),) + srctree := $(patsubst %/,%,$(dir $(CURDIR))) +-srctree := $(patsubst %/,%,$(dir $(srctree))) +-srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + + ifeq ($(V),1) +@@ -13,7 +11,7 @@ + Q = @ + endif + +-BPF_DIR = $(srctree)/tools/lib/bpf ++BPF_DIR = $(srctree)/libbpf/src + + ifneq ($(OUTPUT),) + _OUTPUT := $(OUTPUT) +@@ -43,16 +41,16 @@ + $(QUIET_MKDIR)mkdir -p $@ + + $(LIBBPF): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_OUTPUT) +- $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) \ +- DESTDIR=$(LIBBPF_DESTDIR:/=) prefix= $(LIBBPF) install_headers ++ $(Q)$(MAKE) -C $(BPF_DIR) OBJDIR=$(patsubst %/,%,$(LIBBPF_OUTPUT)) \ ++ PREFIX=$(LIBBPF_DESTDIR:/=) $(LIBBPF) install_headers + + $(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_DIR) + $(call QUIET_INSTALL, $@) + $(Q)install -m 644 -t $(LIBBPF_HDRS_DIR) $< + + $(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT) +- $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) \ +- DESTDIR=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) prefix= \ ++ $(Q)$(MAKE) -C $(BPF_DIR) OBJDIR=$(patsubst %/,%,$(LIBBPF_BOOTSTRAP_OUTPUT)) \ ++ PREFIX=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) \ + ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" $@ install_headers + + $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR) +@@ -76,9 +74,9 @@ + CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ + -I$(or $(OUTPUT),.) \ + -I$(LIBBPF_INCLUDE) \ +- -I$(srctree)/kernel/bpf/ \ +- -I$(srctree)/tools/include \ +- -I$(srctree)/tools/include/uapi ++ -I$(srctree)/src/kernel/bpf/ \ ++ -I$(srctree)/include \ ++ -I$(srctree)/include/uapi + ifneq ($(BPFTOOL_VERSION),) + CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' + endif +@@ -119,11 +117,7 @@ + endif + + ifeq ($(check_feat),1) +-ifeq ($(FEATURES_DUMP),) +-include $(srctree)/tools/build/Makefile.feature +-else +-include $(FEATURES_DUMP) +-endif ++include Makefile.feature + endif + + LIBS = $(LIBBPF) -lelf -lz +@@ -213,7 +207,7 @@ + $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) + $(QUIET_CLANG)$(CLANG) \ + -I$(or $(OUTPUT),.) \ +- -I$(srctree)/tools/include/uapi/ \ ++ -I$(srctree)/include/uapi/ \ + -I$(LIBBPF_BOOTSTRAP_INCLUDE) \ + -g -O2 -Wall -fno-stack-protector \ + -target bpf -c $< -o $@ +@@ -231,10 +225,10 @@ + + CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) + +-$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c ++$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c + $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ + +-$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c ++$(OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ + + $(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) +@@ -253,7 +247,7 @@ + $(call QUIET_CLEAN, feature-detect) + $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null + +-clean: $(LIBBPF)-clean $(LIBBPF_BOOTSTRAP)-clean feature-detect-clean ++clean: $(LIBBPF)-clean $(LIBBPF_BOOTSTRAP)-clean + $(call QUIET_CLEAN, bpftool) + $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d + $(Q)$(RM) -- $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h +@@ -269,7 +263,7 @@ + + install: install-bin + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir) +- $(Q)$(INSTALL) -m 0644 bash-completion/bpftool $(DESTDIR)$(bash_compdir) ++ $(Q)$(INSTALL) -m 0644 $(srctree)/bash-completion/bpftool $(DESTDIR)$(bash_compdir) + + uninstall: + $(call QUIET_UNINST, bpftool) +@@ -277,16 +271,16 @@ + $(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool + + doc: +- $(call descend,Documentation) ++ $(call descend,$(srctree)/docs) + + doc-clean: +- $(call descend,Documentation,clean) ++ $(call descend,$(srctree)/docs,clean) + + doc-install: +- $(call descend,Documentation,install) ++ $(call descend,$(srctree)/docs,install) + + doc-uninstall: +- $(call descend,Documentation,uninstall) ++ $(call descend,$(srctree)/docs,uninstall) + + FORCE: + diff --git a/third_party/bpftool/scripts/sync-kernel.sh b/third_party/bpftool/scripts/sync-kernel.sh new file mode 100755 index 0000000..086b60b --- /dev/null +++ b/third_party/bpftool/scripts/sync-kernel.sh @@ -0,0 +1,423 @@ +#!/usr/bin/env bash + +usage () { + echo "USAGE: ./sync-kernel.sh " + echo "" + echo "This script synchronizes the mirror with upstream bpftool sources from the kernel repository." + echo "It performs the following steps:" + echo " - Update the libbpf submodule, commit, and use its new checkpoints as target commits for bpftool." + echo " - Cherry-pick commits from the bpf-next branch, up to the bpf-next target commit." + echo " - Cherry-pick commits from the bpf branch, up to the bpf target commit." + echo " - Create a new commit with the updated version and checkpoints." + echo " - Check consistency." + echo "" + echo "Set BPF_NEXT_BASELINE to override bpf-next tree commit, otherwise read from /CHECKPOINT-COMMIT." + echo "Set BPF_BASELINE to override bpf tree commit, otherwise read from /BPF-CHECKPOINT-COMMIT." + echo "Set BPF_NEXT_TIP_COMMIT to override bpf-next tree target commit, otherwise read from /libbpf/CHECKPOINT-COMMIT, after libbpf update." + echo "Set BPF_TIP_COMMIT to override bpf tree target commit, otherwise read from /libbpf/BPF-CHECKPOINT-COMMIT, after libbpf update." + echo "Set SKIP_LIBBPF_UPDATE to 1 to avoid updating libbpf automatically." + echo "Set MANUAL_MODE to 1 to manually control every cherry-picked commit." + exit 1 +} + +set -eu + +BPFTOOL_REPO=${1-""} +LINUX_REPO=${2-""} + +if [ -z "${BPFTOOL_REPO}" ] || [ -z "${LINUX_REPO}" ]; then + echo "Error: bpftool or linux repos are not specified" + usage +fi + +BASELINE_COMMIT=${BPF_NEXT_BASELINE:-$(cat "${BPFTOOL_REPO}"/CHECKPOINT-COMMIT)} +BPF_BASELINE_COMMIT=${BPF_BASELINE:-$(cat "${BPFTOOL_REPO}"/BPF-CHECKPOINT-COMMIT)} + +if [ -z "${BASELINE_COMMIT}" ] || [ -z "${BPF_BASELINE_COMMIT}" ]; then + echo "Error: bpf or bpf-next baseline commits are not provided" + usage +fi + +SUFFIX=$(date --utc +%Y-%m-%dT%H-%M-%S.%3NZ) +WORKDIR=$(pwd) +TMP_DIR=$(mktemp -d) + +# shellcheck disable=SC2064 +trap "cd ${WORKDIR}; exit" INT TERM EXIT + +BPFTOOL_SRC_DIR="tools/bpf/bpftool" + +declare -A PATH_MAP +PATH_MAP=( \ + [${BPFTOOL_SRC_DIR}]=src \ + [${BPFTOOL_SRC_DIR}/bash-completion]=bash-completion \ + [${BPFTOOL_SRC_DIR}/Documentation]=docs \ + [kernel/bpf/disasm.c]=src/kernel/bpf/disasm.c \ + [kernel/bpf/disasm.h]=src/kernel/bpf/disasm.h \ + [tools/include/tools/dis-asm-compat.h]=include/tools/dis-asm-compat.h \ + [tools/include/uapi/asm-generic/bitsperlong.h]=include/uapi/asm-generic/bitsperlong.h \ + [tools/include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \ + [tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \ + [tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \ + [tools/include/uapi/linux/const.h]=include/uapi/linux/const.h \ + [tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \ + [tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \ + [tools/include/uapi/linux/perf_event.h]=include/uapi/linux/perf_event.h \ + [tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \ + [tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \ + [tools/include/uapi/linux/tc_act/tc_bpf.h]=include/uapi/linux/tc_act/tc_bpf.h \ +) + +BPFTOOL_PATHS=( "${!PATH_MAP[@]}" ) +BPFTOOL_VIEW_PATHS=( "${PATH_MAP[@]}" ) +BPFTOOL_VIEW_EXCLUDE_REGEX='^(docs/\.gitignore|src/Makefile\.(feature|include))$' +LINUX_VIEW_EXCLUDE_REGEX='^$' + +# Deal with tools/bpf/bpftool first, because once we've mkdir-ed src/, command +# "git mv" doesn't move bpftool _as_ src but _into_ src/. +BPFTOOL_TREE_FILTER="mkdir __bpftool && "$'\\\n' +BPFTOOL_TREE_FILTER+="git mv -kf ${BPFTOOL_SRC_DIR} __bpftool/${PATH_MAP[${BPFTOOL_SRC_DIR}]} && "$'\\\n' + +# Extract bash-completion and Documentation from src/. +BPFTOOL_TREE_FILTER+="git mv -kf __bpftool/src/bash-completion __bpftool/bash-completion && "$'\\\n' +BPFTOOL_TREE_FILTER+="git mv -kf __bpftool/src/Documentation __bpftool/docs && "$'\\\n' + +BPFTOOL_TREE_FILTER+="mkdir -p __bpftool/include/tools __bpftool/include/uapi/asm-generic __bpftool/include/uapi/linux/tc_act __bpftool/src/kernel/bpf && "$'\\\n' +for p in "${!PATH_MAP[@]}"; do + case ${p} in + ${BPFTOOL_SRC_DIR}*) + continue;; + esac + BPFTOOL_TREE_FILTER+="git mv -kf ${p} __bpftool/${PATH_MAP[${p}]} && "$'\\\n' +done +BPFTOOL_TREE_FILTER+="true >/dev/null" + +cd_to() +{ + cd "${WORKDIR}" && cd "$1" +} + +# Output brief single-line commit description +# $1 - commit ref +commit_desc() +{ + git log -n1 --pretty='%h ("%s")' "$1" +} + +# Create commit single-line signature, which consists of: +# - full commit subject +# - author date in ISO8601 format +# - full commit body with newlines replaced with vertical bars (|) +# - shortstat appended at the end +# The idea is that this single-line signature is good enough to make final +# decision about whether two commits are the same, across different repos. +# $1 - commit ref +# $2 - paths filter +commit_signature() +{ + # shellcheck disable=SC2086 + git show --pretty='("%s")|%aI|%b' --shortstat "$1" -- ${2-.} | tr '\n' '|' +} + +# Cherry-pick commits touching bpftool-related files +# $1 - baseline_tag +# $2 - tip_tag +cherry_pick_commits() +{ + local manual_mode=${MANUAL_MODE:-0} + local baseline_tag=$1 + local tip_tag=$2 + local new_commits + local signature + local should_skip + local synced_cnt + local manual_check + local bpftool_conflict_cnt + local desc + + # shellcheck disable=SC2068 + new_commits=$(git rev-list --no-merges --topo-order --reverse "${baseline_tag}".."${tip_tag}" ${BPFTOOL_PATHS[@]}) + for new_commit in ${new_commits}; do + if [[ "${baseline_tag}" == "${BPF_BASELINE_TAG}" ]]; then + if git merge-base --is-ancestor "${new_commit}" "${BASELINE_COMMIT}"; then + echo "Commit ${new_commit::12} from bpf is already in bpf-next branch, skipping." + continue + fi + fi + desc="$(commit_desc "${new_commit}")" + signature="$(commit_signature "${new_commit}" "${BPFTOOL_PATHS[*]}")" + # shellcheck disable=SC2126 + synced_cnt=$(grep -F "${signature}" "${TMP_DIR}"/bpftool_commits.txt | wc -l) + manual_check=0 + if (("${synced_cnt}" > 0)); then + # commit with the same subject is already in bpftool, but it's + # not 100% the same commit, so check with user + echo "Commit '${desc}' is synced into bpftool as:" + grep -F "${signature}" "${TMP_DIR}"/bpftool_commits.txt | \ + cut -d'|' -f1 | sed -e 's/^/- /' + if (("${manual_mode}" != 1 && "${synced_cnt}" == 1)); then + echo "Skipping '${desc}' due to unique match..." + continue + fi + if (("${synced_cnt}" > 1)); then + echo "'${desc} matches multiple commits, please, double-check!" + manual_check=1 + fi + fi + if (("${manual_mode}" == 1 || "${manual_check}" == 1)); then + read -rp "Do you want to skip '${desc}'? [y/N]: " should_skip + case "${should_skip}" in + "y" | "Y") + echo "Skipping '${desc}'..." + continue + ;; + esac + fi + # commit hasn't been synced into bpftool yet + echo "Picking '${desc}'..." + if ! git cherry-pick "${new_commit}" &>/dev/null; then + echo "Warning! Cherry-picking '${desc} failed, checking if it's non-bpftool files causing problems..." + # shellcheck disable=SC2068 + bpftool_conflict_cnt=$(git diff --name-only --diff-filter=U -- ${BPFTOOL_PATHS[@]} | wc -l) + conflict_cnt=$(git diff --name-only | wc -l) + prompt_resolution=1 + + if (("${bpftool_conflict_cnt}" == 0)); then + echo "Looks like only non-bpftool files have conflicts, ignoring..." + if (("${conflict_cnt}" == 0)); then + echo "Empty cherry-pick, skipping it..." + git cherry-pick --abort + continue + fi + + git add . + # GIT_EDITOR=true to avoid editor popping up to edit commit message + if ! GIT_EDITOR=true git cherry-pick --continue &>/dev/null; then + echo "Error! That still failed! Please resolve manually." + else + echo "Success! All cherry-pick conflicts were resolved for '${desc}'!" + prompt_resolution=0 + fi + fi + + if (("${prompt_resolution}" == 1)); then + read -rp "Error! Cherry-picking '${desc}' failed, please fix manually and press to proceed..." + fi + fi + # Append signature of just cherry-picked commit to avoid + # potentially cherry-picking the same commit twice later when + # processing bpf tree commits. At this point we don't know yet + # the final commit sha in bpftool repo, so we record Linux SHA + # instead as LINUX_. + echo "LINUX_$(git log --pretty='%h' -n1) ${signature}" >> "${TMP_DIR}"/bpftool_commits.txt + done +} + +cleanup() +{ + echo "Cleaning up..." + rm -r -- "${TMP_DIR}" + cd_to "${LINUX_REPO}" + git checkout "${TIP_SYM_REF}" + git branch -D "${BASELINE_TAG}" "${TIP_TAG}" "${BPF_BASELINE_TAG}" "${BPF_TIP_TAG}" \ + "${SQUASH_BASE_TAG}" "${SQUASH_TIP_TAG}" || true + # shellcheck disable=SC2015 + git show-ref --verify --quiet refs/heads/"${VIEW_TAG}" && \ + git branch -D "${VIEW_TAG}" || true + + cd_to . + echo "DONE." +} + +cd_to "${BPFTOOL_REPO}" +BPFTOOL_SYNC_TAG="bpftool-sync-${SUFFIX}" +git checkout -b "${BPFTOOL_SYNC_TAG}" + +# Update libbpf +if [[ "${SKIP_LIBBPF_UPDATE:-0}" -ne 1 ]]; then + cd_to "${BPFTOOL_REPO}"/libbpf + git pull origin master + LIBBPF_VERSION=$(grep -oE '^LIBBPF_([0-9.]+)' src/libbpf.map | sort -rV | head -n1 | cut -d'_' -f2) + LIBBPF_COMMIT=$(git rev-parse HEAD) + cd_to "${BPFTOOL_REPO}" + if [[ -n "$(git status --porcelain --untracked-files=no)" ]]; then + git add libbpf + git commit --signoff -m 'sync: Update libbpf submodule' \ + -m "\ +Pull latest libbpf from mirror. +Libbpf version: ${LIBBPF_VERSION} +Libbpf commit: ${LIBBPF_COMMIT}" \ + -- libbpf + fi +fi + +# Use libbpf's new checkpoints as tips +TIP_COMMIT=${BPF_NEXT_TIP_COMMIT:-$(cat "${BPFTOOL_REPO}"/libbpf/CHECKPOINT-COMMIT)} +BPF_TIP_COMMIT=${BPF_TIP_COMMIT:-$(cat "${BPFTOOL_REPO}"/libbpf/BPF-CHECKPOINT-COMMIT)} +if [ -z "${TIP_COMMIT}" ] || [ -z "${BPF_TIP_COMMIT}" ]; then + echo "Error: bpf or bpf-next tip commits are not provided" + usage +fi + +cd_to "${BPFTOOL_REPO}" +GITHUB_ABS_DIR=$(pwd) +echo "Dumping existing bpftool commit signatures..." +for h in $(git log --pretty='%h' -n500); do + echo "$h" "$(commit_signature "$h")" >> "${TMP_DIR}"/bpftool_commits.txt +done + +# Use current kernel repo HEAD as a source of patches +cd_to "${LINUX_REPO}" +LINUX_ABS_DIR=$(pwd) +TIP_SYM_REF=$(git symbolic-ref -q --short HEAD || git rev-parse HEAD) +BASELINE_TAG="bpftool-baseline-${SUFFIX}" +TIP_TAG="bpftool-tip-${SUFFIX}" +BPF_BASELINE_TAG="bpftool-bpf-baseline-${SUFFIX}" +BPF_TIP_TAG="bpftool-bpf-tip-${SUFFIX}" +VIEW_TAG="bpftool-view-${SUFFIX}" + +# Squash state of kernel repo at baseline into single commit +SQUASH_BASE_TAG="bpftool-squash-base-${SUFFIX}" +SQUASH_TIP_TAG="bpftool-squash-tip-${SUFFIX}" +SQUASH_COMMIT=$(git commit-tree "${BASELINE_COMMIT}^{tree}" -m "BASELINE SQUASH ${BASELINE_COMMIT}") + +echo "WORKDIR: ${WORKDIR}" +echo "LINUX REPO: ${LINUX_REPO}" +echo "BPFTOOL REPO: ${BPFTOOL_REPO}" +echo "TEMP DIR: ${TMP_DIR}" +echo "SUFFIX: ${SUFFIX}" +echo "BASE COMMIT: '$(commit_desc "${BASELINE_COMMIT}")'" +echo "TIP COMMIT: '$(commit_desc "${TIP_COMMIT}")'" +echo "BPF BASE COMMIT: '$(commit_desc "${BPF_BASELINE_COMMIT}")'" +echo "BPF TIP COMMIT: '$(commit_desc "${BPF_TIP_COMMIT}")'" +echo "SQUASH COMMIT: ${SQUASH_COMMIT}" +echo "BASELINE TAG: ${BASELINE_TAG}" +echo "TIP TAG: ${TIP_TAG}" +echo "BPF BASELINE TAG: ${BPF_BASELINE_TAG}" +echo "BPF TIP TAG: ${BPF_TIP_TAG}" +echo "SQUASH BASE TAG: ${SQUASH_BASE_TAG}" +echo "SQUASH TIP TAG: ${SQUASH_TIP_TAG}" +echo "VIEW TAG: ${VIEW_TAG}" +echo "BPFTOOL SYNC TAG: ${BPFTOOL_SYNC_TAG}" +echo "PATCHES: ${TMP_DIR}/patches" + +git branch "${BASELINE_TAG}" "${BASELINE_COMMIT}" +git branch "${TIP_TAG}" "${TIP_COMMIT}" +git branch "${BPF_BASELINE_TAG}" "${BPF_BASELINE_COMMIT}" +git branch "${BPF_TIP_TAG}" "${BPF_TIP_COMMIT}" +git branch "${SQUASH_BASE_TAG}" "${SQUASH_COMMIT}" +git checkout -b "${SQUASH_TIP_TAG}" "${SQUASH_COMMIT}" + +# Cherry-pick new commits onto squashed baseline commit +echo "Cherry-pick for bpf-next..." +cherry_pick_commits "${BASELINE_TAG}" "${TIP_TAG}" +echo "Cherry-pick for bpf..." +cherry_pick_commits "${BPF_BASELINE_TAG}" "${BPF_TIP_TAG}" + +# Move all bpftool files into __bpftool directory. +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --tree-filter "${BPFTOOL_TREE_FILTER}" "${SQUASH_TIP_TAG}" "${SQUASH_BASE_TAG}" +# Make __bpftool a new root directory +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --subdirectory-filter __bpftool "${SQUASH_TIP_TAG}" "${SQUASH_BASE_TAG}" + +# If there are no new commits with bpftool-related changes, bail out +COMMIT_CNT=$(git rev-list --count "${SQUASH_BASE_TAG}".."${SQUASH_TIP_TAG}") +if (("${COMMIT_CNT}" <= 0)); then + echo "No new changes to apply, we are done!" + cleanup + exit 2 +fi + +# Exclude baseline commit and generate nice cover letter with summary +git format-patch --no-signature "${SQUASH_BASE_TAG}".."${SQUASH_TIP_TAG}" --cover-letter -o "${TMP_DIR}"/patches + +# Now is time to re-apply bpftool-related linux patches to bpftool repo +cd_to "${BPFTOOL_REPO}" + +# shellcheck disable=SC2012 +for patch in $(ls -1 "${TMP_DIR}"/patches | tail -n +2); do + if ! git am --3way --committer-date-is-author-date "${TMP_DIR}/patches/${patch}"; then + read -rp "Applying ${TMP_DIR}/patches/${patch} failed, please resolve manually and press to proceed..." + fi +done + +# Use generated cover-letter as a template for "sync commit" with +# baseline and checkpoint commits from kernel repo (and leave summary +# from cover letter intact, of course) +echo "${TIP_COMMIT}" > CHECKPOINT-COMMIT && \ +echo "${BPF_TIP_COMMIT}" > BPF-CHECKPOINT-COMMIT && \ +git add CHECKPOINT-COMMIT && \ +git add BPF-CHECKPOINT-COMMIT && \ +awk '/\*\*\* BLURB HERE \*\*\*/ {p=1} p' "${TMP_DIR}"/patches/0000-cover-letter.patch | \ +sed "s/\*\*\* BLURB HERE \*\*\*/\ +sync: Pull latest bpftool changes from kernel\n\ +\n\ +Syncing latest bpftool commits from kernel repository.\n\ +Baseline bpf-next commit: ${BASELINE_COMMIT}\n\ +Checkpoint bpf-next commit: ${TIP_COMMIT}\n\ +Baseline bpf commit: ${BPF_BASELINE_COMMIT}\n\ +Checkpoint bpf commit: ${BPF_TIP_COMMIT}/" | \ +git commit --signoff --file=- + +echo "SUCCESS! ${COMMIT_CNT} commits synced." + +echo "Verifying Linux's and Github's bpftool state" + +cd_to "${LINUX_REPO}" +git checkout -b "${VIEW_TAG}" "${TIP_COMMIT}" +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --tree-filter "${BPFTOOL_TREE_FILTER}" "${VIEW_TAG}"^.."${VIEW_TAG}" +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --subdirectory-filter __bpftool "${VIEW_TAG}"^.."${VIEW_TAG}" +# shellcheck disable=SC2068 +git ls-files -- ${BPFTOOL_VIEW_PATHS[@]} | grep -v -E "${LINUX_VIEW_EXCLUDE_REGEX}" > "${TMP_DIR}"/linux-view.ls +# Before we compare each file, try to apply to the mirror a patch containing the +# expected differences between the two repositories; this is to avoid checking +# "known" differences visually and taking the risk of missing a new, relevant +# differences. +echo "Patching to account for expected differences..." +patch -d "${LINUX_ABS_DIR}" -p0 -f --reject-file=- --no-backup-if-mismatch < "${GITHUB_ABS_DIR}/scripts/sync-kernel-expected-diff.patch" || true +git add -u +git commit -m 'tmp: apply expected differences to compare github/kernel repos' || true + +cd_to "${BPFTOOL_REPO}" +# shellcheck disable=SC2068 +git ls-files -- ${BPFTOOL_VIEW_PATHS[@]} | grep -v -E "${BPFTOOL_VIEW_EXCLUDE_REGEX}" > "${TMP_DIR}"/github-view.ls + +echo "Comparing list of files..." +diff -u "${TMP_DIR}"/linux-view.ls "${TMP_DIR}"/github-view.ls +echo "Comparing file contents..." +CONSISTENT=1 +while IFS= read -r F; do + if ! diff -u --color "${LINUX_ABS_DIR}/${F}" "${GITHUB_ABS_DIR}/${F}"; then + echo "${LINUX_ABS_DIR}/${F} and ${GITHUB_ABS_DIR}/${F} are different!" + CONSISTENT=0 + fi +done < "${TMP_DIR}"/linux-view.ls +echo "" +if (("${CONSISTENT}" == 1)); then + echo "Great! Content is identical!" +else + ignore_inconsistency=n + echo "Unfortunately, there are some inconsistencies, please double check." + echo "Some of them may come from patches in bpf tree but absent from bpf-next." + echo "Note: I applied scripts/sync-kernel-expected-diff.patch before checking," + echo "to account for expected changes. If this patch needs an update," + echo "you can do it now with:" + echo "------" + echo " (cd \"${LINUX_ABS_DIR}\" && git -c advice.detachedHead=false checkout HEAD~)" + echo " for f in \$(cat \"${TMP_DIR}/linux-view.ls\"); do" + echo " diff -u --label \"\${f}\" --label \"\${f}\" \\" + echo " \"${LINUX_ABS_DIR}/\${f}\" \\" + echo " \"${GITHUB_ABS_DIR}/\${f}\"" + echo " done > \"${GITHUB_ABS_DIR}/scripts/sync-kernel-expected-diff.patch\"" + echo "------" + read -rp "Does everything look good? [y/N]: " ignore_inconsistency + case "${ignore_inconsistency}" in + "y" | "Y") + echo "Ok, proceeding..." + ;; + *) + echo "Oops, exiting with error..." + exit 4 + esac +fi + +cleanup diff --git a/third_party/bpftool/src/.gitignore b/third_party/bpftool/src/.gitignore new file mode 100644 index 0000000..712adf5 --- /dev/null +++ b/third_party/bpftool/src/.gitignore @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +*.o +*.d +/bootstrap/ +/bpftool +FEATURE-DUMP.bpftool +feature +libbpf +/*.skel.h +/vmlinux.h diff --git a/third_party/bpftool/src/Makefile b/third_party/bpftool/src/Makefile new file mode 100644 index 0000000..048d221 --- /dev/null +++ b/third_party/bpftool/src/Makefile @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +include Makefile.include + +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +endif + +ifeq ($(V),1) + Q = +else + Q = @ +endif + +BPF_DIR = $(srctree)/libbpf/src + +ifneq ($(OUTPUT),) + _OUTPUT := $(OUTPUT) +else + _OUTPUT := $(CURDIR)/ +endif +BOOTSTRAP_OUTPUT := $(_OUTPUT)bootstrap/ + +LIBBPF_OUTPUT := $(_OUTPUT)libbpf/ +LIBBPF_DESTDIR := $(LIBBPF_OUTPUT) +LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)include +LIBBPF_HDRS_DIR := $(LIBBPF_INCLUDE)/bpf +LIBBPF := $(LIBBPF_OUTPUT)libbpf.a + +LIBBPF_BOOTSTRAP_OUTPUT := $(BOOTSTRAP_OUTPUT)libbpf/ +LIBBPF_BOOTSTRAP_DESTDIR := $(LIBBPF_BOOTSTRAP_OUTPUT) +LIBBPF_BOOTSTRAP_INCLUDE := $(LIBBPF_BOOTSTRAP_DESTDIR)include +LIBBPF_BOOTSTRAP_HDRS_DIR := $(LIBBPF_BOOTSTRAP_INCLUDE)/bpf +LIBBPF_BOOTSTRAP := $(LIBBPF_BOOTSTRAP_OUTPUT)libbpf.a + +# We need to copy hashmap.h, nlattr.h, relo_core.h and libbpf_internal.h +# which are not otherwise exported by libbpf, but still required by bpftool. +LIBBPF_INTERNAL_HDRS := $(addprefix $(LIBBPF_HDRS_DIR)/,hashmap.h nlattr.h relo_core.h libbpf_internal.h) +LIBBPF_BOOTSTRAP_INTERNAL_HDRS := $(addprefix $(LIBBPF_BOOTSTRAP_HDRS_DIR)/,hashmap.h relo_core.h libbpf_internal.h) + +$(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) $(LIBBPF_BOOTSTRAP_OUTPUT) $(LIBBPF_HDRS_DIR) $(LIBBPF_BOOTSTRAP_HDRS_DIR): + $(QUIET_MKDIR)mkdir -p $@ + +$(LIBBPF): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_OUTPUT) + $(Q)$(MAKE) -C $(BPF_DIR) OBJDIR=$(patsubst %/,%,$(LIBBPF_OUTPUT)) \ + PREFIX=$(LIBBPF_DESTDIR:/=) $(LIBBPF) install_headers + +$(LIBBPF_INTERNAL_HDRS): $(LIBBPF_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_HDRS_DIR) + $(call QUIET_INSTALL, $@) + $(Q)install -m 644 -t $(LIBBPF_HDRS_DIR) $< + +$(LIBBPF_BOOTSTRAP): $(wildcard $(BPF_DIR)/*.[ch] $(BPF_DIR)/Makefile) | $(LIBBPF_BOOTSTRAP_OUTPUT) + $(Q)$(MAKE) -C $(BPF_DIR) OBJDIR=$(patsubst %/,%,$(LIBBPF_BOOTSTRAP_OUTPUT)) \ + PREFIX=$(LIBBPF_BOOTSTRAP_DESTDIR:/=) \ + ARCH= CROSS_COMPILE= CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" $@ install_headers + +$(LIBBPF_BOOTSTRAP_INTERNAL_HDRS): $(LIBBPF_BOOTSTRAP_HDRS_DIR)/%.h: $(BPF_DIR)/%.h | $(LIBBPF_BOOTSTRAP_HDRS_DIR) + $(call QUIET_INSTALL, $@) + $(Q)install -m 644 -t $(LIBBPF_BOOTSTRAP_HDRS_DIR) $< + +$(LIBBPF)-clean: FORCE | $(LIBBPF_OUTPUT) + $(call QUIET_CLEAN, libbpf) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null + +$(LIBBPF_BOOTSTRAP)-clean: FORCE | $(LIBBPF_BOOTSTRAP_OUTPUT) + $(call QUIET_CLEAN, libbpf-bootstrap) + $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_BOOTSTRAP_OUTPUT) clean >/dev/null + +prefix ?= /usr/local +bash_compdir ?= /usr/share/bash-completion/completions + +CFLAGS += -O2 +CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers +CFLAGS += $(filter-out -Wswitch-enum -Wnested-externs,$(EXTRA_WARNINGS)) +CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \ + -I$(or $(OUTPUT),.) \ + -I$(LIBBPF_INCLUDE) \ + -I$(srctree)/src/kernel/bpf/ \ + -I$(srctree)/include \ + -I$(srctree)/include/uapi +ifneq ($(BPFTOOL_VERSION),) +CFLAGS += -DBPFTOOL_VERSION='"$(BPFTOOL_VERSION)"' +endif +ifneq ($(EXTRA_CFLAGS),) +CFLAGS += $(EXTRA_CFLAGS) +endif +ifneq ($(EXTRA_LDFLAGS),) +LDFLAGS += $(EXTRA_LDFLAGS) +endif + +INSTALL ?= install +RM ?= rm -f + +FEATURE_USER = .bpftool + +FEATURE_TESTS := clang-bpf-co-re +FEATURE_TESTS += llvm +FEATURE_TESTS += libcap +FEATURE_TESTS += libbfd +FEATURE_TESTS += libbfd-liberty +FEATURE_TESTS += libbfd-liberty-z +FEATURE_TESTS += disassembler-four-args +FEATURE_TESTS += disassembler-init-styled + +FEATURE_DISPLAY := clang-bpf-co-re +FEATURE_DISPLAY += llvm +FEATURE_DISPLAY += libcap +FEATURE_DISPLAY += libbfd +FEATURE_DISPLAY += libbfd-liberty +FEATURE_DISPLAY += libbfd-liberty-z + +check_feat := 1 +NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall +ifdef MAKECMDGOALS +ifeq ($(filter-out $(NON_CHECK_FEAT_TARGETS),$(MAKECMDGOALS)),) + check_feat := 0 +endif +endif + +ifeq ($(check_feat),1) +include Makefile.feature +endif + +LIBS = $(LIBBPF) -lelf -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz +ifeq ($(feature-libcap), 1) +CFLAGS += -DUSE_LIBCAP +LIBS += -lcap +endif + +include $(wildcard $(OUTPUT)*.d) + +all: $(OUTPUT)bpftool + +SRCS := $(wildcard *.c) + +ifeq ($(feature-llvm),1) + # If LLVM is available, use it for JIT disassembly + CFLAGS += -DHAVE_LLVM_SUPPORT + LLVM_CONFIG_LIB_COMPONENTS := mcdisassembler all-targets + CFLAGS += $(shell $(LLVM_CONFIG) --cflags --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LIBS += $(shell $(LLVM_CONFIG) --libs $(LLVM_CONFIG_LIB_COMPONENTS)) + ifeq ($(shell $(LLVM_CONFIG) --shared-mode),static) + LIBS += $(shell $(LLVM_CONFIG) --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) + LIBS += -lstdc++ + endif + LDFLAGS += $(shell $(LLVM_CONFIG) --ldflags) +else + # Fall back on libbfd + ifeq ($(feature-libbfd),1) + LIBS += -lbfd -ldl -lopcodes + else ifeq ($(feature-libbfd-liberty),1) + LIBS += -lbfd -ldl -lopcodes -liberty + else ifeq ($(feature-libbfd-liberty-z),1) + LIBS += -lbfd -ldl -lopcodes -liberty -lz + endif + + # If one of the above feature combinations is set, we support libbfd + ifneq ($(filter -lbfd,$(LIBS)),) + CFLAGS += -DHAVE_LIBBFD_SUPPORT + + # Libbfd interface changed over time, figure out what we need + ifeq ($(feature-disassembler-four-args), 1) + CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE + endif + ifeq ($(feature-disassembler-init-styled), 1) + CFLAGS += -DDISASM_INIT_STYLED + endif + endif +endif +ifeq ($(filter -DHAVE_LLVM_SUPPORT -DHAVE_LIBBFD_SUPPORT,$(CFLAGS)),) + # No support for JIT disassembly + SRCS := $(filter-out jit_disasm.c,$(SRCS)) +endif + +HOST_CFLAGS = $(subst -I$(LIBBPF_INCLUDE),-I$(LIBBPF_BOOTSTRAP_INCLUDE),\ + $(subst $(CLANG_CROSS_FLAGS),,$(CFLAGS))) + +BPFTOOL_BOOTSTRAP := $(BOOTSTRAP_OUTPUT)bpftool + +BOOTSTRAP_OBJS = $(addprefix $(BOOTSTRAP_OUTPUT),main.o common.o json_writer.o gen.o btf.o xlated_dumper.o btf_dumper.o disasm.o) +$(BOOTSTRAP_OBJS): $(LIBBPF_BOOTSTRAP) + +OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o +$(OBJS): $(LIBBPF) $(LIBBPF_INTERNAL_HDRS) + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +bootstrap: $(BPFTOOL_BOOTSTRAP) + +ifneq ($(VMLINUX_BTF)$(VMLINUX_H),) +ifeq ($(feature-clang-bpf-co-re),1) + +BUILD_BPF_SKELS := 1 + +$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP) +ifeq ($(VMLINUX_H),) + $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) btf dump file $< format c > $@ +else + $(Q)cp "$(VMLINUX_H)" $@ +endif + +$(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF_BOOTSTRAP) + $(QUIET_CLANG)$(CLANG) \ + -I$(or $(OUTPUT),.) \ + -I$(srctree)/include/uapi/ \ + -I$(LIBBPF_BOOTSTRAP_INCLUDE) \ + -g -O2 -Wall -fno-stack-protector \ + --target=bpf -c $< -o $@ + $(Q)$(LLVM_STRIP) -g $@ + +$(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP) + $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@ + +$(OUTPUT)prog.o: $(OUTPUT)profiler.skel.h + +$(OUTPUT)pids.o: $(OUTPUT)pid_iter.skel.h + +endif +endif + +CFLAGS += $(if $(BUILD_BPF_SKELS),,-DBPFTOOL_WITHOUT_SKELETONS) + +$(BOOTSTRAP_OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c + $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ + +$(OUTPUT)disasm.o: $(srctree)/src/kernel/bpf/disasm.c + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ + +$(BPFTOOL_BOOTSTRAP): $(BOOTSTRAP_OBJS) $(LIBBPF_BOOTSTRAP) + $(QUIET_LINK)$(HOSTCC) $(HOST_CFLAGS) $(LDFLAGS) $(BOOTSTRAP_OBJS) $(LIBS_BOOTSTRAP) -o $@ + +$(OUTPUT)bpftool: $(OBJS) $(LIBBPF) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@ + +$(BOOTSTRAP_OUTPUT)%.o: %.c $(LIBBPF_BOOTSTRAP_INTERNAL_HDRS) | $(BOOTSTRAP_OUTPUT) + $(QUIET_CC)$(HOSTCC) $(HOST_CFLAGS) -c -MMD $< -o $@ + +$(OUTPUT)%.o: %.c + $(QUIET_CC)$(CC) $(CFLAGS) -c -MMD $< -o $@ + +feature-detect-clean: + $(call QUIET_CLEAN, feature-detect) + $(Q)$(MAKE) -C $(srctree)/tools/build/feature/ clean >/dev/null + +clean: $(LIBBPF)-clean $(LIBBPF_BOOTSTRAP)-clean + $(call QUIET_CLEAN, bpftool) + $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d + $(Q)$(RM) -- $(OUTPUT)*.skel.h $(OUTPUT)vmlinux.h + $(Q)$(RM) -r -- $(LIBBPF_OUTPUT) $(BOOTSTRAP_OUTPUT) + $(call QUIET_CLEAN, core-gen) + $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool + $(Q)$(RM) -r -- $(OUTPUT)feature/ + +install-bin: $(OUTPUT)bpftool + $(call QUIET_INSTALL, bpftool) + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/sbin + $(Q)$(INSTALL) $(OUTPUT)bpftool $(DESTDIR)$(prefix)/sbin/bpftool + +install: install-bin + $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir) + $(Q)$(INSTALL) -m 0644 $(srctree)/bash-completion/bpftool $(DESTDIR)$(bash_compdir) + +uninstall: + $(call QUIET_UNINST, bpftool) + $(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool + $(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool + +doc: + $(call descend,$(srctree)/docs) + +doc-clean: + $(call descend,$(srctree)/docs,clean) + +doc-install: + $(call descend,$(srctree)/docs,install) + +doc-uninstall: + $(call descend,$(srctree)/docs,uninstall) + +FORCE: + +.SECONDARY: +.PHONY: all FORCE bootstrap clean install-bin install uninstall +.PHONY: doc doc-clean doc-install doc-uninstall +.DEFAULT_GOAL := all + +# Delete partially updated (corrupted) files on error +.DELETE_ON_ERROR: diff --git a/third_party/bpftool/src/Makefile.feature b/third_party/bpftool/src/Makefile.feature new file mode 100644 index 0000000..d091cda --- /dev/null +++ b/third_party/bpftool/src/Makefile.feature @@ -0,0 +1,179 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +pound := \# + +CFLAGS_BACKUP := $(CFLAGS) +CFLAGS := $(EXTRA_CFLAGS) +ifneq ($(LLVM),) + CFLAGS += -Wno-unused-command-line-argument +endif + +ifeq ($(V),1) + LOG=$(warning $(1)) + LOG_RES = (echo $(1) && >&2 echo result: $(1)) + define detect + $(warning $(1) && $(call LOG_RES,1) || $(call LOG_RES,0)) + $(shell $(1) && $(call LOG_RES,1) || $(call LOG_RES,0)) + endef +else + LOG= + LOG_RES = (echo $(1)) + define detect + $(shell $(1) 2>&1 && $(call LOG_RES,1) || $(call LOG_RES,0)) + endef + QUIET_STDERR := 2>/dev/null +endif + +### feature-clang-bpf-co-re + +CLANG_BPF_CO_RE_PROBE_CMD = \ + printf '%s\n' 'struct s { int i; } __attribute__((preserve_access_index)); struct s foo;' | \ + $(CLANG) -g -target bpf -S -o - -x c - $(QUIET_STDERR) | grep -q BTF_KIND_VAR + +ifneq ($(findstring clang-bpf-co-re,$(FEATURE_TESTS)),) +$(call LOG,Probing: feature-clang-bpf-co-re) +feature-clang-bpf-co-re := \ + $(findstring 1,$(call detect,$(CLANG_BPF_CO_RE_PROBE_CMD))) +endif # clang-bpf-co-re + +### feature-libbfd + +ifneq ($(findstring libbfd,$(FEATURE_TESTS)),) +LIBBFD_PROBE := '$(pound)include \n' +LIBBFD_PROBE += 'int main(void) {' +LIBBFD_PROBE += ' bfd_demangle(0, 0, 0);' +LIBBFD_PROBE += ' return 0;' +LIBBFD_PROBE += '}' +LIBBFD_PROBE_CMD = printf '%b\n' $(LIBBFD_PROBE) | \ + $(CC) $(CFLAGS) -Wall -Werror -x c - $(1) -o /dev/null >/dev/null + +define libbfd_build + $(call detect,$(LIBBFD_PROBE_CMD)) +endef + +$(call LOG,Probing: feature-libbfd) +feature-libbfd := \ + $(findstring 1,$(call libbfd_build,-lbfd -ldl)) +ifneq ($(feature-libbfd),1) + $(call LOG,Probing: feature-libbfd-liberty) + feature-libbfd-liberty := \ + $(findstring 1,$(call libbfd_build,-lbfd -ldl -liberty)) + ifneq ($(feature-libbfd-liberty),1) + $(call LOG,Probing: feature-libbfd-liberty-z) + feature-libbfd-liberty-z := \ + $(findstring 1,$(call libbfd_build,-lbfd -ldl -liberty -lz)) + endif +endif +HAS_LIBBFD := $(findstring 1, \ + $(feature-libbfd)$(feature-libbfd-liberty)$(feature-libbfd-liberty-z)) +endif # libbfd + +### feature-disassembler-four-args + +ifneq ($(findstring disassembler-four-args,$(FEATURE_TESTS)),) +DISASSEMBLER_PROBE := '$(pound)include \n' +DISASSEMBLER_PROBE += 'int main(void) {' +DISASSEMBLER_PROBE += ' disassembler((enum bfd_architecture)0, 0, 0, NULL);' +DISASSEMBLER_PROBE += ' return 0;' +DISASSEMBLER_PROBE += '}' + +DISASSEMBLER_PROBE_CMD = printf '%b\n' $(1) | \ + $(CC) $(CFLAGS) -Wall -Werror -x c - -lbfd -lopcodes -S -o - >/dev/null +define disassembler_build + $(call detect,$(DISASSEMBLER_PROBE_CMD)) +endef + +$(call LOG,Probing: feature-disassembler-four-args) +feature-disassembler-four-args := \ + $(findstring 1, $(call disassembler_build,$(DISASSEMBLER_PROBE))) +endif # disassembler-four-args + +### feature-disassembler-init-styled + +ifneq ($(findstring disassembler-init-styled,$(FEATURE_TESTS)),) +DISASSEMBLER_STYLED_PROBE := '$(pound)include \n' +DISASSEMBLER_STYLED_PROBE += 'int main(void) {' +DISASSEMBLER_STYLED_PROBE += ' init_disassemble_info(NULL, 0, NULL, NULL);' +DISASSEMBLER_STYLED_PROBE += ' return 0;' +DISASSEMBLER_STYLED_PROBE += '}' + +$(call LOG,Probing: feature-disassembler-styled) +feature-disassembler-init-styled := \ + $(findstring 1, $(call disassembler_build,$(DISASSEMBLER_STYLED_PROBE))) +endif # disassembler-init-styled + +### feature-libcap + +ifneq ($(findstring libcap,$(FEATURE_TESTS)),) +LIBCAP_PROBE := '$(pound)include \n' +LIBCAP_PROBE += 'int main(void) {' +LIBCAP_PROBE += ' cap_free(0);' +LIBCAP_PROBE += ' return 0;' +LIBCAP_PROBE += '}' +LIBCAP_PROBE_CMD = printf '%b\n' $(LIBCAP_PROBE) | \ + $(CC) $(CFLAGS) -Wall -Werror -x c - -lcap -S -o - >/dev/null + +define libcap_build + $(call detect,$(LIBCAP_PROBE_CMD)) +endef + +$(call LOG,Probing: feature-libcap) +feature-libcap := $(findstring 1, $(call libcap_build)) +endif # libcap + +### feature-llvm + +ifneq ($(findstring llvm,$(FEATURE_TESTS)),) +LLVM_PROBE := '$(pound)include \n' +LLVM_PROBE += '$(pound)include \n' +LLVM_PROBE += 'int main(void) {' +LLVM_PROBE += ' char *triple = LLVMNormalizeTargetTriple("");' +LLVM_PROBE += ' LLVMDisposeMessage(triple);' +LLVM_PROBE += ' return 0;' +LLVM_PROBE += '}' + +# We need some adjustments for the flags. +# - $(CFLAGS) was set to parent $(EXTRA_CFLAGS) at the beginning of this file. +# - $(EXTRA_LDFLAGS) from parent Makefile should be kept as well. +# - Libraries to use depend on whether we have a static or shared version of +# LLVM, pass the llvm-config flag and adjust the list of libraries +# accordingly. +FEATURE_LLVM_CFLAGS := $(CFLAGS) $(shell $(LLVM_CONFIG) --cflags 2>/dev/null) +FEATURE_LLVM_LIBS := $(shell $(LLVM_CONFIG) --libs target 2>/dev/null) +ifeq ($(shell $(LLVM_CONFIG) --shared-mode 2>/dev/null),static) + FEATURE_LLVM_LIBS += $(shell $(LLVM_CONFIG) --system-libs target 2>/dev/null) + FEATURE_LLVM_LIBS += -lstdc++ +endif +FEATURE_LDFLAGS := $(EXTRA_LDFLAGS) $(shell $(LLVM_CONFIG) --ldflags 2>/dev/null) + +LLVM_PROBE_CMD = printf '%b\n' $(LLVM_PROBE) | \ + $(CC) $(FEATURE_LLVM_CFLAGS) $(FEATURE_LDFLAGS) \ + -Wall -Werror -x c - $(FEATURE_LLVM_LIBS) \ + -o /dev/null >/dev/null + +define llvm_build + $(call detect,$(LLVM_PROBE_CMD)) +endef + +$(call LOG,Probing: feature-llvm) +feature-llvm := $(findstring 1, $(call llvm_build)) +endif # llvm + +### Print detection results + +define print_status + ifeq ($(1), 1) + MSG = $(shell printf '...%30s: [ \033[32mon\033[m ]' $(2)) + else + MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(2)) + endif +endef +feature_print_status = $(eval $(print_status)) $(info $(MSG)) + +$(call feature_print_status,$(HAS_LIBBFD),libbfd) + +$(foreach feature,$(filter-out libbfd%,$(FEATURE_DISPLAY)), \ + $(call feature_print_status,$(feature-$(feature)),$(feature))) + +CFLAGS := $(CFLAGS_BACKUP) +undefine LOG LOG_RES diff --git a/third_party/bpftool/src/Makefile.include b/third_party/bpftool/src/Makefile.include new file mode 100644 index 0000000..d01b7d1 --- /dev/null +++ b/third_party/bpftool/src/Makefile.include @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +ifneq ($(OUTPUT),) +$(if $(shell [ -d "$(OUTPUT)" -a -x "$(OUTPUT)" ] && echo 1),, \ + $(error output directory "$(OUTPUT)" does not exist)) +endif + +LLVM_VERSION ?= +CLANG ?= clang$(LLVM_VERSION) +LLVM_CONFIG ?= llvm-config$(LLVM_VERSION) +LLVM_STRIP ?= llvm-strip$(LLVM_VERSION) + +ifneq ($(LLVM),) + $(if $(findstring default,$(origin AR)),$(eval AR := llvm-ar$(LLVM_VERSION))) + $(if $(findstring default,$(origin CC)),$(eval CC := clang$(LLVM_VERSION))) + $(if $(findstring default,$(origin LD)),$(eval LD := ld.lld$(LLVM_VERSION))) + HOSTAR ?= llvm-ar + HOSTCC ?= clang + HOSTLD ?= ld.lld +else + $(if $(findstring default,$(origin AR)),$(eval AR = $(CROSS_COMPILE)$(AR))) + $(if $(findstring default,$(origin CC)),$(eval CC = $(CROSS_COMPILE)$(CC))) + $(if $(findstring default,$(origin LD)),$(eval LD = $(CROSS_COMPILE)$(LD))) + HOSTAR ?= ar + HOSTCC ?= gcc + HOSTLD ?= ld +endif + +EXTRA_WARNINGS := \ + -Wbad-function-cast \ + -Wdeclaration-after-statement \ + -Wformat-security \ + -Wformat-y2k \ + -Winit-self \ + -Wmissing-declarations \ + -Wmissing-prototypes \ + -Wold-style-definition \ + -Wpacked \ + -Wredundant-decls \ + -Wshadow \ + -Wstrict-prototypes \ + -Wswitch-default \ + -Wundef \ + -Wwrite-strings \ + +ifeq ($(findstring s,$(filter-out --%,$(MAKEFLAGS))),) + ifneq ($(V),1) + + define def_quiet_msg + $(eval QUIET_$(1) = @printf ' %-9s%s\n' $(1) $$@;) + endef + $(foreach action,CC CLANG LINK MKDIR GEN,$(call def_quiet_msg,$(action))) + + define def_quiet_msg_subdir + $(eval QUIET_$(1) = @printf ' %-9s%s\n' $(1) $$1;) + endef + $(foreach action,CLEAN INSTALL UNINST,$(call def_quiet_msg_subdir,$(action))) + + define descend + @printf ' DESCEND %s\n' $(1); mkdir -p $(OUTPUT)$(1) && \ + $(MAKE) --no-print-directory -C $(1) $(2) + endef + + endif +endif diff --git a/third_party/bpftool/src/btf.c b/third_party/bpftool/src/btf.c new file mode 100644 index 0000000..91fcb75 --- /dev/null +++ b/third_party/bpftool/src/btf.c @@ -0,0 +1,1088 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", + [BTF_KIND_FLOAT] = "FLOAT", + [BTF_KIND_DECL_TAG] = "DECL_TAG", + [BTF_KIND_TYPE_TAG] = "TYPE_TAG", + [BTF_KIND_ENUM64] = "ENUM64", +}; + +static const char *btf_int_enc_str(__u8 encoding) +{ + switch (encoding) { + case 0: + return "(none)"; + case BTF_INT_SIGNED: + return "SIGNED"; + case BTF_INT_CHAR: + return "CHAR"; + case BTF_INT_BOOL: + return "BOOL"; + default: + return "UNKN"; + } +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: + return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: + return "global"; + case BTF_VAR_GLOBAL_EXTERN: + return "extern"; + default: + return "(unknown)"; + } +} + +static const char *btf_func_linkage_str(const struct btf_type *t) +{ + switch (btf_vlen(t)) { + case BTF_FUNC_STATIC: + return "static"; + case BTF_FUNC_GLOBAL: + return "global"; + case BTF_FUNC_EXTERN: + return "extern"; + default: + return "(unknown)"; + } +} + +static const char *btf_str(const struct btf *btf, __u32 off) +{ + if (!off) + return "(anon)"; + return btf__name_by_offset(btf, off) ? : "(invalid)"; +} + +static int btf_kind_safe(int kind) +{ + return kind <= BTF_KIND_MAX ? kind : BTF_KIND_UNKN; +} + +static int dump_btf_type(const struct btf *btf, __u32 id, + const struct btf_type *t) +{ + json_writer_t *w = json_wtr; + int kind = btf_kind(t); + + if (json_output) { + jsonw_start_object(w); + jsonw_uint_field(w, "id", id); + jsonw_string_field(w, "kind", btf_kind_str[btf_kind_safe(kind)]); + jsonw_string_field(w, "name", btf_str(btf, t->name_off)); + } else { + printf("[%u] %s '%s'", id, btf_kind_str[btf_kind_safe(kind)], + btf_str(btf, t->name_off)); + } + + switch (kind) { + case BTF_KIND_INT: { + __u32 v = *(__u32 *)(t + 1); + const char *enc; + + enc = btf_int_enc_str(BTF_INT_ENCODING(v)); + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "bits_offset", BTF_INT_OFFSET(v)); + jsonw_uint_field(w, "nr_bits", BTF_INT_BITS(v)); + jsonw_string_field(w, "encoding", enc); + } else { + printf(" size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(v), BTF_INT_BITS(v), + enc); + } + break; + } + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_TYPE_TAG: + if (json_output) + jsonw_uint_field(w, "type_id", t->type); + else + printf(" type_id=%u", t->type); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *arr = (const void *)(t + 1); + + if (json_output) { + jsonw_uint_field(w, "type_id", arr->type); + jsonw_uint_field(w, "index_type_id", arr->index_type); + jsonw_uint_field(w, "nr_elems", arr->nelems); + } else { + printf(" type_id=%u index_type_id=%u nr_elems=%u", + arr->type, arr->index_type, arr->nelems); + } + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "members"); + jsonw_start_array(w); + } else { + printf(" size=%u vlen=%u", t->size, vlen); + } + for (i = 0; i < vlen; i++, m++) { + const char *name = btf_str(btf, m->name_off); + __u32 bit_off, bit_sz; + + if (BTF_INFO_KFLAG(t->info)) { + bit_off = BTF_MEMBER_BIT_OFFSET(m->offset); + bit_sz = BTF_MEMBER_BITFIELD_SIZE(m->offset); + } else { + bit_off = m->offset; + bit_sz = 0; + } + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + jsonw_uint_field(w, "type_id", m->type); + jsonw_uint_field(w, "bits_offset", bit_off); + if (bit_sz) { + jsonw_uint_field(w, "bitfield_size", + bit_sz); + } + jsonw_end_object(w); + } else { + printf("\n\t'%s' type_id=%u bits_offset=%u", + name, m->type, bit_off); + if (bit_sz) + printf(" bitfield_size=%u", bit_sz); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *v = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + const char *encoding; + int i; + + encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; + if (json_output) { + jsonw_string_field(w, "encoding", encoding); + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "values"); + jsonw_start_array(w); + } else { + printf(" encoding=%s size=%u vlen=%u", encoding, t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + const char *name = btf_str(btf, v->name_off); + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + if (btf_kflag(t)) + jsonw_int_field(w, "val", v->val); + else + jsonw_uint_field(w, "val", v->val); + jsonw_end_object(w); + } else { + if (btf_kflag(t)) + printf("\n\t'%s' val=%d", name, v->val); + else + printf("\n\t'%s' val=%u", name, v->val); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_ENUM64: { + const struct btf_enum64 *v = btf_enum64(t); + __u16 vlen = btf_vlen(t); + const char *encoding; + int i; + + encoding = btf_kflag(t) ? "SIGNED" : "UNSIGNED"; + if (json_output) { + jsonw_string_field(w, "encoding", encoding); + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "values"); + jsonw_start_array(w); + } else { + printf(" encoding=%s size=%u vlen=%u", encoding, t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + const char *name = btf_str(btf, v->name_off); + __u64 val = ((__u64)v->val_hi32 << 32) | v->val_lo32; + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + if (btf_kflag(t)) + jsonw_int_field(w, "val", val); + else + jsonw_uint_field(w, "val", val); + jsonw_end_object(w); + } else { + if (btf_kflag(t)) + printf("\n\t'%s' val=%lldLL", name, + (unsigned long long)val); + else + printf("\n\t'%s' val=%lluULL", name, + (unsigned long long)val); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_FWD: { + const char *fwd_kind = BTF_INFO_KFLAG(t->info) ? "union" + : "struct"; + + if (json_output) + jsonw_string_field(w, "fwd_kind", fwd_kind); + else + printf(" fwd_kind=%s", fwd_kind); + break; + } + case BTF_KIND_FUNC: { + const char *linkage = btf_func_linkage_str(t); + + if (json_output) { + jsonw_uint_field(w, "type_id", t->type); + jsonw_string_field(w, "linkage", linkage); + } else { + printf(" type_id=%u linkage=%s", t->type, linkage); + } + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = (const void *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "ret_type_id", t->type); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "params"); + jsonw_start_array(w); + } else { + printf(" ret_type_id=%u vlen=%u", t->type, vlen); + } + for (i = 0; i < vlen; i++, p++) { + const char *name = btf_str(btf, p->name_off); + + if (json_output) { + jsonw_start_object(w); + jsonw_string_field(w, "name", name); + jsonw_uint_field(w, "type_id", p->type); + jsonw_end_object(w); + } else { + printf("\n\t'%s' type_id=%u", name, p->type); + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_VAR: { + const struct btf_var *v = (const void *)(t + 1); + const char *linkage; + + linkage = btf_var_linkage_str(v->linkage); + + if (json_output) { + jsonw_uint_field(w, "type_id", t->type); + jsonw_string_field(w, "linkage", linkage); + } else { + printf(" type_id=%u, linkage=%s", t->type, linkage); + } + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = (const void *)(t + 1); + const struct btf_type *vt; + __u16 vlen = BTF_INFO_VLEN(t->info); + int i; + + if (json_output) { + jsonw_uint_field(w, "size", t->size); + jsonw_uint_field(w, "vlen", vlen); + jsonw_name(w, "vars"); + jsonw_start_array(w); + } else { + printf(" size=%u vlen=%u", t->size, vlen); + } + for (i = 0; i < vlen; i++, v++) { + if (json_output) { + jsonw_start_object(w); + jsonw_uint_field(w, "type_id", v->type); + jsonw_uint_field(w, "offset", v->offset); + jsonw_uint_field(w, "size", v->size); + jsonw_end_object(w); + } else { + printf("\n\ttype_id=%u offset=%u size=%u", + v->type, v->offset, v->size); + + if (v->type < btf__type_cnt(btf)) { + vt = btf__type_by_id(btf, v->type); + printf(" (%s '%s')", + btf_kind_str[btf_kind_safe(btf_kind(vt))], + btf_str(btf, vt->name_off)); + } + } + } + if (json_output) + jsonw_end_array(w); + break; + } + case BTF_KIND_FLOAT: { + if (json_output) + jsonw_uint_field(w, "size", t->size); + else + printf(" size=%u", t->size); + break; + } + case BTF_KIND_DECL_TAG: { + const struct btf_decl_tag *tag = (const void *)(t + 1); + + if (json_output) { + jsonw_uint_field(w, "type_id", t->type); + jsonw_int_field(w, "component_idx", tag->component_idx); + } else { + printf(" type_id=%u component_idx=%d", t->type, tag->component_idx); + } + break; + } + default: + break; + } + + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + + return 0; +} + +static int dump_btf_raw(const struct btf *btf, + __u32 *root_type_ids, int root_type_cnt) +{ + const struct btf_type *t; + int i; + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "types"); + jsonw_start_array(json_wtr); + } + + if (root_type_cnt) { + for (i = 0; i < root_type_cnt; i++) { + t = btf__type_by_id(btf, root_type_ids[i]); + dump_btf_type(btf, root_type_ids[i], t); + } + } else { + const struct btf *base; + int cnt = btf__type_cnt(btf); + int start_id = 1; + + base = btf__base_btf(btf); + if (base) + start_id = btf__type_cnt(base); + + for (i = start_id; i < cnt; i++) { + t = btf__type_by_id(btf, i); + dump_btf_type(btf, i, t); + } + } + + if (json_output) { + jsonw_end_array(json_wtr); + jsonw_end_object(json_wtr); + } + return 0; +} + +static void __printf(2, 0) btf_dump_printf(void *ctx, + const char *fmt, va_list args) +{ + vfprintf(stdout, fmt, args); +} + +static int dump_btf_c(const struct btf *btf, + __u32 *root_type_ids, int root_type_cnt) +{ + struct btf_dump *d; + int err = 0, i; + + d = btf_dump__new(btf, btf_dump_printf, NULL, NULL); + if (!d) + return -errno; + + printf("#ifndef __VMLINUX_H__\n"); + printf("#define __VMLINUX_H__\n"); + printf("\n"); + printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); + printf("#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)\n"); + printf("#endif\n\n"); + + if (root_type_cnt) { + for (i = 0; i < root_type_cnt; i++) { + err = btf_dump__dump_type(d, root_type_ids[i]); + if (err) + goto done; + } + } else { + int cnt = btf__type_cnt(btf); + + for (i = 1; i < cnt; i++) { + err = btf_dump__dump_type(d, i); + if (err) + goto done; + } + } + + printf("#ifndef BPF_NO_PRESERVE_ACCESS_INDEX\n"); + printf("#pragma clang attribute pop\n"); + printf("#endif\n"); + printf("\n"); + printf("#endif /* __VMLINUX_H__ */\n"); + +done: + btf_dump__free(d); + return err; +} + +static const char sysfs_vmlinux[] = "/sys/kernel/btf/vmlinux"; + +static struct btf *get_vmlinux_btf_from_sysfs(void) +{ + struct btf *base; + + base = btf__parse(sysfs_vmlinux, NULL); + if (!base) + p_err("failed to parse vmlinux BTF at '%s': %d\n", + sysfs_vmlinux, -errno); + + return base; +} + +#define BTF_NAME_BUFF_LEN 64 + +static bool btf_is_kernel_module(__u32 btf_id) +{ + struct bpf_btf_info btf_info = {}; + char btf_name[BTF_NAME_BUFF_LEN]; + int btf_fd; + __u32 len; + int err; + + btf_fd = bpf_btf_get_fd_by_id(btf_id); + if (btf_fd < 0) { + p_err("can't get BTF object by id (%u): %s", btf_id, strerror(errno)); + return false; + } + + len = sizeof(btf_info); + btf_info.name = ptr_to_u64(btf_name); + btf_info.name_len = sizeof(btf_name); + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); + close(btf_fd); + if (err) { + p_err("can't get BTF (ID %u) object info: %s", btf_id, strerror(errno)); + return false; + } + + return btf_info.kernel_btf && strncmp(btf_name, "vmlinux", sizeof(btf_name)) != 0; +} + +static int do_dump(int argc, char **argv) +{ + struct btf *btf = NULL, *base = NULL; + __u32 root_type_ids[2]; + int root_type_cnt = 0; + bool dump_c = false; + __u32 btf_id = -1; + const char *src; + int fd = -1; + int err = 0; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + src = GET_ARG(); + if (is_prefix(src, "map")) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + btf_id = info.btf_id; + if (argc && is_prefix(*argv, "key")) { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "value")) { + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "all")) { + NEXT_ARG(); + } else if (argc && is_prefix(*argv, "kv")) { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + NEXT_ARG(); + } else { + root_type_ids[root_type_cnt++] = info.btf_key_type_id; + root_type_ids[root_type_cnt++] = info.btf_value_type_id; + } + } else if (is_prefix(src, "prog")) { + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + err = bpf_prog_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get prog info: %s", strerror(errno)); + goto done; + } + + btf_id = info.btf_id; + } else if (is_prefix(src, "id")) { + char *endptr; + + btf_id = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(src, "file")) { + const char sysfs_prefix[] = "/sys/kernel/btf/"; + + if (!base_btf && + strncmp(*argv, sysfs_prefix, sizeof(sysfs_prefix) - 1) == 0 && + strcmp(*argv, sysfs_vmlinux) != 0) + base = get_vmlinux_btf_from_sysfs(); + + btf = btf__parse_split(*argv, base ?: base_btf); + if (!btf) { + err = -errno; + p_err("failed to load BTF from %s: %s", + *argv, strerror(errno)); + goto done; + } + NEXT_ARG(); + } else { + err = -1; + p_err("unrecognized BTF source specifier: '%s'", src); + goto done; + } + + while (argc) { + if (is_prefix(*argv, "format")) { + NEXT_ARG(); + if (argc < 1) { + p_err("expecting value for 'format' option\n"); + err = -EINVAL; + goto done; + } + if (strcmp(*argv, "c") == 0) { + dump_c = true; + } else if (strcmp(*argv, "raw") == 0) { + dump_c = false; + } else { + p_err("unrecognized format specifier: '%s', possible values: raw, c", + *argv); + err = -EINVAL; + goto done; + } + NEXT_ARG(); + } else { + p_err("unrecognized option: '%s'", *argv); + err = -EINVAL; + goto done; + } + } + + if (!btf) { + if (!base_btf && btf_is_kernel_module(btf_id)) { + p_info("Warning: valid base BTF was not specified with -B option, falling back to standard base BTF (%s)", + sysfs_vmlinux); + base_btf = get_vmlinux_btf_from_sysfs(); + } + + btf = btf__load_from_kernel_by_id_split(btf_id, base_btf); + if (!btf) { + err = -errno; + p_err("get btf by id (%u): %s", btf_id, strerror(errno)); + goto done; + } + } + + if (dump_c) { + if (json_output) { + p_err("JSON output for C-syntax dump is not supported"); + err = -ENOTSUP; + goto done; + } + err = dump_btf_c(btf, root_type_ids, root_type_cnt); + } else { + err = dump_btf_raw(btf, root_type_ids, root_type_cnt); + } + +done: + close(fd); + btf__free(btf); + btf__free(base); + return err; +} + +static int btf_parse_fd(int *argc, char ***argv) +{ + unsigned int id; + char *endptr; + int fd; + + if (!is_prefix(*argv[0], "id")) { + p_err("expected 'id', got: '%s'?", **argv); + return -1; + } + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) + p_err("can't get BTF object by id (%u): %s", + id, strerror(errno)); + + return fd; +} + +static int +build_btf_type_table(struct hashmap *tab, enum bpf_obj_type type, + void *info, __u32 *len) +{ + static const char * const names[] = { + [BPF_OBJ_UNKNOWN] = "unknown", + [BPF_OBJ_PROG] = "prog", + [BPF_OBJ_MAP] = "map", + }; + __u32 btf_id, id = 0; + int err; + int fd; + + while (true) { + switch (type) { + case BPF_OBJ_PROG: + err = bpf_prog_get_next_id(id, &id); + break; + case BPF_OBJ_MAP: + err = bpf_map_get_next_id(id, &id); + break; + default: + err = -1; + p_err("unexpected object type: %d", type); + goto err_free; + } + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + p_err("can't get next %s: %s%s", names[type], + strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + goto err_free; + } + + switch (type) { + case BPF_OBJ_PROG: + fd = bpf_prog_get_fd_by_id(id); + break; + case BPF_OBJ_MAP: + fd = bpf_map_get_fd_by_id(id); + break; + default: + err = -1; + p_err("unexpected object type: %d", type); + goto err_free; + } + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get %s by id (%u): %s", names[type], id, + strerror(errno)); + err = -1; + goto err_free; + } + + memset(info, 0, *len); + if (type == BPF_OBJ_PROG) + err = bpf_prog_get_info_by_fd(fd, info, len); + else + err = bpf_map_get_info_by_fd(fd, info, len); + close(fd); + if (err) { + p_err("can't get %s info: %s", names[type], + strerror(errno)); + goto err_free; + } + + switch (type) { + case BPF_OBJ_PROG: + btf_id = ((struct bpf_prog_info *)info)->btf_id; + break; + case BPF_OBJ_MAP: + btf_id = ((struct bpf_map_info *)info)->btf_id; + break; + default: + err = -1; + p_err("unexpected object type: %d", type); + goto err_free; + } + if (!btf_id) + continue; + + err = hashmap__append(tab, btf_id, id); + if (err) { + p_err("failed to append entry to hashmap for BTF ID %u, object ID %u: %s", + btf_id, id, strerror(-err)); + goto err_free; + } + } + + return 0; + +err_free: + hashmap__free(tab); + return err; +} + +static int +build_btf_tables(struct hashmap *btf_prog_table, + struct hashmap *btf_map_table) +{ + struct bpf_prog_info prog_info; + __u32 prog_len = sizeof(prog_info); + struct bpf_map_info map_info; + __u32 map_len = sizeof(map_info); + int err = 0; + + err = build_btf_type_table(btf_prog_table, BPF_OBJ_PROG, &prog_info, + &prog_len); + if (err) + return err; + + err = build_btf_type_table(btf_map_table, BPF_OBJ_MAP, &map_info, + &map_len); + if (err) { + hashmap__free(btf_prog_table); + return err; + } + + return 0; +} + +static void +show_btf_plain(struct bpf_btf_info *info, int fd, + struct hashmap *btf_prog_table, + struct hashmap *btf_map_table) +{ + struct hashmap_entry *entry; + const char *name = u64_to_ptr(info->name); + int n; + + printf("%u: ", info->id); + if (info->kernel_btf) + printf("name [%s] ", name); + else if (name && name[0]) + printf("name %s ", name); + else + printf("name "); + printf("size %uB", info->btf_size); + + n = 0; + hashmap__for_each_key_entry(btf_prog_table, entry, info->id) { + printf("%s%lu", n++ == 0 ? " prog_ids " : ",", entry->value); + } + + n = 0; + hashmap__for_each_key_entry(btf_map_table, entry, info->id) { + printf("%s%lu", n++ == 0 ? " map_ids " : ",", entry->value); + } + + emit_obj_refs_plain(refs_table, info->id, "\n\tpids "); + + printf("\n"); +} + +static void +show_btf_json(struct bpf_btf_info *info, int fd, + struct hashmap *btf_prog_table, + struct hashmap *btf_map_table) +{ + struct hashmap_entry *entry; + const char *name = u64_to_ptr(info->name); + + jsonw_start_object(json_wtr); /* btf object */ + jsonw_uint_field(json_wtr, "id", info->id); + jsonw_uint_field(json_wtr, "size", info->btf_size); + + jsonw_name(json_wtr, "prog_ids"); + jsonw_start_array(json_wtr); /* prog_ids */ + hashmap__for_each_key_entry(btf_prog_table, entry, info->id) { + jsonw_uint(json_wtr, entry->value); + } + jsonw_end_array(json_wtr); /* prog_ids */ + + jsonw_name(json_wtr, "map_ids"); + jsonw_start_array(json_wtr); /* map_ids */ + hashmap__for_each_key_entry(btf_map_table, entry, info->id) { + jsonw_uint(json_wtr, entry->value); + } + jsonw_end_array(json_wtr); /* map_ids */ + + emit_obj_refs_json(refs_table, info->id, json_wtr); /* pids */ + + jsonw_bool_field(json_wtr, "kernel", info->kernel_btf); + + if (name && name[0]) + jsonw_string_field(json_wtr, "name", name); + + jsonw_end_object(json_wtr); /* btf object */ +} + +static int +show_btf(int fd, struct hashmap *btf_prog_table, + struct hashmap *btf_map_table) +{ + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[64]; + int err; + + memset(&info, 0, sizeof(info)); + err = bpf_btf_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get BTF object info: %s", strerror(errno)); + return -1; + } + /* if kernel support emitting BTF object name, pass name pointer */ + if (info.name_len) { + memset(&info, 0, sizeof(info)); + info.name_len = sizeof(name); + info.name = ptr_to_u64(name); + len = sizeof(info); + + err = bpf_btf_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get BTF object info: %s", strerror(errno)); + return -1; + } + } + + if (json_output) + show_btf_json(&info, fd, btf_prog_table, btf_map_table); + else + show_btf_plain(&info, fd, btf_prog_table, btf_map_table); + + return 0; +} + +static int do_show(int argc, char **argv) +{ + struct hashmap *btf_prog_table; + struct hashmap *btf_map_table; + int err, fd = -1; + __u32 id = 0; + + if (argc == 2) { + fd = btf_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + } + + if (argc) { + if (fd >= 0) + close(fd); + return BAD_ARG(); + } + + btf_prog_table = hashmap__new(hash_fn_for_key_as_id, + equal_fn_for_key_as_id, NULL); + btf_map_table = hashmap__new(hash_fn_for_key_as_id, + equal_fn_for_key_as_id, NULL); + if (IS_ERR(btf_prog_table) || IS_ERR(btf_map_table)) { + hashmap__free(btf_prog_table); + hashmap__free(btf_map_table); + if (fd >= 0) + close(fd); + p_err("failed to create hashmap for object references"); + return -1; + } + err = build_btf_tables(btf_prog_table, btf_map_table); + if (err) { + if (fd >= 0) + close(fd); + return err; + } + build_obj_refs_table(&refs_table, BPF_OBJ_BTF); + + if (fd >= 0) { + err = show_btf(fd, btf_prog_table, btf_map_table); + close(fd); + goto exit_free; + } + + if (json_output) + jsonw_start_array(json_wtr); /* root array */ + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + p_err("can't get next BTF object: %s%s", + strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + err = -1; + break; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get BTF object by id (%u): %s", + id, strerror(errno)); + err = -1; + break; + } + + err = show_btf(fd, btf_prog_table, btf_map_table); + close(fd); + if (err) + break; + } + + if (json_output) + jsonw_end_array(json_wtr); /* root array */ + +exit_free: + hashmap__free(btf_prog_table); + hashmap__free(btf_map_table); + delete_obj_refs_table(refs_table); + + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [id BTF_ID]\n" + " %1$s %2$s dump BTF_SRC [format FORMAT]\n" + " %1$s %2$s help\n" + "\n" + " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" + " FORMAT := { raw | c }\n" + " " HELP_SPEC_MAP "\n" + " " HELP_SPEC_PROGRAM "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-B|--base-btf} }\n" + "", + bin_name, "btf"); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "dump", do_dump }, + { 0 } +}; + +int do_btf(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/btf_dumper.c b/third_party/bpftool/src/btf_dumper.c new file mode 100644 index 0000000..294de23 --- /dev/null +++ b/third_party/bpftool/src/btf_dumper.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include /* for (FILE *) used by json_writer */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +#define BITS_ROUNDUP_BYTES(bits) \ + (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) + +static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, + __u8 bit_offset, const void *data); + +static int btf_dump_func(const struct btf *btf, char *func_sig, + const struct btf_type *func_proto, + const struct btf_type *func, int pos, int size); + +static int dump_prog_id_as_func_ptr(const struct btf_dumper *d, + const struct btf_type *func_proto, + __u32 prog_id) +{ + const struct btf_type *func_type; + int prog_fd = -1, func_sig_len; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + const char *prog_name = NULL; + struct btf *prog_btf = NULL; + struct bpf_func_info finfo; + __u32 finfo_rec_size; + char prog_str[1024]; + int err; + + /* Get the ptr's func_proto */ + func_sig_len = btf_dump_func(d->btf, prog_str, func_proto, NULL, 0, + sizeof(prog_str)); + if (func_sig_len == -1) + return -1; + + if (!prog_id) + goto print; + + /* Get the bpf_prog's name. Obtain from func_info. */ + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) + goto print; + + err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); + if (err) + goto print; + + if (!info.btf_id || !info.nr_func_info) + goto print; + + finfo_rec_size = info.func_info_rec_size; + memset(&info, 0, sizeof(info)); + info.nr_func_info = 1; + info.func_info_rec_size = finfo_rec_size; + info.func_info = ptr_to_u64(&finfo); + + err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); + if (err) + goto print; + + prog_btf = btf__load_from_kernel_by_id(info.btf_id); + if (!prog_btf) + goto print; + func_type = btf__type_by_id(prog_btf, finfo.type_id); + if (!func_type || !btf_is_func(func_type)) + goto print; + + prog_name = btf__name_by_offset(prog_btf, func_type->name_off); + +print: + if (!prog_id) + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, " 0"); + else if (prog_name) + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, + " %s/prog_id:%u", prog_name, prog_id); + else + snprintf(&prog_str[func_sig_len], + sizeof(prog_str) - func_sig_len, + " /prog_id:%u", prog_id); + + prog_str[sizeof(prog_str) - 1] = '\0'; + jsonw_string(d->jw, prog_str); + btf__free(prog_btf); + if (prog_fd >= 0) + close(prog_fd); + return 0; +} + +static void btf_dumper_ptr(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + unsigned long value = *(unsigned long *)data; + const struct btf_type *ptr_type; + __s32 ptr_type_id; + + if (!d->prog_id_as_func_ptr || value > UINT32_MAX) + goto print_ptr_value; + + ptr_type_id = btf__resolve_type(d->btf, t->type); + if (ptr_type_id < 0) + goto print_ptr_value; + ptr_type = btf__type_by_id(d->btf, ptr_type_id); + if (!ptr_type || !btf_is_func_proto(ptr_type)) + goto print_ptr_value; + + if (!dump_prog_id_as_func_ptr(d, ptr_type, value)) + return; + +print_ptr_value: + if (d->is_plain_text) + jsonw_printf(d->jw, "%p", (void *)value); + else + jsonw_printf(d->jw, "%lu", value); +} + +static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id, + __u8 bit_offset, const void *data) +{ + int actual_type_id; + + actual_type_id = btf__resolve_type(d->btf, type_id); + if (actual_type_id < 0) + return actual_type_id; + + return btf_dumper_do_type(d, actual_type_id, bit_offset, data); +} + +static int btf_dumper_enum(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + const struct btf_enum *enums = btf_enum(t); + __s64 value; + __u16 i; + + switch (t->size) { + case 8: + value = *(__s64 *)data; + break; + case 4: + value = *(__s32 *)data; + break; + case 2: + value = *(__s16 *)data; + break; + case 1: + value = *(__s8 *)data; + break; + default: + return -EINVAL; + } + + for (i = 0; i < btf_vlen(t); i++) { + if (value == enums[i].val) { + jsonw_string(d->jw, + btf__name_by_offset(d->btf, + enums[i].name_off)); + return 0; + } + } + + jsonw_int(d->jw, value); + return 0; +} + +static int btf_dumper_enum64(const struct btf_dumper *d, + const struct btf_type *t, + const void *data) +{ + const struct btf_enum64 *enums = btf_enum64(t); + __u32 val_lo32, val_hi32; + __u64 value; + __u16 i; + + value = *(__u64 *)data; + val_lo32 = (__u32)value; + val_hi32 = value >> 32; + + for (i = 0; i < btf_vlen(t); i++) { + if (val_lo32 == enums[i].val_lo32 && val_hi32 == enums[i].val_hi32) { + jsonw_string(d->jw, + btf__name_by_offset(d->btf, + enums[i].name_off)); + return 0; + } + } + + jsonw_int(d->jw, value); + return 0; +} + +static bool is_str_array(const struct btf *btf, const struct btf_array *arr, + const char *s) +{ + const struct btf_type *elem_type; + const char *end_s; + + if (!arr->nelems) + return false; + + elem_type = btf__type_by_id(btf, arr->type); + /* Not skipping typedef. typedef to char does not count as + * a string now. + */ + while (elem_type && btf_is_mod(elem_type)) + elem_type = btf__type_by_id(btf, elem_type->type); + + if (!elem_type || !btf_is_int(elem_type) || elem_type->size != 1) + return false; + + if (btf_int_encoding(elem_type) != BTF_INT_CHAR && + strcmp("char", btf__name_by_offset(btf, elem_type->name_off))) + return false; + + end_s = s + arr->nelems; + while (s < end_s) { + if (!*s) + return true; + if (*s <= 0x1f || *s >= 0x7f) + return false; + s++; + } + + /* '\0' is not found */ + return false; +} + +static int btf_dumper_array(const struct btf_dumper *d, __u32 type_id, + const void *data) +{ + const struct btf_type *t = btf__type_by_id(d->btf, type_id); + struct btf_array *arr = (struct btf_array *)(t + 1); + long long elem_size; + int ret = 0; + __u32 i; + + if (is_str_array(d->btf, arr, data)) { + jsonw_string(d->jw, data); + return 0; + } + + elem_size = btf__resolve_size(d->btf, arr->type); + if (elem_size < 0) + return elem_size; + + jsonw_start_array(d->jw); + for (i = 0; i < arr->nelems; i++) { + ret = btf_dumper_do_type(d, arr->type, 0, + data + i * elem_size); + if (ret) + break; + } + + jsonw_end_array(d->jw); + return ret; +} + +static void btf_int128_print(json_writer_t *jw, const void *data, + bool is_plain_text) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + __u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(__u64 *)data; + lower_num = *(__u64 *)(data + 8); +#else + upper_num = *(__u64 *)(data + 8); + lower_num = *(__u64 *)data; +#endif + + if (is_plain_text) { + if (upper_num == 0) + jsonw_printf(jw, "0x%llx", lower_num); + else + jsonw_printf(jw, "0x%llx%016llx", upper_num, lower_num); + } else { + if (upper_num == 0) + jsonw_printf(jw, "\"0x%llx\"", lower_num); + else + jsonw_printf(jw, "\"0x%llx%016llx\"", upper_num, lower_num); + } +} + +static void btf_int128_shift(__u64 *print_num, __u16 left_shift_bits, + __u16 right_shift_bits) +{ + __u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + +static void btf_dumper_bitfield(__u32 nr_bits, __u8 bit_offset, + const void *data, json_writer_t *jw, + bool is_plain_text) +{ + int left_shift_bits, right_shift_bits; + __u64 print_num[2] = {}; + int bytes_to_copy; + int bits_to_copy; + + bits_to_copy = bit_offset + nr_bits; + bytes_to_copy = BITS_ROUNDUP_BYTES(bits_to_copy); + + memcpy(print_num, data, bytes_to_copy); +#if defined(__BIG_ENDIAN_BITFIELD) + left_shift_bits = bit_offset; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + left_shift_bits = 128 - bits_to_copy; +#else +#error neither big nor little endian +#endif + right_shift_bits = 128 - nr_bits; + + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(jw, print_num, is_plain_text); +} + + +static void btf_dumper_int_bits(__u32 int_type, __u8 bit_offset, + const void *data, json_writer_t *jw, + bool is_plain_text) +{ + int nr_bits = BTF_INT_BITS(int_type); + int total_bits_offset; + + /* bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 128 bits. + */ + total_bits_offset = bit_offset + BTF_INT_OFFSET(int_type); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bit_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_dumper_bitfield(nr_bits, bit_offset, data, jw, + is_plain_text); +} + +static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset, + const void *data, json_writer_t *jw, + bool is_plain_text) +{ + __u32 *int_type; + __u32 nr_bits; + + int_type = (__u32 *)(t + 1); + nr_bits = BTF_INT_BITS(*int_type); + /* if this is bit field */ + if (bit_offset || BTF_INT_OFFSET(*int_type) || + BITS_PER_BYTE_MASKED(nr_bits)) { + btf_dumper_int_bits(*int_type, bit_offset, data, jw, + is_plain_text); + return 0; + } + + if (nr_bits == 128) { + btf_int128_print(jw, data, is_plain_text); + return 0; + } + + switch (BTF_INT_ENCODING(*int_type)) { + case 0: + if (BTF_INT_BITS(*int_type) == 64) + jsonw_printf(jw, "%llu", *(__u64 *)data); + else if (BTF_INT_BITS(*int_type) == 32) + jsonw_printf(jw, "%u", *(__u32 *)data); + else if (BTF_INT_BITS(*int_type) == 16) + jsonw_printf(jw, "%hu", *(__u16 *)data); + else if (BTF_INT_BITS(*int_type) == 8) + jsonw_printf(jw, "%hhu", *(__u8 *)data); + else + btf_dumper_int_bits(*int_type, bit_offset, data, jw, + is_plain_text); + break; + case BTF_INT_SIGNED: + if (BTF_INT_BITS(*int_type) == 64) + jsonw_printf(jw, "%lld", *(long long *)data); + else if (BTF_INT_BITS(*int_type) == 32) + jsonw_printf(jw, "%d", *(int *)data); + else if (BTF_INT_BITS(*int_type) == 16) + jsonw_printf(jw, "%hd", *(short *)data); + else if (BTF_INT_BITS(*int_type) == 8) + jsonw_printf(jw, "%hhd", *(char *)data); + else + btf_dumper_int_bits(*int_type, bit_offset, data, jw, + is_plain_text); + break; + case BTF_INT_CHAR: + if (isprint(*(char *)data)) + jsonw_printf(jw, "\"%c\"", *(char *)data); + else + if (is_plain_text) + jsonw_printf(jw, "0x%hhx", *(char *)data); + else + jsonw_printf(jw, "\"\\u00%02hhx\"", + *(char *)data); + break; + case BTF_INT_BOOL: + jsonw_bool(jw, *(bool *)data); + break; + default: + /* shouldn't happen */ + return -EINVAL; + } + + return 0; +} + +static int btf_dumper_struct(const struct btf_dumper *d, __u32 type_id, + const void *data) +{ + const struct btf_type *t; + struct btf_member *m; + const void *data_off; + int kind_flag; + int ret = 0; + int i, vlen; + + t = btf__type_by_id(d->btf, type_id); + if (!t) + return -EINVAL; + + kind_flag = BTF_INFO_KFLAG(t->info); + vlen = BTF_INFO_VLEN(t->info); + jsonw_start_object(d->jw); + m = (struct btf_member *)(t + 1); + + for (i = 0; i < vlen; i++) { + __u32 bit_offset = m[i].offset; + __u32 bitfield_size = 0; + + if (kind_flag) { + bitfield_size = BTF_MEMBER_BITFIELD_SIZE(bit_offset); + bit_offset = BTF_MEMBER_BIT_OFFSET(bit_offset); + } + + jsonw_name(d->jw, btf__name_by_offset(d->btf, m[i].name_off)); + data_off = data + BITS_ROUNDDOWN_BYTES(bit_offset); + if (bitfield_size) { + btf_dumper_bitfield(bitfield_size, + BITS_PER_BYTE_MASKED(bit_offset), + data_off, d->jw, d->is_plain_text); + } else { + ret = btf_dumper_do_type(d, m[i].type, + BITS_PER_BYTE_MASKED(bit_offset), + data_off); + if (ret) + break; + } + } + + jsonw_end_object(d->jw); + + return ret; +} + +static int btf_dumper_var(const struct btf_dumper *d, __u32 type_id, + __u8 bit_offset, const void *data) +{ + const struct btf_type *t = btf__type_by_id(d->btf, type_id); + int ret; + + jsonw_start_object(d->jw); + jsonw_name(d->jw, btf__name_by_offset(d->btf, t->name_off)); + ret = btf_dumper_do_type(d, t->type, bit_offset, data); + jsonw_end_object(d->jw); + + return ret; +} + +static int btf_dumper_datasec(const struct btf_dumper *d, __u32 type_id, + const void *data) +{ + struct btf_var_secinfo *vsi; + const struct btf_type *t; + int ret = 0, i, vlen; + + t = btf__type_by_id(d->btf, type_id); + if (!t) + return -EINVAL; + + vlen = BTF_INFO_VLEN(t->info); + vsi = (struct btf_var_secinfo *)(t + 1); + + jsonw_start_object(d->jw); + jsonw_name(d->jw, btf__name_by_offset(d->btf, t->name_off)); + jsonw_start_array(d->jw); + for (i = 0; i < vlen; i++) { + ret = btf_dumper_do_type(d, vsi[i].type, 0, data + vsi[i].offset); + if (ret) + break; + } + jsonw_end_array(d->jw); + jsonw_end_object(d->jw); + + return ret; +} + +static int btf_dumper_do_type(const struct btf_dumper *d, __u32 type_id, + __u8 bit_offset, const void *data) +{ + const struct btf_type *t = btf__type_by_id(d->btf, type_id); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + return btf_dumper_int(t, bit_offset, data, d->jw, + d->is_plain_text); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return btf_dumper_struct(d, type_id, data); + case BTF_KIND_ARRAY: + return btf_dumper_array(d, type_id, data); + case BTF_KIND_ENUM: + return btf_dumper_enum(d, t, data); + case BTF_KIND_ENUM64: + return btf_dumper_enum64(d, t, data); + case BTF_KIND_PTR: + btf_dumper_ptr(d, t, data); + return 0; + case BTF_KIND_UNKN: + jsonw_printf(d->jw, "(unknown)"); + return 0; + case BTF_KIND_FWD: + /* map key or value can't be forward */ + jsonw_printf(d->jw, "(fwd-kind-invalid)"); + return -EINVAL; + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return btf_dumper_modifier(d, type_id, bit_offset, data); + case BTF_KIND_VAR: + return btf_dumper_var(d, type_id, bit_offset, data); + case BTF_KIND_DATASEC: + return btf_dumper_datasec(d, type_id, data); + default: + jsonw_printf(d->jw, "(unsupported-kind"); + return -EINVAL; + } +} + +int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, + const void *data) +{ + return btf_dumper_do_type(d, type_id, 0, data); +} + +#define BTF_PRINT_ARG(...) \ + do { \ + pos += snprintf(func_sig + pos, size - pos, \ + __VA_ARGS__); \ + if (pos >= size) \ + return -1; \ + } while (0) +#define BTF_PRINT_TYPE(type) \ + do { \ + pos = __btf_dumper_type_only(btf, type, func_sig, \ + pos, size); \ + if (pos == -1) \ + return -1; \ + } while (0) + +static int __btf_dumper_type_only(const struct btf *btf, __u32 type_id, + char *func_sig, int pos, int size) +{ + const struct btf_type *proto_type; + const struct btf_array *array; + const struct btf_var *var; + const struct btf_type *t; + + if (!type_id) { + BTF_PRINT_ARG("void "); + return pos; + } + + t = btf__type_by_id(btf, type_id); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: + BTF_PRINT_ARG("%s ", btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_STRUCT: + BTF_PRINT_ARG("struct %s ", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_UNION: + BTF_PRINT_ARG("union %s ", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + BTF_PRINT_ARG("enum %s ", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_ARRAY: + array = (struct btf_array *)(t + 1); + BTF_PRINT_TYPE(array->type); + BTF_PRINT_ARG("[%d]", array->nelems); + break; + case BTF_KIND_PTR: + BTF_PRINT_TYPE(t->type); + BTF_PRINT_ARG("* "); + break; + case BTF_KIND_FWD: + BTF_PRINT_ARG("%s %s ", + BTF_INFO_KFLAG(t->info) ? "union" : "struct", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_VOLATILE: + BTF_PRINT_ARG("volatile "); + BTF_PRINT_TYPE(t->type); + break; + case BTF_KIND_CONST: + BTF_PRINT_ARG("const "); + BTF_PRINT_TYPE(t->type); + break; + case BTF_KIND_RESTRICT: + BTF_PRINT_ARG("restrict "); + BTF_PRINT_TYPE(t->type); + break; + case BTF_KIND_FUNC_PROTO: + pos = btf_dump_func(btf, func_sig, t, NULL, pos, size); + if (pos == -1) + return -1; + break; + case BTF_KIND_FUNC: + proto_type = btf__type_by_id(btf, t->type); + pos = btf_dump_func(btf, func_sig, proto_type, t, pos, size); + if (pos == -1) + return -1; + break; + case BTF_KIND_VAR: + var = (struct btf_var *)(t + 1); + if (var->linkage == BTF_VAR_STATIC) + BTF_PRINT_ARG("static "); + BTF_PRINT_TYPE(t->type); + BTF_PRINT_ARG(" %s", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_DATASEC: + BTF_PRINT_ARG("section (\"%s\") ", + btf__name_by_offset(btf, t->name_off)); + break; + case BTF_KIND_UNKN: + default: + return -1; + } + + return pos; +} + +static int btf_dump_func(const struct btf *btf, char *func_sig, + const struct btf_type *func_proto, + const struct btf_type *func, int pos, int size) +{ + int i, vlen; + + BTF_PRINT_TYPE(func_proto->type); + if (func) + BTF_PRINT_ARG("%s(", btf__name_by_offset(btf, func->name_off)); + else + BTF_PRINT_ARG("("); + vlen = BTF_INFO_VLEN(func_proto->info); + for (i = 0; i < vlen; i++) { + struct btf_param *arg = &((struct btf_param *)(func_proto + 1))[i]; + + if (i) + BTF_PRINT_ARG(", "); + if (arg->type) { + BTF_PRINT_TYPE(arg->type); + if (arg->name_off) + BTF_PRINT_ARG("%s", + btf__name_by_offset(btf, arg->name_off)); + else if (pos && func_sig[pos - 1] == ' ') + /* Remove unnecessary space for + * FUNC_PROTO that does not have + * arg->name_off + */ + func_sig[--pos] = '\0'; + } else { + BTF_PRINT_ARG("..."); + } + } + BTF_PRINT_ARG(")"); + + return pos; +} + +void btf_dumper_type_only(const struct btf *btf, __u32 type_id, char *func_sig, + int size) +{ + int err; + + func_sig[0] = '\0'; + if (!btf) + return; + + err = __btf_dumper_type_only(btf, type_id, func_sig, 0, size); + if (err < 0) + func_sig[0] = '\0'; +} + +static const char *ltrim(const char *s) +{ + while (isspace(*s)) + s++; + + return s; +} + +void btf_dump_linfo_plain(const struct btf *btf, + const struct bpf_line_info *linfo, + const char *prefix, bool linum) +{ + const char *line = btf__name_by_offset(btf, linfo->line_off); + + if (!line) + return; + line = ltrim(line); + + if (!prefix) + prefix = ""; + + if (linum) { + const char *file = btf__name_by_offset(btf, linfo->file_name_off); + + /* More forgiving on file because linum option is + * expected to provide more info than the already + * available src line. + */ + if (!file) + file = ""; + + printf("%s%s [file:%s line_num:%u line_col:%u]\n", + prefix, line, file, + BPF_LINE_INFO_LINE_NUM(linfo->line_col), + BPF_LINE_INFO_LINE_COL(linfo->line_col)); + } else { + printf("%s%s\n", prefix, line); + } +} + +void btf_dump_linfo_json(const struct btf *btf, + const struct bpf_line_info *linfo, bool linum) +{ + const char *line = btf__name_by_offset(btf, linfo->line_off); + + if (line) + jsonw_string_field(json_wtr, "src", ltrim(line)); + + if (linum) { + const char *file = btf__name_by_offset(btf, linfo->file_name_off); + + if (file) + jsonw_string_field(json_wtr, "file", file); + + if (BPF_LINE_INFO_LINE_NUM(linfo->line_col)) + jsonw_int_field(json_wtr, "line_num", + BPF_LINE_INFO_LINE_NUM(linfo->line_col)); + + if (BPF_LINE_INFO_LINE_COL(linfo->line_col)) + jsonw_int_field(json_wtr, "line_col", + BPF_LINE_INFO_LINE_COL(linfo->line_col)); + } +} + +static void dotlabel_puts(const char *s) +{ + for (; *s; ++s) { + switch (*s) { + case '\\': + case '"': + case '{': + case '}': + case '<': + case '>': + case '|': + case ' ': + putchar('\\'); + /* fallthrough */ + default: + putchar(*s); + } + } +} + +static const char *shorten_path(const char *path) +{ + const unsigned int MAX_PATH_LEN = 32; + size_t len = strlen(path); + const char *shortpath; + + if (len <= MAX_PATH_LEN) + return path; + + /* Search for last '/' under the MAX_PATH_LEN limit */ + shortpath = strchr(path + len - MAX_PATH_LEN, '/'); + if (shortpath) { + if (shortpath < path + strlen("...")) + /* We removed a very short prefix, e.g. "/w", and we'll + * make the path longer by prefixing with the ellipsis. + * Not worth it, keep initial path. + */ + return path; + return shortpath; + } + + /* File base name length is > MAX_PATH_LEN, search for last '/' */ + shortpath = strrchr(path, '/'); + if (shortpath) + return shortpath; + + return path; +} + +void btf_dump_linfo_dotlabel(const struct btf *btf, + const struct bpf_line_info *linfo, bool linum) +{ + const char *line = btf__name_by_offset(btf, linfo->line_off); + + if (!line || !strlen(line)) + return; + line = ltrim(line); + + if (linum) { + const char *file = btf__name_by_offset(btf, linfo->file_name_off); + const char *shortfile; + + /* More forgiving on file because linum option is + * expected to provide more info than the already + * available src line. + */ + if (!file) + shortfile = ""; + else + shortfile = shorten_path(file); + + printf("; [%s", shortfile > file ? "..." : ""); + dotlabel_puts(shortfile); + printf(" line:%u col:%u]\\l\\\n", + BPF_LINE_INFO_LINE_NUM(linfo->line_col), + BPF_LINE_INFO_LINE_COL(linfo->line_col)); + } + + printf("; "); + dotlabel_puts(line); + printf("\\l\\\n"); +} diff --git a/third_party/bpftool/src/cfg.c b/third_party/bpftool/src/cfg.c new file mode 100644 index 0000000..eec437c --- /dev/null +++ b/third_party/bpftool/src/cfg.c @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2018 Netronome Systems, Inc. */ + +#include +#include +#include + +#include "cfg.h" +#include "main.h" +#include "xlated_dumper.h" + +struct cfg { + struct list_head funcs; + int func_num; +}; + +struct func_node { + struct list_head l; + struct list_head bbs; + struct bpf_insn *start; + struct bpf_insn *end; + int idx; + int bb_num; +}; + +struct bb_node { + struct list_head l; + struct list_head e_prevs; + struct list_head e_succs; + struct bpf_insn *head; + struct bpf_insn *tail; + int idx; +}; + +#define EDGE_FLAG_EMPTY 0x0 +#define EDGE_FLAG_FALLTHROUGH 0x1 +#define EDGE_FLAG_JUMP 0x2 +struct edge_node { + struct list_head l; + struct bb_node *src; + struct bb_node *dst; + int flags; +}; + +#define ENTRY_BLOCK_INDEX 0 +#define EXIT_BLOCK_INDEX 1 +#define NUM_FIXED_BLOCKS 2 +#define func_prev(func) list_prev_entry(func, l) +#define func_next(func) list_next_entry(func, l) +#define bb_prev(bb) list_prev_entry(bb, l) +#define bb_next(bb) list_next_entry(bb, l) +#define entry_bb(func) func_first_bb(func) +#define exit_bb(func) func_last_bb(func) +#define cfg_first_func(cfg) \ + list_first_entry(&cfg->funcs, struct func_node, l) +#define cfg_last_func(cfg) \ + list_last_entry(&cfg->funcs, struct func_node, l) +#define func_first_bb(func) \ + list_first_entry(&func->bbs, struct bb_node, l) +#define func_last_bb(func) \ + list_last_entry(&func->bbs, struct bb_node, l) + +static struct func_node *cfg_append_func(struct cfg *cfg, struct bpf_insn *insn) +{ + struct func_node *new_func, *func; + + list_for_each_entry(func, &cfg->funcs, l) { + if (func->start == insn) + return func; + else if (func->start > insn) + break; + } + + func = func_prev(func); + new_func = calloc(1, sizeof(*new_func)); + if (!new_func) { + p_err("OOM when allocating FUNC node"); + return NULL; + } + new_func->start = insn; + new_func->idx = cfg->func_num; + list_add(&new_func->l, &func->l); + cfg->func_num++; + + return new_func; +} + +static struct bb_node *func_append_bb(struct func_node *func, + struct bpf_insn *insn) +{ + struct bb_node *new_bb, *bb; + + list_for_each_entry(bb, &func->bbs, l) { + if (bb->head == insn) + return bb; + else if (bb->head > insn) + break; + } + + bb = bb_prev(bb); + new_bb = calloc(1, sizeof(*new_bb)); + if (!new_bb) { + p_err("OOM when allocating BB node"); + return NULL; + } + new_bb->head = insn; + INIT_LIST_HEAD(&new_bb->e_prevs); + INIT_LIST_HEAD(&new_bb->e_succs); + list_add(&new_bb->l, &bb->l); + + return new_bb; +} + +static struct bb_node *func_insert_dummy_bb(struct list_head *after) +{ + struct bb_node *bb; + + bb = calloc(1, sizeof(*bb)); + if (!bb) { + p_err("OOM when allocating BB node"); + return NULL; + } + + INIT_LIST_HEAD(&bb->e_prevs); + INIT_LIST_HEAD(&bb->e_succs); + list_add(&bb->l, after); + + return bb; +} + +static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur, + struct bpf_insn *end) +{ + struct func_node *func, *last_func; + + func = cfg_append_func(cfg, cur); + if (!func) + return true; + + for (; cur < end; cur++) { + if (cur->code != (BPF_JMP | BPF_CALL)) + continue; + if (cur->src_reg != BPF_PSEUDO_CALL) + continue; + func = cfg_append_func(cfg, cur + cur->off + 1); + if (!func) + return true; + } + + last_func = cfg_last_func(cfg); + last_func->end = end - 1; + func = cfg_first_func(cfg); + list_for_each_entry_from(func, &last_func->l, l) { + func->end = func_next(func)->start - 1; + } + + return false; +} + +static bool is_jmp_insn(__u8 code) +{ + return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32; +} + +static bool func_partition_bb_head(struct func_node *func) +{ + struct bpf_insn *cur, *end; + struct bb_node *bb; + + cur = func->start; + end = func->end; + INIT_LIST_HEAD(&func->bbs); + bb = func_append_bb(func, cur); + if (!bb) + return true; + + for (; cur <= end; cur++) { + if (is_jmp_insn(cur->code)) { + __u8 opcode = BPF_OP(cur->code); + + if (opcode == BPF_EXIT || opcode == BPF_CALL) + continue; + + bb = func_append_bb(func, cur + cur->off + 1); + if (!bb) + return true; + + if (opcode != BPF_JA) { + bb = func_append_bb(func, cur + 1); + if (!bb) + return true; + } + } + } + + return false; +} + +static void func_partition_bb_tail(struct func_node *func) +{ + unsigned int bb_idx = NUM_FIXED_BLOCKS; + struct bb_node *bb, *last; + + last = func_last_bb(func); + last->tail = func->end; + bb = func_first_bb(func); + list_for_each_entry_from(bb, &last->l, l) { + bb->tail = bb_next(bb)->head - 1; + bb->idx = bb_idx++; + } + + last->idx = bb_idx++; + func->bb_num = bb_idx; +} + +static bool func_add_special_bb(struct func_node *func) +{ + struct bb_node *bb; + + bb = func_insert_dummy_bb(&func->bbs); + if (!bb) + return true; + bb->idx = ENTRY_BLOCK_INDEX; + + bb = func_insert_dummy_bb(&func_last_bb(func)->l); + if (!bb) + return true; + bb->idx = EXIT_BLOCK_INDEX; + + return false; +} + +static bool func_partition_bb(struct func_node *func) +{ + if (func_partition_bb_head(func)) + return true; + + func_partition_bb_tail(func); + + return false; +} + +static struct bb_node *func_search_bb_with_head(struct func_node *func, + struct bpf_insn *insn) +{ + struct bb_node *bb; + + list_for_each_entry(bb, &func->bbs, l) { + if (bb->head == insn) + return bb; + } + + return NULL; +} + +static struct edge_node *new_edge(struct bb_node *src, struct bb_node *dst, + int flags) +{ + struct edge_node *e; + + e = calloc(1, sizeof(*e)); + if (!e) { + p_err("OOM when allocating edge node"); + return NULL; + } + + if (src) + e->src = src; + if (dst) + e->dst = dst; + + e->flags |= flags; + + return e; +} + +static bool func_add_bb_edges(struct func_node *func) +{ + struct bpf_insn *insn; + struct edge_node *e; + struct bb_node *bb; + + bb = entry_bb(func); + e = new_edge(bb, bb_next(bb), EDGE_FLAG_FALLTHROUGH); + if (!e) + return true; + list_add_tail(&e->l, &bb->e_succs); + + bb = exit_bb(func); + e = new_edge(bb_prev(bb), bb, EDGE_FLAG_FALLTHROUGH); + if (!e) + return true; + list_add_tail(&e->l, &bb->e_prevs); + + bb = entry_bb(func); + bb = bb_next(bb); + list_for_each_entry_from(bb, &exit_bb(func)->l, l) { + e = new_edge(bb, NULL, EDGE_FLAG_EMPTY); + if (!e) + return true; + e->src = bb; + + insn = bb->tail; + if (!is_jmp_insn(insn->code) || + BPF_OP(insn->code) == BPF_EXIT) { + e->dst = bb_next(bb); + e->flags |= EDGE_FLAG_FALLTHROUGH; + list_add_tail(&e->l, &bb->e_succs); + continue; + } else if (BPF_OP(insn->code) == BPF_JA) { + e->dst = func_search_bb_with_head(func, + insn + insn->off + 1); + e->flags |= EDGE_FLAG_JUMP; + list_add_tail(&e->l, &bb->e_succs); + continue; + } + + e->dst = bb_next(bb); + e->flags |= EDGE_FLAG_FALLTHROUGH; + list_add_tail(&e->l, &bb->e_succs); + + e = new_edge(bb, NULL, EDGE_FLAG_JUMP); + if (!e) + return true; + e->src = bb; + e->dst = func_search_bb_with_head(func, insn + insn->off + 1); + list_add_tail(&e->l, &bb->e_succs); + } + + return false; +} + +static bool cfg_build(struct cfg *cfg, struct bpf_insn *insn, unsigned int len) +{ + int cnt = len / sizeof(*insn); + struct func_node *func; + + INIT_LIST_HEAD(&cfg->funcs); + + if (cfg_partition_funcs(cfg, insn, insn + cnt)) + return true; + + list_for_each_entry(func, &cfg->funcs, l) { + if (func_partition_bb(func) || func_add_special_bb(func)) + return true; + + if (func_add_bb_edges(func)) + return true; + } + + return false; +} + +static void cfg_destroy(struct cfg *cfg) +{ + struct func_node *func, *func2; + + list_for_each_entry_safe(func, func2, &cfg->funcs, l) { + struct bb_node *bb, *bb2; + + list_for_each_entry_safe(bb, bb2, &func->bbs, l) { + struct edge_node *e, *e2; + + list_for_each_entry_safe(e, e2, &bb->e_prevs, l) { + list_del(&e->l); + free(e); + } + + list_for_each_entry_safe(e, e2, &bb->e_succs, l) { + list_del(&e->l); + free(e); + } + + list_del(&bb->l); + free(bb); + } + + list_del(&func->l); + free(func); + } +} + +static void +draw_bb_node(struct func_node *func, struct bb_node *bb, struct dump_data *dd, + bool opcodes, bool linum) +{ + const char *shape; + + if (bb->idx == ENTRY_BLOCK_INDEX || bb->idx == EXIT_BLOCK_INDEX) + shape = "Mdiamond"; + else + shape = "record"; + + printf("\tfn_%d_bb_%d [shape=%s,style=filled,label=\"", + func->idx, bb->idx, shape); + + if (bb->idx == ENTRY_BLOCK_INDEX) { + printf("ENTRY"); + } else if (bb->idx == EXIT_BLOCK_INDEX) { + printf("EXIT"); + } else { + unsigned int start_idx; + printf("{\\\n"); + start_idx = bb->head - func->start; + dump_xlated_for_graph(dd, bb->head, bb->tail, start_idx, + opcodes, linum); + printf("}"); + } + + printf("\"];\n\n"); +} + +static void draw_bb_succ_edges(struct func_node *func, struct bb_node *bb) +{ + const char *style = "\"solid,bold\""; + const char *color = "black"; + int func_idx = func->idx; + struct edge_node *e; + int weight = 10; + + if (list_empty(&bb->e_succs)) + return; + + list_for_each_entry(e, &bb->e_succs, l) { + printf("\tfn_%d_bb_%d:s -> fn_%d_bb_%d:n [style=%s, color=%s, weight=%d, constraint=true", + func_idx, e->src->idx, func_idx, e->dst->idx, + style, color, weight); + printf("];\n"); + } +} + +static void +func_output_bb_def(struct func_node *func, struct dump_data *dd, + bool opcodes, bool linum) +{ + struct bb_node *bb; + + list_for_each_entry(bb, &func->bbs, l) { + draw_bb_node(func, bb, dd, opcodes, linum); + } +} + +static void func_output_edges(struct func_node *func) +{ + int func_idx = func->idx; + struct bb_node *bb; + + list_for_each_entry(bb, &func->bbs, l) { + draw_bb_succ_edges(func, bb); + } + + /* Add an invisible edge from ENTRY to EXIT, this is to + * improve the graph layout. + */ + printf("\tfn_%d_bb_%d:s -> fn_%d_bb_%d:n [style=\"invis\", constraint=true];\n", + func_idx, ENTRY_BLOCK_INDEX, func_idx, EXIT_BLOCK_INDEX); +} + +static void +cfg_dump(struct cfg *cfg, struct dump_data *dd, bool opcodes, bool linum) +{ + struct func_node *func; + + printf("digraph \"DOT graph for eBPF program\" {\n"); + list_for_each_entry(func, &cfg->funcs, l) { + printf("subgraph \"cluster_%d\" {\n\tstyle=\"dashed\";\n\tcolor=\"black\";\n\tlabel=\"func_%d ()\";\n", + func->idx, func->idx); + func_output_bb_def(func, dd, opcodes, linum); + func_output_edges(func); + printf("}\n"); + } + printf("}\n"); +} + +void dump_xlated_cfg(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum) +{ + struct bpf_insn *insn = buf; + struct cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + if (cfg_build(&cfg, insn, len)) + return; + + cfg_dump(&cfg, dd, opcodes, linum); + + cfg_destroy(&cfg); +} diff --git a/third_party/bpftool/src/cfg.h b/third_party/bpftool/src/cfg.h new file mode 100644 index 0000000..b3793f4 --- /dev/null +++ b/third_party/bpftool/src/cfg.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2018 Netronome Systems, Inc. */ + +#ifndef __BPF_TOOL_CFG_H +#define __BPF_TOOL_CFG_H + +#include "xlated_dumper.h" + +void dump_xlated_cfg(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum); + +#endif /* __BPF_TOOL_CFG_H */ diff --git a/third_party/bpftool/src/cgroup.c b/third_party/bpftool/src/cgroup.c new file mode 100644 index 0000000..ac846b0 --- /dev/null +++ b/third_party/bpftool/src/cgroup.c @@ -0,0 +1,655 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2017 Facebook +// Author: Roman Gushchin + +#define _XOPEN_SOURCE 500 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "main.h" + +#define HELP_SPEC_ATTACH_FLAGS \ + "ATTACH_FLAGS := { multi | override }" + +#define HELP_SPEC_ATTACH_TYPES \ + " ATTACH_TYPE := { cgroup_inet_ingress | cgroup_inet_egress |\n" \ + " cgroup_inet_sock_create | cgroup_sock_ops |\n" \ + " cgroup_device | cgroup_inet4_bind |\n" \ + " cgroup_inet6_bind | cgroup_inet4_post_bind |\n" \ + " cgroup_inet6_post_bind | cgroup_inet4_connect |\n" \ + " cgroup_inet6_connect | cgroup_inet4_getpeername |\n" \ + " cgroup_inet6_getpeername | cgroup_inet4_getsockname |\n" \ + " cgroup_inet6_getsockname | cgroup_udp4_sendmsg |\n" \ + " cgroup_udp6_sendmsg | cgroup_udp4_recvmsg |\n" \ + " cgroup_udp6_recvmsg | cgroup_sysctl |\n" \ + " cgroup_getsockopt | cgroup_setsockopt |\n" \ + " cgroup_inet_sock_release }" + +static unsigned int query_flags; +static struct btf *btf_vmlinux; +static __u32 btf_vmlinux_id; + +static enum bpf_attach_type parse_attach_type(const char *str) +{ + const char *attach_type_str; + enum bpf_attach_type type; + + for (type = 0; ; type++) { + attach_type_str = libbpf_bpf_attach_type_str(type); + if (!attach_type_str) + break; + if (!strcmp(str, attach_type_str)) + return type; + } + + /* Also check traditionally used attach type strings. For these we keep + * allowing prefixed usage. + */ + for (type = 0; ; type++) { + attach_type_str = bpf_attach_type_input_str(type); + if (!attach_type_str) + break; + if (is_prefix(str, attach_type_str)) + return type; + } + + return __MAX_BPF_ATTACH_TYPE; +} + +static void guess_vmlinux_btf_id(__u32 attach_btf_obj_id) +{ + struct bpf_btf_info btf_info = {}; + __u32 btf_len = sizeof(btf_info); + char name[16] = {}; + int err; + int fd; + + btf_info.name = ptr_to_u64(name); + btf_info.name_len = sizeof(name); + + fd = bpf_btf_get_fd_by_id(attach_btf_obj_id); + if (fd < 0) + return; + + err = bpf_btf_get_info_by_fd(fd, &btf_info, &btf_len); + if (err) + goto out; + + if (btf_info.kernel_btf && strncmp(name, "vmlinux", sizeof(name)) == 0) + btf_vmlinux_id = btf_info.id; + +out: + close(fd); +} + +static int show_bpf_prog(int id, enum bpf_attach_type attach_type, + const char *attach_flags_str, + int level) +{ + char prog_name[MAX_PROG_FULL_NAME]; + const char *attach_btf_name = NULL; + struct bpf_prog_info info = {}; + const char *attach_type_str; + __u32 info_len = sizeof(info); + int prog_fd; + + prog_fd = bpf_prog_get_fd_by_id(id); + if (prog_fd < 0) + return -1; + + if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) { + close(prog_fd); + return -1; + } + + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + + if (btf_vmlinux) { + if (!btf_vmlinux_id) + guess_vmlinux_btf_id(info.attach_btf_obj_id); + + if (btf_vmlinux_id == info.attach_btf_obj_id && + info.attach_btf_id < btf__type_cnt(btf_vmlinux)) { + const struct btf_type *t = + btf__type_by_id(btf_vmlinux, info.attach_btf_id); + attach_btf_name = + btf__name_by_offset(btf_vmlinux, t->name_off); + } + } + + get_prog_full_name(&info, prog_fd, prog_name, sizeof(prog_name)); + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "id", info.id); + if (attach_type_str) + jsonw_string_field(json_wtr, "attach_type", attach_type_str); + else + jsonw_uint_field(json_wtr, "attach_type", attach_type); + if (!(query_flags & BPF_F_QUERY_EFFECTIVE)) + jsonw_string_field(json_wtr, "attach_flags", attach_flags_str); + jsonw_string_field(json_wtr, "name", prog_name); + if (attach_btf_name) + jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name); + jsonw_uint_field(json_wtr, "attach_btf_obj_id", info.attach_btf_obj_id); + jsonw_uint_field(json_wtr, "attach_btf_id", info.attach_btf_id); + jsonw_end_object(json_wtr); + } else { + printf("%s%-8u ", level ? " " : "", info.id); + if (attach_type_str) + printf("%-15s", attach_type_str); + else + printf("type %-10u", attach_type); + if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf(" %-15s", prog_name); + else + printf(" %-15s %-15s", attach_flags_str, prog_name); + if (attach_btf_name) + printf(" %-15s", attach_btf_name); + else if (info.attach_btf_id) + printf(" attach_btf_obj_id=%d attach_btf_id=%d", + info.attach_btf_obj_id, info.attach_btf_id); + printf("\n"); + } + + close(prog_fd); + return 0; +} + +static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type) +{ + __u32 prog_cnt = 0; + int ret; + + ret = bpf_prog_query(cgroup_fd, type, query_flags, NULL, + NULL, &prog_cnt); + if (ret) + return -1; + + return prog_cnt; +} + +static int cgroup_has_attached_progs(int cgroup_fd) +{ + enum bpf_attach_type type; + bool no_prog = true; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + int count = count_attached_bpf_progs(cgroup_fd, type); + + if (count < 0 && errno != EINVAL) + return -1; + + if (count > 0) { + no_prog = false; + break; + } + } + + return no_prog ? 0 : 1; +} + +static int show_effective_bpf_progs(int cgroup_fd, enum bpf_attach_type type, + int level) +{ + LIBBPF_OPTS(bpf_prog_query_opts, p); + __u32 prog_ids[1024] = {0}; + __u32 iter; + int ret; + + p.query_flags = query_flags; + p.prog_cnt = ARRAY_SIZE(prog_ids); + p.prog_ids = prog_ids; + + ret = bpf_prog_query_opts(cgroup_fd, type, &p); + if (ret) + return ret; + + if (p.prog_cnt == 0) + return 0; + + for (iter = 0; iter < p.prog_cnt; iter++) + show_bpf_prog(prog_ids[iter], type, NULL, level); + + return 0; +} + +static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type, + int level) +{ + LIBBPF_OPTS(bpf_prog_query_opts, p); + __u32 prog_attach_flags[1024] = {0}; + const char *attach_flags_str; + __u32 prog_ids[1024] = {0}; + char buf[32]; + __u32 iter; + int ret; + + p.query_flags = query_flags; + p.prog_cnt = ARRAY_SIZE(prog_ids); + p.prog_ids = prog_ids; + p.prog_attach_flags = prog_attach_flags; + + ret = bpf_prog_query_opts(cgroup_fd, type, &p); + if (ret) + return ret; + + if (p.prog_cnt == 0) + return 0; + + for (iter = 0; iter < p.prog_cnt; iter++) { + __u32 attach_flags; + + attach_flags = prog_attach_flags[iter] ?: p.attach_flags; + + switch (attach_flags) { + case BPF_F_ALLOW_MULTI: + attach_flags_str = "multi"; + break; + case BPF_F_ALLOW_OVERRIDE: + attach_flags_str = "override"; + break; + case 0: + attach_flags_str = ""; + break; + default: + snprintf(buf, sizeof(buf), "unknown(%x)", attach_flags); + attach_flags_str = buf; + } + + show_bpf_prog(prog_ids[iter], type, + attach_flags_str, level); + } + + return 0; +} + +static int show_bpf_progs(int cgroup_fd, enum bpf_attach_type type, + int level) +{ + return query_flags & BPF_F_QUERY_EFFECTIVE ? + show_effective_bpf_progs(cgroup_fd, type, level) : + show_attached_bpf_progs(cgroup_fd, type, level); +} + +static int do_show(int argc, char **argv) +{ + enum bpf_attach_type type; + int has_attached_progs; + const char *path; + int cgroup_fd; + int ret = -1; + + query_flags = 0; + + if (!REQ_ARGS(1)) + return -1; + path = GET_ARG(); + + while (argc) { + if (is_prefix(*argv, "effective")) { + if (query_flags & BPF_F_QUERY_EFFECTIVE) { + p_err("duplicated argument: %s", *argv); + return -1; + } + query_flags |= BPF_F_QUERY_EFFECTIVE; + NEXT_ARG(); + } else { + p_err("expected no more arguments, 'effective', got: '%s'?", + *argv); + return -1; + } + } + + cgroup_fd = open(path, O_RDONLY); + if (cgroup_fd < 0) { + p_err("can't open cgroup %s", path); + goto exit; + } + + has_attached_progs = cgroup_has_attached_progs(cgroup_fd); + if (has_attached_progs < 0) { + p_err("can't query bpf programs attached to %s: %s", + path, strerror(errno)); + goto exit_cgroup; + } else if (!has_attached_progs) { + ret = 0; + goto exit_cgroup; + } + + if (json_output) + jsonw_start_array(json_wtr); + else if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf("%-8s %-15s %-15s\n", "ID", "AttachType", "Name"); + else + printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType", + "AttachFlags", "Name"); + + btf_vmlinux = libbpf_find_kernel_btf(); + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + /* + * Not all attach types may be supported, so it's expected, + * that some requests will fail. + * If we were able to get the show for at least one + * attach type, let's return 0. + */ + if (show_bpf_progs(cgroup_fd, type, 0) == 0) + ret = 0; + } + + if (json_output) + jsonw_end_array(json_wtr); + +exit_cgroup: + close(cgroup_fd); +exit: + return ret; +} + +/* + * To distinguish nftw() errors and do_show_tree_fn() errors + * and avoid duplicating error messages, let's return -2 + * from do_show_tree_fn() in case of error. + */ +#define NFTW_ERR -1 +#define SHOW_TREE_FN_ERR -2 +static int do_show_tree_fn(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftw) +{ + enum bpf_attach_type type; + int has_attached_progs; + int cgroup_fd; + + if (typeflag != FTW_D) + return 0; + + cgroup_fd = open(fpath, O_RDONLY); + if (cgroup_fd < 0) { + p_err("can't open cgroup %s: %s", fpath, strerror(errno)); + return SHOW_TREE_FN_ERR; + } + + has_attached_progs = cgroup_has_attached_progs(cgroup_fd); + if (has_attached_progs < 0) { + p_err("can't query bpf programs attached to %s: %s", + fpath, strerror(errno)); + close(cgroup_fd); + return SHOW_TREE_FN_ERR; + } else if (!has_attached_progs) { + close(cgroup_fd); + return 0; + } + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "cgroup", fpath); + jsonw_name(json_wtr, "programs"); + jsonw_start_array(json_wtr); + } else { + printf("%s\n", fpath); + } + + btf_vmlinux = libbpf_find_kernel_btf(); + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) + show_bpf_progs(cgroup_fd, type, ftw->level); + + if (errno == EINVAL) + /* Last attach type does not support query. + * Do not report an error for this, especially because batch + * mode would stop processing commands. + */ + errno = 0; + + if (json_output) { + jsonw_end_array(json_wtr); + jsonw_end_object(json_wtr); + } + + close(cgroup_fd); + + return 0; +} + +static char *find_cgroup_root(void) +{ + struct mntent *mnt; + FILE *f; + + f = fopen("/proc/mounts", "r"); + if (f == NULL) + return NULL; + + while ((mnt = getmntent(f))) { + if (strcmp(mnt->mnt_type, "cgroup2") == 0) { + fclose(f); + return strdup(mnt->mnt_dir); + } + } + + fclose(f); + return NULL; +} + +static int do_show_tree(int argc, char **argv) +{ + char *cgroup_root, *cgroup_alloced = NULL; + int ret; + + query_flags = 0; + + if (!argc) { + cgroup_alloced = find_cgroup_root(); + if (!cgroup_alloced) { + p_err("cgroup v2 isn't mounted"); + return -1; + } + cgroup_root = cgroup_alloced; + } else { + cgroup_root = GET_ARG(); + + while (argc) { + if (is_prefix(*argv, "effective")) { + if (query_flags & BPF_F_QUERY_EFFECTIVE) { + p_err("duplicated argument: %s", *argv); + return -1; + } + query_flags |= BPF_F_QUERY_EFFECTIVE; + NEXT_ARG(); + } else { + p_err("expected no more arguments, 'effective', got: '%s'?", + *argv); + return -1; + } + } + } + + if (json_output) + jsonw_start_array(json_wtr); + else if (query_flags & BPF_F_QUERY_EFFECTIVE) + printf("%s\n" + "%-8s %-15s %-15s\n", + "CgroupPath", + "ID", "AttachType", "Name"); + else + printf("%s\n" + "%-8s %-15s %-15s %-15s\n", + "CgroupPath", + "ID", "AttachType", "AttachFlags", "Name"); + + switch (nftw(cgroup_root, do_show_tree_fn, 1024, FTW_MOUNT)) { + case NFTW_ERR: + p_err("can't iterate over %s: %s", cgroup_root, + strerror(errno)); + ret = -1; + break; + case SHOW_TREE_FN_ERR: + ret = -1; + break; + default: + ret = 0; + } + + if (json_output) + jsonw_end_array(json_wtr); + + free(cgroup_alloced); + + return ret; +} + +static int do_attach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int cgroup_fd, prog_fd; + int attach_flags = 0; + int ret = -1; + int i; + + if (argc < 4) { + p_err("too few parameters for cgroup attach"); + goto exit; + } + + cgroup_fd = open(argv[0], O_RDONLY); + if (cgroup_fd < 0) { + p_err("can't open cgroup %s", argv[0]); + goto exit; + } + + attach_type = parse_attach_type(argv[1]); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + goto exit_cgroup; + } + + argc -= 2; + argv = &argv[2]; + prog_fd = prog_parse_fd(&argc, &argv); + if (prog_fd < 0) + goto exit_cgroup; + + for (i = 0; i < argc; i++) { + if (is_prefix(argv[i], "multi")) { + attach_flags |= BPF_F_ALLOW_MULTI; + } else if (is_prefix(argv[i], "override")) { + attach_flags |= BPF_F_ALLOW_OVERRIDE; + } else { + p_err("unknown option: %s", argv[i]); + goto exit_cgroup; + } + } + + if (bpf_prog_attach(prog_fd, cgroup_fd, attach_type, attach_flags)) { + p_err("failed to attach program"); + goto exit_prog; + } + + if (json_output) + jsonw_null(json_wtr); + + ret = 0; + +exit_prog: + close(prog_fd); +exit_cgroup: + close(cgroup_fd); +exit: + return ret; +} + +static int do_detach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int prog_fd, cgroup_fd; + int ret = -1; + + if (argc < 4) { + p_err("too few parameters for cgroup detach"); + goto exit; + } + + cgroup_fd = open(argv[0], O_RDONLY); + if (cgroup_fd < 0) { + p_err("can't open cgroup %s", argv[0]); + goto exit; + } + + attach_type = parse_attach_type(argv[1]); + if (attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach type"); + goto exit_cgroup; + } + + argc -= 2; + argv = &argv[2]; + prog_fd = prog_parse_fd(&argc, &argv); + if (prog_fd < 0) + goto exit_cgroup; + + if (bpf_prog_detach2(prog_fd, cgroup_fd, attach_type)) { + p_err("failed to detach program"); + goto exit_prog; + } + + if (json_output) + jsonw_null(json_wtr); + + ret = 0; + +exit_prog: + close(prog_fd); +exit_cgroup: + close(cgroup_fd); +exit: + return ret; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } CGROUP [**effective**]\n" + " %1$s %2$s tree [CGROUP_ROOT] [**effective**]\n" + " %1$s %2$s attach CGROUP ATTACH_TYPE PROG [ATTACH_FLAGS]\n" + " %1$s %2$s detach CGROUP ATTACH_TYPE PROG\n" + " %1$s %2$s help\n" + "\n" + HELP_SPEC_ATTACH_TYPES "\n" + " " HELP_SPEC_ATTACH_FLAGS "\n" + " " HELP_SPEC_PROGRAM "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "tree", do_show_tree }, + { "attach", do_attach }, + { "detach", do_detach }, + { "help", do_help }, + { 0 } +}; + +int do_cgroup(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/common.c b/third_party/bpftool/src/common.c new file mode 100644 index 0000000..cc6e6aa --- /dev/null +++ b/third_party/bpftool/src/common.c @@ -0,0 +1,1110 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include /* libbpf_num_possible_cpus */ +#include + +#include "main.h" + +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + +void p_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "error"); + jsonw_vprintf_enquote(json_wtr, fmt, ap); + jsonw_end_object(json_wtr); + } else { + fprintf(stderr, "Error: "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + } + va_end(ap); +} + +void p_info(const char *fmt, ...) +{ + va_list ap; + + if (json_output) + return; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +static bool is_bpffs(const char *path) +{ + struct statfs st_fs; + + if (statfs(path, &st_fs) < 0) + return false; + + return (unsigned long)st_fs.f_type == BPF_FS_MAGIC; +} + +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and programs. This was done in + * commit 97306be45fbe ("Merge branch 'switch to memcg-based memory + * accounting'"), in Linux 5.11. + * + * Libbpf also offers to probe for memcg-based accounting vs rlimit, but does + * so by checking for the availability of a given BPF helper and this has + * failed on some kernels with backports in the past, see commit 6b4384ff1088 + * ("Revert "bpftool: Use libbpf 1.0 API mode instead of RLIMIT_MEMLOCK""). + * Instead, we can probe by lowering the process-based rlimit to 0, trying to + * load a BPF object, and resetting the rlimit. If the load succeeds then + * memcg-based accounting is supported. + * + * This would be too dangerous to do in the library, because multithreaded + * applications might attempt to load items while the rlimit is at 0. Given + * that bpftool is single-threaded, this is fine to do here. + */ +static bool known_to_need_rlimit(void) +{ + struct rlimit rlim_init, rlim_cur_zero = {}; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd, err; + + memset(&attr, 0, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim_init)) + return false; + + /* Drop the soft limit to zero. We maintain the hard limit to its + * current value, because lowering it would be a permanent operation + * for unprivileged users. + */ + rlim_cur_zero.rlim_max = rlim_init.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rlim_cur_zero)) + return false; + + /* Do not use bpf_prog_load() from libbpf here, because it calls + * bump_rlimit_memlock(), interfering with the current probe. + */ + prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + err = errno; + + /* reset soft rlimit to its initial value */ + setrlimit(RLIMIT_MEMLOCK, &rlim_init); + + if (prog_fd < 0) + return err == EPERM; + + close(prog_fd); + return false; +} + +void set_max_rlimit(void) +{ + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; + + if (known_to_need_rlimit()) + setrlimit(RLIMIT_MEMLOCK, &rinf); +} + +static int +mnt_fs(const char *target, const char *type, char *buff, size_t bufflen) +{ + bool bind_done = false; + + while (mount("", target, "none", MS_PRIVATE | MS_REC, NULL)) { + if (errno != EINVAL || bind_done) { + snprintf(buff, bufflen, + "mount --make-private %s failed: %s", + target, strerror(errno)); + return -1; + } + + if (mount(target, target, "none", MS_BIND, NULL)) { + snprintf(buff, bufflen, + "mount --bind %s %s failed: %s", + target, target, strerror(errno)); + return -1; + } + + bind_done = true; + } + + if (mount(type, target, type, 0, "mode=0700")) { + snprintf(buff, bufflen, "mount -t %s %s %s failed: %s", + type, type, target, strerror(errno)); + return -1; + } + + return 0; +} + +int mount_tracefs(const char *target) +{ + char err_str[ERR_MAX_LEN]; + int err; + + err = mnt_fs(target, "tracefs", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount tracefs: %s", err_str); + } + + return err; +} + +int open_obj_pinned(const char *path, bool quiet) +{ + char *pname; + int fd = -1; + + pname = strdup(path); + if (!pname) { + if (!quiet) + p_err("mem alloc failed"); + goto out_ret; + } + + fd = bpf_obj_get(pname); + if (fd < 0) { + if (!quiet) + p_err("bpf obj get (%s): %s", pname, + errno == EACCES && !is_bpffs(dirname(pname)) ? + "directory not in bpf file system (bpffs)" : + strerror(errno)); + goto out_free; + } + +out_free: + free(pname); +out_ret: + return fd; +} + +int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type) +{ + enum bpf_obj_type type; + int fd; + + fd = open_obj_pinned(path, false); + if (fd < 0) + return -1; + + type = get_fd_type(fd); + if (type < 0) { + close(fd); + return type; + } + if (type != exp_type) { + p_err("incorrect object type: %s", get_fd_type_name(type)); + close(fd); + return -1; + } + + return fd; +} + +int mount_bpffs_for_pin(const char *name, bool is_dir) +{ + char err_str[ERR_MAX_LEN]; + char *file; + char *dir; + int err = 0; + + if (is_dir && is_bpffs(name)) + return err; + + file = malloc(strlen(name) + 1); + if (!file) { + p_err("mem alloc failed"); + return -1; + } + + strcpy(file, name); + dir = dirname(file); + + if (is_bpffs(dir)) + /* nothing to do if already mounted */ + goto out_free; + + if (block_mount) { + p_err("no BPF file system found, not mounting it due to --nomount option"); + err = -1; + goto out_free; + } + + err = mnt_fs(dir, "bpf", err_str, ERR_MAX_LEN); + if (err) { + err_str[ERR_MAX_LEN - 1] = '\0'; + p_err("can't mount BPF file system to pin the object (%s): %s", + name, err_str); + } + +out_free: + free(file); + return err; +} + +int do_pin_fd(int fd, const char *name) +{ + int err; + + err = mount_bpffs_for_pin(name, false); + if (err) + return err; + + err = bpf_obj_pin(fd, name); + if (err) + p_err("can't pin the object (%s): %s", name, strerror(errno)); + + return err; +} + +int do_pin_any(int argc, char **argv, int (*get_fd)(int *, char ***)) +{ + int err; + int fd; + + if (!REQ_ARGS(3)) + return -EINVAL; + + fd = get_fd(&argc, &argv); + if (fd < 0) + return fd; + + err = do_pin_fd(fd, *argv); + + close(fd); + return err; +} + +const char *get_fd_type_name(enum bpf_obj_type type) +{ + static const char * const names[] = { + [BPF_OBJ_UNKNOWN] = "unknown", + [BPF_OBJ_PROG] = "prog", + [BPF_OBJ_MAP] = "map", + [BPF_OBJ_LINK] = "link", + }; + + if (type < 0 || type >= ARRAY_SIZE(names) || !names[type]) + return names[BPF_OBJ_UNKNOWN]; + + return names[type]; +} + +void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd, + char *name_buff, size_t buff_len) +{ + const char *prog_name = prog_info->name; + const struct btf_type *func_type; + const struct bpf_func_info finfo = {}; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + struct btf *prog_btf = NULL; + + if (buff_len <= BPF_OBJ_NAME_LEN || + strlen(prog_info->name) < BPF_OBJ_NAME_LEN - 1) + goto copy_name; + + if (!prog_info->btf_id || prog_info->nr_func_info == 0) + goto copy_name; + + info.nr_func_info = 1; + info.func_info_rec_size = prog_info->func_info_rec_size; + if (info.func_info_rec_size > sizeof(finfo)) + info.func_info_rec_size = sizeof(finfo); + info.func_info = ptr_to_u64(&finfo); + + if (bpf_prog_get_info_by_fd(prog_fd, &info, &info_len)) + goto copy_name; + + prog_btf = btf__load_from_kernel_by_id(info.btf_id); + if (!prog_btf) + goto copy_name; + + func_type = btf__type_by_id(prog_btf, finfo.type_id); + if (!func_type || !btf_is_func(func_type)) + goto copy_name; + + prog_name = btf__name_by_offset(prog_btf, func_type->name_off); + +copy_name: + snprintf(name_buff, buff_len, "%s", prog_name); + + if (prog_btf) + btf__free(prog_btf); +} + +int get_fd_type(int fd) +{ + char path[PATH_MAX]; + char buf[512]; + ssize_t n; + + snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); + + n = readlink(path, buf, sizeof(buf)); + if (n < 0) { + p_err("can't read link type: %s", strerror(errno)); + return -1; + } + if (n == sizeof(path)) { + p_err("can't read link type: path too long!"); + return -1; + } + + if (strstr(buf, "bpf-map")) + return BPF_OBJ_MAP; + else if (strstr(buf, "bpf-prog")) + return BPF_OBJ_PROG; + else if (strstr(buf, "bpf-link")) + return BPF_OBJ_LINK; + + return BPF_OBJ_UNKNOWN; +} + +char *get_fdinfo(int fd, const char *key) +{ + char path[PATH_MAX]; + char *line = NULL; + size_t line_n = 0; + ssize_t n; + FILE *fdi; + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + + fdi = fopen(path, "r"); + if (!fdi) + return NULL; + + while ((n = getline(&line, &line_n, fdi)) > 0) { + char *value; + int len; + + if (!strstr(line, key)) + continue; + + fclose(fdi); + + value = strchr(line, '\t'); + if (!value || !value[1]) { + free(line); + return NULL; + } + value++; + + len = strlen(value); + memmove(line, value, len); + line[len - 1] = '\0'; + + return line; + } + + free(line); + fclose(fdi); + return NULL; +} + +void print_data_json(uint8_t *data, size_t len) +{ + unsigned int i; + + jsonw_start_array(json_wtr); + for (i = 0; i < len; i++) + jsonw_printf(json_wtr, "%d", data[i]); + jsonw_end_array(json_wtr); +} + +void print_hex_data_json(uint8_t *data, size_t len) +{ + unsigned int i; + + jsonw_start_array(json_wtr); + for (i = 0; i < len; i++) + jsonw_printf(json_wtr, "\"0x%02hhx\"", data[i]); + jsonw_end_array(json_wtr); +} + +/* extra params for nftw cb */ +static struct hashmap *build_fn_table; +static enum bpf_obj_type build_fn_type; + +static int do_build_table_cb(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + struct bpf_prog_info pinned_info; + __u32 len = sizeof(pinned_info); + enum bpf_obj_type objtype; + int fd, err = 0; + char *path; + + if (typeflag != FTW_F) + goto out_ret; + + fd = open_obj_pinned(fpath, true); + if (fd < 0) + goto out_ret; + + objtype = get_fd_type(fd); + if (objtype != build_fn_type) + goto out_close; + + memset(&pinned_info, 0, sizeof(pinned_info)); + if (bpf_prog_get_info_by_fd(fd, &pinned_info, &len)) + goto out_close; + + path = strdup(fpath); + if (!path) { + err = -1; + goto out_close; + } + + err = hashmap__append(build_fn_table, pinned_info.id, path); + if (err) { + p_err("failed to append entry to hashmap for ID %u, path '%s': %s", + pinned_info.id, path, strerror(errno)); + free(path); + goto out_close; + } + +out_close: + close(fd); +out_ret: + return err; +} + +int build_pinned_obj_table(struct hashmap *tab, + enum bpf_obj_type type) +{ + struct mntent *mntent = NULL; + FILE *mntfile = NULL; + int flags = FTW_PHYS; + int nopenfd = 16; + int err = 0; + + mntfile = setmntent("/proc/mounts", "r"); + if (!mntfile) + return -1; + + build_fn_table = tab; + build_fn_type = type; + + while ((mntent = getmntent(mntfile))) { + char *path = mntent->mnt_dir; + + if (strncmp(mntent->mnt_type, "bpf", 3) != 0) + continue; + err = nftw(path, do_build_table_cb, nopenfd, flags); + if (err) + break; + } + fclose(mntfile); + return err; +} + +void delete_pinned_obj_table(struct hashmap *map) +{ + struct hashmap_entry *entry; + size_t bkt; + + if (!map) + return; + + hashmap__for_each_entry(map, entry, bkt) + free(entry->pvalue); + + hashmap__free(map); +} + +unsigned int get_page_size(void) +{ + static int result; + + if (!result) + result = getpagesize(); + return result; +} + +unsigned int get_possible_cpus(void) +{ + int cpus = libbpf_num_possible_cpus(); + + if (cpus < 0) { + p_err("Can't get # of possible cpus: %s", strerror(-cpus)); + exit(-1); + } + return cpus; +} + +static char * +ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf) +{ + struct stat st; + int err; + + err = stat("/proc/self/ns/net", &st); + if (err) { + p_err("Can't stat /proc/self: %s", strerror(errno)); + return NULL; + } + + if (st.st_dev != ns_dev || st.st_ino != ns_ino) + return NULL; + + return if_indextoname(ifindex, buf); +} + +static int read_sysfs_hex_int(char *path) +{ + char vendor_id_buf[8]; + int len; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) { + p_err("Can't open %s: %s", path, strerror(errno)); + return -1; + } + + len = read(fd, vendor_id_buf, sizeof(vendor_id_buf)); + close(fd); + if (len < 0) { + p_err("Can't read %s: %s", path, strerror(errno)); + return -1; + } + if (len >= (int)sizeof(vendor_id_buf)) { + p_err("Value in %s too long", path); + return -1; + } + + vendor_id_buf[len] = 0; + + return strtol(vendor_id_buf, NULL, 0); +} + +static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name) +{ + char full_path[64]; + + snprintf(full_path, sizeof(full_path), "/sys/class/net/%s/device/%s", + devname, entry_name); + + return read_sysfs_hex_int(full_path); +} + +const char * +ifindex_to_arch(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, const char **opt) +{ + __maybe_unused int device_id; + char devname[IF_NAMESIZE]; + int vendor_id; + + if (!ifindex_to_name_ns(ifindex, ns_dev, ns_ino, devname)) { + p_err("Can't get net device name for ifindex %d: %s", ifindex, + strerror(errno)); + return NULL; + } + + vendor_id = read_sysfs_netdev_hex_int(devname, "vendor"); + if (vendor_id < 0) { + p_err("Can't get device vendor id for %s", devname); + return NULL; + } + + switch (vendor_id) { +#ifdef HAVE_LIBBFD_SUPPORT + case 0x19ee: + device_id = read_sysfs_netdev_hex_int(devname, "device"); + if (device_id != 0x4000 && + device_id != 0x6000 && + device_id != 0x6003) + p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch"); + *opt = "ctx4"; + return "NFP-6xxx"; +#endif /* HAVE_LIBBFD_SUPPORT */ + /* No NFP support in LLVM, we have no valid triple to return. */ + default: + p_err("Can't get arch name for device vendor id 0x%04x", + vendor_id); + return NULL; + } +} + +void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) +{ + char name[IF_NAMESIZE]; + + if (!ifindex) + return; + + printf(" offloaded_to "); + if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name)) + printf("%s", name); + else + printf("ifindex %u ns_dev %llu ns_ino %llu", + ifindex, ns_dev, ns_inode); +} + +void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode) +{ + char name[IF_NAMESIZE]; + + if (!ifindex) + return; + + jsonw_name(json_wtr, "dev"); + jsonw_start_object(json_wtr); + jsonw_uint_field(json_wtr, "ifindex", ifindex); + jsonw_uint_field(json_wtr, "ns_dev", ns_dev); + jsonw_uint_field(json_wtr, "ns_inode", ns_inode); + if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name)) + jsonw_string_field(json_wtr, "ifname", name); + jsonw_end_object(json_wtr); +} + +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what) +{ + char *endptr; + + NEXT_ARGP(); + + if (*val) { + p_err("%s already specified", what); + return -1; + } + + *val = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as %s", **argv, what); + return -1; + } + NEXT_ARGP(); + + return 0; +} + +int __printf(2, 0) +print_all_levels(__maybe_unused enum libbpf_print_level level, + const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} + +static int prog_fd_by_nametag(void *nametag, int **fds, bool tag) +{ + char prog_name[MAX_PROG_FULL_NAME]; + unsigned int id = 0; + int fd, nb_fds = 0; + void *tmp; + int err; + + while (true) { + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + + err = bpf_prog_get_next_id(id, &id); + if (err) { + if (errno != ENOENT) { + p_err("%s", strerror(errno)); + goto err_close_fds; + } + return nb_fds; + } + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + p_err("can't get prog by id (%u): %s", + id, strerror(errno)); + goto err_close_fds; + } + + err = bpf_prog_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get prog info (%u): %s", + id, strerror(errno)); + goto err_close_fd; + } + + if (tag && memcmp(nametag, info.tag, BPF_TAG_SIZE)) { + close(fd); + continue; + } + + if (!tag) { + get_prog_full_name(&info, fd, prog_name, + sizeof(prog_name)); + if (strncmp(nametag, prog_name, sizeof(prog_name))) { + close(fd); + continue; + } + } + + if (nb_fds > 0) { + tmp = realloc(*fds, (nb_fds + 1) * sizeof(int)); + if (!tmp) { + p_err("failed to realloc"); + goto err_close_fd; + } + *fds = tmp; + } + (*fds)[nb_fds++] = fd; + } + +err_close_fd: + close(fd); +err_close_fds: + while (--nb_fds >= 0) + close((*fds)[nb_fds]); + return -1; +} + +int prog_parse_fds(int *argc, char ***argv, int **fds) +{ + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + (*fds)[0] = bpf_prog_get_fd_by_id(id); + if ((*fds)[0] < 0) { + p_err("get by id (%u): %s", id, strerror(errno)); + return -1; + } + return 1; + } else if (is_prefix(**argv, "tag")) { + unsigned char tag[BPF_TAG_SIZE]; + + NEXT_ARGP(); + + if (sscanf(**argv, BPF_TAG_FMT, tag, tag + 1, tag + 2, + tag + 3, tag + 4, tag + 5, tag + 6, tag + 7) + != BPF_TAG_SIZE) { + p_err("can't parse tag"); + return -1; + } + NEXT_ARGP(); + + return prog_fd_by_nametag(tag, fds, true); + } else if (is_prefix(**argv, "name")) { + char *name; + + NEXT_ARGP(); + + name = **argv; + if (strlen(name) > MAX_PROG_FULL_NAME - 1) { + p_err("can't parse name"); + return -1; + } + NEXT_ARGP(); + + return prog_fd_by_nametag(name, fds, false); + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG); + if ((*fds)[0] < 0) + return -1; + return 1; + } + + p_err("expected 'id', 'tag', 'name' or 'pinned', got: '%s'?", **argv); + return -1; +} + +int prog_parse_fd(int *argc, char ***argv) +{ + int *fds = NULL; + int nb_fds, fd; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(argc, argv, &fds); + if (nb_fds != 1) { + if (nb_fds > 1) { + p_err("several programs match this handle"); + while (nb_fds--) + close(fds[nb_fds]); + } + fd = -1; + goto exit_free; + } + + fd = fds[0]; +exit_free: + free(fds); + return fd; +} + +static int map_fd_by_name(char *name, int **fds) +{ + unsigned int id = 0; + int fd, nb_fds = 0; + void *tmp; + int err; + + while (true) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno != ENOENT) { + p_err("%s", strerror(errno)); + goto err_close_fds; + } + return nb_fds; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + p_err("can't get map by id (%u): %s", + id, strerror(errno)); + goto err_close_fds; + } + + err = bpf_map_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get map info (%u): %s", + id, strerror(errno)); + goto err_close_fd; + } + + if (strncmp(name, info.name, BPF_OBJ_NAME_LEN)) { + close(fd); + continue; + } + + if (nb_fds > 0) { + tmp = realloc(*fds, (nb_fds + 1) * sizeof(int)); + if (!tmp) { + p_err("failed to realloc"); + goto err_close_fd; + } + *fds = tmp; + } + (*fds)[nb_fds++] = fd; + } + +err_close_fd: + close(fd); +err_close_fds: + while (--nb_fds >= 0) + close((*fds)[nb_fds]); + return -1; +} + +int map_parse_fds(int *argc, char ***argv, int **fds) +{ + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + (*fds)[0] = bpf_map_get_fd_by_id(id); + if ((*fds)[0] < 0) { + p_err("get map by id (%u): %s", id, strerror(errno)); + return -1; + } + return 1; + } else if (is_prefix(**argv, "name")) { + char *name; + + NEXT_ARGP(); + + name = **argv; + if (strlen(name) > BPF_OBJ_NAME_LEN - 1) { + p_err("can't parse name"); + return -1; + } + NEXT_ARGP(); + + return map_fd_by_name(name, fds); + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_MAP); + if ((*fds)[0] < 0) + return -1; + return 1; + } + + p_err("expected 'id', 'name' or 'pinned', got: '%s'?", **argv); + return -1; +} + +int map_parse_fd(int *argc, char ***argv) +{ + int *fds = NULL; + int nb_fds, fd; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(argc, argv, &fds); + if (nb_fds != 1) { + if (nb_fds > 1) { + p_err("several maps match this handle"); + while (nb_fds--) + close(fds[nb_fds]); + } + fd = -1; + goto exit_free; + } + + fd = fds[0]; +exit_free: + free(fds); + return fd; +} + +int map_parse_fd_and_info(int *argc, char ***argv, struct bpf_map_info *info, + __u32 *info_len) +{ + int err; + int fd; + + fd = map_parse_fd(argc, argv); + if (fd < 0) + return -1; + + err = bpf_map_get_info_by_fd(fd, info, info_len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + close(fd); + return err; + } + + return fd; +} + +size_t hash_fn_for_key_as_id(long key, void *ctx) +{ + return key; +} + +bool equal_fn_for_key_as_id(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +const char *bpf_attach_type_input_str(enum bpf_attach_type t) +{ + switch (t) { + case BPF_CGROUP_INET_INGRESS: return "ingress"; + case BPF_CGROUP_INET_EGRESS: return "egress"; + case BPF_CGROUP_INET_SOCK_CREATE: return "sock_create"; + case BPF_CGROUP_INET_SOCK_RELEASE: return "sock_release"; + case BPF_CGROUP_SOCK_OPS: return "sock_ops"; + case BPF_CGROUP_DEVICE: return "device"; + case BPF_CGROUP_INET4_BIND: return "bind4"; + case BPF_CGROUP_INET6_BIND: return "bind6"; + case BPF_CGROUP_INET4_CONNECT: return "connect4"; + case BPF_CGROUP_INET6_CONNECT: return "connect6"; + case BPF_CGROUP_INET4_POST_BIND: return "post_bind4"; + case BPF_CGROUP_INET6_POST_BIND: return "post_bind6"; + case BPF_CGROUP_INET4_GETPEERNAME: return "getpeername4"; + case BPF_CGROUP_INET6_GETPEERNAME: return "getpeername6"; + case BPF_CGROUP_INET4_GETSOCKNAME: return "getsockname4"; + case BPF_CGROUP_INET6_GETSOCKNAME: return "getsockname6"; + case BPF_CGROUP_UDP4_SENDMSG: return "sendmsg4"; + case BPF_CGROUP_UDP6_SENDMSG: return "sendmsg6"; + case BPF_CGROUP_SYSCTL: return "sysctl"; + case BPF_CGROUP_UDP4_RECVMSG: return "recvmsg4"; + case BPF_CGROUP_UDP6_RECVMSG: return "recvmsg6"; + case BPF_CGROUP_GETSOCKOPT: return "getsockopt"; + case BPF_CGROUP_SETSOCKOPT: return "setsockopt"; + case BPF_TRACE_RAW_TP: return "raw_tp"; + case BPF_TRACE_FENTRY: return "fentry"; + case BPF_TRACE_FEXIT: return "fexit"; + case BPF_MODIFY_RETURN: return "mod_ret"; + case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select"; + case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate"; + default: return libbpf_bpf_attach_type_str(t); + } +} + +int pathname_concat(char *buf, int buf_sz, const char *path, + const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} diff --git a/third_party/bpftool/src/feature.c b/third_party/bpftool/src/feature.c new file mode 100644 index 0000000..0675d6a --- /dev/null +++ b/third_party/bpftool/src/feature.c @@ -0,0 +1,1344 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#ifdef USE_LIBCAP +#include +#endif +#include +#include + +#include +#include + +#include +#include +#include + +#include "main.h" + +#ifndef PROC_SUPER_MAGIC +# define PROC_SUPER_MAGIC 0x9fa0 +#endif + +enum probe_component { + COMPONENT_UNSPEC, + COMPONENT_KERNEL, + COMPONENT_DEVICE, +}; + +#define BPF_HELPER_MAKE_ENTRY(name) [BPF_FUNC_ ## name] = "bpf_" # name +static const char * const helper_name[] = { + __BPF_FUNC_MAPPER(BPF_HELPER_MAKE_ENTRY) +}; + +#undef BPF_HELPER_MAKE_ENTRY + +static bool full_mode; +#ifdef USE_LIBCAP +static bool run_as_unprivileged; +#endif + +/* Miscellaneous utility functions */ + +static bool grep(const char *buffer, const char *pattern) +{ + return !!strstr(buffer, pattern); +} + +static bool check_procfs(void) +{ + struct statfs st_fs; + + if (statfs("/proc", &st_fs) < 0) + return false; + if ((unsigned long)st_fs.f_type != PROC_SUPER_MAGIC) + return false; + + return true; +} + +static void uppercase(char *str, size_t len) +{ + size_t i; + + for (i = 0; i < len && str[i] != '\0'; i++) + str[i] = toupper(str[i]); +} + +/* Printing utility functions */ + +static void +print_bool_feature(const char *feat_name, const char *plain_name, + const char *define_name, bool res, const char *define_prefix) +{ + if (json_output) + jsonw_bool_field(json_wtr, feat_name, res); + else if (define_prefix) + printf("#define %s%sHAVE_%s\n", define_prefix, + res ? "" : "NO_", define_name); + else + printf("%s is %savailable\n", plain_name, res ? "" : "NOT "); +} + +static void print_kernel_option(const char *name, const char *value, + const char *define_prefix) +{ + char *endptr; + int res; + + if (json_output) { + if (!value) { + jsonw_null_field(json_wtr, name); + return; + } + errno = 0; + res = strtol(value, &endptr, 0); + if (!errno && *endptr == '\n') + jsonw_int_field(json_wtr, name, res); + else + jsonw_string_field(json_wtr, name, value); + } else if (define_prefix) { + if (value) + printf("#define %s%s %s\n", define_prefix, + name, value); + else + printf("/* %s%s is not set */\n", define_prefix, name); + } else { + if (value) + printf("%s is set to %s\n", name, value); + else + printf("%s is not set\n", name); + } +} + +static void +print_start_section(const char *json_title, const char *plain_title, + const char *define_comment, const char *define_prefix) +{ + if (json_output) { + jsonw_name(json_wtr, json_title); + jsonw_start_object(json_wtr); + } else if (define_prefix) { + printf("%s\n", define_comment); + } else { + printf("%s\n", plain_title); + } +} + +static void print_end_section(void) +{ + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); +} + +/* Probing functions */ + +static int get_vendor_id(int ifindex) +{ + char ifname[IF_NAMESIZE], path[64], buf[8]; + ssize_t len; + int fd; + + if (!if_indextoname(ifindex, ifname)) + return -1; + + snprintf(path, sizeof(path), "/sys/class/net/%s/device/vendor", ifname); + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -1; + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) + return -1; + if (len >= (ssize_t)sizeof(buf)) + return -1; + buf[len] = '\0'; + + return strtol(buf, NULL, 0); +} + +static long read_procfs(const char *path) +{ + char *endptr, *line = NULL; + size_t len = 0; + FILE *fd; + long res; + + fd = fopen(path, "r"); + if (!fd) + return -1; + + res = getline(&line, &len, fd); + fclose(fd); + if (res < 0) + return -1; + + errno = 0; + res = strtol(line, &endptr, 10); + if (errno || *line == '\0' || *endptr != '\n') + res = -1; + free(line); + + return res; +} + +static void probe_unprivileged_disabled(void) +{ + long res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/kernel/unprivileged_bpf_disabled"); + if (json_output) { + jsonw_int_field(json_wtr, "unprivileged_bpf_disabled", res); + } else { + switch (res) { + case 0: + printf("bpf() syscall for unprivileged users is enabled\n"); + break; + case 1: + printf("bpf() syscall restricted to privileged users (without recovery)\n"); + break; + case 2: + printf("bpf() syscall restricted to privileged users (admin can change)\n"); + break; + case -1: + printf("Unable to retrieve required privileges for bpf() syscall\n"); + break; + default: + printf("bpf() syscall restriction has unknown value %ld\n", res); + } + } +} + +static void probe_jit_enable(void) +{ + long res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_enable"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_enable", res); + } else { + switch (res) { + case 0: + printf("JIT compiler is disabled\n"); + break; + case 1: + printf("JIT compiler is enabled\n"); + break; + case 2: + printf("JIT compiler is enabled with debugging traces in kernel logs\n"); + break; + case -1: + printf("Unable to retrieve JIT-compiler status\n"); + break; + default: + printf("JIT-compiler status has unknown value %ld\n", + res); + } + } +} + +static void probe_jit_harden(void) +{ + long res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_harden"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_harden", res); + } else { + switch (res) { + case 0: + printf("JIT compiler hardening is disabled\n"); + break; + case 1: + printf("JIT compiler hardening is enabled for unprivileged users\n"); + break; + case 2: + printf("JIT compiler hardening is enabled for all users\n"); + break; + case -1: + printf("Unable to retrieve JIT hardening status\n"); + break; + default: + printf("JIT hardening status has unknown value %ld\n", + res); + } + } +} + +static void probe_jit_kallsyms(void) +{ + long res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_kallsyms"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_kallsyms", res); + } else { + switch (res) { + case 0: + printf("JIT compiler kallsyms exports are disabled\n"); + break; + case 1: + printf("JIT compiler kallsyms exports are enabled for root\n"); + break; + case -1: + printf("Unable to retrieve JIT kallsyms export status\n"); + break; + default: + printf("JIT kallsyms exports status has unknown value %ld\n", res); + } + } +} + +static void probe_jit_limit(void) +{ + long res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_limit"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_limit", res); + } else { + switch (res) { + case -1: + printf("Unable to retrieve global memory limit for JIT compiler for unprivileged users\n"); + break; + default: + printf("Global memory limit for JIT compiler for unprivileged users is %ld bytes\n", res); + } + } +} + +static bool read_next_kernel_config_option(gzFile file, char *buf, size_t n, + char **value) +{ + char *sep; + + while (gzgets(file, buf, n)) { + if (strncmp(buf, "CONFIG_", 7)) + continue; + + sep = strchr(buf, '='); + if (!sep) + continue; + + /* Trim ending '\n' */ + buf[strlen(buf) - 1] = '\0'; + + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) + continue; + + *value = sep + 1; + return true; + } + + return false; +} + +static void probe_kernel_image_config(const char *define_prefix) +{ + static const struct { + const char * const name; + bool macro_dump; + } options[] = { + /* Enable BPF */ + { "CONFIG_BPF", }, + /* Enable bpf() syscall */ + { "CONFIG_BPF_SYSCALL", }, + /* Does selected architecture support eBPF JIT compiler */ + { "CONFIG_HAVE_EBPF_JIT", }, + /* Compile eBPF JIT compiler */ + { "CONFIG_BPF_JIT", }, + /* Avoid compiling eBPF interpreter (use JIT only) */ + { "CONFIG_BPF_JIT_ALWAYS_ON", }, + /* Kernel BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF", }, + /* Kernel module BTF debug information available */ + { "CONFIG_DEBUG_INFO_BTF_MODULES", }, + + /* cgroups */ + { "CONFIG_CGROUPS", }, + /* BPF programs attached to cgroups */ + { "CONFIG_CGROUP_BPF", }, + /* bpf_get_cgroup_classid() helper */ + { "CONFIG_CGROUP_NET_CLASSID", }, + /* bpf_skb_{,ancestor_}cgroup_id() helpers */ + { "CONFIG_SOCK_CGROUP_DATA", }, + + /* Tracing: attach BPF to kprobes, tracepoints, etc. */ + { "CONFIG_BPF_EVENTS", }, + /* Kprobes */ + { "CONFIG_KPROBE_EVENTS", }, + /* Uprobes */ + { "CONFIG_UPROBE_EVENTS", }, + /* Tracepoints */ + { "CONFIG_TRACING", }, + /* Syscall tracepoints */ + { "CONFIG_FTRACE_SYSCALLS", }, + /* bpf_override_return() helper support for selected arch */ + { "CONFIG_FUNCTION_ERROR_INJECTION", }, + /* bpf_override_return() helper */ + { "CONFIG_BPF_KPROBE_OVERRIDE", }, + + /* Network */ + { "CONFIG_NET", }, + /* AF_XDP sockets */ + { "CONFIG_XDP_SOCKETS", }, + /* BPF_PROG_TYPE_LWT_* and related helpers */ + { "CONFIG_LWTUNNEL_BPF", }, + /* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */ + { "CONFIG_NET_ACT_BPF", }, + /* BPF_PROG_TYPE_SCHED_CLS, TC filters */ + { "CONFIG_NET_CLS_BPF", }, + /* TC clsact qdisc */ + { "CONFIG_NET_CLS_ACT", }, + /* Ingress filtering with TC */ + { "CONFIG_NET_SCH_INGRESS", }, + /* bpf_skb_get_xfrm_state() helper */ + { "CONFIG_XFRM", }, + /* bpf_get_route_realm() helper */ + { "CONFIG_IP_ROUTE_CLASSID", }, + /* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */ + { "CONFIG_IPV6_SEG6_BPF", }, + /* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */ + { "CONFIG_BPF_LIRC_MODE2", }, + /* BPF stream parser and BPF socket maps */ + { "CONFIG_BPF_STREAM_PARSER", }, + /* xt_bpf module for passing BPF programs to netfilter */ + { "CONFIG_NETFILTER_XT_MATCH_BPF", }, + /* bpfilter back-end for iptables */ + { "CONFIG_BPFILTER", }, + /* bpftilter module with "user mode helper" */ + { "CONFIG_BPFILTER_UMH", }, + + /* test_bpf module for BPF tests */ + { "CONFIG_TEST_BPF", }, + + /* Misc configs useful in BPF C programs */ + /* jiffies <-> sec conversion for bpf_jiffies64() helper */ + { "CONFIG_HZ", true, } + }; + char *values[ARRAY_SIZE(options)] = { }; + struct utsname utsn; + char path[PATH_MAX]; + gzFile file = NULL; + char buf[4096]; + char *value; + size_t i; + + if (!uname(&utsn)) { + snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); + + /* gzopen also accepts uncompressed files. */ + file = gzopen(path, "r"); + } + + if (!file) { + /* Some distributions build with CONFIG_IKCONFIG=y and put the + * config file at /proc/config.gz. + */ + file = gzopen("/proc/config.gz", "r"); + } + if (!file) { + p_info("skipping kernel config, can't open file: %s", + strerror(errno)); + goto end_parse; + } + /* Sanity checks */ + if (!gzgets(file, buf, sizeof(buf)) || + !gzgets(file, buf, sizeof(buf))) { + p_info("skipping kernel config, can't read from file: %s", + strerror(errno)); + goto end_parse; + } + if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { + p_info("skipping kernel config, can't find correct file"); + goto end_parse; + } + + while (read_next_kernel_config_option(file, buf, sizeof(buf), &value)) { + for (i = 0; i < ARRAY_SIZE(options); i++) { + if ((define_prefix && !options[i].macro_dump) || + values[i] || strcmp(buf, options[i].name)) + continue; + + values[i] = strdup(value); + } + } + + for (i = 0; i < ARRAY_SIZE(options); i++) { + if (define_prefix && !options[i].macro_dump) + continue; + print_kernel_option(options[i].name, values[i], define_prefix); + free(values[i]); + } + +end_parse: + if (file) + gzclose(file); +} + +static bool probe_bpf_syscall(const char *define_prefix) +{ + bool res; + + bpf_prog_load(BPF_PROG_TYPE_UNSPEC, NULL, NULL, NULL, 0, NULL); + res = (errno != ENOSYS); + + print_bool_feature("have_bpf_syscall", + "bpf() syscall", + "BPF_SYSCALL", + res, define_prefix); + + return res; +} + +static bool +probe_prog_load_ifindex(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, size_t insns_cnt, + char *log_buf, size_t log_buf_sz, + __u32 ifindex) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = log_buf_sz, + .log_level = log_buf ? 1 : 0, + .prog_ifindex = ifindex, + ); + int fd; + + errno = 0; + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); + if (fd >= 0) + close(fd); + + return fd >= 0 && errno != EINVAL && errno != EOPNOTSUPP; +} + +static bool probe_prog_type_ifindex(enum bpf_prog_type prog_type, __u32 ifindex) +{ + /* nfp returns -EINVAL on exit(0) with TC offload */ + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN() + }; + + return probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), + NULL, 0, ifindex); +} + +static void +probe_prog_type(enum bpf_prog_type prog_type, const char *prog_type_str, + bool *supported_types, const char *define_prefix, __u32 ifindex) +{ + char feat_name[128], plain_desc[128], define_name[128]; + const char *plain_comment = "eBPF program_type "; + size_t maxlen; + bool res; + + if (ifindex) { + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_XDP: + break; + default: + return; + } + + res = probe_prog_type_ifindex(prog_type, ifindex); + } else { + res = libbpf_probe_bpf_prog_type(prog_type, NULL) > 0; + } + +#ifdef USE_LIBCAP + /* Probe may succeed even if program load fails, for unprivileged users + * check that we did not fail because of insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; +#endif + + supported_types[prog_type] |= res; + + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; + if (strlen(prog_type_str) > maxlen) { + p_info("program type name too long"); + return; + } + + sprintf(feat_name, "have_%s_prog_type", prog_type_str); + sprintf(define_name, "%s_prog_type", prog_type_str); + uppercase(define_name, sizeof(define_name)); + sprintf(plain_desc, "%s%s", plain_comment, prog_type_str); + print_bool_feature(feat_name, plain_desc, define_name, res, + define_prefix); +} + +static bool probe_map_type_ifindex(enum bpf_map_type map_type, __u32 ifindex) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int key_size, value_size, max_entries; + int fd; + + opts.map_ifindex = ifindex; + + key_size = sizeof(__u32); + value_size = sizeof(__u32); + max_entries = 1; + + fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, + &opts); + if (fd >= 0) + close(fd); + + return fd >= 0; +} + +static void +probe_map_type(enum bpf_map_type map_type, char const *map_type_str, + const char *define_prefix, __u32 ifindex) +{ + char feat_name[128], plain_desc[128], define_name[128]; + const char *plain_comment = "eBPF map_type "; + size_t maxlen; + bool res; + + if (ifindex) { + switch (map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + break; + default: + return; + } + + res = probe_map_type_ifindex(map_type, ifindex); + } else { + res = libbpf_probe_bpf_map_type(map_type, NULL) > 0; + } + + /* Probe result depends on the success of map creation, no additional + * check required for unprivileged users + */ + + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; + if (strlen(map_type_str) > maxlen) { + p_info("map type name too long"); + return; + } + + sprintf(feat_name, "have_%s_map_type", map_type_str); + sprintf(define_name, "%s_map_type", map_type_str); + uppercase(define_name, sizeof(define_name)); + sprintf(plain_desc, "%s%s", plain_comment, map_type_str); + print_bool_feature(feat_name, plain_desc, define_name, res, + define_prefix); +} + +static bool +probe_helper_ifindex(enum bpf_func_id id, enum bpf_prog_type prog_type, + __u32 ifindex) +{ + struct bpf_insn insns[2] = { + BPF_EMIT_CALL(id), + BPF_EXIT_INSN() + }; + char buf[4096] = {}; + bool res; + + probe_prog_load_ifindex(prog_type, insns, ARRAY_SIZE(insns), buf, + sizeof(buf), ifindex); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + + switch (get_vendor_id(ifindex)) { + case 0x19ee: /* Netronome specific */ + res = res && !grep(buf, "not supported by FW") && + !grep(buf, "unsupported function id"); + break; + default: + break; + } + + return res; +} + +static bool +probe_helper_for_progtype(enum bpf_prog_type prog_type, bool supported_type, + const char *define_prefix, unsigned int id, + const char *ptype_name, __u32 ifindex) +{ + bool res = false; + + if (supported_type) { + if (ifindex) + res = probe_helper_ifindex(id, prog_type, ifindex); + else + res = libbpf_probe_bpf_helper(prog_type, id, NULL) > 0; +#ifdef USE_LIBCAP + /* Probe may succeed even if program load fails, for + * unprivileged users check that we did not fail because of + * insufficient permissions + */ + if (run_as_unprivileged && errno == EPERM) + res = false; +#endif + } + + if (json_output) { + if (res) + jsonw_string(json_wtr, helper_name[id]); + } else if (define_prefix) { + printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", + define_prefix, ptype_name, helper_name[id], + res ? "1" : "0"); + } else { + if (res) + printf("\n\t- %s", helper_name[id]); + } + + return res; +} + +static void +probe_helpers_for_progtype(enum bpf_prog_type prog_type, + const char *prog_type_str, bool supported_type, + const char *define_prefix, __u32 ifindex) +{ + char feat_name[128]; + unsigned int id; + bool probe_res = false; + + if (ifindex) + /* Only test helpers for offload-able program types */ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_XDP: + break; + default: + return; + } + + if (json_output) { + sprintf(feat_name, "%s_available_helpers", prog_type_str); + jsonw_name(json_wtr, feat_name); + jsonw_start_array(json_wtr); + } else if (!define_prefix) { + printf("eBPF helpers supported for program type %s:", + prog_type_str); + } + + for (id = 1; id < ARRAY_SIZE(helper_name); id++) { + /* Skip helper functions which emit dmesg messages when not in + * the full mode. + */ + switch (id) { + case BPF_FUNC_trace_printk: + case BPF_FUNC_trace_vprintk: + case BPF_FUNC_probe_write_user: + if (!full_mode) + continue; + /* fallthrough */ + default: + probe_res |= probe_helper_for_progtype(prog_type, supported_type, + define_prefix, id, prog_type_str, + ifindex); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + else if (!define_prefix) { + printf("\n"); + if (!probe_res) { + if (!supported_type) + printf("\tProgram type not supported\n"); + else + printf("\tCould not determine which helpers are available\n"); + } + } + + +} + +static void +probe_misc_feature(struct bpf_insn *insns, size_t len, + const char *define_prefix, __u32 ifindex, + const char *feat_name, const char *plain_name, + const char *define_name) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .prog_ifindex = ifindex, + ); + bool res; + int fd; + + errno = 0; + fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", + insns, len, &opts); + res = fd >= 0 || !errno; + + if (fd >= 0) + close(fd); + + print_bool_feature(feat_name, plain_name, define_name, res, + define_prefix); +} + +/* + * Probe for availability of kernel commit (5.3): + * + * c04c0d2b968a ("bpf: increase complexity limit and maximum program size") + */ +static void probe_large_insn_limit(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[BPF_MAXINSNS + 1]; + int i; + + for (i = 0; i < BPF_MAXINSNS; i++) + insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); + insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_large_insn_limit", + "Large program size limit", + "LARGE_INSN_LIMIT"); +} + +/* + * Probe for bounded loop support introduced in commit 2589726d12a1 + * ("bpf: introduce bounded loops"). + */ +static void +probe_bounded_loops(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 10), + BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, -2), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_bounded_loops", + "Bounded loop support", + "BOUNDED_LOOPS"); +} + +/* + * Probe for the v2 instruction set extension introduced in commit 92b31a9af73b + * ("bpf: add BPF_J{LT,LE,SLT,SLE} instructions"). + */ +static void +probe_v2_isa_extension(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_v2_isa_extension", + "ISA extension v2", + "V2_ISA_EXTENSION"); +} + +/* + * Probe for the v3 instruction set extension introduced in commit 092ed0968bb6 + * ("bpf: verifier support JMP32"). + */ +static void +probe_v3_isa_extension(const char *define_prefix, __u32 ifindex) +{ + struct bpf_insn insns[4] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP32_IMM(BPF_JLT, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + probe_misc_feature(insns, ARRAY_SIZE(insns), + define_prefix, ifindex, + "have_v3_isa_extension", + "ISA extension v3", + "V3_ISA_EXTENSION"); +} + +static void +section_system_config(enum probe_component target, const char *define_prefix) +{ + switch (target) { + case COMPONENT_KERNEL: + case COMPONENT_UNSPEC: + print_start_section("system_config", + "Scanning system configuration...", + "/*** Misc kernel config items ***/", + define_prefix); + if (!define_prefix) { + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } + } + probe_kernel_image_config(define_prefix); + print_end_section(); + break; + default: + break; + } +} + +static bool section_syscall_config(const char *define_prefix) +{ + bool res; + + print_start_section("syscall_config", + "Scanning system call availability...", + "/*** System call availability ***/", + define_prefix); + res = probe_bpf_syscall(define_prefix); + print_end_section(); + + return res; +} + +static void +section_program_types(bool *supported_types, const char *define_prefix, + __u32 ifindex) +{ + unsigned int prog_type = BPF_PROG_TYPE_UNSPEC; + const char *prog_type_str; + + print_start_section("program_types", + "Scanning eBPF program types...", + "/*** eBPF program types ***/", + define_prefix); + + while (true) { + prog_type++; + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!prog_type_str) + break; + + probe_prog_type(prog_type, prog_type_str, supported_types, define_prefix, + ifindex); + } + + print_end_section(); +} + +static void section_map_types(const char *define_prefix, __u32 ifindex) +{ + unsigned int map_type = BPF_MAP_TYPE_UNSPEC; + const char *map_type_str; + + print_start_section("map_types", + "Scanning eBPF map types...", + "/*** eBPF map types ***/", + define_prefix); + + while (true) { + map_type++; + map_type_str = libbpf_bpf_map_type_str(map_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!map_type_str) + break; + + probe_map_type(map_type, map_type_str, define_prefix, ifindex); + } + + print_end_section(); +} + +static void +section_helpers(bool *supported_types, const char *define_prefix, __u32 ifindex) +{ + unsigned int prog_type = BPF_PROG_TYPE_UNSPEC; + const char *prog_type_str; + + print_start_section("helpers", + "Scanning eBPF helper functions...", + "/*** eBPF helper functions ***/", + define_prefix); + + if (define_prefix) + printf("/*\n" + " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" + " * to determine if is available for ,\n" + " * e.g.\n" + " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" + " * // do stuff with this helper\n" + " * #elif\n" + " * // use a workaround\n" + " * #endif\n" + " */\n" + "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" + " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", + define_prefix, define_prefix, define_prefix, + define_prefix); + while (true) { + prog_type++; + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + /* libbpf will return NULL for variants unknown to it. */ + if (!prog_type_str) + break; + + probe_helpers_for_progtype(prog_type, prog_type_str, + supported_types[prog_type], + define_prefix, + ifindex); + } + + print_end_section(); +} + +static void section_misc(const char *define_prefix, __u32 ifindex) +{ + print_start_section("misc", + "Scanning miscellaneous eBPF features...", + "/*** eBPF misc features ***/", + define_prefix); + probe_large_insn_limit(define_prefix, ifindex); + probe_bounded_loops(define_prefix, ifindex); + probe_v2_isa_extension(define_prefix, ifindex); + probe_v3_isa_extension(define_prefix, ifindex); + print_end_section(); +} + +#ifdef USE_LIBCAP +#define capability(c) { c, false, #c } +#define capability_msg(a, i) a[i].set ? "" : a[i].name, a[i].set ? "" : ", " +#endif + +static int handle_perms(void) +{ +#ifdef USE_LIBCAP + struct { + cap_value_t cap; + bool set; + char name[14]; /* strlen("CAP_SYS_ADMIN") */ + } bpf_caps[] = { + capability(CAP_SYS_ADMIN), +#ifdef CAP_BPF + capability(CAP_BPF), + capability(CAP_NET_ADMIN), + capability(CAP_PERFMON), +#endif + }; + cap_value_t cap_list[ARRAY_SIZE(bpf_caps)]; + unsigned int i, nb_bpf_caps = 0; + bool cap_sys_admin_only = true; + cap_flag_value_t val; + int res = -1; + cap_t caps; + + caps = cap_get_proc(); + if (!caps) { + p_err("failed to get capabilities for process: %s", + strerror(errno)); + return -1; + } + +#ifdef CAP_BPF + if (CAP_IS_SUPPORTED(CAP_BPF)) + cap_sys_admin_only = false; +#endif + + for (i = 0; i < ARRAY_SIZE(bpf_caps); i++) { + const char *cap_name = bpf_caps[i].name; + cap_value_t cap = bpf_caps[i].cap; + + if (cap_get_flag(caps, cap, CAP_EFFECTIVE, &val)) { + p_err("bug: failed to retrieve %s status: %s", cap_name, + strerror(errno)); + goto exit_free; + } + + if (val == CAP_SET) { + bpf_caps[i].set = true; + cap_list[nb_bpf_caps++] = cap; + } + + if (cap_sys_admin_only) + /* System does not know about CAP_BPF, meaning that + * CAP_SYS_ADMIN is the only capability required. We + * just checked it, break. + */ + break; + } + + if ((run_as_unprivileged && !nb_bpf_caps) || + (!run_as_unprivileged && nb_bpf_caps == ARRAY_SIZE(bpf_caps)) || + (!run_as_unprivileged && cap_sys_admin_only && nb_bpf_caps)) { + /* We are all good, exit now */ + res = 0; + goto exit_free; + } + + if (!run_as_unprivileged) { + if (cap_sys_admin_only) + p_err("missing %s, required for full feature probing; run as root or use 'unprivileged'", + bpf_caps[0].name); + else + p_err("missing %s%s%s%s%s%s%s%srequired for full feature probing; run as root or use 'unprivileged'", + capability_msg(bpf_caps, 0), +#ifdef CAP_BPF + capability_msg(bpf_caps, 1), + capability_msg(bpf_caps, 2), + capability_msg(bpf_caps, 3) +#else + "", "", "", "", "", "" +#endif /* CAP_BPF */ + ); + goto exit_free; + } + + /* if (run_as_unprivileged && nb_bpf_caps > 0), drop capabilities. */ + if (cap_set_flag(caps, CAP_EFFECTIVE, nb_bpf_caps, cap_list, + CAP_CLEAR)) { + p_err("bug: failed to clear capabilities: %s", strerror(errno)); + goto exit_free; + } + + if (cap_set_proc(caps)) { + p_err("failed to drop capabilities: %s", strerror(errno)); + goto exit_free; + } + + res = 0; + +exit_free: + if (cap_free(caps) && !res) { + p_err("failed to clear storage object for capabilities: %s", + strerror(errno)); + res = -1; + } + + return res; +#else + /* Detection assumes user has specific privileges. + * We do not use libcap so let's approximate, and restrict usage to + * root user only. + */ + if (geteuid()) { + p_err("full feature probing requires root privileges"); + return -1; + } + + return 0; +#endif /* USE_LIBCAP */ +} + +static int do_probe(int argc, char **argv) +{ + enum probe_component target = COMPONENT_UNSPEC; + const char *define_prefix = NULL; + bool supported_types[128] = {}; + __u32 ifindex = 0; + char *ifname; + + set_max_rlimit(); + + while (argc) { + if (is_prefix(*argv, "kernel")) { + if (target != COMPONENT_UNSPEC) { + p_err("component to probe already specified"); + return -1; + } + target = COMPONENT_KERNEL; + NEXT_ARG(); + } else if (is_prefix(*argv, "dev")) { + NEXT_ARG(); + + if (target != COMPONENT_UNSPEC || ifindex) { + p_err("component to probe already specified"); + return -1; + } + if (!REQ_ARGS(1)) + return -1; + + target = COMPONENT_DEVICE; + ifname = GET_ARG(); + ifindex = if_nametoindex(ifname); + if (!ifindex) { + p_err("unrecognized netdevice '%s': %s", ifname, + strerror(errno)); + return -1; + } + } else if (is_prefix(*argv, "full")) { + full_mode = true; + NEXT_ARG(); + } else if (is_prefix(*argv, "macros") && !define_prefix) { + define_prefix = ""; + NEXT_ARG(); + } else if (is_prefix(*argv, "prefix")) { + if (!define_prefix) { + p_err("'prefix' argument can only be use after 'macros'"); + return -1; + } + if (strcmp(define_prefix, "")) { + p_err("'prefix' already defined"); + return -1; + } + NEXT_ARG(); + + if (!REQ_ARGS(1)) + return -1; + define_prefix = GET_ARG(); + } else if (is_prefix(*argv, "unprivileged")) { +#ifdef USE_LIBCAP + run_as_unprivileged = true; + NEXT_ARG(); +#else + p_err("unprivileged run not supported, recompile bpftool with libcap"); + return -1; +#endif + } else { + p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", + *argv); + return -1; + } + } + + /* Full feature detection requires specific privileges. + * Let's approximate, and warn if user is not root. + */ + if (handle_perms()) + return -1; + + if (json_output) { + define_prefix = NULL; + jsonw_start_object(json_wtr); + } + + section_system_config(target, define_prefix); + if (!section_syscall_config(define_prefix)) + /* bpf() syscall unavailable, don't probe other BPF features */ + goto exit_close_json; + section_program_types(supported_types, define_prefix, ifindex); + section_map_types(define_prefix, ifindex); + section_helpers(supported_types, define_prefix, ifindex); + section_misc(define_prefix, ifindex); + +exit_close_json: + if (json_output) + /* End root object */ + jsonw_end_object(json_wtr); + + return 0; +} + +static const char *get_helper_name(unsigned int id) +{ + if (id >= ARRAY_SIZE(helper_name)) + return NULL; + + return helper_name[id]; +} + +static int do_list_builtins(int argc, char **argv) +{ + const char *(*get_name)(unsigned int id); + unsigned int id = 0; + + if (argc < 1) + usage(); + + if (is_prefix(*argv, "prog_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_prog_type_str; + } else if (is_prefix(*argv, "map_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_map_type_str; + } else if (is_prefix(*argv, "attach_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_attach_type_str; + } else if (is_prefix(*argv, "link_types")) { + get_name = (const char *(*)(unsigned int))libbpf_bpf_link_type_str; + } else if (is_prefix(*argv, "helpers")) { + get_name = get_helper_name; + } else { + p_err("expected 'prog_types', 'map_types', 'attach_types', 'link_types' or 'helpers', got: %s", *argv); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); /* root array */ + + while (true) { + const char *name; + + name = get_name(id++); + if (!name) + break; + if (json_output) + jsonw_string(json_wtr, name); + else + printf("%s\n", name); + } + + if (json_output) + jsonw_end_array(json_wtr); /* root array */ + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s probe [COMPONENT] [full] [unprivileged] [macros [prefix PREFIX]]\n" + " %1$s %2$s list_builtins GROUP\n" + " %1$s %2$s help\n" + "\n" + " COMPONENT := { kernel | dev NAME }\n" + " GROUP := { prog_types | map_types | attach_types | link_types | helpers }\n" + " " HELP_SPEC_OPTIONS " }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "probe", do_probe }, + { "list_builtins", do_list_builtins }, + { "help", do_help }, + { 0 } +}; + +int do_feature(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/gen.c b/third_party/bpftool/src/gen.c new file mode 100644 index 0000000..2883660 --- /dev/null +++ b/third_party/bpftool/src/gen.c @@ -0,0 +1,2333 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2019 Facebook */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +#define MAX_OBJ_NAME_LEN 64 + +static void sanitize_identifier(char *name) +{ + int i; + + for (i = 0; name[i]; i++) + if (!isalnum(name[i]) && name[i] != '_') + name[i] = '_'; +} + +static bool str_has_prefix(const char *str, const char *prefix) +{ + return strncmp(str, prefix, strlen(prefix)) == 0; +} + +static bool str_has_suffix(const char *str, const char *suffix) +{ + size_t i, n1 = strlen(str), n2 = strlen(suffix); + + if (n1 < n2) + return false; + + for (i = 0; i < n2; i++) { + if (str[n1 - i - 1] != suffix[n2 - i - 1]) + return false; + } + + return true; +} + +static void get_obj_name(char *name, const char *file) +{ + /* Using basename() GNU version which doesn't modify arg. */ + strncpy(name, basename(file), MAX_OBJ_NAME_LEN - 1); + name[MAX_OBJ_NAME_LEN - 1] = '\0'; + if (str_has_suffix(name, ".o")) + name[strlen(name) - 2] = '\0'; + sanitize_identifier(name); +} + +static void get_header_guard(char *guard, const char *obj_name, const char *suffix) +{ + int i; + + sprintf(guard, "__%s_%s__", obj_name, suffix); + for (i = 0; guard[i]; i++) + guard[i] = toupper(guard[i]); +} + +static bool get_map_ident(const struct bpf_map *map, char *buf, size_t buf_sz) +{ + static const char *sfxs[] = { ".data", ".rodata", ".bss", ".kconfig" }; + const char *name = bpf_map__name(map); + int i, n; + + if (!bpf_map__is_internal(map)) { + snprintf(buf, buf_sz, "%s", name); + return true; + } + + for (i = 0, n = ARRAY_SIZE(sfxs); i < n; i++) { + const char *sfx = sfxs[i], *p; + + p = strstr(name, sfx); + if (p) { + snprintf(buf, buf_sz, "%s", p + 1); + sanitize_identifier(buf); + return true; + } + } + + return false; +} + +static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) +{ + static const char *pfxs[] = { ".data", ".rodata", ".bss", ".kconfig" }; + int i, n; + + for (i = 0, n = ARRAY_SIZE(pfxs); i < n; i++) { + const char *pfx = pfxs[i]; + + if (str_has_prefix(sec_name, pfx)) { + snprintf(buf, buf_sz, "%s", sec_name + 1); + sanitize_identifier(buf); + return true; + } + } + + return false; +} + +static void codegen_btf_dump_printf(void *ctx, const char *fmt, va_list args) +{ + vprintf(fmt, args); +} + +static int codegen_datasec_def(struct bpf_object *obj, + struct btf *btf, + struct btf_dump *d, + const struct btf_type *sec, + const char *obj_name) +{ + const char *sec_name = btf__name_by_offset(btf, sec->name_off); + const struct btf_var_secinfo *sec_var = btf_var_secinfos(sec); + int i, err, off = 0, pad_cnt = 0, vlen = btf_vlen(sec); + char var_ident[256], sec_ident[256]; + bool strip_mods = false; + + if (!get_datasec_ident(sec_name, sec_ident, sizeof(sec_ident))) + return 0; + + if (strcmp(sec_name, ".kconfig") != 0) + strip_mods = true; + + printf(" struct %s__%s {\n", obj_name, sec_ident); + for (i = 0; i < vlen; i++, sec_var++) { + const struct btf_type *var = btf__type_by_id(btf, sec_var->type); + const char *var_name = btf__name_by_offset(btf, var->name_off); + DECLARE_LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts, + .field_name = var_ident, + .indent_level = 2, + .strip_mods = strip_mods, + ); + int need_off = sec_var->offset, align_off, align; + __u32 var_type_id = var->type; + + /* static variables are not exposed through BPF skeleton */ + if (btf_var(var)->linkage == BTF_VAR_STATIC) + continue; + + if (off > need_off) { + p_err("Something is wrong for %s's variable #%d: need offset %d, already at %d.\n", + sec_name, i, need_off, off); + return -EINVAL; + } + + align = btf__align_of(btf, var->type); + if (align <= 0) { + p_err("Failed to determine alignment of variable '%s': %d", + var_name, align); + return -EINVAL; + } + /* Assume 32-bit architectures when generating data section + * struct memory layout. Given bpftool can't know which target + * host architecture it's emitting skeleton for, we need to be + * conservative and assume 32-bit one to ensure enough padding + * bytes are generated for pointer and long types. This will + * still work correctly for 64-bit architectures, because in + * the worst case we'll generate unnecessary padding field, + * which on 64-bit architectures is not strictly necessary and + * would be handled by natural 8-byte alignment. But it still + * will be a correct memory layout, based on recorded offsets + * in BTF. + */ + if (align > 4) + align = 4; + + align_off = (off + align - 1) / align * align; + if (align_off != need_off) { + printf("\t\tchar __pad%d[%d];\n", + pad_cnt, need_off - off); + pad_cnt++; + } + + /* sanitize variable name, e.g., for static vars inside + * a function, it's name is '.', + * which we'll turn into a '_' + */ + var_ident[0] = '\0'; + strncat(var_ident, var_name, sizeof(var_ident) - 1); + sanitize_identifier(var_ident); + + printf("\t\t"); + err = btf_dump__emit_type_decl(d, var_type_id, &opts); + if (err) + return err; + printf(";\n"); + + off = sec_var->offset + sec_var->size; + } + printf(" } *%s;\n", sec_ident); + return 0; +} + +static const struct btf_type *find_type_for_map(struct btf *btf, const char *map_ident) +{ + int n = btf__type_cnt(btf), i; + char sec_ident[256]; + + for (i = 1; i < n; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name; + + if (!btf_is_datasec(t)) + continue; + + name = btf__str_by_offset(btf, t->name_off); + if (!get_datasec_ident(name, sec_ident, sizeof(sec_ident))) + continue; + + if (strcmp(sec_ident, map_ident) == 0) + return t; + } + return NULL; +} + +static bool is_internal_mmapable_map(const struct bpf_map *map, char *buf, size_t sz) +{ + if (!bpf_map__is_internal(map) || !(bpf_map__map_flags(map) & BPF_F_MMAPABLE)) + return false; + + if (!get_map_ident(map, buf, sz)) + return false; + + return true; +} + +static int codegen_datasecs(struct bpf_object *obj, const char *obj_name) +{ + struct btf *btf = bpf_object__btf(obj); + struct btf_dump *d; + struct bpf_map *map; + const struct btf_type *sec; + char map_ident[256]; + int err = 0; + + d = btf_dump__new(btf, codegen_btf_dump_printf, NULL, NULL); + if (!d) + return -errno; + + bpf_object__for_each_map(map, obj) { + /* only generate definitions for memory-mapped internal maps */ + if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + continue; + + sec = find_type_for_map(btf, map_ident); + + /* In some cases (e.g., sections like .rodata.cst16 containing + * compiler allocated string constants only) there will be + * special internal maps with no corresponding DATASEC BTF + * type. In such case, generate empty structs for each such + * map. It will still be memory-mapped and its contents + * accessible from user-space through BPF skeleton. + */ + if (!sec) { + printf(" struct %s__%s {\n", obj_name, map_ident); + printf(" } *%s;\n", map_ident); + } else { + err = codegen_datasec_def(obj, btf, d, sec, obj_name); + if (err) + goto out; + } + } + + +out: + btf_dump__free(d); + return err; +} + +static bool btf_is_ptr_to_func_proto(const struct btf *btf, + const struct btf_type *v) +{ + return btf_is_ptr(v) && btf_is_func_proto(btf__type_by_id(btf, v->type)); +} + +static int codegen_subskel_datasecs(struct bpf_object *obj, const char *obj_name) +{ + struct btf *btf = bpf_object__btf(obj); + struct btf_dump *d; + struct bpf_map *map; + const struct btf_type *sec, *var; + const struct btf_var_secinfo *sec_var; + int i, err = 0, vlen; + char map_ident[256], sec_ident[256]; + bool strip_mods = false, needs_typeof = false; + const char *sec_name, *var_name; + __u32 var_type_id; + + d = btf_dump__new(btf, codegen_btf_dump_printf, NULL, NULL); + if (!d) + return -errno; + + bpf_object__for_each_map(map, obj) { + /* only generate definitions for memory-mapped internal maps */ + if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + continue; + + sec = find_type_for_map(btf, map_ident); + if (!sec) + continue; + + sec_name = btf__name_by_offset(btf, sec->name_off); + if (!get_datasec_ident(sec_name, sec_ident, sizeof(sec_ident))) + continue; + + strip_mods = strcmp(sec_name, ".kconfig") != 0; + printf(" struct %s__%s {\n", obj_name, sec_ident); + + sec_var = btf_var_secinfos(sec); + vlen = btf_vlen(sec); + for (i = 0; i < vlen; i++, sec_var++) { + DECLARE_LIBBPF_OPTS(btf_dump_emit_type_decl_opts, opts, + .indent_level = 2, + .strip_mods = strip_mods, + /* we'll print the name separately */ + .field_name = "", + ); + + var = btf__type_by_id(btf, sec_var->type); + var_name = btf__name_by_offset(btf, var->name_off); + var_type_id = var->type; + + /* static variables are not exposed through BPF skeleton */ + if (btf_var(var)->linkage == BTF_VAR_STATIC) + continue; + + /* The datasec member has KIND_VAR but we want the + * underlying type of the variable (e.g. KIND_INT). + */ + var = skip_mods_and_typedefs(btf, var->type, NULL); + + printf("\t\t"); + /* Func and array members require special handling. + * Instead of producing `typename *var`, they produce + * `typeof(typename) *var`. This allows us to keep a + * similar syntax where the identifier is just prefixed + * by *, allowing us to ignore C declaration minutiae. + */ + needs_typeof = btf_is_array(var) || btf_is_ptr_to_func_proto(btf, var); + if (needs_typeof) + printf("typeof("); + + err = btf_dump__emit_type_decl(d, var_type_id, &opts); + if (err) + goto out; + + if (needs_typeof) + printf(")"); + + printf(" *%s;\n", var_name); + } + printf(" } %s;\n", sec_ident); + } + +out: + btf_dump__free(d); + return err; +} + +static void codegen(const char *template, ...) +{ + const char *src, *end; + int skip_tabs = 0, n; + char *s, *dst; + va_list args; + char c; + + n = strlen(template); + s = malloc(n + 1); + if (!s) + exit(-1); + src = template; + dst = s; + + /* find out "baseline" indentation to skip */ + while ((c = *src++)) { + if (c == '\t') { + skip_tabs++; + } else if (c == '\n') { + break; + } else { + p_err("unrecognized character at pos %td in template '%s': '%c'", + src - template - 1, template, c); + free(s); + exit(-1); + } + } + + while (*src) { + /* skip baseline indentation tabs */ + for (n = skip_tabs; n > 0; n--, src++) { + if (*src != '\t') { + p_err("not enough tabs at pos %td in template '%s'", + src - template - 1, template); + free(s); + exit(-1); + } + } + /* trim trailing whitespace */ + end = strchrnul(src, '\n'); + for (n = end - src; n > 0 && isspace(src[n - 1]); n--) + ; + memcpy(dst, src, n); + dst += n; + if (*end) + *dst++ = '\n'; + src = *end ? end + 1 : end; + } + *dst++ = '\0'; + + /* print out using adjusted template */ + va_start(args, template); + n = vprintf(s, args); + va_end(args); + + free(s); +} + +static void print_hex(const char *data, int data_sz) +{ + int i, len; + + for (i = 0, len = 0; i < data_sz; i++) { + int w = data[i] ? 4 : 2; + + len += w; + if (len > 78) { + printf("\\\n"); + len = w; + } + if (!data[i]) + printf("\\0"); + else + printf("\\x%02x", (unsigned char)data[i]); + } +} + +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + long page_sz = sysconf(_SC_PAGE_SIZE); + size_t map_sz; + + map_sz = (size_t)roundup(bpf_map__value_size(map), 8) * bpf_map__max_entries(map); + map_sz = roundup(map_sz, page_sz); + return map_sz; +} + +/* Emit type size asserts for all top-level fields in memory-mapped internal maps. */ +static void codegen_asserts(struct bpf_object *obj, const char *obj_name) +{ + struct btf *btf = bpf_object__btf(obj); + struct bpf_map *map; + struct btf_var_secinfo *sec_var; + int i, vlen; + const struct btf_type *sec; + char map_ident[256], var_ident[256]; + + if (!btf) + return; + + codegen("\ + \n\ + __attribute__((unused)) static void \n\ + %1$s__assert(struct %1$s *s __attribute__((unused))) \n\ + { \n\ + #ifdef __cplusplus \n\ + #define _Static_assert static_assert \n\ + #endif \n\ + ", obj_name); + + bpf_object__for_each_map(map, obj) { + if (!is_internal_mmapable_map(map, map_ident, sizeof(map_ident))) + continue; + + sec = find_type_for_map(btf, map_ident); + if (!sec) { + /* best effort, couldn't find the type for this map */ + continue; + } + + sec_var = btf_var_secinfos(sec); + vlen = btf_vlen(sec); + + for (i = 0; i < vlen; i++, sec_var++) { + const struct btf_type *var = btf__type_by_id(btf, sec_var->type); + const char *var_name = btf__name_by_offset(btf, var->name_off); + long var_size; + + /* static variables are not exposed through BPF skeleton */ + if (btf_var(var)->linkage == BTF_VAR_STATIC) + continue; + + var_size = btf__resolve_size(btf, var->type); + if (var_size < 0) + continue; + + var_ident[0] = '\0'; + strncat(var_ident, var_name, sizeof(var_ident) - 1); + sanitize_identifier(var_ident); + + printf("\t_Static_assert(sizeof(s->%s->%s) == %ld, \"unexpected size of '%s'\");\n", + map_ident, var_ident, var_size, var_ident); + } + } + codegen("\ + \n\ + #ifdef __cplusplus \n\ + #undef _Static_assert \n\ + #endif \n\ + } \n\ + "); +} + +static void codegen_attach_detach(struct bpf_object *obj, const char *obj_name) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + const char *tp_name; + + codegen("\ + \n\ + \n\ + static inline int \n\ + %1$s__%2$s__attach(struct %1$s *skel) \n\ + { \n\ + int prog_fd = skel->progs.%2$s.prog_fd; \n\ + ", obj_name, bpf_program__name(prog)); + + switch (bpf_program__type(prog)) { + case BPF_PROG_TYPE_RAW_TRACEPOINT: + tp_name = strchr(bpf_program__section_name(prog), '/') + 1; + printf("\tint fd = skel_raw_tracepoint_open(\"%s\", prog_fd);\n", tp_name); + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + if (bpf_program__expected_attach_type(prog) == BPF_TRACE_ITER) + printf("\tint fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);\n"); + else + printf("\tint fd = skel_raw_tracepoint_open(NULL, prog_fd);\n"); + break; + default: + printf("\tint fd = ((void)prog_fd, 0); /* auto-attach not supported */\n"); + break; + } + codegen("\ + \n\ + \n\ + if (fd > 0) \n\ + skel->links.%1$s_fd = fd; \n\ + return fd; \n\ + } \n\ + ", bpf_program__name(prog)); + } + + codegen("\ + \n\ + \n\ + static inline int \n\ + %1$s__attach(struct %1$s *skel) \n\ + { \n\ + int ret = 0; \n\ + \n\ + ", obj_name); + + bpf_object__for_each_program(prog, obj) { + codegen("\ + \n\ + ret = ret < 0 ? ret : %1$s__%2$s__attach(skel); \n\ + ", obj_name, bpf_program__name(prog)); + } + + codegen("\ + \n\ + return ret < 0 ? ret : 0; \n\ + } \n\ + \n\ + static inline void \n\ + %1$s__detach(struct %1$s *skel) \n\ + { \n\ + ", obj_name); + + bpf_object__for_each_program(prog, obj) { + codegen("\ + \n\ + skel_closenz(skel->links.%1$s_fd); \n\ + ", bpf_program__name(prog)); + } + + codegen("\ + \n\ + } \n\ + "); +} + +static void codegen_destroy(struct bpf_object *obj, const char *obj_name) +{ + struct bpf_program *prog; + struct bpf_map *map; + char ident[256]; + + codegen("\ + \n\ + static void \n\ + %1$s__destroy(struct %1$s *skel) \n\ + { \n\ + if (!skel) \n\ + return; \n\ + %1$s__detach(skel); \n\ + ", + obj_name); + + bpf_object__for_each_program(prog, obj) { + codegen("\ + \n\ + skel_closenz(skel->progs.%1$s.prog_fd); \n\ + ", bpf_program__name(prog)); + } + + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + if (bpf_map__is_internal(map) && + (bpf_map__map_flags(map) & BPF_F_MMAPABLE)) + printf("\tskel_free_map_data(skel->%1$s, skel->maps.%1$s.initial_value, %2$zd);\n", + ident, bpf_map_mmap_sz(map)); + codegen("\ + \n\ + skel_closenz(skel->maps.%1$s.map_fd); \n\ + ", ident); + } + codegen("\ + \n\ + skel_free(skel); \n\ + } \n\ + ", + obj_name); +} + +static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *header_guard) +{ + DECLARE_LIBBPF_OPTS(gen_loader_opts, opts); + struct bpf_map *map; + char ident[256]; + int err = 0; + + err = bpf_object__gen_loader(obj, &opts); + if (err) + return err; + + err = bpf_object__load(obj); + if (err) { + p_err("failed to load object file"); + goto out; + } + /* If there was no error during load then gen_loader_opts + * are populated with the loader program. + */ + + /* finish generating 'struct skel' */ + codegen("\ + \n\ + }; \n\ + ", obj_name); + + + codegen_attach_detach(obj, obj_name); + + codegen_destroy(obj, obj_name); + + codegen("\ + \n\ + static inline struct %1$s * \n\ + %1$s__open(void) \n\ + { \n\ + struct %1$s *skel; \n\ + \n\ + skel = skel_alloc(sizeof(*skel)); \n\ + if (!skel) \n\ + goto cleanup; \n\ + skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\ + ", + obj_name, opts.data_sz); + bpf_object__for_each_map(map, obj) { + const void *mmap_data = NULL; + size_t mmap_size = 0; + + if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + continue; + + codegen("\ + \n\ + skel->%1$s = skel_prep_map_data((void *)\"\\ \n\ + ", ident); + mmap_data = bpf_map__initial_value(map, &mmap_size); + print_hex(mmap_data, mmap_size); + codegen("\ + \n\ + \", %1$zd, %2$zd); \n\ + if (!skel->%3$s) \n\ + goto cleanup; \n\ + skel->maps.%3$s.initial_value = (__u64) (long) skel->%3$s;\n\ + ", bpf_map_mmap_sz(map), mmap_size, ident); + } + codegen("\ + \n\ + return skel; \n\ + cleanup: \n\ + %1$s__destroy(skel); \n\ + return NULL; \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__load(struct %1$s *skel) \n\ + { \n\ + struct bpf_load_and_run_opts opts = {}; \n\ + int err; \n\ + \n\ + opts.ctx = (struct bpf_loader_ctx *)skel; \n\ + opts.data_sz = %2$d; \n\ + opts.data = (void *)\"\\ \n\ + ", + obj_name, opts.data_sz); + print_hex(opts.data, opts.data_sz); + codegen("\ + \n\ + \"; \n\ + "); + + codegen("\ + \n\ + opts.insns_sz = %d; \n\ + opts.insns = (void *)\"\\ \n\ + ", + opts.insns_sz); + print_hex(opts.insns, opts.insns_sz); + codegen("\ + \n\ + \"; \n\ + err = bpf_load_and_run(&opts); \n\ + if (err < 0) \n\ + return err; \n\ + ", obj_name); + bpf_object__for_each_map(map, obj) { + const char *mmap_flags; + + if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + continue; + + if (bpf_map__map_flags(map) & BPF_F_RDONLY_PROG) + mmap_flags = "PROT_READ"; + else + mmap_flags = "PROT_READ | PROT_WRITE"; + + codegen("\ + \n\ + skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value, \n\ + %2$zd, %3$s, skel->maps.%1$s.map_fd);\n\ + if (!skel->%1$s) \n\ + return -ENOMEM; \n\ + ", + ident, bpf_map_mmap_sz(map), mmap_flags); + } + codegen("\ + \n\ + return 0; \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_and_load(void) \n\ + { \n\ + struct %1$s *skel; \n\ + \n\ + skel = %1$s__open(); \n\ + if (!skel) \n\ + return NULL; \n\ + if (%1$s__load(skel)) { \n\ + %1$s__destroy(skel); \n\ + return NULL; \n\ + } \n\ + return skel; \n\ + } \n\ + \n\ + ", obj_name); + + codegen_asserts(obj, obj_name); + + codegen("\ + \n\ + \n\ + #endif /* %s */ \n\ + ", + header_guard); + err = 0; +out: + return err; +} + +static void +codegen_maps_skeleton(struct bpf_object *obj, size_t map_cnt, bool mmaped) +{ + struct bpf_map *map; + char ident[256]; + size_t i; + + if (!map_cnt) + return; + + codegen("\ + \n\ + \n\ + /* maps */ \n\ + s->map_cnt = %zu; \n\ + s->map_skel_sz = sizeof(*s->maps); \n\ + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);\n\ + if (!s->maps) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + ", + map_cnt + ); + i = 0; + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + + codegen("\ + \n\ + \n\ + s->maps[%zu].name = \"%s\"; \n\ + s->maps[%zu].map = &obj->maps.%s; \n\ + ", + i, bpf_map__name(map), i, ident); + /* memory-mapped internal maps */ + if (mmaped && is_internal_mmapable_map(map, ident, sizeof(ident))) { + printf("\ts->maps[%zu].mmaped = (void **)&obj->%s;\n", + i, ident); + } + i++; + } +} + +static void +codegen_progs_skeleton(struct bpf_object *obj, size_t prog_cnt, bool populate_links) +{ + struct bpf_program *prog; + int i; + + if (!prog_cnt) + return; + + codegen("\ + \n\ + \n\ + /* programs */ \n\ + s->prog_cnt = %zu; \n\ + s->prog_skel_sz = sizeof(*s->progs); \n\ + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);\n\ + if (!s->progs) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + ", + prog_cnt + ); + i = 0; + bpf_object__for_each_program(prog, obj) { + codegen("\ + \n\ + \n\ + s->progs[%1$zu].name = \"%2$s\"; \n\ + s->progs[%1$zu].prog = &obj->progs.%2$s;\n\ + ", + i, bpf_program__name(prog)); + + if (populate_links) { + codegen("\ + \n\ + s->progs[%1$zu].link = &obj->links.%2$s;\n\ + ", + i, bpf_program__name(prog)); + } + i++; + } +} + +static int do_skeleton(int argc, char **argv) +{ + char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; + size_t map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + char obj_name[MAX_OBJ_NAME_LEN] = "", *obj_data; + struct bpf_object *obj = NULL; + const char *file; + char ident[256]; + struct bpf_program *prog; + int fd, err = -1; + struct bpf_map *map; + struct btf *btf; + struct stat st; + + if (!REQ_ARGS(1)) { + usage(); + return -1; + } + file = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + if (obj_name[0] != '\0') { + p_err("object name already specified"); + return -1; + } + + strncpy(obj_name, *argv, MAX_OBJ_NAME_LEN - 1); + obj_name[MAX_OBJ_NAME_LEN - 1] = '\0'; + } else { + p_err("unknown arg %s", *argv); + return -1; + } + + NEXT_ARG(); + } + + if (argc) { + p_err("extra unknown arguments"); + return -1; + } + + if (stat(file, &st)) { + p_err("failed to stat() %s: %s", file, strerror(errno)); + return -1; + } + file_sz = st.st_size; + mmap_sz = roundup(file_sz, sysconf(_SC_PAGE_SIZE)); + fd = open(file, O_RDONLY); + if (fd < 0) { + p_err("failed to open() %s: %s", file, strerror(errno)); + return -1; + } + obj_data = mmap(NULL, mmap_sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (obj_data == MAP_FAILED) { + obj_data = NULL; + p_err("failed to mmap() %s: %s", file, strerror(errno)); + goto out; + } + if (obj_name[0] == '\0') + get_obj_name(obj_name, file); + opts.object_name = obj_name; + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + opts.kernel_log_level = 1 + 2 + 4; + obj = bpf_object__open_mem(obj_data, file_sz, &opts); + if (!obj) { + char err_buf[256]; + + err = -errno; + libbpf_strerror(err, err_buf, sizeof(err_buf)); + p_err("failed to open BPF object file: %s", err_buf); + goto out; + } + + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) { + p_err("ignoring unrecognized internal map '%s'...", + bpf_map__name(map)); + continue; + } + map_cnt++; + } + bpf_object__for_each_program(prog, obj) { + prog_cnt++; + } + + get_header_guard(header_guard, obj_name, "SKEL_H"); + if (use_loader) { + codegen("\ + \n\ + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ \n\ + /* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ \n\ + #ifndef %2$s \n\ + #define %2$s \n\ + \n\ + #include \n\ + \n\ + struct %1$s { \n\ + struct bpf_loader_ctx ctx; \n\ + ", + obj_name, header_guard + ); + } else { + codegen("\ + \n\ + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ \n\ + \n\ + /* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ \n\ + #ifndef %2$s \n\ + #define %2$s \n\ + \n\ + #include \n\ + #include \n\ + #include \n\ + \n\ + struct %1$s { \n\ + struct bpf_object_skeleton *skeleton; \n\ + struct bpf_object *obj; \n\ + ", + obj_name, header_guard + ); + } + + if (map_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + if (use_loader) + printf("\t\tstruct bpf_map_desc %s;\n", ident); + else + printf("\t\tstruct bpf_map *%s;\n", ident); + } + printf("\t} maps;\n"); + } + + if (prog_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_program(prog, obj) { + if (use_loader) + printf("\t\tstruct bpf_prog_desc %s;\n", + bpf_program__name(prog)); + else + printf("\t\tstruct bpf_program *%s;\n", + bpf_program__name(prog)); + } + printf("\t} progs;\n"); + printf("\tstruct {\n"); + bpf_object__for_each_program(prog, obj) { + if (use_loader) + printf("\t\tint %s_fd;\n", + bpf_program__name(prog)); + else + printf("\t\tstruct bpf_link *%s;\n", + bpf_program__name(prog)); + } + printf("\t} links;\n"); + } + + btf = bpf_object__btf(obj); + if (btf) { + err = codegen_datasecs(obj, obj_name); + if (err) + goto out; + } + if (use_loader) { + err = gen_trace(obj, obj_name, header_guard); + goto out; + } + + codegen("\ + \n\ + \n\ + #ifdef __cplusplus \n\ + static inline struct %1$s *open(const struct bpf_object_open_opts *opts = nullptr);\n\ + static inline struct %1$s *open_and_load(); \n\ + static inline int load(struct %1$s *skel); \n\ + static inline int attach(struct %1$s *skel); \n\ + static inline void detach(struct %1$s *skel); \n\ + static inline void destroy(struct %1$s *skel); \n\ + static inline const void *elf_bytes(size_t *sz); \n\ + #endif /* __cplusplus */ \n\ + }; \n\ + \n\ + static void \n\ + %1$s__destroy(struct %1$s *obj) \n\ + { \n\ + if (!obj) \n\ + return; \n\ + if (obj->skeleton) \n\ + bpf_object__destroy_skeleton(obj->skeleton);\n\ + free(obj); \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__create_skeleton(struct %1$s *obj); \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_opts(const struct bpf_object_open_opts *opts) \n\ + { \n\ + struct %1$s *obj; \n\ + int err; \n\ + \n\ + obj = (struct %1$s *)calloc(1, sizeof(*obj)); \n\ + if (!obj) { \n\ + errno = ENOMEM; \n\ + return NULL; \n\ + } \n\ + \n\ + err = %1$s__create_skeleton(obj); \n\ + if (err) \n\ + goto err_out; \n\ + \n\ + err = bpf_object__open_skeleton(obj->skeleton, opts);\n\ + if (err) \n\ + goto err_out; \n\ + \n\ + return obj; \n\ + err_out: \n\ + %1$s__destroy(obj); \n\ + errno = -err; \n\ + return NULL; \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open(void) \n\ + { \n\ + return %1$s__open_opts(NULL); \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__load(struct %1$s *obj) \n\ + { \n\ + return bpf_object__load_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open_and_load(void) \n\ + { \n\ + struct %1$s *obj; \n\ + int err; \n\ + \n\ + obj = %1$s__open(); \n\ + if (!obj) \n\ + return NULL; \n\ + err = %1$s__load(obj); \n\ + if (err) { \n\ + %1$s__destroy(obj); \n\ + errno = -err; \n\ + return NULL; \n\ + } \n\ + return obj; \n\ + } \n\ + \n\ + static inline int \n\ + %1$s__attach(struct %1$s *obj) \n\ + { \n\ + return bpf_object__attach_skeleton(obj->skeleton); \n\ + } \n\ + \n\ + static inline void \n\ + %1$s__detach(struct %1$s *obj) \n\ + { \n\ + bpf_object__detach_skeleton(obj->skeleton); \n\ + } \n\ + ", + obj_name + ); + + codegen("\ + \n\ + \n\ + static inline const void *%1$s__elf_bytes(size_t *sz); \n\ + \n\ + static inline int \n\ + %1$s__create_skeleton(struct %1$s *obj) \n\ + { \n\ + struct bpf_object_skeleton *s; \n\ + int err; \n\ + \n\ + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));\n\ + if (!s) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + \n\ + s->sz = sizeof(*s); \n\ + s->name = \"%1$s\"; \n\ + s->obj = &obj->obj; \n\ + ", + obj_name + ); + + codegen_maps_skeleton(obj, map_cnt, true /*mmaped*/); + codegen_progs_skeleton(obj, prog_cnt, true /*populate_links*/); + + codegen("\ + \n\ + \n\ + s->data = (void *)%2$s__elf_bytes(&s->data_sz); \n\ + \n\ + obj->skeleton = s; \n\ + return 0; \n\ + err: \n\ + bpf_object__destroy_skeleton(s); \n\ + return err; \n\ + } \n\ + \n\ + static inline const void *%2$s__elf_bytes(size_t *sz) \n\ + { \n\ + *sz = %1$d; \n\ + return (const void *)\"\\ \n\ + " + , file_sz, obj_name); + + /* embed contents of BPF object file */ + print_hex(obj_data, file_sz); + + codegen("\ + \n\ + \"; \n\ + } \n\ + \n\ + #ifdef __cplusplus \n\ + struct %1$s *%1$s::open(const struct bpf_object_open_opts *opts) { return %1$s__open_opts(opts); }\n\ + struct %1$s *%1$s::open_and_load() { return %1$s__open_and_load(); } \n\ + int %1$s::load(struct %1$s *skel) { return %1$s__load(skel); } \n\ + int %1$s::attach(struct %1$s *skel) { return %1$s__attach(skel); } \n\ + void %1$s::detach(struct %1$s *skel) { %1$s__detach(skel); } \n\ + void %1$s::destroy(struct %1$s *skel) { %1$s__destroy(skel); } \n\ + const void *%1$s::elf_bytes(size_t *sz) { return %1$s__elf_bytes(sz); } \n\ + #endif /* __cplusplus */ \n\ + \n\ + ", + obj_name); + + codegen_asserts(obj, obj_name); + + codegen("\ + \n\ + \n\ + #endif /* %1$s */ \n\ + ", + header_guard); + err = 0; +out: + bpf_object__close(obj); + if (obj_data) + munmap(obj_data, mmap_sz); + close(fd); + return err; +} + +/* Subskeletons are like skeletons, except they don't own the bpf_object, + * associated maps, links, etc. Instead, they know about the existence of + * variables, maps, programs and are able to find their locations + * _at runtime_ from an already loaded bpf_object. + * + * This allows for library-like BPF objects to have userspace counterparts + * with access to their own items without having to know anything about the + * final BPF object that the library was linked into. + */ +static int do_subskeleton(int argc, char **argv) +{ + char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SUBSKEL_H__")]; + size_t i, len, file_sz, map_cnt = 0, prog_cnt = 0, mmap_sz, var_cnt = 0, var_idx = 0; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + char obj_name[MAX_OBJ_NAME_LEN] = "", *obj_data; + struct bpf_object *obj = NULL; + const char *file, *var_name; + char ident[256]; + int fd, err = -1, map_type_id; + const struct bpf_map *map; + struct bpf_program *prog; + struct btf *btf; + const struct btf_type *map_type, *var_type; + const struct btf_var_secinfo *var; + struct stat st; + + if (!REQ_ARGS(1)) { + usage(); + return -1; + } + file = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + if (obj_name[0] != '\0') { + p_err("object name already specified"); + return -1; + } + + strncpy(obj_name, *argv, MAX_OBJ_NAME_LEN - 1); + obj_name[MAX_OBJ_NAME_LEN - 1] = '\0'; + } else { + p_err("unknown arg %s", *argv); + return -1; + } + + NEXT_ARG(); + } + + if (argc) { + p_err("extra unknown arguments"); + return -1; + } + + if (use_loader) { + p_err("cannot use loader for subskeletons"); + return -1; + } + + if (stat(file, &st)) { + p_err("failed to stat() %s: %s", file, strerror(errno)); + return -1; + } + file_sz = st.st_size; + mmap_sz = roundup(file_sz, sysconf(_SC_PAGE_SIZE)); + fd = open(file, O_RDONLY); + if (fd < 0) { + p_err("failed to open() %s: %s", file, strerror(errno)); + return -1; + } + obj_data = mmap(NULL, mmap_sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (obj_data == MAP_FAILED) { + obj_data = NULL; + p_err("failed to mmap() %s: %s", file, strerror(errno)); + goto out; + } + if (obj_name[0] == '\0') + get_obj_name(obj_name, file); + + /* The empty object name allows us to use bpf_map__name and produce + * ELF section names out of it. (".data" instead of "obj.data") + */ + opts.object_name = ""; + obj = bpf_object__open_mem(obj_data, file_sz, &opts); + if (!obj) { + char err_buf[256]; + + libbpf_strerror(errno, err_buf, sizeof(err_buf)); + p_err("failed to open BPF object file: %s", err_buf); + obj = NULL; + goto out; + } + + btf = bpf_object__btf(obj); + if (!btf) { + err = -1; + p_err("need btf type information for %s", obj_name); + goto out; + } + + bpf_object__for_each_program(prog, obj) { + prog_cnt++; + } + + /* First, count how many variables we have to find. + * We need this in advance so the subskel can allocate the right + * amount of storage. + */ + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + + /* Also count all maps that have a name */ + map_cnt++; + + if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + continue; + + map_type_id = bpf_map__btf_value_type_id(map); + if (map_type_id <= 0) { + err = map_type_id; + goto out; + } + map_type = btf__type_by_id(btf, map_type_id); + + var = btf_var_secinfos(map_type); + len = btf_vlen(map_type); + for (i = 0; i < len; i++, var++) { + var_type = btf__type_by_id(btf, var->type); + + if (btf_var(var_type)->linkage == BTF_VAR_STATIC) + continue; + + var_cnt++; + } + } + + get_header_guard(header_guard, obj_name, "SUBSKEL_H"); + codegen("\ + \n\ + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ \n\ + \n\ + /* THIS FILE IS AUTOGENERATED! */ \n\ + #ifndef %2$s \n\ + #define %2$s \n\ + \n\ + #include \n\ + #include \n\ + #include \n\ + \n\ + struct %1$s { \n\ + struct bpf_object *obj; \n\ + struct bpf_object_subskeleton *subskel; \n\ + ", obj_name, header_guard); + + if (map_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_map(map, obj) { + if (!get_map_ident(map, ident, sizeof(ident))) + continue; + printf("\t\tstruct bpf_map *%s;\n", ident); + } + printf("\t} maps;\n"); + } + + if (prog_cnt) { + printf("\tstruct {\n"); + bpf_object__for_each_program(prog, obj) { + printf("\t\tstruct bpf_program *%s;\n", + bpf_program__name(prog)); + } + printf("\t} progs;\n"); + } + + err = codegen_subskel_datasecs(obj, obj_name); + if (err) + goto out; + + /* emit code that will allocate enough storage for all symbols */ + codegen("\ + \n\ + \n\ + #ifdef __cplusplus \n\ + static inline struct %1$s *open(const struct bpf_object *src);\n\ + static inline void destroy(struct %1$s *skel); \n\ + #endif /* __cplusplus */ \n\ + }; \n\ + \n\ + static inline void \n\ + %1$s__destroy(struct %1$s *skel) \n\ + { \n\ + if (!skel) \n\ + return; \n\ + if (skel->subskel) \n\ + bpf_object__destroy_subskeleton(skel->subskel);\n\ + free(skel); \n\ + } \n\ + \n\ + static inline struct %1$s * \n\ + %1$s__open(const struct bpf_object *src) \n\ + { \n\ + struct %1$s *obj; \n\ + struct bpf_object_subskeleton *s; \n\ + int err; \n\ + \n\ + obj = (struct %1$s *)calloc(1, sizeof(*obj)); \n\ + if (!obj) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + s = (struct bpf_object_subskeleton *)calloc(1, sizeof(*s));\n\ + if (!s) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + s->sz = sizeof(*s); \n\ + s->obj = src; \n\ + s->var_skel_sz = sizeof(*s->vars); \n\ + obj->subskel = s; \n\ + \n\ + /* vars */ \n\ + s->var_cnt = %2$d; \n\ + s->vars = (struct bpf_var_skeleton *)calloc(%2$d, sizeof(*s->vars));\n\ + if (!s->vars) { \n\ + err = -ENOMEM; \n\ + goto err; \n\ + } \n\ + ", + obj_name, var_cnt + ); + + /* walk through each symbol and emit the runtime representation */ + bpf_object__for_each_map(map, obj) { + if (!is_internal_mmapable_map(map, ident, sizeof(ident))) + continue; + + map_type_id = bpf_map__btf_value_type_id(map); + if (map_type_id <= 0) + /* skip over internal maps with no type*/ + continue; + + map_type = btf__type_by_id(btf, map_type_id); + var = btf_var_secinfos(map_type); + len = btf_vlen(map_type); + for (i = 0; i < len; i++, var++) { + var_type = btf__type_by_id(btf, var->type); + var_name = btf__name_by_offset(btf, var_type->name_off); + + if (btf_var(var_type)->linkage == BTF_VAR_STATIC) + continue; + + /* Note that we use the dot prefix in .data as the + * field access operator i.e. maps%s becomes maps.data + */ + codegen("\ + \n\ + \n\ + s->vars[%3$d].name = \"%1$s\"; \n\ + s->vars[%3$d].map = &obj->maps.%2$s; \n\ + s->vars[%3$d].addr = (void **) &obj->%2$s.%1$s;\n\ + ", var_name, ident, var_idx); + + var_idx++; + } + } + + codegen_maps_skeleton(obj, map_cnt, false /*mmaped*/); + codegen_progs_skeleton(obj, prog_cnt, false /*links*/); + + codegen("\ + \n\ + \n\ + err = bpf_object__open_subskeleton(s); \n\ + if (err) \n\ + goto err; \n\ + \n\ + return obj; \n\ + err: \n\ + %1$s__destroy(obj); \n\ + errno = -err; \n\ + return NULL; \n\ + } \n\ + \n\ + #ifdef __cplusplus \n\ + struct %1$s *%1$s::open(const struct bpf_object *src) { return %1$s__open(src); }\n\ + void %1$s::destroy(struct %1$s *skel) { %1$s__destroy(skel); }\n\ + #endif /* __cplusplus */ \n\ + \n\ + #endif /* %2$s */ \n\ + ", + obj_name, header_guard); + err = 0; +out: + bpf_object__close(obj); + if (obj_data) + munmap(obj_data, mmap_sz); + close(fd); + return err; +} + +static int do_object(int argc, char **argv) +{ + struct bpf_linker *linker; + const char *output_file, *file; + int err = 0; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + output_file = GET_ARG(); + + linker = bpf_linker__new(output_file, NULL); + if (!linker) { + p_err("failed to create BPF linker instance"); + return -1; + } + + while (argc) { + file = GET_ARG(); + + err = bpf_linker__add_file(linker, file, NULL); + if (err) { + p_err("failed to link '%s': %s (%d)", file, strerror(errno), errno); + goto out; + } + } + + err = bpf_linker__finalize(linker); + if (err) { + p_err("failed to finalize ELF file: %s (%d)", strerror(errno), errno); + goto out; + } + + err = 0; +out: + bpf_linker__free(linker); + return err; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s object OUTPUT_FILE INPUT_FILE [INPUT_FILE...]\n" + " %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" + " %1$s %2$s subskeleton FILE [name OBJECT_NAME]\n" + " %1$s %2$s min_core_btf INPUT OUTPUT OBJECT [OBJECT...]\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-L|--use-loader} }\n" + "", + bin_name, "gen"); + + return 0; +} + +static int btf_save_raw(const struct btf *btf, const char *path) +{ + const void *data; + FILE *f = NULL; + __u32 data_sz; + int err = 0; + + data = btf__raw_data(btf, &data_sz); + if (!data) + return -ENOMEM; + + f = fopen(path, "wb"); + if (!f) + return -errno; + + if (fwrite(data, 1, data_sz, f) != data_sz) + err = -errno; + + fclose(f); + return err; +} + +struct btfgen_info { + struct btf *src_btf; + struct btf *marked_btf; /* btf structure used to mark used types */ +}; + +static size_t btfgen_hash_fn(long key, void *ctx) +{ + return key; +} + +static bool btfgen_equal_fn(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +static void btfgen_free_info(struct btfgen_info *info) +{ + if (!info) + return; + + btf__free(info->src_btf); + btf__free(info->marked_btf); + + free(info); +} + +static struct btfgen_info * +btfgen_new_info(const char *targ_btf_path) +{ + struct btfgen_info *info; + int err; + + info = calloc(1, sizeof(*info)); + if (!info) + return NULL; + + info->src_btf = btf__parse(targ_btf_path, NULL); + if (!info->src_btf) { + err = -errno; + p_err("failed parsing '%s' BTF file: %s", targ_btf_path, strerror(errno)); + goto err_out; + } + + info->marked_btf = btf__parse(targ_btf_path, NULL); + if (!info->marked_btf) { + err = -errno; + p_err("failed parsing '%s' BTF file: %s", targ_btf_path, strerror(errno)); + goto err_out; + } + + return info; + +err_out: + btfgen_free_info(info); + errno = -err; + return NULL; +} + +#define MARKED UINT32_MAX + +static void btfgen_mark_member(struct btfgen_info *info, int type_id, int idx) +{ + const struct btf_type *t = btf__type_by_id(info->marked_btf, type_id); + struct btf_member *m = btf_members(t) + idx; + + m->name_off = MARKED; +} + +static int +btfgen_mark_type(struct btfgen_info *info, unsigned int type_id, bool follow_pointers) +{ + const struct btf_type *btf_type = btf__type_by_id(info->src_btf, type_id); + struct btf_type *cloned_type; + struct btf_param *param; + struct btf_array *array; + int err, i; + + if (type_id == 0) + return 0; + + /* mark type on cloned BTF as used */ + cloned_type = (struct btf_type *) btf__type_by_id(info->marked_btf, type_id); + cloned_type->name_off = MARKED; + + /* recursively mark other types needed by it */ + switch (btf_kind(btf_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + break; + case BTF_KIND_PTR: + if (follow_pointers) { + err = btfgen_mark_type(info, btf_type->type, follow_pointers); + if (err) + return err; + } + break; + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VOLATILE: + case BTF_KIND_TYPEDEF: + err = btfgen_mark_type(info, btf_type->type, follow_pointers); + if (err) + return err; + break; + case BTF_KIND_ARRAY: + array = btf_array(btf_type); + + /* mark array type */ + err = btfgen_mark_type(info, array->type, follow_pointers); + /* mark array's index type */ + err = err ? : btfgen_mark_type(info, array->index_type, follow_pointers); + if (err) + return err; + break; + case BTF_KIND_FUNC_PROTO: + /* mark ret type */ + err = btfgen_mark_type(info, btf_type->type, follow_pointers); + if (err) + return err; + + /* mark parameters types */ + param = btf_params(btf_type); + for (i = 0; i < btf_vlen(btf_type); i++) { + err = btfgen_mark_type(info, param->type, follow_pointers); + if (err) + return err; + param++; + } + break; + /* tells if some other type needs to be handled */ + default: + p_err("unsupported kind: %s (%d)", btf_kind_str(btf_type), type_id); + return -EINVAL; + } + + return 0; +} + +static int btfgen_record_field_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + struct btf *btf = info->src_btf; + const struct btf_type *btf_type; + struct btf_member *btf_member; + struct btf_array *array; + unsigned int type_id = targ_spec->root_type_id; + int idx, err; + + /* mark root type */ + btf_type = btf__type_by_id(btf, type_id); + err = btfgen_mark_type(info, type_id, false); + if (err) + return err; + + /* mark types for complex types (arrays, unions, structures) */ + for (int i = 1; i < targ_spec->raw_len; i++) { + /* skip typedefs and mods */ + while (btf_is_mod(btf_type) || btf_is_typedef(btf_type)) { + type_id = btf_type->type; + btf_type = btf__type_by_id(btf, type_id); + } + + switch (btf_kind(btf_type)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + idx = targ_spec->raw_spec[i]; + btf_member = btf_members(btf_type) + idx; + + /* mark member */ + btfgen_mark_member(info, type_id, idx); + + /* mark member's type */ + type_id = btf_member->type; + btf_type = btf__type_by_id(btf, type_id); + err = btfgen_mark_type(info, type_id, false); + if (err) + return err; + break; + case BTF_KIND_ARRAY: + array = btf_array(btf_type); + type_id = array->type; + btf_type = btf__type_by_id(btf, type_id); + break; + default: + p_err("unsupported kind: %s (%d)", + btf_kind_str(btf_type), btf_type->type); + return -EINVAL; + } + } + + return 0; +} + +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + * + * The `behind_ptr` argument is used to stop marking of composite types reached + * through a pointer. This way, we can keep BTF size in check while providing + * reasonable match semantics. + */ +static int btfgen_mark_type_match(struct btfgen_info *info, __u32 type_id, bool behind_ptr) +{ + const struct btf_type *btf_type; + struct btf *btf = info->src_btf; + struct btf_type *cloned_type; + int i, err; + + if (type_id == 0) + return 0; + + btf_type = btf__type_by_id(btf, type_id); + /* mark type on cloned BTF as used */ + cloned_type = (struct btf_type *)btf__type_by_id(info->marked_btf, type_id); + cloned_type->name_off = MARKED; + + switch (btf_kind(btf_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(btf_type); + __u16 vlen = btf_vlen(btf_type); + + if (behind_ptr) + break; + + for (i = 0; i < vlen; i++, m++) { + /* mark member */ + btfgen_mark_member(info, type_id, i); + + /* mark member's type */ + err = btfgen_mark_type_match(info, m->type, false); + if (err) + return err; + } + break; + } + case BTF_KIND_CONST: + case BTF_KIND_FWD: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + return btfgen_mark_type_match(info, btf_type->type, behind_ptr); + case BTF_KIND_PTR: + return btfgen_mark_type_match(info, btf_type->type, true); + case BTF_KIND_ARRAY: { + struct btf_array *array; + + array = btf_array(btf_type); + /* mark array type */ + err = btfgen_mark_type_match(info, array->type, false); + /* mark array's index type */ + err = err ? : btfgen_mark_type_match(info, array->index_type, false); + if (err) + return err; + break; + } + case BTF_KIND_FUNC_PROTO: { + __u16 vlen = btf_vlen(btf_type); + struct btf_param *param; + + /* mark ret type */ + err = btfgen_mark_type_match(info, btf_type->type, false); + if (err) + return err; + + /* mark parameters types */ + param = btf_params(btf_type); + for (i = 0; i < vlen; i++) { + err = btfgen_mark_type_match(info, param->type, false); + if (err) + return err; + param++; + } + break; + } + /* tells if some other type needs to be handled */ + default: + p_err("unsupported kind: %s (%d)", btf_kind_str(btf_type), type_id); + return -EINVAL; + } + + return 0; +} + +/* Mark types, members, and member types. Compared to btfgen_record_field_relo, + * this function does not rely on the target spec for inferring members, but + * uses the associated BTF. + */ +static int btfgen_record_type_match_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + return btfgen_mark_type_match(info, targ_spec->root_type_id, false); +} + +static int btfgen_record_type_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + return btfgen_mark_type(info, targ_spec->root_type_id, true); +} + +static int btfgen_record_enumval_relo(struct btfgen_info *info, struct bpf_core_spec *targ_spec) +{ + return btfgen_mark_type(info, targ_spec->root_type_id, false); +} + +static int btfgen_record_reloc(struct btfgen_info *info, struct bpf_core_spec *res) +{ + switch (res->relo_kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_EXISTS: + case BPF_CORE_FIELD_SIGNED: + case BPF_CORE_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: + return btfgen_record_field_relo(info, res); + case BPF_CORE_TYPE_ID_LOCAL: /* BPF_CORE_TYPE_ID_LOCAL doesn't require kernel BTF */ + return 0; + case BPF_CORE_TYPE_ID_TARGET: + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_SIZE: + return btfgen_record_type_relo(info, res); + case BPF_CORE_TYPE_MATCHES: + return btfgen_record_type_match_relo(info, res); + case BPF_CORE_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_VALUE: + return btfgen_record_enumval_relo(info, res); + default: + return -EINVAL; + } +} + +static struct bpf_core_cand_list * +btfgen_find_cands(const struct btf *local_btf, const struct btf *targ_btf, __u32 local_id) +{ + const struct btf_type *local_type; + struct bpf_core_cand_list *cands = NULL; + struct bpf_core_cand local_cand = {}; + size_t local_essent_len; + const char *local_name; + int err; + + local_cand.btf = local_btf; + local_cand.id = local_id; + + local_type = btf__type_by_id(local_btf, local_id); + if (!local_type) { + err = -EINVAL; + goto err_out; + } + + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) { + err = -EINVAL; + goto err_out; + } + local_essent_len = bpf_core_essential_name_len(local_name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return NULL; + + err = bpf_core_add_cands(&local_cand, local_essent_len, targ_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + return cands; + +err_out: + bpf_core_free_cands(cands); + errno = -err; + return NULL; +} + +/* Record relocation information for a single BPF object */ +static int btfgen_record_obj(struct btfgen_info *info, const char *obj_path) +{ + const struct btf_ext_info_sec *sec; + const struct bpf_core_relo *relo; + const struct btf_ext_info *seg; + struct hashmap_entry *entry; + struct hashmap *cand_cache = NULL; + struct btf_ext *btf_ext = NULL; + unsigned int relo_idx; + struct btf *btf = NULL; + size_t i; + int err; + + btf = btf__parse(obj_path, &btf_ext); + if (!btf) { + err = -errno; + p_err("failed to parse BPF object '%s': %s", obj_path, strerror(errno)); + return err; + } + + if (!btf_ext) { + p_err("failed to parse BPF object '%s': section %s not found", + obj_path, BTF_EXT_ELF_SEC); + err = -EINVAL; + goto out; + } + + if (btf_ext->core_relo_info.len == 0) { + err = 0; + goto out; + } + + cand_cache = hashmap__new(btfgen_hash_fn, btfgen_equal_fn, NULL); + if (IS_ERR(cand_cache)) { + err = PTR_ERR(cand_cache); + goto out; + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + for_each_btf_ext_rec(seg, sec, relo_idx, relo) { + struct bpf_core_spec specs_scratch[3] = {}; + struct bpf_core_relo_res targ_res = {}; + struct bpf_core_cand_list *cands = NULL; + const char *sec_name = btf__name_by_offset(btf, sec->sec_name_off); + + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && + !hashmap__find(cand_cache, relo->type_id, &cands)) { + cands = btfgen_find_cands(btf, info->src_btf, relo->type_id); + if (!cands) { + err = -errno; + goto out; + } + + err = hashmap__set(cand_cache, relo->type_id, cands, + NULL, NULL); + if (err) + goto out; + } + + err = bpf_core_calc_relo_insn(sec_name, relo, relo_idx, btf, cands, + specs_scratch, &targ_res); + if (err) + goto out; + + /* specs_scratch[2] is the target spec */ + err = btfgen_record_reloc(info, &specs_scratch[2]); + if (err) + goto out; + } + } + +out: + btf__free(btf); + btf_ext__free(btf_ext); + + if (!IS_ERR_OR_NULL(cand_cache)) { + hashmap__for_each_entry(cand_cache, entry, i) { + bpf_core_free_cands(entry->pvalue); + } + hashmap__free(cand_cache); + } + + return err; +} + +static int btfgen_remap_id(__u32 *type_id, void *ctx) +{ + unsigned int *ids = ctx; + + *type_id = ids[*type_id]; + + return 0; +} + +/* Generate BTF from relocation information previously recorded */ +static struct btf *btfgen_get_btf(struct btfgen_info *info) +{ + struct btf *btf_new = NULL; + unsigned int *ids = NULL; + unsigned int i, n = btf__type_cnt(info->marked_btf); + int err = 0; + + btf_new = btf__new_empty(); + if (!btf_new) { + err = -errno; + goto err_out; + } + + ids = calloc(n, sizeof(*ids)); + if (!ids) { + err = -errno; + goto err_out; + } + + /* first pass: add all marked types to btf_new and add their new ids to the ids map */ + for (i = 1; i < n; i++) { + const struct btf_type *cloned_type, *type; + const char *name; + int new_id; + + cloned_type = btf__type_by_id(info->marked_btf, i); + + if (cloned_type->name_off != MARKED) + continue; + + type = btf__type_by_id(info->src_btf, i); + + /* add members for struct and union */ + if (btf_is_composite(type)) { + struct btf_member *cloned_m, *m; + unsigned short vlen; + int idx_src; + + name = btf__str_by_offset(info->src_btf, type->name_off); + + if (btf_is_struct(type)) + err = btf__add_struct(btf_new, name, type->size); + else + err = btf__add_union(btf_new, name, type->size); + + if (err < 0) + goto err_out; + new_id = err; + + cloned_m = btf_members(cloned_type); + m = btf_members(type); + vlen = btf_vlen(cloned_type); + for (idx_src = 0; idx_src < vlen; idx_src++, cloned_m++, m++) { + /* add only members that are marked as used */ + if (cloned_m->name_off != MARKED) + continue; + + name = btf__str_by_offset(info->src_btf, m->name_off); + err = btf__add_field(btf_new, name, m->type, + btf_member_bit_offset(cloned_type, idx_src), + btf_member_bitfield_size(cloned_type, idx_src)); + if (err < 0) + goto err_out; + } + } else { + err = btf__add_type(btf_new, info->src_btf, type); + if (err < 0) + goto err_out; + new_id = err; + } + + /* add ID mapping */ + ids[i] = new_id; + } + + /* second pass: fix up type ids */ + for (i = 1; i < btf__type_cnt(btf_new); i++) { + struct btf_type *btf_type = (struct btf_type *) btf__type_by_id(btf_new, i); + + err = btf_type_visit_type_ids(btf_type, btfgen_remap_id, ids); + if (err) + goto err_out; + } + + free(ids); + return btf_new; + +err_out: + btf__free(btf_new); + free(ids); + errno = -err; + return NULL; +} + +/* Create minimized BTF file for a set of BPF objects. + * + * The BTFGen algorithm is divided in two main parts: (1) collect the + * BTF types that are involved in relocations and (2) generate the BTF + * object using the collected types. + * + * In order to collect the types involved in the relocations, we parse + * the BTF and BTF.ext sections of the BPF objects and use + * bpf_core_calc_relo_insn() to get the target specification, this + * indicates how the types and fields are used in a relocation. + * + * Types are recorded in different ways according to the kind of the + * relocation. For field-based relocations only the members that are + * actually used are saved in order to reduce the size of the generated + * BTF file. For type-based relocations empty struct / unions are + * generated and for enum-based relocations the whole type is saved. + * + * The second part of the algorithm generates the BTF object. It creates + * an empty BTF object and fills it with the types recorded in the + * previous step. This function takes care of only adding the structure + * and union members that were marked as used and it also fixes up the + * type IDs on the generated BTF object. + */ +static int minimize_btf(const char *src_btf, const char *dst_btf, const char *objspaths[]) +{ + struct btfgen_info *info; + struct btf *btf_new = NULL; + int err, i; + + info = btfgen_new_info(src_btf); + if (!info) { + err = -errno; + p_err("failed to allocate info structure: %s", strerror(errno)); + goto out; + } + + for (i = 0; objspaths[i] != NULL; i++) { + err = btfgen_record_obj(info, objspaths[i]); + if (err) { + p_err("error recording relocations for %s: %s", objspaths[i], + strerror(errno)); + goto out; + } + } + + btf_new = btfgen_get_btf(info); + if (!btf_new) { + err = -errno; + p_err("error generating BTF: %s", strerror(errno)); + goto out; + } + + err = btf_save_raw(btf_new, dst_btf); + if (err) { + p_err("error saving btf file: %s", strerror(errno)); + goto out; + } + +out: + btf__free(btf_new); + btfgen_free_info(info); + + return err; +} + +static int do_min_core_btf(int argc, char **argv) +{ + const char *input, *output, **objs; + int i, err; + + if (!REQ_ARGS(3)) { + usage(); + return -1; + } + + input = GET_ARG(); + output = GET_ARG(); + + objs = (const char **) calloc(argc + 1, sizeof(*objs)); + if (!objs) { + p_err("failed to allocate array for object names"); + return -ENOMEM; + } + + i = 0; + while (argc) + objs[i++] = GET_ARG(); + + err = minimize_btf(input, output, objs); + free(objs); + return err; +} + +static const struct cmd cmds[] = { + { "object", do_object }, + { "skeleton", do_skeleton }, + { "subskeleton", do_subskeleton }, + { "min_core_btf", do_min_core_btf}, + { "help", do_help }, + { 0 } +}; + +int do_gen(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/iter.c b/third_party/bpftool/src/iter.c new file mode 100644 index 0000000..6b0e520 --- /dev/null +++ b/third_party/bpftool/src/iter.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2020 Facebook + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include + +#include "main.h" + +static int do_pin(int argc, char **argv) +{ + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, iter_opts); + union bpf_iter_link_info linfo; + const char *objfile, *path; + struct bpf_program *prog; + struct bpf_object *obj; + struct bpf_link *link; + int err = -1, map_fd = -1; + + if (!REQ_ARGS(2)) + usage(); + + objfile = GET_ARG(); + path = GET_ARG(); + + /* optional arguments */ + if (argc) { + if (is_prefix(*argv, "map")) { + NEXT_ARG(); + + if (!REQ_ARGS(2)) { + p_err("incorrect map spec"); + return -1; + } + + map_fd = map_parse_fd(&argc, &argv); + if (map_fd < 0) + return -1; + + memset(&linfo, 0, sizeof(linfo)); + linfo.map.map_fd = map_fd; + iter_opts.link_info = &linfo; + iter_opts.link_info_len = sizeof(linfo); + } + } + + obj = bpf_object__open(objfile); + if (!obj) { + err = -errno; + p_err("can't open objfile %s", objfile); + goto close_map_fd; + } + + err = bpf_object__load(obj); + if (err) { + p_err("can't load objfile %s", objfile); + goto close_obj; + } + + prog = bpf_object__next_program(obj, NULL); + if (!prog) { + err = -errno; + p_err("can't find bpf program in objfile %s", objfile); + goto close_obj; + } + + link = bpf_program__attach_iter(prog, &iter_opts); + if (!link) { + err = -errno; + p_err("attach_iter failed for program %s", + bpf_program__name(prog)); + goto close_obj; + } + + err = mount_bpffs_for_pin(path, false); + if (err) + goto close_link; + + err = bpf_link__pin(link, path); + if (err) { + p_err("pin_iter failed for program %s to path %s", + bpf_program__name(prog), path); + goto close_link; + } + +close_link: + bpf_link__destroy(link); +close_obj: + bpf_object__close(obj); +close_map_fd: + if (map_fd >= 0) + close(map_fd); + return err; +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %1$s %2$s pin OBJ PATH [map MAP]\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_MAP "\n" + " " HELP_SPEC_OPTIONS " }\n" + "", + bin_name, "iter"); + + return 0; +} + +static const struct cmd cmds[] = { + { "help", do_help }, + { "pin", do_pin }, + { 0 } +}; + +int do_iter(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/jit_disasm.c b/third_party/bpftool/src/jit_disasm.c new file mode 100644 index 0000000..7b8d9ec --- /dev/null +++ b/third_party/bpftool/src/jit_disasm.c @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * Based on: + * + * Minimal BPF JIT image disassembler + * + * Disassembles BPF JIT compiler emitted opcodes back to asm insn's for + * debugging or verification purposes. + * + * Copyright 2013 Daniel Borkmann + * Licensed under the GNU General Public License, version 2.0 (GPLv2) + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_LLVM_SUPPORT +#include +#include +#include +#include +#endif + +#ifdef HAVE_LIBBFD_SUPPORT +#include +#include +#include +#endif + +#include "json_writer.h" +#include "main.h" + +static int oper_count; + +#ifdef HAVE_LLVM_SUPPORT +#define DISASM_SPACER + +typedef LLVMDisasmContextRef disasm_ctx_t; + +static int printf_json(char *s) +{ + s = strtok(s, " \t"); + jsonw_string_field(json_wtr, "operation", s); + + jsonw_name(json_wtr, "operands"); + jsonw_start_array(json_wtr); + oper_count = 1; + + while ((s = strtok(NULL, " \t,()")) != 0) { + jsonw_string(json_wtr, s); + oper_count++; + } + return 0; +} + +/* This callback to set the ref_type is necessary to have the LLVM disassembler + * print PC-relative addresses instead of byte offsets for branch instruction + * targets. + */ +static const char * +symbol_lookup_callback(__maybe_unused void *disasm_info, + __maybe_unused uint64_t ref_value, + uint64_t *ref_type, __maybe_unused uint64_t ref_PC, + __maybe_unused const char **ref_name) +{ + *ref_type = LLVMDisassembler_ReferenceType_InOut_None; + return NULL; +} + +static int +init_context(disasm_ctx_t *ctx, const char *arch, + __maybe_unused const char *disassembler_options, + __maybe_unused unsigned char *image, __maybe_unused ssize_t len) +{ + char *triple; + + if (arch) + triple = LLVMNormalizeTargetTriple(arch); + else + triple = LLVMGetDefaultTargetTriple(); + if (!triple) { + p_err("Failed to retrieve triple"); + return -1; + } + *ctx = LLVMCreateDisasm(triple, NULL, 0, NULL, symbol_lookup_callback); + LLVMDisposeMessage(triple); + + if (!*ctx) { + p_err("Failed to create disassembler"); + return -1; + } + + return 0; +} + +static void destroy_context(disasm_ctx_t *ctx) +{ + LLVMDisposeMessage(*ctx); +} + +static int +disassemble_insn(disasm_ctx_t *ctx, unsigned char *image, ssize_t len, int pc) +{ + char buf[256]; + int count; + + count = LLVMDisasmInstruction(*ctx, image + pc, len - pc, pc, + buf, sizeof(buf)); + if (json_output) + printf_json(buf); + else + printf("%s", buf); + + return count; +} + +int disasm_init(void) +{ + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllDisassemblers(); + return 0; +} +#endif /* HAVE_LLVM_SUPPORT */ + +#ifdef HAVE_LIBBFD_SUPPORT +#define DISASM_SPACER "\t" + +typedef struct { + struct disassemble_info *info; + disassembler_ftype disassemble; + bfd *bfdf; +} disasm_ctx_t; + +static int get_exec_path(char *tpath, size_t size) +{ + const char *path = "/proc/self/exe"; + ssize_t len; + + len = readlink(path, tpath, size - 1); + if (len <= 0) + return -1; + + tpath[len] = 0; + + return 0; +} + +static int printf_json(void *out, const char *fmt, va_list ap) +{ + char *s; + int err; + + err = vasprintf(&s, fmt, ap); + if (err < 0) + return -1; + + if (!oper_count) { + int i; + + /* Strip trailing spaces */ + i = strlen(s) - 1; + while (s[i] == ' ') + s[i--] = '\0'; + + jsonw_string_field(json_wtr, "operation", s); + jsonw_name(json_wtr, "operands"); + jsonw_start_array(json_wtr); + oper_count++; + } else if (!strcmp(fmt, ",")) { + /* Skip */ + } else { + jsonw_string(json_wtr, s); + oper_count++; + } + free(s); + return 0; +} + +static int fprintf_json(void *out, const char *fmt, ...) +{ + va_list ap; + int r; + + va_start(ap, fmt); + r = printf_json(out, fmt, ap); + va_end(ap); + + return r; +} + +static int fprintf_json_styled(void *out, + enum disassembler_style style __maybe_unused, + const char *fmt, ...) +{ + va_list ap; + int r; + + va_start(ap, fmt); + r = printf_json(out, fmt, ap); + va_end(ap); + + return r; +} + +static int init_context(disasm_ctx_t *ctx, const char *arch, + const char *disassembler_options, + unsigned char *image, ssize_t len) +{ + struct disassemble_info *info; + char tpath[PATH_MAX]; + bfd *bfdf; + + memset(tpath, 0, sizeof(tpath)); + if (get_exec_path(tpath, sizeof(tpath))) { + p_err("failed to create disassembler (get_exec_path)"); + return -1; + } + + ctx->bfdf = bfd_openr(tpath, NULL); + if (!ctx->bfdf) { + p_err("failed to create disassembler (bfd_openr)"); + return -1; + } + if (!bfd_check_format(ctx->bfdf, bfd_object)) { + p_err("failed to create disassembler (bfd_check_format)"); + goto err_close; + } + bfdf = ctx->bfdf; + + ctx->info = malloc(sizeof(struct disassemble_info)); + if (!ctx->info) { + p_err("mem alloc failed"); + goto err_close; + } + info = ctx->info; + + if (json_output) + init_disassemble_info_compat(info, stdout, + (fprintf_ftype) fprintf_json, + fprintf_json_styled); + else + init_disassemble_info_compat(info, stdout, + (fprintf_ftype) fprintf, + fprintf_styled); + + /* Update architecture info for offload. */ + if (arch) { + const bfd_arch_info_type *inf = bfd_scan_arch(arch); + + if (inf) { + bfdf->arch_info = inf; + } else { + p_err("No libbfd support for %s", arch); + goto err_free; + } + } + + info->arch = bfd_get_arch(bfdf); + info->mach = bfd_get_mach(bfdf); + if (disassembler_options) + info->disassembler_options = disassembler_options; + info->buffer = image; + info->buffer_length = len; + + disassemble_init_for_target(info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + ctx->disassemble = disassembler(info->arch, + bfd_big_endian(bfdf), + info->mach, + bfdf); +#else + ctx->disassemble = disassembler(bfdf); +#endif + if (!ctx->disassemble) { + p_err("failed to create disassembler"); + goto err_free; + } + return 0; + +err_free: + free(info); +err_close: + bfd_close(ctx->bfdf); + return -1; +} + +static void destroy_context(disasm_ctx_t *ctx) +{ + free(ctx->info); + bfd_close(ctx->bfdf); +} + +static int +disassemble_insn(disasm_ctx_t *ctx, __maybe_unused unsigned char *image, + __maybe_unused ssize_t len, int pc) +{ + return ctx->disassemble(pc, ctx->info); +} + +int disasm_init(void) +{ + bfd_init(); + return 0; +} +#endif /* HAVE_LIBBPFD_SUPPORT */ + +int disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch, const char *disassembler_options, + const struct btf *btf, + const struct bpf_prog_linfo *prog_linfo, + __u64 func_ksym, unsigned int func_idx, + bool linum) +{ + const struct bpf_line_info *linfo = NULL; + unsigned int nr_skip = 0; + int count, i, pc = 0; + disasm_ctx_t ctx; + + if (!len) + return -1; + + if (init_context(&ctx, arch, disassembler_options, image, len)) + return -1; + + if (json_output) + jsonw_start_array(json_wtr); + do { + if (prog_linfo) { + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + func_ksym + pc, + func_idx, + nr_skip); + if (linfo) + nr_skip++; + } + + if (json_output) { + jsonw_start_object(json_wtr); + oper_count = 0; + if (linfo) + btf_dump_linfo_json(btf, linfo, linum); + jsonw_name(json_wtr, "pc"); + jsonw_printf(json_wtr, "\"0x%x\"", pc); + } else { + if (linfo) + btf_dump_linfo_plain(btf, linfo, "; ", + linum); + printf("%4x:" DISASM_SPACER, pc); + } + + count = disassemble_insn(&ctx, image, len, pc); + + if (json_output) { + /* Operand array, was started in fprintf_json. Before + * that, make sure we have a _null_ value if no operand + * other than operation code was present. + */ + if (oper_count == 1) + jsonw_null(json_wtr); + jsonw_end_array(json_wtr); + } + + if (opcodes) { + if (json_output) { + jsonw_name(json_wtr, "opcodes"); + jsonw_start_array(json_wtr); + for (i = 0; i < count; ++i) + jsonw_printf(json_wtr, "\"0x%02hhx\"", + (uint8_t)image[pc + i]); + jsonw_end_array(json_wtr); + } else { + printf("\n\t"); + for (i = 0; i < count; ++i) + printf("%02x ", + (uint8_t)image[pc + i]); + } + } + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + + pc += count; + } while (count > 0 && pc < len); + if (json_output) + jsonw_end_array(json_wtr); + + destroy_context(&ctx); + + return 0; +} diff --git a/third_party/bpftool/src/json_writer.c b/third_party/bpftool/src/json_writer.c new file mode 100644 index 0000000..be37961 --- /dev/null +++ b/third_party/bpftool/src/json_writer.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause) +/* + * Simple streaming JSON writer + * + * This takes care of the annoying bits of JSON syntax like the commas + * after elements + * + * Authors: Stephen Hemminger + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "json_writer.h" + +struct json_writer { + FILE *out; /* output file */ + unsigned depth; /* nesting */ + bool pretty; /* optional whitepace */ + char sep; /* either nul or comma */ +}; + +/* indentation for pretty print */ +static void jsonw_indent(json_writer_t *self) +{ + unsigned i; + for (i = 0; i < self->depth; ++i) + fputs(" ", self->out); +} + +/* end current line and indent if pretty printing */ +static void jsonw_eol(json_writer_t *self) +{ + if (!self->pretty) + return; + + putc('\n', self->out); + jsonw_indent(self); +} + +/* If current object is not empty print a comma */ +static void jsonw_eor(json_writer_t *self) +{ + if (self->sep != '\0') + putc(self->sep, self->out); + self->sep = ','; +} + + +/* Output JSON encoded string */ +/* Handles C escapes, does not do Unicode */ +static void jsonw_puts(json_writer_t *self, const char *str) +{ + putc('"', self->out); + for (; *str; ++str) + switch (*str) { + case '\t': + fputs("\\t", self->out); + break; + case '\n': + fputs("\\n", self->out); + break; + case '\r': + fputs("\\r", self->out); + break; + case '\f': + fputs("\\f", self->out); + break; + case '\b': + fputs("\\b", self->out); + break; + case '\\': + fputs("\\\\", self->out); + break; + case '"': + fputs("\\\"", self->out); + break; + default: + putc(*str, self->out); + } + putc('"', self->out); +} + +/* Create a new JSON stream */ +json_writer_t *jsonw_new(FILE *f) +{ + json_writer_t *self = malloc(sizeof(*self)); + if (self) { + self->out = f; + self->depth = 0; + self->pretty = false; + self->sep = '\0'; + } + return self; +} + +/* End output to JSON stream */ +void jsonw_destroy(json_writer_t **self_p) +{ + json_writer_t *self = *self_p; + + assert(self->depth == 0); + fputs("\n", self->out); + fflush(self->out); + free(self); + *self_p = NULL; +} + +void jsonw_pretty(json_writer_t *self, bool on) +{ + self->pretty = on; +} + +void jsonw_reset(json_writer_t *self) +{ + assert(self->depth == 0); + self->sep = '\0'; +} + +/* Basic blocks */ +static void jsonw_begin(json_writer_t *self, int c) +{ + jsonw_eor(self); + putc(c, self->out); + ++self->depth; + self->sep = '\0'; +} + +static void jsonw_end(json_writer_t *self, int c) +{ + assert(self->depth > 0); + + --self->depth; + if (self->sep != '\0') + jsonw_eol(self); + putc(c, self->out); + self->sep = ','; +} + + +/* Add a JSON property name */ +void jsonw_name(json_writer_t *self, const char *name) +{ + jsonw_eor(self); + jsonw_eol(self); + self->sep = '\0'; + jsonw_puts(self, name); + putc(':', self->out); + if (self->pretty) + putc(' ', self->out); +} + +void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap) +{ + jsonw_eor(self); + putc('"', self->out); + vfprintf(self->out, fmt, ap); + putc('"', self->out); +} + +void jsonw_printf(json_writer_t *self, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + jsonw_eor(self); + vfprintf(self->out, fmt, ap); + va_end(ap); +} + +/* Collections */ +void jsonw_start_object(json_writer_t *self) +{ + jsonw_begin(self, '{'); +} + +void jsonw_end_object(json_writer_t *self) +{ + jsonw_end(self, '}'); +} + +void jsonw_start_array(json_writer_t *self) +{ + jsonw_begin(self, '['); +} + +void jsonw_end_array(json_writer_t *self) +{ + jsonw_end(self, ']'); +} + +/* JSON value types */ +void jsonw_string(json_writer_t *self, const char *value) +{ + jsonw_eor(self); + jsonw_puts(self, value); +} + +void jsonw_bool(json_writer_t *self, bool val) +{ + jsonw_printf(self, "%s", val ? "true" : "false"); +} + +void jsonw_null(json_writer_t *self) +{ + jsonw_printf(self, "null"); +} + +void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num) +{ + jsonw_printf(self, fmt, num); +} + +#ifdef notused +void jsonw_float(json_writer_t *self, double num) +{ + jsonw_printf(self, "%g", num); +} +#endif + +void jsonw_hu(json_writer_t *self, unsigned short num) +{ + jsonw_printf(self, "%hu", num); +} + +void jsonw_uint(json_writer_t *self, uint64_t num) +{ + jsonw_printf(self, "%"PRIu64, num); +} + +void jsonw_lluint(json_writer_t *self, unsigned long long int num) +{ + jsonw_printf(self, "%llu", num); +} + +void jsonw_int(json_writer_t *self, int64_t num) +{ + jsonw_printf(self, "%"PRId64, num); +} + +/* Basic name/value objects */ +void jsonw_string_field(json_writer_t *self, const char *prop, const char *val) +{ + jsonw_name(self, prop); + jsonw_string(self, val); +} + +void jsonw_bool_field(json_writer_t *self, const char *prop, bool val) +{ + jsonw_name(self, prop); + jsonw_bool(self, val); +} + +#ifdef notused +void jsonw_float_field(json_writer_t *self, const char *prop, double val) +{ + jsonw_name(self, prop); + jsonw_float(self, val); +} +#endif + +void jsonw_float_field_fmt(json_writer_t *self, + const char *prop, + const char *fmt, + double val) +{ + jsonw_name(self, prop); + jsonw_float_fmt(self, fmt, val); +} + +void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num) +{ + jsonw_name(self, prop); + jsonw_uint(self, num); +} + +void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num) +{ + jsonw_name(self, prop); + jsonw_hu(self, num); +} + +void jsonw_lluint_field(json_writer_t *self, + const char *prop, + unsigned long long int num) +{ + jsonw_name(self, prop); + jsonw_lluint(self, num); +} + +void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num) +{ + jsonw_name(self, prop); + jsonw_int(self, num); +} + +void jsonw_null_field(json_writer_t *self, const char *prop) +{ + jsonw_name(self, prop); + jsonw_null(self); +} + +#ifdef TEST +int main(int argc, char **argv) +{ + json_writer_t *wr = jsonw_new(stdout); + + jsonw_start_object(wr); + jsonw_pretty(wr, true); + jsonw_name(wr, "Vyatta"); + jsonw_start_object(wr); + jsonw_string_field(wr, "url", "http://vyatta.com"); + jsonw_uint_field(wr, "downloads", 2000000ul); + jsonw_float_field(wr, "stock", 8.16); + + jsonw_name(wr, "ARGV"); + jsonw_start_array(wr); + while (--argc) + jsonw_string(wr, *++argv); + jsonw_end_array(wr); + + jsonw_name(wr, "empty"); + jsonw_start_array(wr); + jsonw_end_array(wr); + + jsonw_name(wr, "NIL"); + jsonw_start_object(wr); + jsonw_end_object(wr); + + jsonw_null_field(wr, "my_null"); + + jsonw_name(wr, "special chars"); + jsonw_start_array(wr); + jsonw_string_field(wr, "slash", "/"); + jsonw_string_field(wr, "newline", "\n"); + jsonw_string_field(wr, "tab", "\t"); + jsonw_string_field(wr, "ff", "\f"); + jsonw_string_field(wr, "quote", "\""); + jsonw_string_field(wr, "tick", "\'"); + jsonw_string_field(wr, "backslash", "\\"); + jsonw_end_array(wr); + + jsonw_end_object(wr); + + jsonw_end_object(wr); + jsonw_destroy(&wr); + return 0; +} + +#endif diff --git a/third_party/bpftool/src/json_writer.h b/third_party/bpftool/src/json_writer.h new file mode 100644 index 0000000..5aaffd3 --- /dev/null +++ b/third_party/bpftool/src/json_writer.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Simple streaming JSON writer + * + * This takes care of the annoying bits of JSON syntax like the commas + * after elements + * + * Authors: Stephen Hemminger + */ + +#ifndef _JSON_WRITER_H_ +#define _JSON_WRITER_H_ + +#include +#include +#include +#include +#include + +/* Opaque class structure */ +typedef struct json_writer json_writer_t; + +/* Create a new JSON stream */ +json_writer_t *jsonw_new(FILE *f); +/* End output to JSON stream */ +void jsonw_destroy(json_writer_t **self_p); + +/* Cause output to have pretty whitespace */ +void jsonw_pretty(json_writer_t *self, bool on); + +/* Reset separator to create new JSON */ +void jsonw_reset(json_writer_t *self); + +/* Add property name */ +void jsonw_name(json_writer_t *self, const char *name); + +/* Add value */ +void __printf(2, 0) jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, + va_list ap); +void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...); +void jsonw_string(json_writer_t *self, const char *value); +void jsonw_bool(json_writer_t *self, bool value); +void jsonw_float(json_writer_t *self, double number); +void jsonw_float_fmt(json_writer_t *self, const char *fmt, double num); +void jsonw_uint(json_writer_t *self, uint64_t number); +void jsonw_hu(json_writer_t *self, unsigned short number); +void jsonw_int(json_writer_t *self, int64_t number); +void jsonw_null(json_writer_t *self); +void jsonw_lluint(json_writer_t *self, unsigned long long int num); + +/* Useful Combinations of name and value */ +void jsonw_string_field(json_writer_t *self, const char *prop, const char *val); +void jsonw_bool_field(json_writer_t *self, const char *prop, bool value); +void jsonw_float_field(json_writer_t *self, const char *prop, double num); +void jsonw_uint_field(json_writer_t *self, const char *prop, uint64_t num); +void jsonw_hu_field(json_writer_t *self, const char *prop, unsigned short num); +void jsonw_int_field(json_writer_t *self, const char *prop, int64_t num); +void jsonw_null_field(json_writer_t *self, const char *prop); +void jsonw_lluint_field(json_writer_t *self, const char *prop, + unsigned long long int num); +void jsonw_float_field_fmt(json_writer_t *self, const char *prop, + const char *fmt, double val); + +/* Collections */ +void jsonw_start_object(json_writer_t *self); +void jsonw_end_object(json_writer_t *self); + +void jsonw_start_array(json_writer_t *self); +void jsonw_end_array(json_writer_t *self); + +/* Override default exception handling */ +typedef void (jsonw_err_handler_fn)(const char *); + +#endif /* _JSON_WRITER_H_ */ diff --git a/third_party/bpftool/src/kernel/bpf/disasm.c b/third_party/bpftool/src/kernel/bpf/disasm.c new file mode 100644 index 0000000..7b4afb7 --- /dev/null +++ b/third_party/bpftool/src/kernel/bpf/disasm.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + */ + +#include + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (!insn->src_reg && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) { + const char *res; + + res = cbs->cb_call(cbs->private_data, insn); + if (res) + return res; + } + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + snprintf(buff, len, "kernel-function"); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_JMP32] = "jmp32", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "xor", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_t verbose, + void *private_data, + const struct bpf_insn *insn) +{ + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + bool allow_ptr_leaks) +{ + const bpf_insn_print_t verbose = cbs->cb_print; + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, cbs->private_data, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? 'w' : 'r', + insn->src_reg); + } else { + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == BPF_ADD || insn->imm == BPF_AND || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", + insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); + } else { + verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); + } + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; + char tmp[64]; + + if (is_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(cbs->private_data, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); + } else { + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP32 || class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(cbs->private_data, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(cbs->private_data, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(cbs->private_data, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', + insn->src_reg, insn->off); + } else { + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(cbs->private_data, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/third_party/bpftool/src/kernel/bpf/disasm.h b/third_party/bpftool/src/kernel/bpf/disasm.h new file mode 100644 index 0000000..a4b0407 --- /dev/null +++ b/third_party/bpftool/src/kernel/bpf/disasm.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include +#include +#include +#ifndef __KERNEL__ +#include +#include +#endif + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); + +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; + +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + bool allow_ptr_leaks); +#endif diff --git a/third_party/bpftool/src/link.c b/third_party/bpftool/src/link.c new file mode 100644 index 0000000..2d78607 --- /dev/null +++ b/third_party/bpftool/src/link.c @@ -0,0 +1,588 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "json_writer.h" +#include "main.h" + +static struct hashmap *link_table; + +static int link_parse_fd(int *argc, char ***argv) +{ + int fd; + + if (is_prefix(**argv, "id")) { + unsigned int id; + char *endptr; + + NEXT_ARGP(); + + id = strtoul(**argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as ID", **argv); + return -1; + } + NEXT_ARGP(); + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) + p_err("failed to get link with ID %d: %s", id, strerror(errno)); + return fd; + } else if (is_prefix(**argv, "pinned")) { + char *path; + + NEXT_ARGP(); + + path = **argv; + NEXT_ARGP(); + + return open_obj_pinned_any(path, BPF_OBJ_LINK); + } + + p_err("expected 'id' or 'pinned', got: '%s'?", **argv); + return -1; +} + +static void +show_link_header_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + const char *link_type_str; + + jsonw_uint_field(wtr, "id", info->id); + link_type_str = libbpf_bpf_link_type_str(info->type); + if (link_type_str) + jsonw_string_field(wtr, "type", link_type_str); + else + jsonw_uint_field(wtr, "type", info->type); + + jsonw_uint_field(json_wtr, "prog_id", info->prog_id); +} + +static void show_link_attach_type_json(__u32 attach_type, json_writer_t *wtr) +{ + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + if (attach_type_str) + jsonw_string_field(wtr, "attach_type", attach_type_str); + else + jsonw_uint_field(wtr, "attach_type", attach_type); +} + +static bool is_iter_map_target(const char *target_name) +{ + return strcmp(target_name, "bpf_map_elem") == 0 || + strcmp(target_name, "bpf_sk_storage_map") == 0; +} + +static bool is_iter_cgroup_target(const char *target_name) +{ + return strcmp(target_name, "cgroup") == 0; +} + +static const char *cgroup_order_string(__u32 order) +{ + switch (order) { + case BPF_CGROUP_ITER_ORDER_UNSPEC: + return "order_unspec"; + case BPF_CGROUP_ITER_SELF_ONLY: + return "self_only"; + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + return "descendants_pre"; + case BPF_CGROUP_ITER_DESCENDANTS_POST: + return "descendants_post"; + case BPF_CGROUP_ITER_ANCESTORS_UP: + return "ancestors_up"; + default: /* won't happen */ + return "unknown"; + } +} + +static bool is_iter_task_target(const char *target_name) +{ + return strcmp(target_name, "task") == 0 || + strcmp(target_name, "task_file") == 0 || + strcmp(target_name, "task_vma") == 0; +} + +static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr) +{ + const char *target_name = u64_to_ptr(info->iter.target_name); + + jsonw_string_field(wtr, "target_name", target_name); + + if (is_iter_map_target(target_name)) + jsonw_uint_field(wtr, "map_id", info->iter.map.map_id); + else if (is_iter_task_target(target_name)) { + if (info->iter.task.tid) + jsonw_uint_field(wtr, "tid", info->iter.task.tid); + else if (info->iter.task.pid) + jsonw_uint_field(wtr, "pid", info->iter.task.pid); + } + + if (is_iter_cgroup_target(target_name)) { + jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id); + jsonw_string_field(wtr, "order", + cgroup_order_string(info->iter.cgroup.order)); + } +} + +void netfilter_dump_json(const struct bpf_link_info *info, json_writer_t *wtr) +{ + jsonw_uint_field(json_wtr, "pf", + info->netfilter.pf); + jsonw_uint_field(json_wtr, "hook", + info->netfilter.hooknum); + jsonw_int_field(json_wtr, "prio", + info->netfilter.priority); + jsonw_uint_field(json_wtr, "flags", + info->netfilter.flags); +} + +static int get_prog_info(int prog_id, struct bpf_prog_info *info) +{ + __u32 len = sizeof(*info); + int err, prog_fd; + + prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (prog_fd < 0) + return prog_fd; + + memset(info, 0, sizeof(*info)); + err = bpf_prog_get_info_by_fd(prog_fd, info, &len); + if (err) + p_err("can't get prog info: %s", strerror(errno)); + close(prog_fd); + return err; +} + +static int show_link_close_json(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + const char *prog_type_str; + int err; + + jsonw_start_object(json_wtr); + + show_link_header_json(info, json_wtr); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + jsonw_string_field(json_wtr, "tp_name", + u64_to_ptr(info->raw_tracepoint.tp_name)); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + prog_type_str = libbpf_bpf_prog_type_str(prog_info.type); + /* libbpf will return NULL for variants unknown to it. */ + if (prog_type_str) + jsonw_string_field(json_wtr, "prog_type", prog_type_str); + else + jsonw_uint_field(json_wtr, "prog_type", prog_info.type); + + show_link_attach_type_json(info->tracing.attach_type, + json_wtr); + jsonw_uint_field(json_wtr, "target_obj_id", info->tracing.target_obj_id); + jsonw_uint_field(json_wtr, "target_btf_id", info->tracing.target_btf_id); + break; + case BPF_LINK_TYPE_CGROUP: + jsonw_lluint_field(json_wtr, "cgroup_id", + info->cgroup.cgroup_id); + show_link_attach_type_json(info->cgroup.attach_type, json_wtr); + break; + case BPF_LINK_TYPE_ITER: + show_iter_json(info, json_wtr); + break; + case BPF_LINK_TYPE_NETNS: + jsonw_uint_field(json_wtr, "netns_ino", + info->netns.netns_ino); + show_link_attach_type_json(info->netns.attach_type, json_wtr); + break; + case BPF_LINK_TYPE_NETFILTER: + netfilter_dump_json(info, json_wtr); + break; + case BPF_LINK_TYPE_STRUCT_OPS: + jsonw_uint_field(json_wtr, "map_id", + info->struct_ops.map_id); + break; + default: + break; + } + + if (!hashmap__empty(link_table)) { + struct hashmap_entry *entry; + + jsonw_name(json_wtr, "pinned"); + jsonw_start_array(json_wtr); + hashmap__for_each_key_entry(link_table, entry, info->id) + jsonw_string(json_wtr, entry->pvalue); + jsonw_end_array(json_wtr); + } + + emit_obj_refs_json(refs_table, info->id, json_wtr); + + jsonw_end_object(json_wtr); + + return 0; +} + +static void show_link_header_plain(struct bpf_link_info *info) +{ + const char *link_type_str; + + printf("%u: ", info->id); + link_type_str = libbpf_bpf_link_type_str(info->type); + if (link_type_str) + printf("%s ", link_type_str); + else + printf("type %u ", info->type); + + if (info->type == BPF_LINK_TYPE_STRUCT_OPS) + printf("map %u ", info->struct_ops.map_id); + else + printf("prog %u ", info->prog_id); +} + +static void show_link_attach_type_plain(__u32 attach_type) +{ + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(attach_type); + if (attach_type_str) + printf("attach_type %s ", attach_type_str); + else + printf("attach_type %u ", attach_type); +} + +static void show_iter_plain(struct bpf_link_info *info) +{ + const char *target_name = u64_to_ptr(info->iter.target_name); + + printf("target_name %s ", target_name); + + if (is_iter_map_target(target_name)) + printf("map_id %u ", info->iter.map.map_id); + else if (is_iter_task_target(target_name)) { + if (info->iter.task.tid) + printf("tid %u ", info->iter.task.tid); + else if (info->iter.task.pid) + printf("pid %u ", info->iter.task.pid); + } + + if (is_iter_cgroup_target(target_name)) { + printf("cgroup_id %llu ", info->iter.cgroup.cgroup_id); + printf("order %s ", + cgroup_order_string(info->iter.cgroup.order)); + } +} + +static const char * const pf2name[] = { + [NFPROTO_INET] = "inet", + [NFPROTO_IPV4] = "ip", + [NFPROTO_ARP] = "arp", + [NFPROTO_NETDEV] = "netdev", + [NFPROTO_BRIDGE] = "bridge", + [NFPROTO_IPV6] = "ip6", +}; + +static const char * const inethook2name[] = { + [NF_INET_PRE_ROUTING] = "prerouting", + [NF_INET_LOCAL_IN] = "input", + [NF_INET_FORWARD] = "forward", + [NF_INET_LOCAL_OUT] = "output", + [NF_INET_POST_ROUTING] = "postrouting", +}; + +static const char * const arphook2name[] = { + [NF_ARP_IN] = "input", + [NF_ARP_OUT] = "output", +}; + +void netfilter_dump_plain(const struct bpf_link_info *info) +{ + const char *hookname = NULL, *pfname = NULL; + unsigned int hook = info->netfilter.hooknum; + unsigned int pf = info->netfilter.pf; + + if (pf < ARRAY_SIZE(pf2name)) + pfname = pf2name[pf]; + + switch (pf) { + case NFPROTO_BRIDGE: /* bridge shares numbers with enum nf_inet_hooks */ + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + if (hook < ARRAY_SIZE(inethook2name)) + hookname = inethook2name[hook]; + break; + case NFPROTO_ARP: + if (hook < ARRAY_SIZE(arphook2name)) + hookname = arphook2name[hook]; + default: + break; + } + + if (pfname) + printf("\n\t%s", pfname); + else + printf("\n\tpf: %d", pf); + + if (hookname) + printf(" %s", hookname); + else + printf(", hook %u,", hook); + + printf(" prio %d", info->netfilter.priority); + + if (info->netfilter.flags) + printf(" flags 0x%x", info->netfilter.flags); +} + +static int show_link_close_plain(int fd, struct bpf_link_info *info) +{ + struct bpf_prog_info prog_info; + const char *prog_type_str; + int err; + + show_link_header_plain(info); + + switch (info->type) { + case BPF_LINK_TYPE_RAW_TRACEPOINT: + printf("\n\ttp '%s' ", + (const char *)u64_to_ptr(info->raw_tracepoint.tp_name)); + break; + case BPF_LINK_TYPE_TRACING: + err = get_prog_info(info->prog_id, &prog_info); + if (err) + return err; + + prog_type_str = libbpf_bpf_prog_type_str(prog_info.type); + /* libbpf will return NULL for variants unknown to it. */ + if (prog_type_str) + printf("\n\tprog_type %s ", prog_type_str); + else + printf("\n\tprog_type %u ", prog_info.type); + + show_link_attach_type_plain(info->tracing.attach_type); + if (info->tracing.target_obj_id || info->tracing.target_btf_id) + printf("\n\ttarget_obj_id %u target_btf_id %u ", + info->tracing.target_obj_id, + info->tracing.target_btf_id); + break; + case BPF_LINK_TYPE_CGROUP: + printf("\n\tcgroup_id %zu ", (size_t)info->cgroup.cgroup_id); + show_link_attach_type_plain(info->cgroup.attach_type); + break; + case BPF_LINK_TYPE_ITER: + show_iter_plain(info); + break; + case BPF_LINK_TYPE_NETNS: + printf("\n\tnetns_ino %u ", info->netns.netns_ino); + show_link_attach_type_plain(info->netns.attach_type); + break; + case BPF_LINK_TYPE_NETFILTER: + netfilter_dump_plain(info); + break; + default: + break; + } + + if (!hashmap__empty(link_table)) { + struct hashmap_entry *entry; + + hashmap__for_each_key_entry(link_table, entry, info->id) + printf("\n\tpinned %s", (char *)entry->pvalue); + } + emit_obj_refs_plain(refs_table, info->id, "\n\tpids "); + + printf("\n"); + + return 0; +} + +static int do_show_link(int fd) +{ + struct bpf_link_info info; + __u32 len = sizeof(info); + char buf[256]; + int err; + + memset(&info, 0, sizeof(info)); +again: + err = bpf_link_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get link info: %s", + strerror(errno)); + close(fd); + return err; + } + if (info.type == BPF_LINK_TYPE_RAW_TRACEPOINT && + !info.raw_tracepoint.tp_name) { + info.raw_tracepoint.tp_name = (unsigned long)&buf; + info.raw_tracepoint.tp_name_len = sizeof(buf); + goto again; + } + if (info.type == BPF_LINK_TYPE_ITER && + !info.iter.target_name) { + info.iter.target_name = (unsigned long)&buf; + info.iter.target_name_len = sizeof(buf); + goto again; + } + + if (json_output) + show_link_close_json(fd, &info); + else + show_link_close_plain(fd, &info); + + close(fd); + return 0; +} + +static int do_show(int argc, char **argv) +{ + __u32 id = 0; + int err, fd; + + if (show_pinned) { + link_table = hashmap__new(hash_fn_for_key_as_id, + equal_fn_for_key_as_id, NULL); + if (IS_ERR(link_table)) { + p_err("failed to create hashmap for pinned paths"); + return -1; + } + build_pinned_obj_table(link_table, BPF_OBJ_LINK); + } + build_obj_refs_table(&refs_table, BPF_OBJ_LINK); + + if (argc == 2) { + fd = link_parse_fd(&argc, &argv); + if (fd < 0) + return fd; + return do_show_link(fd); + } + + if (argc) + return BAD_ARG(); + + if (json_output) + jsonw_start_array(json_wtr); + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + p_err("can't get next link: %s%s", strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get link by id (%u): %s", + id, strerror(errno)); + break; + } + + err = do_show_link(fd); + if (err) + break; + } + if (json_output) + jsonw_end_array(json_wtr); + + delete_obj_refs_table(refs_table); + + if (show_pinned) + delete_pinned_obj_table(link_table); + + return errno == ENOENT ? 0 : -1; +} + +static int do_pin(int argc, char **argv) +{ + int err; + + err = do_pin_any(argc, argv, link_parse_fd); + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static int do_detach(int argc, char **argv) +{ + int err, fd; + + if (argc != 2) { + p_err("link specifier is invalid or missing\n"); + return 1; + } + + fd = link_parse_fd(&argc, &argv); + if (fd < 0) + return 1; + + err = bpf_link_detach(fd); + if (err) + err = -errno; + close(fd); + if (err) { + p_err("failed link detach: %s", strerror(-err)); + return 1; + } + + if (json_output) + jsonw_null(json_wtr); + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [LINK]\n" + " %1$s %2$s pin LINK FILE\n" + " %1$s %2$s detach LINK\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_LINK "\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-n|--nomount} }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "pin", do_pin }, + { "detach", do_detach }, + { 0 } +}; + +int do_link(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/main.c b/third_party/bpftool/src/main.c new file mode 100644 index 0000000..08d0ac5 --- /dev/null +++ b/third_party/bpftool/src/main.c @@ -0,0 +1,547 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "main.h" + +#define BATCH_LINE_LEN_MAX 65536 +#define BATCH_ARG_NB_MAX 4096 + +const char *bin_name; +static int last_argc; +static char **last_argv; +static int (*last_do_help)(int argc, char **argv); +json_writer_t *json_wtr; +bool pretty_output; +bool json_output; +bool show_pinned; +bool block_mount; +bool verifier_logs; +bool relaxed_maps; +bool use_loader; +struct btf *base_btf; +struct hashmap *refs_table; + +static void __noreturn clean_and_exit(int i) +{ + if (json_output) + jsonw_destroy(&json_wtr); + + exit(i); +} + +void usage(void) +{ + last_do_help(last_argc - 1, last_argv + 1); + + clean_and_exit(-1); +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s [OPTIONS] OBJECT { COMMAND | help }\n" + " %s batch file FILE\n" + " %s version\n" + "\n" + " OBJECT := { prog | map | link | cgroup | perf | net | feature | btf | gen | struct_ops | iter }\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-V|--version} }\n" + "", + bin_name, bin_name, bin_name); + + return 0; +} + +static int do_batch(int argc, char **argv); +static int do_version(int argc, char **argv); + +static const struct cmd commands[] = { + { "help", do_help }, + { "batch", do_batch }, + { "prog", do_prog }, + { "map", do_map }, + { "link", do_link }, + { "cgroup", do_cgroup }, + { "perf", do_perf }, + { "net", do_net }, + { "feature", do_feature }, + { "btf", do_btf }, + { "gen", do_gen }, + { "struct_ops", do_struct_ops }, + { "iter", do_iter }, + { "version", do_version }, + { 0 } +}; + +#ifndef BPFTOOL_VERSION +/* bpftool's major and minor version numbers are aligned on libbpf's. There is + * an offset of 6 for the version number, because bpftool's version was higher + * than libbpf's when we adopted this scheme. The patch number remains at 0 + * for now. Set BPFTOOL_VERSION to override. + */ +#define BPFTOOL_MAJOR_VERSION (LIBBPF_MAJOR_VERSION + 6) +#define BPFTOOL_MINOR_VERSION LIBBPF_MINOR_VERSION +#define BPFTOOL_PATCH_VERSION 0 +#endif + +static void +print_feature(const char *feature, bool state, unsigned int *nb_features) +{ + if (state) { + printf("%s %s", *nb_features ? "," : "", feature); + *nb_features = *nb_features + 1; + } +} + +static int do_version(int argc, char **argv) +{ +#ifdef HAVE_LIBBFD_SUPPORT + const bool has_libbfd = true; +#else + const bool has_libbfd = false; +#endif +#ifdef HAVE_LLVM_SUPPORT + const bool has_llvm = true; +#else + const bool has_llvm = false; +#endif +#ifdef BPFTOOL_WITHOUT_SKELETONS + const bool has_skeletons = false; +#else + const bool has_skeletons = true; +#endif + bool bootstrap = false; + int i; + + for (i = 0; commands[i].cmd; i++) { + if (!strcmp(commands[i].cmd, "prog")) { + /* Assume we run a bootstrap version if "bpftool prog" + * is not available. + */ + bootstrap = !commands[i].func; + break; + } + } + + if (json_output) { + jsonw_start_object(json_wtr); /* root object */ + + jsonw_name(json_wtr, "version"); +#ifdef BPFTOOL_VERSION + jsonw_printf(json_wtr, "\"%s\"", BPFTOOL_VERSION); +#else + jsonw_printf(json_wtr, "\"%d.%d.%d\"", BPFTOOL_MAJOR_VERSION, + BPFTOOL_MINOR_VERSION, BPFTOOL_PATCH_VERSION); +#endif + jsonw_name(json_wtr, "libbpf_version"); + jsonw_printf(json_wtr, "\"%d.%d\"", + libbpf_major_version(), libbpf_minor_version()); + + jsonw_name(json_wtr, "features"); + jsonw_start_object(json_wtr); /* features */ + jsonw_bool_field(json_wtr, "libbfd", has_libbfd); + jsonw_bool_field(json_wtr, "llvm", has_llvm); + jsonw_bool_field(json_wtr, "skeletons", has_skeletons); + jsonw_bool_field(json_wtr, "bootstrap", bootstrap); + jsonw_end_object(json_wtr); /* features */ + + jsonw_end_object(json_wtr); /* root object */ + } else { + unsigned int nb_features = 0; + +#ifdef BPFTOOL_VERSION + printf("%s v%s\n", bin_name, BPFTOOL_VERSION); +#else + printf("%s v%d.%d.%d\n", bin_name, BPFTOOL_MAJOR_VERSION, + BPFTOOL_MINOR_VERSION, BPFTOOL_PATCH_VERSION); +#endif + printf("using libbpf %s\n", libbpf_version_string()); + printf("features:"); + print_feature("libbfd", has_libbfd, &nb_features); + print_feature("llvm", has_llvm, &nb_features); + print_feature("skeletons", has_skeletons, &nb_features); + print_feature("bootstrap", bootstrap, &nb_features); + printf("\n"); + } + return 0; +} + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)) +{ + unsigned int i; + + last_argc = argc; + last_argv = argv; + last_do_help = help; + + if (argc < 1 && cmds[0].func) + return cmds[0].func(argc, argv); + + for (i = 0; cmds[i].cmd; i++) { + if (is_prefix(*argv, cmds[i].cmd)) { + if (!cmds[i].func) { + p_err("command '%s' is not supported in bootstrap mode", + cmds[i].cmd); + return -1; + } + return cmds[i].func(argc - 1, argv + 1); + } + } + + help(argc - 1, argv + 1); + + return -1; +} + +bool is_prefix(const char *pfx, const char *str) +{ + if (!pfx) + return false; + if (strlen(str) < strlen(pfx)) + return false; + + return !memcmp(str, pfx, strlen(pfx)); +} + +/* Last argument MUST be NULL pointer */ +int detect_common_prefix(const char *arg, ...) +{ + unsigned int count = 0; + const char *ref; + char msg[256]; + va_list ap; + + snprintf(msg, sizeof(msg), "ambiguous prefix: '%s' could be '", arg); + va_start(ap, arg); + while ((ref = va_arg(ap, const char *))) { + if (!is_prefix(arg, ref)) + continue; + count++; + if (count > 1) + strncat(msg, "' or '", sizeof(msg) - strlen(msg) - 1); + strncat(msg, ref, sizeof(msg) - strlen(msg) - 1); + } + va_end(ap); + strncat(msg, "'", sizeof(msg) - strlen(msg) - 1); + + if (count >= 2) { + p_err("%s", msg); + return -1; + } + + return 0; +} + +void fprint_hex(FILE *f, void *arg, unsigned int n, const char *sep) +{ + unsigned char *data = arg; + unsigned int i; + + for (i = 0; i < n; i++) { + const char *pfx = ""; + + if (!i) + /* nothing */; + else if (!(i % 16)) + fprintf(f, "\n"); + else if (!(i % 8)) + fprintf(f, " "); + else + pfx = sep; + + fprintf(f, "%s%02hhx", i ? pfx : "", data[i]); + } +} + +/* Split command line into argument vector. */ +static int make_args(char *line, char *n_argv[], int maxargs, int cmd_nb) +{ + static const char ws[] = " \t\r\n"; + char *cp = line; + int n_argc = 0; + + while (*cp) { + /* Skip leading whitespace. */ + cp += strspn(cp, ws); + + if (*cp == '\0') + break; + + if (n_argc >= (maxargs - 1)) { + p_err("too many arguments to command %d", cmd_nb); + return -1; + } + + /* Word begins with quote. */ + if (*cp == '\'' || *cp == '"') { + char quote = *cp++; + + n_argv[n_argc++] = cp; + /* Find ending quote. */ + cp = strchr(cp, quote); + if (!cp) { + p_err("unterminated quoted string in command %d", + cmd_nb); + return -1; + } + } else { + n_argv[n_argc++] = cp; + + /* Find end of word. */ + cp += strcspn(cp, ws); + if (*cp == '\0') + break; + } + + /* Separate words. */ + *cp++ = 0; + } + n_argv[n_argc] = NULL; + + return n_argc; +} + +static int do_batch(int argc, char **argv) +{ + char buf[BATCH_LINE_LEN_MAX], contline[BATCH_LINE_LEN_MAX]; + char *n_argv[BATCH_ARG_NB_MAX]; + unsigned int lines = 0; + int n_argc; + FILE *fp; + char *cp; + int err = 0; + int i; + + if (argc < 2) { + p_err("too few parameters for batch"); + return -1; + } else if (argc > 2) { + p_err("too many parameters for batch"); + return -1; + } else if (!is_prefix(*argv, "file")) { + p_err("expected 'file', got: %s", *argv); + return -1; + } + NEXT_ARG(); + + if (!strcmp(*argv, "-")) + fp = stdin; + else + fp = fopen(*argv, "r"); + if (!fp) { + p_err("Can't open file (%s): %s", *argv, strerror(errno)); + return -1; + } + + if (json_output) + jsonw_start_array(json_wtr); + while (fgets(buf, sizeof(buf), fp)) { + cp = strchr(buf, '#'); + if (cp) + *cp = '\0'; + + if (strlen(buf) == sizeof(buf) - 1) { + errno = E2BIG; + break; + } + + /* Append continuation lines if any (coming after a line ending + * with '\' in the batch file). + */ + while ((cp = strstr(buf, "\\\n")) != NULL) { + if (!fgets(contline, sizeof(contline), fp) || + strlen(contline) == 0) { + p_err("missing continuation line on command %d", + lines); + err = -1; + goto err_close; + } + + cp = strchr(contline, '#'); + if (cp) + *cp = '\0'; + + if (strlen(buf) + strlen(contline) + 1 > sizeof(buf)) { + p_err("command %d is too long", lines); + err = -1; + goto err_close; + } + buf[strlen(buf) - 2] = '\0'; + strcat(buf, contline); + } + + n_argc = make_args(buf, n_argv, BATCH_ARG_NB_MAX, lines); + if (!n_argc) + continue; + if (n_argc < 0) { + err = n_argc; + goto err_close; + } + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "command"); + jsonw_start_array(json_wtr); + for (i = 0; i < n_argc; i++) + jsonw_string(json_wtr, n_argv[i]); + jsonw_end_array(json_wtr); + jsonw_name(json_wtr, "output"); + } + + err = cmd_select(commands, n_argc, n_argv, do_help); + + if (json_output) + jsonw_end_object(json_wtr); + + if (err) + goto err_close; + + lines++; + } + + if (errno && errno != ENOENT) { + p_err("reading batch file failed: %s", strerror(errno)); + err = -1; + } else { + if (!json_output) + printf("processed %d commands\n", lines); + } +err_close: + if (fp != stdin) + fclose(fp); + + if (json_output) + jsonw_end_array(json_wtr); + + return err; +} + +int main(int argc, char **argv) +{ + static const struct option options[] = { + { "json", no_argument, NULL, 'j' }, + { "help", no_argument, NULL, 'h' }, + { "pretty", no_argument, NULL, 'p' }, + { "version", no_argument, NULL, 'V' }, + { "bpffs", no_argument, NULL, 'f' }, + { "mapcompat", no_argument, NULL, 'm' }, + { "nomount", no_argument, NULL, 'n' }, + { "debug", no_argument, NULL, 'd' }, + { "use-loader", no_argument, NULL, 'L' }, + { "base-btf", required_argument, NULL, 'B' }, + { 0 } + }; + bool version_requested = false; + int opt, ret; + + setlinebuf(stdout); + +#ifdef USE_LIBCAP + /* Libcap < 2.63 hooks before main() to compute the number of + * capabilities of the running kernel, and doing so it calls prctl() + * which may fail and set errno to non-zero. + * Let's reset errno to make sure this does not interfere with the + * batch mode. + */ + errno = 0; +#endif + + last_do_help = do_help; + pretty_output = false; + json_output = false; + show_pinned = false; + block_mount = false; + bin_name = "bpftool"; + + opterr = 0; + while ((opt = getopt_long(argc, argv, "VhpjfLmndB:l", + options, NULL)) >= 0) { + switch (opt) { + case 'V': + version_requested = true; + break; + case 'h': + return do_help(argc, argv); + case 'p': + pretty_output = true; + /* fall through */ + case 'j': + if (!json_output) { + json_wtr = jsonw_new(stdout); + if (!json_wtr) { + p_err("failed to create JSON writer"); + return -1; + } + json_output = true; + } + jsonw_pretty(json_wtr, pretty_output); + break; + case 'f': + show_pinned = true; + break; + case 'm': + relaxed_maps = true; + break; + case 'n': + block_mount = true; + break; + case 'd': + libbpf_set_print(print_all_levels); + verifier_logs = true; + break; + case 'B': + base_btf = btf__parse(optarg, NULL); + if (!base_btf) { + p_err("failed to parse base BTF at '%s': %d\n", + optarg, -errno); + return -1; + } + break; + case 'L': + use_loader = true; + break; + default: + p_err("unrecognized option '%s'", argv[optind - 1]); + if (json_output) + clean_and_exit(-1); + else + usage(); + } + } + + argc -= optind; + argv += optind; + if (argc < 0) + usage(); + + if (version_requested) + return do_version(argc, argv); + + ret = cmd_select(commands, argc, argv, do_help); + + if (json_output) + jsonw_destroy(&json_wtr); + + btf__free(base_btf); + + return ret; +} diff --git a/third_party/bpftool/src/main.h b/third_party/bpftool/src/main.h new file mode 100644 index 0000000..b8bb08d --- /dev/null +++ b/third_party/bpftool/src/main.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ + +#ifndef __BPF_TOOL_H +#define __BPF_TOOL_H + +/* BFD and kernel.h both define GCC_VERSION, differently */ +#undef GCC_VERSION +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "json_writer.h" + +/* Make sure we do not use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +static inline void *u64_to_ptr(__u64 ptr) +{ + return (void *)(unsigned long)ptr; +} + +#define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) +#define NEXT_ARGP() ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); }) +#define BAD_ARG() ({ p_err("what is '%s'?", *argv); -1; }) +#define GET_ARG() ({ argc--; *argv++; }) +#define REQ_ARGS(cnt) \ + ({ \ + int _cnt = (cnt); \ + bool _res; \ + \ + if (argc < _cnt) { \ + p_err("'%s' needs at least %d arguments, %d found", \ + argv[-1], _cnt, argc); \ + _res = false; \ + } else { \ + _res = true; \ + } \ + _res; \ + }) + +#define ERR_MAX_LEN 1024 + +#define BPF_TAG_FMT "%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx%02hhx" + +#define HELP_SPEC_PROGRAM \ + "PROG := { id PROG_ID | pinned FILE | tag PROG_TAG | name PROG_NAME }" +#define HELP_SPEC_OPTIONS \ + "OPTIONS := { {-j|--json} [{-p|--pretty}] | {-d|--debug}" +#define HELP_SPEC_MAP \ + "MAP := { id MAP_ID | pinned FILE | name MAP_NAME }" +#define HELP_SPEC_LINK \ + "LINK := { id LINK_ID | pinned FILE }" + +/* keep in sync with the definition in skeleton/pid_iter.bpf.c */ +enum bpf_obj_type { + BPF_OBJ_UNKNOWN, + BPF_OBJ_PROG, + BPF_OBJ_MAP, + BPF_OBJ_LINK, + BPF_OBJ_BTF, +}; + +extern const char *bin_name; + +extern json_writer_t *json_wtr; +extern bool json_output; +extern bool show_pinned; +extern bool show_pids; +extern bool block_mount; +extern bool verifier_logs; +extern bool relaxed_maps; +extern bool use_loader; +extern struct btf *base_btf; +extern struct hashmap *refs_table; + +void __printf(1, 2) p_err(const char *fmt, ...); +void __printf(1, 2) p_info(const char *fmt, ...); + +bool is_prefix(const char *pfx, const char *str); +int detect_common_prefix(const char *arg, ...); +void fprint_hex(FILE *f, void *arg, unsigned int n, const char *sep); +void usage(void) __noreturn; + +void set_max_rlimit(void); + +int mount_tracefs(const char *target); + +struct obj_ref { + int pid; + char comm[16]; +}; + +struct obj_refs { + int ref_cnt; + bool has_bpf_cookie; + struct obj_ref *refs; + __u64 bpf_cookie; +}; + +struct btf; +struct bpf_line_info; + +int build_pinned_obj_table(struct hashmap *table, + enum bpf_obj_type type); +void delete_pinned_obj_table(struct hashmap *table); +__weak int build_obj_refs_table(struct hashmap **table, + enum bpf_obj_type type); +__weak void delete_obj_refs_table(struct hashmap *table); +__weak void emit_obj_refs_json(struct hashmap *table, __u32 id, + json_writer_t *json_wtr); +__weak void emit_obj_refs_plain(struct hashmap *table, __u32 id, + const char *prefix); +void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode); +void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode); + +struct cmd { + const char *cmd; + int (*func)(int argc, char **argv); +}; + +int cmd_select(const struct cmd *cmds, int argc, char **argv, + int (*help)(int argc, char **argv)); + +#define MAX_PROG_FULL_NAME 128 +void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd, + char *name_buff, size_t buff_len); + +int get_fd_type(int fd); +const char *get_fd_type_name(enum bpf_obj_type type); +char *get_fdinfo(int fd, const char *key); +int open_obj_pinned(const char *path, bool quiet); +int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type); +int mount_bpffs_for_pin(const char *name, bool is_dir); +int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***)); +int do_pin_fd(int fd, const char *name); + +/* commands available in bootstrap mode */ +int do_gen(int argc, char **argv); +int do_btf(int argc, char **argv); + +/* non-bootstrap only commands */ +int do_prog(int argc, char **arg) __weak; +int do_map(int argc, char **arg) __weak; +int do_link(int argc, char **arg) __weak; +int do_event_pipe(int argc, char **argv) __weak; +int do_cgroup(int argc, char **arg) __weak; +int do_perf(int argc, char **arg) __weak; +int do_net(int argc, char **arg) __weak; +int do_tracelog(int argc, char **arg) __weak; +int do_feature(int argc, char **argv) __weak; +int do_struct_ops(int argc, char **argv) __weak; +int do_iter(int argc, char **argv) __weak; + +int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); +int prog_parse_fd(int *argc, char ***argv); +int prog_parse_fds(int *argc, char ***argv, int **fds); +int map_parse_fd(int *argc, char ***argv); +int map_parse_fds(int *argc, char ***argv, int **fds); +int map_parse_fd_and_info(int *argc, char ***argv, struct bpf_map_info *info, + __u32 *info_len); + +struct bpf_prog_linfo; +#if defined(HAVE_LLVM_SUPPORT) || defined(HAVE_LIBBFD_SUPPORT) +int disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch, const char *disassembler_options, + const struct btf *btf, + const struct bpf_prog_linfo *prog_linfo, + __u64 func_ksym, unsigned int func_idx, + bool linum); +int disasm_init(void); +#else +static inline +int disasm_print_insn(unsigned char *image, ssize_t len, int opcodes, + const char *arch, const char *disassembler_options, + const struct btf *btf, + const struct bpf_prog_linfo *prog_linfo, + __u64 func_ksym, unsigned int func_idx, + bool linum) +{ + return 0; +} +static inline int disasm_init(void) +{ + p_err("No JIT disassembly support"); + return -1; +} +#endif +void print_data_json(uint8_t *data, size_t len); +void print_hex_data_json(uint8_t *data, size_t len); + +unsigned int get_page_size(void); +unsigned int get_possible_cpus(void); +const char * +ifindex_to_arch(__u32 ifindex, __u64 ns_dev, __u64 ns_ino, const char **opt); + +struct btf_dumper { + const struct btf *btf; + json_writer_t *jw; + bool is_plain_text; + bool prog_id_as_func_ptr; +}; + +/* btf_dumper_type - print data along with type information + * @d: an instance containing context for dumping types + * @type_id: index in btf->types array. this points to the type to be dumped + * @data: pointer the actual data, i.e. the values to be printed + * + * Returns zero on success and negative error code otherwise + */ +int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, + const void *data); +void btf_dumper_type_only(const struct btf *btf, __u32 func_type_id, + char *func_only, int size); + +void btf_dump_linfo_plain(const struct btf *btf, + const struct bpf_line_info *linfo, + const char *prefix, bool linum); +void btf_dump_linfo_json(const struct btf *btf, + const struct bpf_line_info *linfo, bool linum); +void btf_dump_linfo_dotlabel(const struct btf *btf, + const struct bpf_line_info *linfo, bool linum); + +struct nlattr; +struct ifinfomsg; +struct tcmsg; +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind, + const char *devname, int ifindex); + +int print_all_levels(__maybe_unused enum libbpf_print_level level, + const char *format, va_list args); + +size_t hash_fn_for_key_as_id(long key, void *ctx); +bool equal_fn_for_key_as_id(long k1, long k2, void *ctx); + +/* bpf_attach_type_input_str - convert the provided attach type value into a + * textual representation that we accept for input purposes. + * + * This function is similar in nature to libbpf_bpf_attach_type_str, but + * recognizes some attach type names that have been used by the program in the + * past and which do not follow the string inference scheme that libbpf uses. + * These textual representations should only be used for user input. + * + * @t: The attach type + * Returns a pointer to a static string identifying the attach type. NULL is + * returned for unknown bpf_attach_type values. + */ +const char *bpf_attach_type_input_str(enum bpf_attach_type t); + +static inline bool hashmap__empty(struct hashmap *map) +{ + return map ? hashmap__size(map) == 0 : true; +} + +int pathname_concat(char *buf, int buf_sz, const char *path, + const char *name); + +/* print netfilter bpf_link info */ +void netfilter_dump_plain(const struct bpf_link_info *info); +void netfilter_dump_json(const struct bpf_link_info *info, json_writer_t *wtr); +#endif diff --git a/third_party/bpftool/src/map.c b/third_party/bpftool/src/map.c new file mode 100644 index 0000000..f98f7bb --- /dev/null +++ b/third_party/bpftool/src/map.c @@ -0,0 +1,1499 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +static struct hashmap *map_table; + +static bool map_is_per_cpu(__u32 type) +{ + return type == BPF_MAP_TYPE_PERCPU_HASH || + type == BPF_MAP_TYPE_PERCPU_ARRAY || + type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE; +} + +static bool map_is_map_of_maps(__u32 type) +{ + return type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static bool map_is_map_of_progs(__u32 type) +{ + return type == BPF_MAP_TYPE_PROG_ARRAY; +} + +static int map_type_from_str(const char *type) +{ + const char *map_type_str; + unsigned int i; + + for (i = 0; ; i++) { + map_type_str = libbpf_bpf_map_type_str(i); + if (!map_type_str) + break; + + /* Don't allow prefixing in case of possible future shadowing */ + if (!strcmp(map_type_str, type)) + return i; + } + return -1; +} + +static void *alloc_value(struct bpf_map_info *info) +{ + if (map_is_per_cpu(info->type)) + return malloc(round_up(info->value_size, 8) * + get_possible_cpus()); + else + return malloc(info->value_size); +} + +static int do_dump_btf(const struct btf_dumper *d, + struct bpf_map_info *map_info, void *key, + void *value) +{ + __u32 value_id; + int ret = 0; + + /* start of key-value pair */ + jsonw_start_object(d->jw); + + if (map_info->btf_key_type_id) { + jsonw_name(d->jw, "key"); + + ret = btf_dumper_type(d, map_info->btf_key_type_id, key); + if (ret) + goto err_end_obj; + } + + value_id = map_info->btf_vmlinux_value_type_id ? + : map_info->btf_value_type_id; + + if (!map_is_per_cpu(map_info->type)) { + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, value_id, value); + } else { + unsigned int i, n, step; + + jsonw_name(d->jw, "values"); + jsonw_start_array(d->jw); + n = get_possible_cpus(); + step = round_up(map_info->value_size, 8); + for (i = 0; i < n; i++) { + jsonw_start_object(d->jw); + jsonw_int_field(d->jw, "cpu", i); + jsonw_name(d->jw, "value"); + ret = btf_dumper_type(d, value_id, value + i * step); + jsonw_end_object(d->jw); + if (ret) + break; + } + jsonw_end_array(d->jw); + } + +err_end_obj: + /* end of key-value pair */ + jsonw_end_object(d->jw); + + return ret; +} + +static json_writer_t *get_btf_writer(void) +{ + json_writer_t *jw = jsonw_new(stdout); + + if (!jw) + return NULL; + jsonw_pretty(jw, true); + + return jw; +} + +static void print_entry_json(struct bpf_map_info *info, unsigned char *key, + unsigned char *value, struct btf *btf) +{ + jsonw_start_object(json_wtr); + + if (!map_is_per_cpu(info->type)) { + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, info->key_size); + jsonw_name(json_wtr, "value"); + print_hex_data_json(value, info->value_size); + if (map_is_map_of_maps(info->type)) + jsonw_uint_field(json_wtr, "inner_map_id", + *(unsigned int *)value); + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + jsonw_name(json_wtr, "formatted"); + do_dump_btf(&d, info, key, value); + } + } else { + unsigned int i, n, step; + + n = get_possible_cpus(); + step = round_up(info->value_size, 8); + + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, info->key_size); + + jsonw_name(json_wtr, "values"); + jsonw_start_array(json_wtr); + for (i = 0; i < n; i++) { + jsonw_start_object(json_wtr); + + jsonw_int_field(json_wtr, "cpu", i); + + jsonw_name(json_wtr, "value"); + print_hex_data_json(value + i * step, + info->value_size); + + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); + if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + jsonw_name(json_wtr, "formatted"); + do_dump_btf(&d, info, key, value); + } + } + + jsonw_end_object(json_wtr); +} + +static void +print_entry_error_msg(struct bpf_map_info *info, unsigned char *key, + const char *error_msg) +{ + int msg_size = strlen(error_msg); + bool single_line, break_names; + + break_names = info->key_size > 16 || msg_size > 16; + single_line = info->key_size + msg_size <= 24 && !break_names; + + printf("key:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, key, info->key_size, " "); + + printf(single_line ? " " : "\n"); + + printf("value:%c%s", break_names ? '\n' : ' ', error_msg); + + printf("\n"); +} + +static void +print_entry_error(struct bpf_map_info *map_info, void *key, int lookup_errno) +{ + /* For prog_array maps or arrays of maps, failure to lookup the value + * means there is no entry for that key. Do not print an error message + * in that case. + */ + if ((map_is_map_of_maps(map_info->type) || + map_is_map_of_progs(map_info->type)) && lookup_errno == ENOENT) + return; + + if (json_output) { + jsonw_start_object(json_wtr); /* entry */ + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, map_info->key_size); + jsonw_name(json_wtr, "value"); + jsonw_start_object(json_wtr); /* error */ + jsonw_string_field(json_wtr, "error", strerror(lookup_errno)); + jsonw_end_object(json_wtr); /* error */ + jsonw_end_object(json_wtr); /* entry */ + } else { + const char *msg = NULL; + + if (lookup_errno == ENOENT) + msg = ""; + else if (lookup_errno == ENOSPC && + map_info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + msg = ""; + + print_entry_error_msg(map_info, key, + msg ? : strerror(lookup_errno)); + } +} + +static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, + unsigned char *value) +{ + if (!map_is_per_cpu(info->type)) { + bool single_line, break_names; + + break_names = info->key_size > 16 || info->value_size > 16; + single_line = info->key_size + info->value_size <= 24 && + !break_names; + + if (info->key_size) { + printf("key:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, key, info->key_size, " "); + + printf(single_line ? " " : "\n"); + } + + if (info->value_size) { + if (map_is_map_of_maps(info->type)) { + printf("inner_map_id:%c", break_names ? '\n' : ' '); + printf("%u ", *(unsigned int *)value); + } else { + printf("value:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, value, info->value_size, " "); + } + } + + printf("\n"); + } else { + unsigned int i, n, step; + + n = get_possible_cpus(); + step = round_up(info->value_size, 8); + + if (info->key_size) { + printf("key:\n"); + fprint_hex(stdout, key, info->key_size, " "); + printf("\n"); + } + if (info->value_size) { + for (i = 0; i < n; i++) { + printf("value (CPU %02d):%c", + i, info->value_size > 16 ? '\n' : ' '); + fprint_hex(stdout, value + i * step, + info->value_size, " "); + printf("\n"); + } + } + } +} + +static char **parse_bytes(char **argv, const char *name, unsigned char *val, + unsigned int n) +{ + unsigned int i = 0, base = 0; + char *endptr; + + if (is_prefix(*argv, "hex")) { + base = 16; + argv++; + } + + while (i < n && argv[i]) { + val[i] = strtoul(argv[i], &endptr, base); + if (*endptr) { + p_err("error parsing byte: %s", argv[i]); + return NULL; + } + i++; + } + + if (i != n) { + p_err("%s expected %d bytes got %d", name, n, i); + return NULL; + } + + return argv + i; +} + +/* on per cpu maps we must copy the provided value on all value instances */ +static void fill_per_cpu_value(struct bpf_map_info *info, void *value) +{ + unsigned int i, n, step; + + if (!map_is_per_cpu(info->type)) + return; + + n = get_possible_cpus(); + step = round_up(info->value_size, 8); + for (i = 1; i < n; i++) + memcpy(value + i * step, value, info->value_size); +} + +static int parse_elem(char **argv, struct bpf_map_info *info, + void *key, void *value, __u32 key_size, __u32 value_size, + __u32 *flags, __u32 **value_fd) +{ + if (!*argv) { + if (!key && !value) + return 0; + p_err("did not find %s", key ? "key" : "value"); + return -1; + } + + if (is_prefix(*argv, "key")) { + if (!key) { + if (key_size) + p_err("duplicate key"); + else + p_err("unnecessary key"); + return -1; + } + + argv = parse_bytes(argv + 1, "key", key, key_size); + if (!argv) + return -1; + + return parse_elem(argv, info, NULL, value, key_size, value_size, + flags, value_fd); + } else if (is_prefix(*argv, "value")) { + int fd; + + if (!value) { + if (value_size) + p_err("duplicate value"); + else + p_err("unnecessary value"); + return -1; + } + + argv++; + + if (map_is_map_of_maps(info->type)) { + int argc = 2; + + if (value_size != 4) { + p_err("value smaller than 4B for map in map?"); + return -1; + } + if (!argv[0] || !argv[1]) { + p_err("not enough value arguments for map in map"); + return -1; + } + + fd = map_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + *value_fd = value; + **value_fd = fd; + } else if (map_is_map_of_progs(info->type)) { + int argc = 2; + + if (value_size != 4) { + p_err("value smaller than 4B for map of progs?"); + return -1; + } + if (!argv[0] || !argv[1]) { + p_err("not enough value arguments for map of progs"); + return -1; + } + if (is_prefix(*argv, "id")) + p_info("Warning: updating program array via MAP_ID, make sure this map is kept open\n" + " by some process or pinned otherwise update will be lost"); + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + *value_fd = value; + **value_fd = fd; + } else { + argv = parse_bytes(argv, "value", value, value_size); + if (!argv) + return -1; + + fill_per_cpu_value(info, value); + } + + return parse_elem(argv, info, key, NULL, key_size, value_size, + flags, NULL); + } else if (is_prefix(*argv, "any") || is_prefix(*argv, "noexist") || + is_prefix(*argv, "exist")) { + if (!flags) { + p_err("flags specified multiple times: %s", *argv); + return -1; + } + + if (is_prefix(*argv, "any")) + *flags = BPF_ANY; + else if (is_prefix(*argv, "noexist")) + *flags = BPF_NOEXIST; + else if (is_prefix(*argv, "exist")) + *flags = BPF_EXIST; + + return parse_elem(argv + 1, info, key, value, key_size, + value_size, NULL, value_fd); + } + + p_err("expected key or value, got: %s", *argv); + return -1; +} + +static void show_map_header_json(struct bpf_map_info *info, json_writer_t *wtr) +{ + const char *map_type_str; + + jsonw_uint_field(wtr, "id", info->id); + map_type_str = libbpf_bpf_map_type_str(info->type); + if (map_type_str) + jsonw_string_field(wtr, "type", map_type_str); + else + jsonw_uint_field(wtr, "type", info->type); + + if (*info->name) + jsonw_string_field(wtr, "name", info->name); + + jsonw_name(wtr, "flags"); + jsonw_printf(wtr, "%d", info->map_flags); +} + +static int show_map_close_json(int fd, struct bpf_map_info *info) +{ + char *memlock, *frozen_str; + int frozen = 0; + + memlock = get_fdinfo(fd, "memlock"); + frozen_str = get_fdinfo(fd, "frozen"); + + jsonw_start_object(json_wtr); + + show_map_header_json(info, json_wtr); + + print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); + + jsonw_uint_field(json_wtr, "bytes_key", info->key_size); + jsonw_uint_field(json_wtr, "bytes_value", info->value_size); + jsonw_uint_field(json_wtr, "max_entries", info->max_entries); + + if (memlock) + jsonw_int_field(json_wtr, "bytes_memlock", atoll(memlock)); + free(memlock); + + if (info->type == BPF_MAP_TYPE_PROG_ARRAY) { + char *owner_prog_type = get_fdinfo(fd, "owner_prog_type"); + char *owner_jited = get_fdinfo(fd, "owner_jited"); + + if (owner_prog_type) { + unsigned int prog_type = atoi(owner_prog_type); + const char *prog_type_str; + + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + if (prog_type_str) + jsonw_string_field(json_wtr, "owner_prog_type", + prog_type_str); + else + jsonw_uint_field(json_wtr, "owner_prog_type", + prog_type); + } + if (owner_jited) + jsonw_bool_field(json_wtr, "owner_jited", + !!atoi(owner_jited)); + + free(owner_prog_type); + free(owner_jited); + } + close(fd); + + if (frozen_str) { + frozen = atoi(frozen_str); + free(frozen_str); + } + jsonw_int_field(json_wtr, "frozen", frozen); + + if (info->btf_id) + jsonw_int_field(json_wtr, "btf_id", info->btf_id); + + if (!hashmap__empty(map_table)) { + struct hashmap_entry *entry; + + jsonw_name(json_wtr, "pinned"); + jsonw_start_array(json_wtr); + hashmap__for_each_key_entry(map_table, entry, info->id) + jsonw_string(json_wtr, entry->pvalue); + jsonw_end_array(json_wtr); + } + + emit_obj_refs_json(refs_table, info->id, json_wtr); + + jsonw_end_object(json_wtr); + + return 0; +} + +static void show_map_header_plain(struct bpf_map_info *info) +{ + const char *map_type_str; + + printf("%u: ", info->id); + + map_type_str = libbpf_bpf_map_type_str(info->type); + if (map_type_str) + printf("%s ", map_type_str); + else + printf("type %u ", info->type); + + if (*info->name) + printf("name %s ", info->name); + + printf("flags 0x%x", info->map_flags); + print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); + printf("\n"); +} + +static int show_map_close_plain(int fd, struct bpf_map_info *info) +{ + char *memlock, *frozen_str; + int frozen = 0; + + memlock = get_fdinfo(fd, "memlock"); + frozen_str = get_fdinfo(fd, "frozen"); + + show_map_header_plain(info); + printf("\tkey %uB value %uB max_entries %u", + info->key_size, info->value_size, info->max_entries); + + if (memlock) + printf(" memlock %sB", memlock); + free(memlock); + + if (info->type == BPF_MAP_TYPE_PROG_ARRAY) { + char *owner_prog_type = get_fdinfo(fd, "owner_prog_type"); + char *owner_jited = get_fdinfo(fd, "owner_jited"); + + if (owner_prog_type || owner_jited) + printf("\n\t"); + if (owner_prog_type) { + unsigned int prog_type = atoi(owner_prog_type); + const char *prog_type_str; + + prog_type_str = libbpf_bpf_prog_type_str(prog_type); + if (prog_type_str) + printf("owner_prog_type %s ", prog_type_str); + else + printf("owner_prog_type %d ", prog_type); + } + if (owner_jited) + printf("owner%s jited", + atoi(owner_jited) ? "" : " not"); + + free(owner_prog_type); + free(owner_jited); + } + close(fd); + + if (!hashmap__empty(map_table)) { + struct hashmap_entry *entry; + + hashmap__for_each_key_entry(map_table, entry, info->id) + printf("\n\tpinned %s", (char *)entry->pvalue); + } + + if (frozen_str) { + frozen = atoi(frozen_str); + free(frozen_str); + } + + if (info->btf_id || frozen) + printf("\n\t"); + + if (info->btf_id) + printf("btf_id %d", info->btf_id); + + if (frozen) + printf("%sfrozen", info->btf_id ? " " : ""); + + emit_obj_refs_plain(refs_table, info->id, "\n\tpids "); + + printf("\n"); + return 0; +} + +static int do_show_subset(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int *fds = NULL; + int nb_fds, i; + int err = -1; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + err = bpf_map_get_info_by_fd(fds[i], &info, &len); + if (err) { + p_err("can't get map info: %s", + strerror(errno)); + for (; i < nb_fds; i++) + close(fds[i]); + break; + } + + if (json_output) + show_map_close_json(fds[i], &info); + else + show_map_close_plain(fds[i], &info); + + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_free: + free(fds); + return err; +} + +static int do_show(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 id = 0; + int err; + int fd; + + if (show_pinned) { + map_table = hashmap__new(hash_fn_for_key_as_id, + equal_fn_for_key_as_id, NULL); + if (IS_ERR(map_table)) { + p_err("failed to create hashmap for pinned paths"); + return -1; + } + build_pinned_obj_table(map_table, BPF_OBJ_MAP); + } + build_obj_refs_table(&refs_table, BPF_OBJ_MAP); + + if (argc == 2) + return do_show_subset(argc, argv); + + if (argc) + return BAD_ARG(); + + if (json_output) + jsonw_start_array(json_wtr); + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + p_err("can't get next map: %s%s", strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + break; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get map by id (%u): %s", + id, strerror(errno)); + break; + } + + err = bpf_map_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + close(fd); + break; + } + + if (json_output) + show_map_close_json(fd, &info); + else + show_map_close_plain(fd, &info); + } + if (json_output) + jsonw_end_array(json_wtr); + + delete_obj_refs_table(refs_table); + + if (show_pinned) + delete_pinned_obj_table(map_table); + + return errno == ENOENT ? 0 : -1; +} + +static int dump_map_elem(int fd, void *key, void *value, + struct bpf_map_info *map_info, struct btf *btf, + json_writer_t *btf_wtr) +{ + if (bpf_map_lookup_elem(fd, key, value)) { + print_entry_error(map_info, key, errno); + return -1; + } + + if (json_output) { + print_entry_json(map_info, key, value, btf); + } else if (btf) { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, map_info, key, value); + } else { + print_entry_plain(map_info, key, value); + } + + return 0; +} + +static int maps_have_btf(int *fds, int nb_fds) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int err, i; + + for (i = 0; i < nb_fds; i++) { + err = bpf_map_get_info_by_fd(fds[i], &info, &len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + return -1; + } + + if (!info.btf_id) + return 0; + } + + return 1; +} + +static struct btf *btf_vmlinux; + +static int get_map_kv_btf(const struct bpf_map_info *info, struct btf **btf) +{ + int err = 0; + + if (info->btf_vmlinux_value_type_id) { + if (!btf_vmlinux) { + btf_vmlinux = libbpf_find_kernel_btf(); + if (!btf_vmlinux) { + p_err("failed to get kernel btf"); + return -errno; + } + } + *btf = btf_vmlinux; + } else if (info->btf_value_type_id) { + *btf = btf__load_from_kernel_by_id(info->btf_id); + if (!*btf) { + err = -errno; + p_err("failed to get btf"); + } + } else { + *btf = NULL; + } + + return err; +} + +static void free_map_kv_btf(struct btf *btf) +{ + if (btf != btf_vmlinux) + btf__free(btf); +} + +static int +map_dump(int fd, struct bpf_map_info *info, json_writer_t *wtr, + bool show_header) +{ + void *key, *value, *prev_key; + unsigned int num_elems = 0; + struct btf *btf = NULL; + int err; + + key = malloc(info->key_size); + value = alloc_value(info); + if (!key || !value) { + p_err("mem alloc failed"); + err = -1; + goto exit_free; + } + + prev_key = NULL; + + if (wtr) { + err = get_map_kv_btf(info, &btf); + if (err) { + goto exit_free; + } + + if (show_header) { + jsonw_start_object(wtr); /* map object */ + show_map_header_json(info, wtr); + jsonw_name(wtr, "elements"); + } + jsonw_start_array(wtr); /* elements */ + } else if (show_header) { + show_map_header_plain(info); + } + + if (info->type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY && + info->value_size != 8) { + const char *map_type_str; + + map_type_str = libbpf_bpf_map_type_str(info->type); + p_info("Warning: cannot read values from %s map with value_size != 8", + map_type_str); + } + while (true) { + err = bpf_map_get_next_key(fd, prev_key, key); + if (err) { + if (errno == ENOENT) + err = 0; + break; + } + if (!dump_map_elem(fd, key, value, info, btf, wtr)) + num_elems++; + prev_key = key; + } + + if (wtr) { + jsonw_end_array(wtr); /* elements */ + if (show_header) + jsonw_end_object(wtr); /* map object */ + } else { + printf("Found %u element%s\n", num_elems, + num_elems != 1 ? "s" : ""); + } + +exit_free: + free(key); + free(value); + close(fd); + free_map_kv_btf(btf); + + return err; +} + +static int do_dump(int argc, char **argv) +{ + json_writer_t *wtr = NULL, *btf_wtr = NULL; + struct bpf_map_info info = {}; + int nb_fds, i = 0; + __u32 len = sizeof(info); + int *fds = NULL; + int err = -1; + + if (argc != 2) + usage(); + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = map_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output) { + wtr = json_wtr; + } else { + int do_plain_btf; + + do_plain_btf = maps_have_btf(fds, nb_fds); + if (do_plain_btf < 0) + goto exit_close; + + if (do_plain_btf) { + btf_wtr = get_btf_writer(); + wtr = btf_wtr; + if (!btf_wtr) + p_info("failed to create json writer for btf. falling back to plain output"); + } + } + + if (wtr && nb_fds > 1) + jsonw_start_array(wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + if (bpf_map_get_info_by_fd(fds[i], &info, &len)) { + p_err("can't get map info: %s", strerror(errno)); + break; + } + err = map_dump(fds[i], &info, wtr, nb_fds > 1); + if (!wtr && i != nb_fds - 1) + printf("\n"); + + if (err) + break; + close(fds[i]); + } + if (wtr && nb_fds > 1) + jsonw_end_array(wtr); /* root array */ + + if (btf_wtr) + jsonw_destroy(&btf_wtr); +exit_close: + for (; i < nb_fds; i++) + close(fds[i]); +exit_free: + free(fds); + btf__free(btf_vmlinux); + return err; +} + +static int alloc_key_value(struct bpf_map_info *info, void **key, void **value) +{ + *key = NULL; + *value = NULL; + + if (info->key_size) { + *key = malloc(info->key_size); + if (!*key) { + p_err("key mem alloc failed"); + return -1; + } + } + + if (info->value_size) { + *value = alloc_value(info); + if (!*value) { + p_err("value mem alloc failed"); + free(*key); + *key = NULL; + return -1; + } + } + + return 0; +} + +static int do_update(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + __u32 *value_fd = NULL; + __u32 flags = BPF_ANY; + void *key, *value; + int fd, err; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + err = alloc_key_value(&info, &key, &value); + if (err) + goto exit_free; + + err = parse_elem(argv, &info, key, value, info.key_size, + info.value_size, &flags, &value_fd); + if (err) + goto exit_free; + + err = bpf_map_update_elem(fd, key, value, flags); + if (err) { + p_err("update failed: %s", strerror(errno)); + goto exit_free; + } + +exit_free: + if (value_fd) + close(*value_fd); + free(key); + free(value); + close(fd); + + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static void print_key_value(struct bpf_map_info *info, void *key, + void *value) +{ + json_writer_t *btf_wtr; + struct btf *btf; + + if (get_map_kv_btf(info, &btf)) + return; + + if (json_output) { + print_entry_json(info, key, value, btf); + } else if (btf) { + /* if here json_wtr wouldn't have been initialised, + * so let's create separate writer for btf + */ + btf_wtr = get_btf_writer(); + if (!btf_wtr) { + p_info("failed to create json writer for btf. falling back to plain output"); + btf__free(btf); + btf = NULL; + print_entry_plain(info, key, value); + } else { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, info, key, value); + jsonw_destroy(&btf_wtr); + } + } else { + print_entry_plain(info, key, value); + } + btf__free(btf); +} + +static int do_lookup(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *value; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + err = alloc_key_value(&info, &key, &value); + if (err) + goto exit_free; + + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL); + if (err) + goto exit_free; + + err = bpf_map_lookup_elem(fd, key, value); + if (err) { + if (errno == ENOENT) { + if (json_output) { + jsonw_null(json_wtr); + } else { + printf("key:\n"); + fprint_hex(stdout, key, info.key_size, " "); + printf("\n\nNot found\n"); + } + } else { + p_err("lookup failed: %s", strerror(errno)); + } + + goto exit_free; + } + + /* here means bpf_map_lookup_elem() succeeded */ + print_key_value(&info, key, value); + +exit_free: + free(key); + free(value); + close(fd); + + return err; +} + +static int do_getnext(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *nextkey; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + nextkey = malloc(info.key_size); + if (!key || !nextkey) { + p_err("mem alloc failed"); + err = -1; + goto exit_free; + } + + if (argc) { + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, + NULL, NULL); + if (err) + goto exit_free; + } else { + free(key); + key = NULL; + } + + err = bpf_map_get_next_key(fd, key, nextkey); + if (err) { + p_err("can't get next key: %s", strerror(errno)); + goto exit_free; + } + + if (json_output) { + jsonw_start_object(json_wtr); + if (key) { + jsonw_name(json_wtr, "key"); + print_hex_data_json(key, info.key_size); + } else { + jsonw_null_field(json_wtr, "key"); + } + jsonw_name(json_wtr, "next_key"); + print_hex_data_json(nextkey, info.key_size); + jsonw_end_object(json_wtr); + } else { + if (key) { + printf("key:\n"); + fprint_hex(stdout, key, info.key_size, " "); + printf("\n"); + } else { + printf("key: None\n"); + } + printf("next key:\n"); + fprint_hex(stdout, nextkey, info.key_size, " "); + printf("\n"); + } + +exit_free: + free(nextkey); + free(key); + close(fd); + + return err; +} + +static int do_delete(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + key = malloc(info.key_size); + if (!key) { + p_err("mem alloc failed"); + err = -1; + goto exit_free; + } + + err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL); + if (err) + goto exit_free; + + err = bpf_map_delete_elem(fd, key); + if (err) + p_err("delete failed: %s", strerror(errno)); + +exit_free: + free(key); + close(fd); + + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static int do_pin(int argc, char **argv) +{ + int err; + + err = do_pin_any(argc, argv, map_parse_fd); + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +static int do_create(int argc, char **argv) +{ + LIBBPF_OPTS(bpf_map_create_opts, attr); + enum bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC; + __u32 key_size = 0, value_size = 0, max_entries = 0; + const char *map_name = NULL; + const char *pinfile; + int err = -1, fd; + + if (!REQ_ARGS(7)) + return -1; + pinfile = GET_ARG(); + + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "type")) { + NEXT_ARG(); + + if (map_type) { + p_err("map type already specified"); + goto exit; + } + + map_type = map_type_from_str(*argv); + if ((int)map_type < 0) { + p_err("unrecognized map type: %s", *argv); + goto exit; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "name")) { + NEXT_ARG(); + map_name = GET_ARG(); + } else if (is_prefix(*argv, "key")) { + if (parse_u32_arg(&argc, &argv, &key_size, + "key size")) + goto exit; + } else if (is_prefix(*argv, "value")) { + if (parse_u32_arg(&argc, &argv, &value_size, + "value size")) + goto exit; + } else if (is_prefix(*argv, "entries")) { + if (parse_u32_arg(&argc, &argv, &max_entries, + "max entries")) + goto exit; + } else if (is_prefix(*argv, "flags")) { + if (parse_u32_arg(&argc, &argv, &attr.map_flags, + "flags")) + goto exit; + } else if (is_prefix(*argv, "dev")) { + p_info("Warning: 'bpftool map create [...] dev ' syntax is deprecated.\n" + "Going further, please use 'offload_dev ' to request hardware offload for the map."); + goto offload_dev; + } else if (is_prefix(*argv, "offload_dev")) { +offload_dev: + NEXT_ARG(); + + if (attr.map_ifindex) { + p_err("offload device already specified"); + goto exit; + } + + attr.map_ifindex = if_nametoindex(*argv); + if (!attr.map_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + goto exit; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "inner_map")) { + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + int inner_map_fd; + + NEXT_ARG(); + if (!REQ_ARGS(2)) + usage(); + inner_map_fd = map_parse_fd_and_info(&argc, &argv, + &info, &len); + if (inner_map_fd < 0) + return -1; + attr.inner_map_fd = inner_map_fd; + } else { + p_err("unknown arg %s", *argv); + goto exit; + } + } + + if (!map_name) { + p_err("map name not specified"); + goto exit; + } + + set_max_rlimit(); + + fd = bpf_map_create(map_type, map_name, key_size, value_size, max_entries, &attr); + if (fd < 0) { + p_err("map create failed: %s", strerror(errno)); + goto exit; + } + + err = do_pin_fd(fd, pinfile); + close(fd); + if (err) + goto exit; + + if (json_output) + jsonw_null(json_wtr); + +exit: + if (attr.inner_map_fd > 0) + close(attr.inner_map_fd); + + return err; +} + +static int do_pop_dequeue(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *value; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + err = alloc_key_value(&info, &key, &value); + if (err) + goto exit_free; + + err = bpf_map_lookup_and_delete_elem(fd, key, value); + if (err) { + if (errno == ENOENT) { + if (json_output) + jsonw_null(json_wtr); + else + printf("Error: empty map\n"); + } else { + p_err("pop failed: %s", strerror(errno)); + } + + goto exit_free; + } + + print_key_value(&info, key, value); + +exit_free: + free(key); + free(value); + close(fd); + + return err; +} + +static int do_freeze(int argc, char **argv) +{ + int err, fd; + + if (!REQ_ARGS(2)) + return -1; + + fd = map_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + if (argc) { + close(fd); + return BAD_ARG(); + } + + err = bpf_map_freeze(fd); + close(fd); + if (err) { + p_err("failed to freeze map: %s", strerror(errno)); + return err; + } + + if (json_output) + jsonw_null(json_wtr); + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [MAP]\n" + " %1$s %2$s create FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n" + " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" + " [inner_map MAP] [offload_dev NAME]\n" + " %1$s %2$s dump MAP\n" + " %1$s %2$s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" + " %1$s %2$s lookup MAP [key DATA]\n" + " %1$s %2$s getnext MAP [key DATA]\n" + " %1$s %2$s delete MAP key DATA\n" + " %1$s %2$s pin MAP FILE\n" + " %1$s %2$s event_pipe MAP [cpu N index M]\n" + " %1$s %2$s peek MAP\n" + " %1$s %2$s push MAP value VALUE\n" + " %1$s %2$s pop MAP\n" + " %1$s %2$s enqueue MAP value VALUE\n" + " %1$s %2$s dequeue MAP\n" + " %1$s %2$s freeze MAP\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_MAP "\n" + " DATA := { [hex] BYTES }\n" + " " HELP_SPEC_PROGRAM "\n" + " VALUE := { DATA | MAP | PROG }\n" + " UPDATE_FLAGS := { any | exist | noexist }\n" + " TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n" + " percpu_array | stack_trace | cgroup_array | lru_hash |\n" + " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" + " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" + " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" + " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n" + " task_storage | bloom_filter | user_ringbuf | cgrp_storage }\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-n|--nomount} }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "dump", do_dump }, + { "update", do_update }, + { "lookup", do_lookup }, + { "getnext", do_getnext }, + { "delete", do_delete }, + { "pin", do_pin }, + { "event_pipe", do_event_pipe }, + { "create", do_create }, + { "peek", do_lookup }, + { "push", do_update }, + { "enqueue", do_update }, + { "pop", do_pop_dequeue }, + { "dequeue", do_pop_dequeue }, + { "freeze", do_freeze }, + { 0 } +}; + +int do_map(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/map_perf_ring.c b/third_party/bpftool/src/map_perf_ring.c new file mode 100644 index 0000000..21d7d44 --- /dev/null +++ b/third_party/bpftool/src/map_perf_ring.c @@ -0,0 +1,226 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2018 Netronome Systems, Inc. */ +/* This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +#define MMAP_PAGE_CNT 16 + +static volatile bool stop; + +struct perf_event_sample { + struct perf_event_header header; + __u64 time; + __u32 size; + unsigned char data[]; +}; + +struct perf_event_lost { + struct perf_event_header header; + __u64 id; + __u64 lost; +}; + +static void int_exit(int signo) +{ + fprintf(stderr, "Stopping...\n"); + stop = true; +} + +struct event_pipe_ctx { + bool all_cpus; + int cpu; + int idx; +}; + +static enum bpf_perf_event_ret +print_bpf_output(void *private_data, int cpu, struct perf_event_header *event) +{ + struct perf_event_sample *e = container_of(event, + struct perf_event_sample, + header); + struct perf_event_lost *lost = container_of(event, + struct perf_event_lost, + header); + struct event_pipe_ctx *ctx = private_data; + int idx = ctx->all_cpus ? cpu : ctx->idx; + + if (json_output) { + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "type"); + jsonw_uint(json_wtr, e->header.type); + jsonw_name(json_wtr, "cpu"); + jsonw_uint(json_wtr, cpu); + jsonw_name(json_wtr, "index"); + jsonw_uint(json_wtr, idx); + if (e->header.type == PERF_RECORD_SAMPLE) { + jsonw_name(json_wtr, "timestamp"); + jsonw_uint(json_wtr, e->time); + jsonw_name(json_wtr, "data"); + print_data_json(e->data, e->size); + } else if (e->header.type == PERF_RECORD_LOST) { + jsonw_name(json_wtr, "lost"); + jsonw_start_object(json_wtr); + jsonw_name(json_wtr, "id"); + jsonw_uint(json_wtr, lost->id); + jsonw_name(json_wtr, "count"); + jsonw_uint(json_wtr, lost->lost); + jsonw_end_object(json_wtr); + } + jsonw_end_object(json_wtr); + } else { + if (e->header.type == PERF_RECORD_SAMPLE) { + printf("== @%lld.%09lld CPU: %d index: %d =====\n", + e->time / 1000000000ULL, e->time % 1000000000ULL, + cpu, idx); + fprint_hex(stdout, e->data, e->size, " "); + printf("\n"); + } else if (e->header.type == PERF_RECORD_LOST) { + printf("lost %lld events\n", lost->lost); + } else { + printf("unknown event type=%d size=%d\n", + e->header.type, e->header.size); + } + } + + return LIBBPF_PERF_EVENT_CONT; +} + +int do_event_pipe(int argc, char **argv) +{ + struct perf_event_attr perf_attr = { + .sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_TIME, + .type = PERF_TYPE_SOFTWARE, + .config = PERF_COUNT_SW_BPF_OUTPUT, + .sample_period = 1, + .wakeup_events = 1, + }; + struct bpf_map_info map_info = {}; + LIBBPF_OPTS(perf_buffer_raw_opts, opts); + struct event_pipe_ctx ctx = { + .all_cpus = true, + .cpu = -1, + .idx = -1, + }; + struct perf_buffer *pb; + __u32 map_info_len; + int err, map_fd; + + map_info_len = sizeof(map_info); + map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len); + if (map_fd < 0) + return -1; + + if (map_info.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + p_err("map is not a perf event array"); + goto err_close_map; + } + + while (argc) { + if (argc < 2) { + BAD_ARG(); + goto err_close_map; + } + + if (is_prefix(*argv, "cpu")) { + char *endptr; + + NEXT_ARG(); + ctx.cpu = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as CPU ID", *argv); + goto err_close_map; + } + + NEXT_ARG(); + } else if (is_prefix(*argv, "index")) { + char *endptr; + + NEXT_ARG(); + ctx.idx = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as index", *argv); + goto err_close_map; + } + + NEXT_ARG(); + } else { + BAD_ARG(); + goto err_close_map; + } + + ctx.all_cpus = false; + } + + if (!ctx.all_cpus) { + if (ctx.idx == -1 || ctx.cpu == -1) { + p_err("cpu and index must be specified together"); + goto err_close_map; + } + } else { + ctx.cpu = 0; + ctx.idx = 0; + } + + opts.cpu_cnt = ctx.all_cpus ? 0 : 1; + opts.cpus = &ctx.cpu; + opts.map_keys = &ctx.idx; + pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &perf_attr, + print_bpf_output, &ctx, &opts); + if (!pb) { + p_err("failed to create perf buffer: %s (%d)", + strerror(errno), errno); + goto err_close_map; + } + + signal(SIGINT, int_exit); + signal(SIGHUP, int_exit); + signal(SIGTERM, int_exit); + + if (json_output) + jsonw_start_array(json_wtr); + + while (!stop) { + err = perf_buffer__poll(pb, 200); + if (err < 0 && err != -EINTR) { + p_err("perf buffer polling failed: %s (%d)", + strerror(errno), errno); + goto err_close_pb; + } + } + + if (json_output) + jsonw_end_array(json_wtr); + + perf_buffer__free(pb); + close(map_fd); + + return 0; + +err_close_pb: + perf_buffer__free(pb); +err_close_map: + close(map_fd); + return -1; +} diff --git a/third_party/bpftool/src/net.c b/third_party/bpftool/src/net.c new file mode 100644 index 0000000..26a4996 --- /dev/null +++ b/third_party/bpftool/src/net.c @@ -0,0 +1,865 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2018 Facebook + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf/nlattr.h" +#include "main.h" +#include "netlink_dumper.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +struct ip_devname_ifindex { + char devname[64]; + int ifindex; +}; + +struct bpf_netdev_t { + struct ip_devname_ifindex *devices; + int used_len; + int array_len; + int filter_idx; +}; + +struct tc_kind_handle { + char kind[64]; + int handle; +}; + +struct bpf_tcinfo_t { + struct tc_kind_handle *handle_array; + int used_len; + int array_len; + bool is_qdisc; +}; + +struct bpf_filter_t { + const char *kind; + const char *devname; + int ifindex; +}; + +struct bpf_attach_info { + __u32 flow_dissector_id; +}; + +enum net_attach_type { + NET_ATTACH_TYPE_XDP, + NET_ATTACH_TYPE_XDP_GENERIC, + NET_ATTACH_TYPE_XDP_DRIVER, + NET_ATTACH_TYPE_XDP_OFFLOAD, +}; + +static const char * const attach_type_strings[] = { + [NET_ATTACH_TYPE_XDP] = "xdp", + [NET_ATTACH_TYPE_XDP_GENERIC] = "xdpgeneric", + [NET_ATTACH_TYPE_XDP_DRIVER] = "xdpdrv", + [NET_ATTACH_TYPE_XDP_OFFLOAD] = "xdpoffload", +}; + +const size_t net_attach_type_size = ARRAY_SIZE(attach_type_strings); + +static enum net_attach_type parse_attach_type(const char *str) +{ + enum net_attach_type type; + + for (type = 0; type < net_attach_type_size; type++) { + if (attach_type_strings[type] && + is_prefix(str, attach_type_strings[type])) + return type; + } + + return net_attach_type_size; +} + +typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); + +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t, void *cookie); + +static int netlink_open(__u32 *nl_pid) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + p_err("Netlink error reporting not supported"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static int netlink_recv(int sock, __u32 nl_pid, __u32 seq, + __dump_nlmsg_t _fn, dump_nlmsg_t fn, + void *cookie) +{ + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + char buf[4096]; + int len, ret; + + while (multipart) { + multipart = false; + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + libbpf_nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + return 0; + default: + break; + } + if (_fn) { + ret = _fn(nh, fn, cookie); + if (ret) + return ret; + } + } + } + ret = 0; +done: + return ret; +} + +static int __dump_class_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_class_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_class_nlmsg(cookie, t, tb); +} + +static int netlink_get_class(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTCLASS, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg, + dump_class_nlmsg, cookie); +} + +static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_qdisc_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_qdisc_nlmsg(cookie, t, tb); +} + +static int netlink_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETQDISC, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg, + dump_qdisc_nlmsg, cookie); +} + +static int __dump_filter_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_filter_nlmsg, + void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (libbpf_nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_filter_nlmsg(cookie, t, tb); +} + +static int netlink_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTFILTER, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + .t.tcm_parent = handle, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg, + dump_filter_nlmsg, cookie); +} + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +static int netlink_get_link(int sock, unsigned int nl_pid, + dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifm.ifi_family = AF_PACKET, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg, + dump_link_nlmsg, cookie); +} + +static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_netdev_t *netinfo = cookie; + struct ifinfomsg *ifinfo = msg; + + if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index) + return 0; + + if (netinfo->used_len == netinfo->array_len) { + netinfo->devices = realloc(netinfo->devices, + (netinfo->array_len + 16) * + sizeof(struct ip_devname_ifindex)); + if (!netinfo->devices) + return -ENOMEM; + + netinfo->array_len += 16; + } + netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index; + snprintf(netinfo->devices[netinfo->used_len].devname, + sizeof(netinfo->devices[netinfo->used_len].devname), + "%s", + tb[IFLA_IFNAME] + ? libbpf_nla_getattr_str(tb[IFLA_IFNAME]) + : ""); + netinfo->used_len++; + + return do_xdp_dump(ifinfo, tb); +} + +static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_tcinfo_t *tcinfo = cookie; + struct tcmsg *info = msg; + + if (tcinfo->is_qdisc) { + /* skip clsact qdisc */ + if (tb[TCA_KIND] && + strcmp(libbpf_nla_data(tb[TCA_KIND]), "clsact") == 0) + return 0; + if (info->tcm_handle == 0) + return 0; + } + + if (tcinfo->used_len == tcinfo->array_len) { + tcinfo->handle_array = realloc(tcinfo->handle_array, + (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + if (!tcinfo->handle_array) + return -ENOMEM; + + tcinfo->array_len += 16; + } + tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; + snprintf(tcinfo->handle_array[tcinfo->used_len].kind, + sizeof(tcinfo->handle_array[tcinfo->used_len].kind), + "%s", + tb[TCA_KIND] + ? libbpf_nla_getattr_str(tb[TCA_KIND]) + : "unknown"); + tcinfo->used_len++; + + return 0; +} + +static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + const struct bpf_filter_t *filter_info = cookie; + + return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind, + filter_info->devname, filter_info->ifindex); +} + +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, + struct ip_devname_ifindex *dev) +{ + struct bpf_filter_t filter_info; + struct bpf_tcinfo_t tcinfo; + int i, handle, ret = 0; + + tcinfo.handle_array = NULL; + tcinfo.used_len = 0; + tcinfo.array_len = 0; + + tcinfo.is_qdisc = false; + ret = netlink_get_class(sock, nl_pid, dev->ifindex, + dump_class_qdisc_nlmsg, &tcinfo); + if (ret) + goto out; + + tcinfo.is_qdisc = true; + ret = netlink_get_qdisc(sock, nl_pid, dev->ifindex, + dump_class_qdisc_nlmsg, &tcinfo); + if (ret) + goto out; + + filter_info.devname = dev->devname; + filter_info.ifindex = dev->ifindex; + for (i = 0; i < tcinfo.used_len; i++) { + filter_info.kind = tcinfo.handle_array[i].kind; + ret = netlink_get_filter(sock, nl_pid, dev->ifindex, + tcinfo.handle_array[i].handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + } + + /* root, ingress and egress handle */ + handle = TC_H_ROOT; + filter_info.kind = "root"; + ret = netlink_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + filter_info.kind = "clsact/ingress"; + ret = netlink_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); + filter_info.kind = "clsact/egress"; + ret = netlink_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); + if (ret) + goto out; + +out: + free(tcinfo.handle_array); + return 0; +} + +static int query_flow_dissector(struct bpf_attach_info *attach_info) +{ + __u32 attach_flags; + __u32 prog_ids[1]; + __u32 prog_cnt; + int err; + int fd; + + fd = open("/proc/self/ns/net", O_RDONLY); + if (fd < 0) { + p_err("can't open /proc/self/ns/net: %s", + strerror(errno)); + return -1; + } + prog_cnt = ARRAY_SIZE(prog_ids); + err = bpf_prog_query(fd, BPF_FLOW_DISSECTOR, 0, + &attach_flags, prog_ids, &prog_cnt); + close(fd); + if (err) { + if (errno == EINVAL) { + /* Older kernel's don't support querying + * flow dissector programs. + */ + errno = 0; + return 0; + } + p_err("can't query prog: %s", strerror(errno)); + return -1; + } + + if (prog_cnt == 1) + attach_info->flow_dissector_id = prog_ids[0]; + + return 0; +} + +static int net_parse_dev(int *argc, char ***argv) +{ + int ifindex; + + if (is_prefix(**argv, "dev")) { + NEXT_ARGP(); + + ifindex = if_nametoindex(**argv); + if (!ifindex) + p_err("invalid devname %s", **argv); + + NEXT_ARGP(); + } else { + p_err("expected 'dev', got: '%s'?", **argv); + return -1; + } + + return ifindex; +} + +static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type, + int ifindex, bool overwrite) +{ + __u32 flags = 0; + + if (!overwrite) + flags = XDP_FLAGS_UPDATE_IF_NOEXIST; + if (attach_type == NET_ATTACH_TYPE_XDP_GENERIC) + flags |= XDP_FLAGS_SKB_MODE; + if (attach_type == NET_ATTACH_TYPE_XDP_DRIVER) + flags |= XDP_FLAGS_DRV_MODE; + if (attach_type == NET_ATTACH_TYPE_XDP_OFFLOAD) + flags |= XDP_FLAGS_HW_MODE; + + return bpf_xdp_attach(ifindex, progfd, flags, NULL); +} + +static int do_attach(int argc, char **argv) +{ + enum net_attach_type attach_type; + int progfd, ifindex, err = 0; + bool overwrite = false; + + /* parse attach args */ + if (!REQ_ARGS(5)) + return -EINVAL; + + attach_type = parse_attach_type(*argv); + if (attach_type == net_attach_type_size) { + p_err("invalid net attach/detach type: %s", *argv); + return -EINVAL; + } + NEXT_ARG(); + + progfd = prog_parse_fd(&argc, &argv); + if (progfd < 0) + return -EINVAL; + + ifindex = net_parse_dev(&argc, &argv); + if (ifindex < 1) { + err = -EINVAL; + goto cleanup; + } + + if (argc) { + if (is_prefix(*argv, "overwrite")) { + overwrite = true; + } else { + p_err("expected 'overwrite', got: '%s'?", *argv); + err = -EINVAL; + goto cleanup; + } + } + + /* attach xdp prog */ + if (is_prefix("xdp", attach_type_strings[attach_type])) + err = do_attach_detach_xdp(progfd, attach_type, ifindex, + overwrite); + if (err) { + p_err("interface %s attach failed: %s", + attach_type_strings[attach_type], strerror(-err)); + goto cleanup; + } + + if (json_output) + jsonw_null(json_wtr); +cleanup: + close(progfd); + return err; +} + +static int do_detach(int argc, char **argv) +{ + enum net_attach_type attach_type; + int progfd, ifindex, err = 0; + + /* parse detach args */ + if (!REQ_ARGS(3)) + return -EINVAL; + + attach_type = parse_attach_type(*argv); + if (attach_type == net_attach_type_size) { + p_err("invalid net attach/detach type: %s", *argv); + return -EINVAL; + } + NEXT_ARG(); + + ifindex = net_parse_dev(&argc, &argv); + if (ifindex < 1) + return -EINVAL; + + /* detach xdp prog */ + progfd = -1; + if (is_prefix("xdp", attach_type_strings[attach_type])) + err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL); + + if (err < 0) { + p_err("interface %s detach failed: %s", + attach_type_strings[attach_type], strerror(-err)); + return err; + } + + if (json_output) + jsonw_null(json_wtr); + + return 0; +} + +static int netfilter_link_compar(const void *a, const void *b) +{ + const struct bpf_link_info *nfa = a; + const struct bpf_link_info *nfb = b; + int delta; + + delta = nfa->netfilter.pf - nfb->netfilter.pf; + if (delta) + return delta; + + delta = nfa->netfilter.hooknum - nfb->netfilter.hooknum; + if (delta) + return delta; + + if (nfa->netfilter.priority < nfb->netfilter.priority) + return -1; + if (nfa->netfilter.priority > nfb->netfilter.priority) + return 1; + + return nfa->netfilter.flags - nfb->netfilter.flags; +} + +static void show_link_netfilter(void) +{ + unsigned int nf_link_len = 0, nf_link_count = 0; + struct bpf_link_info *nf_link_info = NULL; + __u32 id = 0; + + while (true) { + struct bpf_link_info info; + int fd, err; + __u32 len; + + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + break; + p_err("can't get next link: %s (id %d)", strerror(errno), id); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + p_err("can't get link by id (%u): %s", id, strerror(errno)); + continue; + } + + memset(&info, 0, sizeof(info)); + len = sizeof(info); + + err = bpf_link_get_info_by_fd(fd, &info, &len); + + close(fd); + + if (err) { + p_err("can't get link info for fd %d: %s", fd, strerror(errno)); + continue; + } + + if (info.type != BPF_LINK_TYPE_NETFILTER) + continue; + + if (nf_link_count >= nf_link_len) { + static const unsigned int max_link_count = INT_MAX / sizeof(info); + struct bpf_link_info *expand; + + if (nf_link_count > max_link_count) { + p_err("cannot handle more than %u links\n", max_link_count); + break; + } + + nf_link_len += 16; + + expand = realloc(nf_link_info, nf_link_len * sizeof(info)); + if (!expand) { + p_err("realloc: %s", strerror(errno)); + break; + } + + nf_link_info = expand; + } + + nf_link_info[nf_link_count] = info; + nf_link_count++; + } + + qsort(nf_link_info, nf_link_count, sizeof(*nf_link_info), netfilter_link_compar); + + for (id = 0; id < nf_link_count; id++) { + NET_START_OBJECT; + if (json_output) + netfilter_dump_json(&nf_link_info[id], json_wtr); + else + netfilter_dump_plain(&nf_link_info[id]); + + NET_DUMP_UINT("id", " prog_id %u", nf_link_info[id].prog_id); + NET_END_OBJECT; + } + + free(nf_link_info); +} + +static int do_show(int argc, char **argv) +{ + struct bpf_attach_info attach_info = {}; + int i, sock, ret, filter_idx = -1; + struct bpf_netdev_t dev_array; + unsigned int nl_pid = 0; + char err_buf[256]; + + if (argc == 2) { + filter_idx = net_parse_dev(&argc, &argv); + if (filter_idx < 1) + return -1; + } else if (argc != 0) { + usage(); + } + + ret = query_flow_dissector(&attach_info); + if (ret) + return -1; + + sock = netlink_open(&nl_pid); + if (sock < 0) { + fprintf(stderr, "failed to open netlink sock\n"); + return -1; + } + + dev_array.devices = NULL; + dev_array.used_len = 0; + dev_array.array_len = 0; + dev_array.filter_idx = filter_idx; + + if (json_output) + jsonw_start_array(json_wtr); + NET_START_OBJECT; + NET_START_ARRAY("xdp", "%s:\n"); + ret = netlink_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); + NET_END_ARRAY("\n"); + + if (!ret) { + NET_START_ARRAY("tc", "%s:\n"); + for (i = 0; i < dev_array.used_len; i++) { + ret = show_dev_tc_bpf(sock, nl_pid, + &dev_array.devices[i]); + if (ret) + break; + } + NET_END_ARRAY("\n"); + } + + NET_START_ARRAY("flow_dissector", "%s:\n"); + if (attach_info.flow_dissector_id > 0) + NET_DUMP_UINT("id", "id %u", attach_info.flow_dissector_id); + NET_END_ARRAY("\n"); + + NET_START_ARRAY("netfilter", "%s:\n"); + show_link_netfilter(); + NET_END_ARRAY("\n"); + + NET_END_OBJECT; + if (json_output) + jsonw_end_array(json_wtr); + + if (ret) { + if (json_output) + jsonw_null(json_wtr); + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "Error: %s\n", err_buf); + } + free(dev_array.devices); + close(sock); + return ret; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [dev ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite ]\n" + " %1$s %2$s detach ATTACH_TYPE dev \n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_PROGRAM "\n" + " ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n" + " " HELP_SPEC_OPTIONS " }\n" + "\n" + "Note: Only xdp and tc attachments are supported now.\n" + " For progs attached to cgroups, use \"bpftool cgroup\"\n" + " to dump program attachments. For program types\n" + " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" + " consult iproute2.\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "attach", do_attach }, + { "detach", do_detach }, + { "help", do_help }, + { 0 } +}; + +int do_net(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/netlink_dumper.c b/third_party/bpftool/src/netlink_dumper.c new file mode 100644 index 0000000..5f65140 --- /dev/null +++ b/third_party/bpftool/src/netlink_dumper.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2018 Facebook + +#include +#include +#include +#include +#include + +#include "bpf/nlattr.h" +#include "main.h" +#include "netlink_dumper.h" + +static void xdp_dump_prog_id(struct nlattr **tb, int attr, + const char *mode, + bool new_json_object) +{ + if (!tb[attr]) + return; + + if (new_json_object) + NET_START_OBJECT + NET_DUMP_STR("mode", " %s", mode); + NET_DUMP_UINT("id", " id %u", libbpf_nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_END_OBJECT +} + +static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, + const char *name) +{ + struct nlattr *tb[IFLA_XDP_MAX + 1]; + unsigned char mode; + + if (libbpf_nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0) + return -1; + + if (!tb[IFLA_XDP_ATTACHED]) + return 0; + + mode = libbpf_nla_getattr_u8(tb[IFLA_XDP_ATTACHED]); + if (mode == XDP_ATTACHED_NONE) + return 0; + + NET_START_OBJECT; + if (name) + NET_DUMP_STR("devname", "%s", name); + NET_DUMP_UINT("ifindex", "(%d)", ifindex); + + if (mode == XDP_ATTACHED_MULTI) { + if (json_output) { + jsonw_name(json_wtr, "multi_attachments"); + jsonw_start_array(json_wtr); + } + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true); + if (json_output) + jsonw_end_array(json_wtr); + } else if (mode == XDP_ATTACHED_DRV) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false); + } else if (mode == XDP_ATTACHED_SKB) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false); + } else if (mode == XDP_ATTACHED_HW) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false); + } + + NET_END_OBJECT_FINAL; + return 0; +} + +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) +{ + if (!tb[IFLA_XDP]) + return 0; + + return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index, + libbpf_nla_getattr_str(tb[IFLA_IFNAME])); +} + +static int do_bpf_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (!tb[TCA_ACT_BPF_PARMS]) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_OBJECT_NESTED2; + if (tb[TCA_ACT_BPF_NAME]) + NET_DUMP_STR("name", "%s", + libbpf_nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + if (tb[TCA_ACT_BPF_ID]) + NET_DUMP_UINT("id", " id %u", + libbpf_nla_getattr_u32(tb[TCA_ACT_BPF_ID])); + NET_END_OBJECT_NESTED; + return 0; +} + +static int do_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX + 1]; + + if (!attr) + return 0; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_ACT_KIND] && + strcmp(libbpf_nla_data(tb[TCA_ACT_KIND]), "bpf") == 0) + return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]); + + return 0; +} + +static int do_bpf_act_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + int act, ret; + + if (libbpf_nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_ARRAY("act", " %s ["); + for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { + ret = do_dump_one_act(tb[act]); + if (ret) + break; + } + NET_END_ARRAY("] "); + + return ret; +} + +static int do_bpf_filter_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_BPF_MAX + 1]; + int ret; + + if (libbpf_nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_BPF_NAME]) + NET_DUMP_STR("name", " %s", + libbpf_nla_getattr_str(tb[TCA_BPF_NAME])); + if (tb[TCA_BPF_ID]) + NET_DUMP_UINT("id", " id %u", + libbpf_nla_getattr_u32(tb[TCA_BPF_ID])); + if (tb[TCA_BPF_ACT]) { + ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); + if (ret) + return ret; + } + + return 0; +} + +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind, + const char *devname, int ifindex) +{ + int ret = 0; + + if (tb[TCA_OPTIONS] && + strcmp(libbpf_nla_data(tb[TCA_KIND]), "bpf") == 0) { + NET_START_OBJECT; + if (devname[0] != '\0') + NET_DUMP_STR("devname", "%s", devname); + NET_DUMP_UINT("ifindex", "(%u)", ifindex); + NET_DUMP_STR("kind", " %s", kind); + ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); + NET_END_OBJECT_FINAL; + } + + return ret; +} diff --git a/third_party/bpftool/src/netlink_dumper.h b/third_party/bpftool/src/netlink_dumper.h new file mode 100644 index 0000000..774af6c --- /dev/null +++ b/third_party/bpftool/src/netlink_dumper.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +// Copyright (C) 2018 Facebook + +#ifndef _NETLINK_DUMPER_H_ +#define _NETLINK_DUMPER_H_ + +#define NET_START_OBJECT \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ +} + +#define NET_START_OBJECT_NESTED(name) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_object(json_wtr); \ + } else { \ + fprintf(stdout, "%s {", name); \ + } \ +} + +#define NET_START_OBJECT_NESTED2 \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ + else \ + fprintf(stdout, "{"); \ +} + +#define NET_END_OBJECT_NESTED \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stdout, "}"); \ +} + +#define NET_END_OBJECT \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ +} + +#define NET_END_OBJECT_FINAL \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stdout, "\n"); \ +} + +#define NET_START_ARRAY(name, fmt_str) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_array(json_wtr); \ + } else { \ + fprintf(stdout, fmt_str, name); \ + } \ +} + +#define NET_END_ARRAY(endstr) \ +{ \ + if (json_output) \ + jsonw_end_array(json_wtr); \ + else \ + fprintf(stdout, "%s", endstr); \ +} + +#define NET_DUMP_UINT(name, fmt_str, val) \ +{ \ + if (json_output) \ + jsonw_uint_field(json_wtr, name, val); \ + else \ + fprintf(stdout, fmt_str, val); \ +} + +#define NET_DUMP_STR(name, fmt_str, str) \ +{ \ + if (json_output) \ + jsonw_string_field(json_wtr, name, str);\ + else \ + fprintf(stdout, fmt_str, str); \ +} + +#define NET_DUMP_STR_ONLY(str) \ +{ \ + if (json_output) \ + jsonw_string(json_wtr, str); \ + else \ + fprintf(stdout, "%s ", str); \ +} + +#endif diff --git a/third_party/bpftool/src/perf.c b/third_party/bpftool/src/perf.c new file mode 100644 index 0000000..9174344 --- /dev/null +++ b/third_party/bpftool/src/perf.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (C) 2018 Facebook +// Author: Yonghong Song + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "main.h" + +/* 0: undecided, 1: supported, 2: not supported */ +static int perf_query_supported; +static bool has_perf_query_support(void) +{ + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + char buf[256]; + int fd; + + if (perf_query_supported) + goto out; + + fd = open("/", O_RDONLY); + if (fd < 0) { + p_err("perf_query_support: cannot open directory \"/\" (%s)", + strerror(errno)); + goto out; + } + + /* the following query will fail as no bpf attachment, + * the expected errno is ENOTSUPP + */ + errno = 0; + len = sizeof(buf); + bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id, + &fd_type, &probe_offset, &probe_addr); + + if (errno == 524 /* ENOTSUPP */) { + perf_query_supported = 1; + goto close_fd; + } + + perf_query_supported = 2; + p_err("perf_query_support: %s", strerror(errno)); + fprintf(stderr, + "HINT: non root or kernel doesn't support TASK_FD_QUERY\n"); + +close_fd: + close(fd); +out: + return perf_query_supported == 1; +} + +static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type, + char *buf, __u64 probe_offset, __u64 probe_addr) +{ + jsonw_start_object(json_wtr); + jsonw_int_field(json_wtr, "pid", pid); + jsonw_int_field(json_wtr, "fd", fd); + jsonw_uint_field(json_wtr, "prog_id", prog_id); + switch (fd_type) { + case BPF_FD_TYPE_RAW_TRACEPOINT: + jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint"); + jsonw_string_field(json_wtr, "tracepoint", buf); + break; + case BPF_FD_TYPE_TRACEPOINT: + jsonw_string_field(json_wtr, "fd_type", "tracepoint"); + jsonw_string_field(json_wtr, "tracepoint", buf); + break; + case BPF_FD_TYPE_KPROBE: + jsonw_string_field(json_wtr, "fd_type", "kprobe"); + if (buf[0] != '\0') { + jsonw_string_field(json_wtr, "func", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + } else { + jsonw_lluint_field(json_wtr, "addr", probe_addr); + } + break; + case BPF_FD_TYPE_KRETPROBE: + jsonw_string_field(json_wtr, "fd_type", "kretprobe"); + if (buf[0] != '\0') { + jsonw_string_field(json_wtr, "func", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + } else { + jsonw_lluint_field(json_wtr, "addr", probe_addr); + } + break; + case BPF_FD_TYPE_UPROBE: + jsonw_string_field(json_wtr, "fd_type", "uprobe"); + jsonw_string_field(json_wtr, "filename", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + break; + case BPF_FD_TYPE_URETPROBE: + jsonw_string_field(json_wtr, "fd_type", "uretprobe"); + jsonw_string_field(json_wtr, "filename", buf); + jsonw_lluint_field(json_wtr, "offset", probe_offset); + break; + default: + break; + } + jsonw_end_object(json_wtr); +} + +static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type, + char *buf, __u64 probe_offset, __u64 probe_addr) +{ + printf("pid %d fd %d: prog_id %u ", pid, fd, prog_id); + switch (fd_type) { + case BPF_FD_TYPE_RAW_TRACEPOINT: + printf("raw_tracepoint %s\n", buf); + break; + case BPF_FD_TYPE_TRACEPOINT: + printf("tracepoint %s\n", buf); + break; + case BPF_FD_TYPE_KPROBE: + if (buf[0] != '\0') + printf("kprobe func %s offset %llu\n", buf, + probe_offset); + else + printf("kprobe addr %llu\n", probe_addr); + break; + case BPF_FD_TYPE_KRETPROBE: + if (buf[0] != '\0') + printf("kretprobe func %s offset %llu\n", buf, + probe_offset); + else + printf("kretprobe addr %llu\n", probe_addr); + break; + case BPF_FD_TYPE_UPROBE: + printf("uprobe filename %s offset %llu\n", buf, probe_offset); + break; + case BPF_FD_TYPE_URETPROBE: + printf("uretprobe filename %s offset %llu\n", buf, + probe_offset); + break; + default: + break; + } +} + +static int show_proc(void) +{ + struct dirent *proc_de, *pid_fd_de; + __u64 probe_offset, probe_addr; + __u32 len, prog_id, fd_type; + DIR *proc, *pid_fd; + int err, pid, fd; + const char *pch; + char buf[4096]; + + proc = opendir("/proc"); + if (!proc) + return -1; + + while ((proc_de = readdir(proc))) { + pid = 0; + pch = proc_de->d_name; + + /* pid should be all numbers */ + while (isdigit(*pch)) { + pid = pid * 10 + *pch - '0'; + pch++; + } + if (*pch != '\0') + continue; + + err = snprintf(buf, sizeof(buf), "/proc/%s/fd", proc_de->d_name); + if (err < 0 || err >= (int)sizeof(buf)) + continue; + + pid_fd = opendir(buf); + if (!pid_fd) + continue; + + while ((pid_fd_de = readdir(pid_fd))) { + fd = 0; + pch = pid_fd_de->d_name; + + /* fd should be all numbers */ + while (isdigit(*pch)) { + fd = fd * 10 + *pch - '0'; + pch++; + } + if (*pch != '\0') + continue; + + /* query (pid, fd) for potential perf events */ + len = sizeof(buf); + err = bpf_task_fd_query(pid, fd, 0, buf, &len, + &prog_id, &fd_type, + &probe_offset, &probe_addr); + if (err < 0) + continue; + + if (json_output) + print_perf_json(pid, fd, prog_id, fd_type, buf, + probe_offset, probe_addr); + else + print_perf_plain(pid, fd, prog_id, fd_type, buf, + probe_offset, probe_addr); + } + closedir(pid_fd); + } + closedir(proc); + return 0; +} + +static int do_show(int argc, char **argv) +{ + int err; + + if (!has_perf_query_support()) + return -1; + + if (json_output) + jsonw_start_array(json_wtr); + err = show_proc(); + if (json_output) + jsonw_end_array(json_wtr); + + return err; +} + +static int do_help(int argc, char **argv) +{ + fprintf(stderr, + "Usage: %1$s %2$s { show | list }\n" + " %1$s %2$s help }\n" + "\n" + " " HELP_SPEC_OPTIONS " }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_perf(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/pids.c b/third_party/bpftool/src/pids.c new file mode 100644 index 0000000..00c77ed --- /dev/null +++ b/third_party/bpftool/src/pids.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "main.h" +#include "skeleton/pid_iter.h" + +#ifdef BPFTOOL_WITHOUT_SKELETONS + +int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) +{ + return -ENOTSUP; +} +void delete_obj_refs_table(struct hashmap *map) {} +void emit_obj_refs_plain(struct hashmap *map, __u32 id, const char *prefix) {} +void emit_obj_refs_json(struct hashmap *map, __u32 id, json_writer_t *json_writer) {} + +#else /* BPFTOOL_WITHOUT_SKELETONS */ + +#include "pid_iter.skel.h" + +static void add_ref(struct hashmap *map, struct pid_iter_entry *e) +{ + struct hashmap_entry *entry; + struct obj_refs *refs; + struct obj_ref *ref; + int err, i; + void *tmp; + + hashmap__for_each_key_entry(map, entry, e->id) { + refs = entry->pvalue; + + for (i = 0; i < refs->ref_cnt; i++) { + if (refs->refs[i].pid == e->pid) + return; + } + + tmp = realloc(refs->refs, (refs->ref_cnt + 1) * sizeof(*ref)); + if (!tmp) { + p_err("failed to re-alloc memory for ID %u, PID %d, COMM %s...", + e->id, e->pid, e->comm); + return; + } + refs->refs = tmp; + ref = &refs->refs[refs->ref_cnt]; + ref->pid = e->pid; + memcpy(ref->comm, e->comm, sizeof(ref->comm)); + refs->ref_cnt++; + + return; + } + + /* new ref */ + refs = calloc(1, sizeof(*refs)); + if (!refs) { + p_err("failed to alloc memory for ID %u, PID %d, COMM %s...", + e->id, e->pid, e->comm); + return; + } + + refs->refs = malloc(sizeof(*refs->refs)); + if (!refs->refs) { + free(refs); + p_err("failed to alloc memory for ID %u, PID %d, COMM %s...", + e->id, e->pid, e->comm); + return; + } + ref = &refs->refs[0]; + ref->pid = e->pid; + memcpy(ref->comm, e->comm, sizeof(ref->comm)); + refs->ref_cnt = 1; + refs->has_bpf_cookie = e->has_bpf_cookie; + refs->bpf_cookie = e->bpf_cookie; + + err = hashmap__append(map, e->id, refs); + if (err) + p_err("failed to append entry to hashmap for ID %u: %s", + e->id, strerror(errno)); +} + +static int __printf(2, 0) +libbpf_print_none(__maybe_unused enum libbpf_print_level level, + __maybe_unused const char *format, + __maybe_unused va_list args) +{ + return 0; +} + +int build_obj_refs_table(struct hashmap **map, enum bpf_obj_type type) +{ + struct pid_iter_entry *e; + char buf[4096 / sizeof(*e) * sizeof(*e)]; + struct pid_iter_bpf *skel; + int err, ret, fd = -1, i; + libbpf_print_fn_t default_print; + + *map = hashmap__new(hash_fn_for_key_as_id, equal_fn_for_key_as_id, NULL); + if (IS_ERR(*map)) { + p_err("failed to create hashmap for PID references"); + return -1; + } + set_max_rlimit(); + + skel = pid_iter_bpf__open(); + if (!skel) { + p_err("failed to open PID iterator skeleton"); + return -1; + } + + skel->rodata->obj_type = type; + + /* we don't want output polluted with libbpf errors if bpf_iter is not + * supported + */ + default_print = libbpf_set_print(libbpf_print_none); + err = pid_iter_bpf__load(skel); + libbpf_set_print(default_print); + if (err) { + /* too bad, kernel doesn't support BPF iterators yet */ + err = 0; + goto out; + } + err = pid_iter_bpf__attach(skel); + if (err) { + /* if we loaded above successfully, attach has to succeed */ + p_err("failed to attach PID iterator: %d", err); + goto out; + } + + fd = bpf_iter_create(bpf_link__fd(skel->links.iter)); + if (fd < 0) { + err = -errno; + p_err("failed to create PID iterator session: %d", err); + goto out; + } + + while (true) { + ret = read(fd, buf, sizeof(buf)); + if (ret < 0) { + if (errno == EAGAIN) + continue; + err = -errno; + p_err("failed to read PID iterator output: %d", err); + goto out; + } + if (ret == 0) + break; + if (ret % sizeof(*e)) { + err = -EINVAL; + p_err("invalid PID iterator output format"); + goto out; + } + ret /= sizeof(*e); + + e = (void *)buf; + for (i = 0; i < ret; i++, e++) { + add_ref(*map, e); + } + } + err = 0; +out: + if (fd >= 0) + close(fd); + pid_iter_bpf__destroy(skel); + return err; +} + +void delete_obj_refs_table(struct hashmap *map) +{ + struct hashmap_entry *entry; + size_t bkt; + + if (!map) + return; + + hashmap__for_each_entry(map, entry, bkt) { + struct obj_refs *refs = entry->pvalue; + + free(refs->refs); + free(refs); + } + + hashmap__free(map); +} + +void emit_obj_refs_json(struct hashmap *map, __u32 id, + json_writer_t *json_writer) +{ + struct hashmap_entry *entry; + + if (hashmap__empty(map)) + return; + + hashmap__for_each_key_entry(map, entry, id) { + struct obj_refs *refs = entry->pvalue; + int i; + + if (refs->ref_cnt == 0) + break; + + if (refs->has_bpf_cookie) + jsonw_lluint_field(json_writer, "bpf_cookie", refs->bpf_cookie); + + jsonw_name(json_writer, "pids"); + jsonw_start_array(json_writer); + for (i = 0; i < refs->ref_cnt; i++) { + struct obj_ref *ref = &refs->refs[i]; + + jsonw_start_object(json_writer); + jsonw_int_field(json_writer, "pid", ref->pid); + jsonw_string_field(json_writer, "comm", ref->comm); + jsonw_end_object(json_writer); + } + jsonw_end_array(json_writer); + break; + } +} + +void emit_obj_refs_plain(struct hashmap *map, __u32 id, const char *prefix) +{ + struct hashmap_entry *entry; + + if (hashmap__empty(map)) + return; + + hashmap__for_each_key_entry(map, entry, id) { + struct obj_refs *refs = entry->pvalue; + int i; + + if (refs->ref_cnt == 0) + break; + + if (refs->has_bpf_cookie) + printf("\n\tbpf_cookie %llu", (unsigned long long) refs->bpf_cookie); + + printf("%s", prefix); + for (i = 0; i < refs->ref_cnt; i++) { + struct obj_ref *ref = &refs->refs[i]; + + printf("%s%s(%d)", i == 0 ? "" : ", ", ref->comm, ref->pid); + } + break; + } +} + + +#endif diff --git a/third_party/bpftool/src/prog.c b/third_party/bpftool/src/prog.c new file mode 100644 index 0000000..8443a14 --- /dev/null +++ b/third_party/bpftool/src/prog.c @@ -0,0 +1,2514 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2017-2018 Netronome Systems, Inc. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "cfg.h" +#include "main.h" +#include "xlated_dumper.h" + +#define BPF_METADATA_PREFIX "bpf_metadata_" +#define BPF_METADATA_PREFIX_LEN (sizeof(BPF_METADATA_PREFIX) - 1) + +enum dump_mode { + DUMP_JITED, + DUMP_XLATED, +}; + +static const bool attach_types[] = { + [BPF_SK_SKB_STREAM_PARSER] = true, + [BPF_SK_SKB_STREAM_VERDICT] = true, + [BPF_SK_SKB_VERDICT] = true, + [BPF_SK_MSG_VERDICT] = true, + [BPF_FLOW_DISSECTOR] = true, + [__MAX_BPF_ATTACH_TYPE] = false, +}; + +/* Textual representations traditionally used by the program and kept around + * for the sake of backwards compatibility. + */ +static const char * const attach_type_strings[] = { + [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", + [BPF_SK_MSG_VERDICT] = "msg_verdict", + [__MAX_BPF_ATTACH_TYPE] = NULL, +}; + +static struct hashmap *prog_table; + +static enum bpf_attach_type parse_attach_type(const char *str) +{ + enum bpf_attach_type type; + + for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++) { + if (attach_types[type]) { + const char *attach_type_str; + + attach_type_str = libbpf_bpf_attach_type_str(type); + if (!strcmp(str, attach_type_str)) + return type; + } + + if (attach_type_strings[type] && + is_prefix(str, attach_type_strings[type])) + return type; + } + + return __MAX_BPF_ATTACH_TYPE; +} + +static int prep_prog_info(struct bpf_prog_info *const info, enum dump_mode mode, + void **info_data, size_t *const info_data_sz) +{ + struct bpf_prog_info holder = {}; + size_t needed = 0; + void *ptr; + + if (mode == DUMP_JITED) { + holder.jited_prog_len = info->jited_prog_len; + needed += info->jited_prog_len; + } else { + holder.xlated_prog_len = info->xlated_prog_len; + needed += info->xlated_prog_len; + } + + holder.nr_jited_ksyms = info->nr_jited_ksyms; + needed += info->nr_jited_ksyms * sizeof(__u64); + + holder.nr_jited_func_lens = info->nr_jited_func_lens; + needed += info->nr_jited_func_lens * sizeof(__u32); + + holder.nr_func_info = info->nr_func_info; + holder.func_info_rec_size = info->func_info_rec_size; + needed += info->nr_func_info * info->func_info_rec_size; + + holder.nr_line_info = info->nr_line_info; + holder.line_info_rec_size = info->line_info_rec_size; + needed += info->nr_line_info * info->line_info_rec_size; + + holder.nr_jited_line_info = info->nr_jited_line_info; + holder.jited_line_info_rec_size = info->jited_line_info_rec_size; + needed += info->nr_jited_line_info * info->jited_line_info_rec_size; + + if (needed > *info_data_sz) { + ptr = realloc(*info_data, needed); + if (!ptr) + return -1; + + *info_data = ptr; + *info_data_sz = needed; + } + ptr = *info_data; + + if (mode == DUMP_JITED) { + holder.jited_prog_insns = ptr_to_u64(ptr); + ptr += holder.jited_prog_len; + } else { + holder.xlated_prog_insns = ptr_to_u64(ptr); + ptr += holder.xlated_prog_len; + } + + holder.jited_ksyms = ptr_to_u64(ptr); + ptr += holder.nr_jited_ksyms * sizeof(__u64); + + holder.jited_func_lens = ptr_to_u64(ptr); + ptr += holder.nr_jited_func_lens * sizeof(__u32); + + holder.func_info = ptr_to_u64(ptr); + ptr += holder.nr_func_info * holder.func_info_rec_size; + + holder.line_info = ptr_to_u64(ptr); + ptr += holder.nr_line_info * holder.line_info_rec_size; + + holder.jited_line_info = ptr_to_u64(ptr); + ptr += holder.nr_jited_line_info * holder.jited_line_info_rec_size; + + *info = holder; + return 0; +} + +static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) +{ + struct timespec real_time_ts, boot_time_ts; + time_t wallclock_secs; + struct tm load_tm; + + buf[--size] = '\0'; + + if (clock_gettime(CLOCK_REALTIME, &real_time_ts) || + clock_gettime(CLOCK_BOOTTIME, &boot_time_ts)) { + perror("Can't read clocks"); + snprintf(buf, size, "%llu", nsecs / 1000000000); + return; + } + + wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + + (real_time_ts.tv_nsec - boot_time_ts.tv_nsec + nsecs) / + 1000000000; + + + if (!localtime_r(&wallclock_secs, &load_tm)) { + snprintf(buf, size, "%llu", nsecs / 1000000000); + return; + } + + if (json_output) + strftime(buf, size, "%s", &load_tm); + else + strftime(buf, size, "%FT%T%z", &load_tm); +} + +static void show_prog_maps(int fd, __u32 num_maps) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + __u32 map_ids[num_maps]; + unsigned int i; + int err; + + info.nr_map_ids = num_maps; + info.map_ids = ptr_to_u64(map_ids); + + err = bpf_prog_get_info_by_fd(fd, &info, &len); + if (err || !info.nr_map_ids) + return; + + if (json_output) { + jsonw_name(json_wtr, "map_ids"); + jsonw_start_array(json_wtr); + for (i = 0; i < info.nr_map_ids; i++) + jsonw_uint(json_wtr, map_ids[i]); + jsonw_end_array(json_wtr); + } else { + printf(" map_ids "); + for (i = 0; i < info.nr_map_ids; i++) + printf("%u%s", map_ids[i], + i == info.nr_map_ids - 1 ? "" : ","); + } +} + +static void *find_metadata(int prog_fd, struct bpf_map_info *map_info) +{ + struct bpf_prog_info prog_info; + __u32 prog_info_len; + __u32 map_info_len; + void *value = NULL; + __u32 *map_ids; + int nr_maps; + int key = 0; + int map_fd; + int ret; + __u32 i; + + memset(&prog_info, 0, sizeof(prog_info)); + prog_info_len = sizeof(prog_info); + ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + return NULL; + + if (!prog_info.nr_map_ids) + return NULL; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32)); + if (!map_ids) + return NULL; + + nr_maps = prog_info.nr_map_ids; + memset(&prog_info, 0, sizeof(prog_info)); + prog_info.nr_map_ids = nr_maps; + prog_info.map_ids = ptr_to_u64(map_ids); + prog_info_len = sizeof(prog_info); + + ret = bpf_prog_get_info_by_fd(prog_fd, &prog_info, &prog_info_len); + if (ret) + goto free_map_ids; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + map_fd = bpf_map_get_fd_by_id(map_ids[i]); + if (map_fd < 0) + goto free_map_ids; + + memset(map_info, 0, sizeof(*map_info)); + map_info_len = sizeof(*map_info); + ret = bpf_map_get_info_by_fd(map_fd, map_info, &map_info_len); + if (ret < 0) { + close(map_fd); + goto free_map_ids; + } + + if (map_info->type != BPF_MAP_TYPE_ARRAY || + map_info->key_size != sizeof(int) || + map_info->max_entries != 1 || + !map_info->btf_value_type_id || + !strstr(map_info->name, ".rodata")) { + close(map_fd); + continue; + } + + value = malloc(map_info->value_size); + if (!value) { + close(map_fd); + goto free_map_ids; + } + + if (bpf_map_lookup_elem(map_fd, &key, value)) { + close(map_fd); + free(value); + value = NULL; + goto free_map_ids; + } + + close(map_fd); + break; + } + +free_map_ids: + free(map_ids); + return value; +} + +static bool has_metadata_prefix(const char *s) +{ + return strncmp(s, BPF_METADATA_PREFIX, BPF_METADATA_PREFIX_LEN) == 0; +} + +static void show_prog_metadata(int fd, __u32 num_maps) +{ + const struct btf_type *t_datasec, *t_var; + struct bpf_map_info map_info; + struct btf_var_secinfo *vsi; + bool printed_header = false; + unsigned int i, vlen; + void *value = NULL; + const char *name; + struct btf *btf; + int err; + + if (!num_maps) + return; + + memset(&map_info, 0, sizeof(map_info)); + value = find_metadata(fd, &map_info); + if (!value) + return; + + btf = btf__load_from_kernel_by_id(map_info.btf_id); + if (!btf) + goto out_free; + + t_datasec = btf__type_by_id(btf, map_info.btf_value_type_id); + if (!btf_is_datasec(t_datasec)) + goto out_free; + + vlen = btf_vlen(t_datasec); + vsi = btf_var_secinfos(t_datasec); + + /* We don't proceed to check the kinds of the elements of the DATASEC. + * The verifier enforces them to be BTF_KIND_VAR. + */ + + if (json_output) { + struct btf_dumper d = { + .btf = btf, + .jw = json_wtr, + .is_plain_text = false, + }; + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + jsonw_name(json_wtr, "metadata"); + jsonw_start_object(json_wtr); + printed_header = true; + } + + jsonw_name(json_wtr, name + BPF_METADATA_PREFIX_LEN); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_end_object(json_wtr); + } else { + json_writer_t *btf_wtr; + struct btf_dumper d = { + .btf = btf, + .is_plain_text = true, + }; + + for (i = 0; i < vlen; i++, vsi++) { + t_var = btf__type_by_id(btf, vsi->type); + name = btf__name_by_offset(btf, t_var->name_off); + + if (!has_metadata_prefix(name)) + continue; + + if (!printed_header) { + printf("\tmetadata:"); + + btf_wtr = jsonw_new(stdout); + if (!btf_wtr) { + p_err("jsonw alloc failed"); + goto out_free; + } + d.jw = btf_wtr, + + printed_header = true; + } + + printf("\n\t\t%s = ", name + BPF_METADATA_PREFIX_LEN); + + jsonw_reset(btf_wtr); + err = btf_dumper_type(&d, t_var->type, value + vsi->offset); + if (err) { + p_err("btf dump failed: %d", err); + break; + } + } + if (printed_header) + jsonw_destroy(&btf_wtr); + } + +out_free: + btf__free(btf); + free(value); +} + +static void print_prog_header_json(struct bpf_prog_info *info, int fd) +{ + const char *prog_type_str; + char prog_name[MAX_PROG_FULL_NAME]; + + jsonw_uint_field(json_wtr, "id", info->id); + prog_type_str = libbpf_bpf_prog_type_str(info->type); + + if (prog_type_str) + jsonw_string_field(json_wtr, "type", prog_type_str); + else + jsonw_uint_field(json_wtr, "type", info->type); + + if (*info->name) { + get_prog_full_name(info, fd, prog_name, sizeof(prog_name)); + jsonw_string_field(json_wtr, "name", prog_name); + } + + jsonw_name(json_wtr, "tag"); + jsonw_printf(json_wtr, "\"" BPF_TAG_FMT "\"", + info->tag[0], info->tag[1], info->tag[2], info->tag[3], + info->tag[4], info->tag[5], info->tag[6], info->tag[7]); + + jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible); + if (info->run_time_ns) { + jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); + jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); + } + if (info->recursion_misses) + jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); +} + +static void print_prog_json(struct bpf_prog_info *info, int fd) +{ + char *memlock; + + jsonw_start_object(json_wtr); + print_prog_header_json(info, fd); + print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); + + if (info->load_time) { + char buf[32]; + + print_boot_time(info->load_time, buf, sizeof(buf)); + + /* Piggy back on load_time, since 0 uid is a valid one */ + jsonw_name(json_wtr, "loaded_at"); + jsonw_printf(json_wtr, "%s", buf); + jsonw_uint_field(json_wtr, "uid", info->created_by_uid); + } + + jsonw_uint_field(json_wtr, "bytes_xlated", info->xlated_prog_len); + + if (info->jited_prog_len) { + jsonw_bool_field(json_wtr, "jited", true); + jsonw_uint_field(json_wtr, "bytes_jited", info->jited_prog_len); + } else { + jsonw_bool_field(json_wtr, "jited", false); + } + + memlock = get_fdinfo(fd, "memlock"); + if (memlock) + jsonw_int_field(json_wtr, "bytes_memlock", atoll(memlock)); + free(memlock); + + if (info->nr_map_ids) + show_prog_maps(fd, info->nr_map_ids); + + if (info->btf_id) + jsonw_int_field(json_wtr, "btf_id", info->btf_id); + + if (!hashmap__empty(prog_table)) { + struct hashmap_entry *entry; + + jsonw_name(json_wtr, "pinned"); + jsonw_start_array(json_wtr); + hashmap__for_each_key_entry(prog_table, entry, info->id) + jsonw_string(json_wtr, entry->pvalue); + jsonw_end_array(json_wtr); + } + + emit_obj_refs_json(refs_table, info->id, json_wtr); + + show_prog_metadata(fd, info->nr_map_ids); + + jsonw_end_object(json_wtr); +} + +static void print_prog_header_plain(struct bpf_prog_info *info, int fd) +{ + const char *prog_type_str; + char prog_name[MAX_PROG_FULL_NAME]; + + printf("%u: ", info->id); + prog_type_str = libbpf_bpf_prog_type_str(info->type); + if (prog_type_str) + printf("%s ", prog_type_str); + else + printf("type %u ", info->type); + + if (*info->name) { + get_prog_full_name(info, fd, prog_name, sizeof(prog_name)); + printf("name %s ", prog_name); + } + + printf("tag "); + fprint_hex(stdout, info->tag, BPF_TAG_SIZE, ""); + print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); + printf("%s", info->gpl_compatible ? " gpl" : ""); + if (info->run_time_ns) + printf(" run_time_ns %lld run_cnt %lld", + info->run_time_ns, info->run_cnt); + if (info->recursion_misses) + printf(" recursion_misses %lld", info->recursion_misses); + printf("\n"); +} + +static void print_prog_plain(struct bpf_prog_info *info, int fd) +{ + char *memlock; + + print_prog_header_plain(info, fd); + + if (info->load_time) { + char buf[32]; + + print_boot_time(info->load_time, buf, sizeof(buf)); + + /* Piggy back on load_time, since 0 uid is a valid one */ + printf("\tloaded_at %s uid %u\n", buf, info->created_by_uid); + } + + printf("\txlated %uB", info->xlated_prog_len); + + if (info->jited_prog_len) + printf(" jited %uB", info->jited_prog_len); + else + printf(" not jited"); + + memlock = get_fdinfo(fd, "memlock"); + if (memlock) + printf(" memlock %sB", memlock); + free(memlock); + + if (info->nr_map_ids) + show_prog_maps(fd, info->nr_map_ids); + + if (!hashmap__empty(prog_table)) { + struct hashmap_entry *entry; + + hashmap__for_each_key_entry(prog_table, entry, info->id) + printf("\n\tpinned %s", (char *)entry->pvalue); + } + + if (info->btf_id) + printf("\n\tbtf_id %d", info->btf_id); + + emit_obj_refs_plain(refs_table, info->id, "\n\tpids "); + + printf("\n"); + + show_prog_metadata(fd, info->nr_map_ids); +} + +static int show_prog(int fd) +{ + struct bpf_prog_info info = {}; + __u32 len = sizeof(info); + int err; + + err = bpf_prog_get_info_by_fd(fd, &info, &len); + if (err) { + p_err("can't get prog info: %s", strerror(errno)); + return -1; + } + + if (json_output) + print_prog_json(&info, fd); + else + print_prog_plain(&info, fd); + + return 0; +} + +static int do_show_subset(int argc, char **argv) +{ + int *fds = NULL; + int nb_fds, i; + int err = -1; + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + err = show_prog(fds[i]); + if (err) { + for (; i < nb_fds; i++) + close(fds[i]); + break; + } + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_free: + free(fds); + return err; +} + +static int do_show(int argc, char **argv) +{ + __u32 id = 0; + int err; + int fd; + + if (show_pinned) { + prog_table = hashmap__new(hash_fn_for_key_as_id, + equal_fn_for_key_as_id, NULL); + if (IS_ERR(prog_table)) { + p_err("failed to create hashmap for pinned paths"); + return -1; + } + build_pinned_obj_table(prog_table, BPF_OBJ_PROG); + } + build_obj_refs_table(&refs_table, BPF_OBJ_PROG); + + if (argc == 2) + return do_show_subset(argc, argv); + + if (argc) + return BAD_ARG(); + + if (json_output) + jsonw_start_array(json_wtr); + while (true) { + err = bpf_prog_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + p_err("can't get next program: %s%s", strerror(errno), + errno == EINVAL ? " -- kernel too old?" : ""); + err = -1; + break; + } + + fd = bpf_prog_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get prog by id (%u): %s", + id, strerror(errno)); + err = -1; + break; + } + + err = show_prog(fd); + close(fd); + if (err) + break; + } + + if (json_output) + jsonw_end_array(json_wtr); + + delete_obj_refs_table(refs_table); + + if (show_pinned) + delete_pinned_obj_table(prog_table); + + return err; +} + +static int +prog_dump(struct bpf_prog_info *info, enum dump_mode mode, + char *filepath, bool opcodes, bool visual, bool linum) +{ + struct bpf_prog_linfo *prog_linfo = NULL; + const char *disasm_opt = NULL; + struct dump_data dd = {}; + void *func_info = NULL; + struct btf *btf = NULL; + char func_sig[1024]; + unsigned char *buf; + __u32 member_len; + int fd, err = -1; + ssize_t n; + + if (mode == DUMP_JITED) { + if (info->jited_prog_len == 0 || !info->jited_prog_insns) { + p_info("no instructions returned"); + return -1; + } + buf = u64_to_ptr(info->jited_prog_insns); + member_len = info->jited_prog_len; + } else { /* DUMP_XLATED */ + if (info->xlated_prog_len == 0 || !info->xlated_prog_insns) { + p_err("error retrieving insn dump: kernel.kptr_restrict set?"); + return -1; + } + buf = u64_to_ptr(info->xlated_prog_insns); + member_len = info->xlated_prog_len; + } + + if (info->btf_id) { + btf = btf__load_from_kernel_by_id(info->btf_id); + if (!btf) { + p_err("failed to get btf"); + return -1; + } + } + + func_info = u64_to_ptr(info->func_info); + + if (info->nr_line_info) { + prog_linfo = bpf_prog_linfo__new(info); + if (!prog_linfo) + p_info("error in processing bpf_line_info. continue without it."); + } + + if (filepath) { + fd = open(filepath, O_WRONLY | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + p_err("can't open file %s: %s", filepath, + strerror(errno)); + goto exit_free; + } + + n = write(fd, buf, member_len); + close(fd); + if (n != (ssize_t)member_len) { + p_err("error writing output file: %s", + n < 0 ? strerror(errno) : "short write"); + goto exit_free; + } + + if (json_output) + jsonw_null(json_wtr); + } else if (mode == DUMP_JITED) { + const char *name = NULL; + + if (info->ifindex) { + name = ifindex_to_arch(info->ifindex, info->netns_dev, + info->netns_ino, &disasm_opt); + if (!name) + goto exit_free; + } + + if (info->nr_jited_func_lens && info->jited_func_lens) { + struct kernel_sym *sym = NULL; + struct bpf_func_info *record; + char sym_name[SYM_MAX_NAME]; + unsigned char *img = buf; + __u64 *ksyms = NULL; + __u32 *lens; + __u32 i; + if (info->nr_jited_ksyms) { + kernel_syms_load(&dd); + ksyms = u64_to_ptr(info->jited_ksyms); + } + + if (json_output) + jsonw_start_array(json_wtr); + + lens = u64_to_ptr(info->jited_func_lens); + for (i = 0; i < info->nr_jited_func_lens; i++) { + if (ksyms) { + sym = kernel_syms_search(&dd, ksyms[i]); + if (sym) + sprintf(sym_name, "%s", sym->name); + else + sprintf(sym_name, "0x%016llx", ksyms[i]); + } else { + strcpy(sym_name, "unknown"); + } + + if (func_info) { + record = func_info + i * info->func_info_rec_size; + btf_dumper_type_only(btf, record->type_id, + func_sig, + sizeof(func_sig)); + } + + if (json_output) { + jsonw_start_object(json_wtr); + if (func_info && func_sig[0] != '\0') { + jsonw_name(json_wtr, "proto"); + jsonw_string(json_wtr, func_sig); + } + jsonw_name(json_wtr, "name"); + jsonw_string(json_wtr, sym_name); + jsonw_name(json_wtr, "insns"); + } else { + if (func_info && func_sig[0] != '\0') + printf("%s:\n", func_sig); + printf("%s:\n", sym_name); + } + + if (disasm_print_insn(img, lens[i], opcodes, + name, disasm_opt, btf, + prog_linfo, ksyms[i], i, + linum)) + goto exit_free; + + img += lens[i]; + + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + } + + if (json_output) + jsonw_end_array(json_wtr); + } else { + if (disasm_print_insn(buf, member_len, opcodes, name, + disasm_opt, btf, NULL, 0, 0, + false)) + goto exit_free; + } + } else { + kernel_syms_load(&dd); + dd.nr_jited_ksyms = info->nr_jited_ksyms; + dd.jited_ksyms = u64_to_ptr(info->jited_ksyms); + dd.btf = btf; + dd.func_info = func_info; + dd.finfo_rec_size = info->func_info_rec_size; + dd.prog_linfo = prog_linfo; + + if (json_output) + dump_xlated_json(&dd, buf, member_len, opcodes, linum); + else if (visual) + dump_xlated_cfg(&dd, buf, member_len, opcodes, linum); + else + dump_xlated_plain(&dd, buf, member_len, opcodes, linum); + kernel_syms_destroy(&dd); + } + + err = 0; + +exit_free: + btf__free(btf); + bpf_prog_linfo__free(prog_linfo); + return err; +} + +static int do_dump(int argc, char **argv) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + size_t info_data_sz = 0; + void *info_data = NULL; + char *filepath = NULL; + bool opcodes = false; + bool visual = false; + enum dump_mode mode; + bool linum = false; + int nb_fds, i = 0; + int *fds = NULL; + int err = -1; + + if (is_prefix(*argv, "jited")) { + if (disasm_init()) + return -1; + mode = DUMP_JITED; + } else if (is_prefix(*argv, "xlated")) { + mode = DUMP_XLATED; + } else { + p_err("expected 'xlated' or 'jited', got: %s", *argv); + return -1; + } + NEXT_ARG(); + + if (argc < 2) + usage(); + + fds = malloc(sizeof(int)); + if (!fds) { + p_err("mem alloc failed"); + return -1; + } + nb_fds = prog_parse_fds(&argc, &argv, &fds); + if (nb_fds < 1) + goto exit_free; + + while (argc) { + if (is_prefix(*argv, "file")) { + NEXT_ARG(); + if (!argc) { + p_err("expected file path"); + goto exit_close; + } + if (nb_fds > 1) { + p_err("several programs matched"); + goto exit_close; + } + + filepath = *argv; + NEXT_ARG(); + } else if (is_prefix(*argv, "opcodes")) { + opcodes = true; + NEXT_ARG(); + } else if (is_prefix(*argv, "visual")) { + if (nb_fds > 1) { + p_err("several programs matched"); + goto exit_close; + } + + visual = true; + NEXT_ARG(); + } else if (is_prefix(*argv, "linum")) { + linum = true; + NEXT_ARG(); + } else { + usage(); + goto exit_close; + } + } + + if (filepath && (opcodes || visual || linum)) { + p_err("'file' is not compatible with 'opcodes', 'visual', or 'linum'"); + goto exit_close; + } + if (json_output && visual) { + p_err("'visual' is not compatible with JSON output"); + goto exit_close; + } + + if (json_output && nb_fds > 1) + jsonw_start_array(json_wtr); /* root array */ + for (i = 0; i < nb_fds; i++) { + memset(&info, 0, sizeof(info)); + + err = bpf_prog_get_info_by_fd(fds[i], &info, &info_len); + if (err) { + p_err("can't get prog info: %s", strerror(errno)); + break; + } + + err = prep_prog_info(&info, mode, &info_data, &info_data_sz); + if (err) { + p_err("can't grow prog info_data"); + break; + } + + err = bpf_prog_get_info_by_fd(fds[i], &info, &info_len); + if (err) { + p_err("can't get prog info: %s", strerror(errno)); + break; + } + + if (json_output && nb_fds > 1) { + jsonw_start_object(json_wtr); /* prog object */ + print_prog_header_json(&info, fds[i]); + jsonw_name(json_wtr, "insns"); + } else if (nb_fds > 1) { + print_prog_header_plain(&info, fds[i]); + } + + err = prog_dump(&info, mode, filepath, opcodes, visual, linum); + + if (json_output && nb_fds > 1) + jsonw_end_object(json_wtr); /* prog object */ + else if (i != nb_fds - 1 && nb_fds > 1) + printf("\n"); + + if (err) + break; + close(fds[i]); + } + if (json_output && nb_fds > 1) + jsonw_end_array(json_wtr); /* root array */ + +exit_close: + for (; i < nb_fds; i++) + close(fds[i]); +exit_free: + free(info_data); + free(fds); + return err; +} + +static int do_pin(int argc, char **argv) +{ + int err; + + err = do_pin_any(argc, argv, prog_parse_fd); + if (!err && json_output) + jsonw_null(json_wtr); + return err; +} + +struct map_replace { + int idx; + int fd; + char *name; +}; + +static int map_replace_compar(const void *p1, const void *p2) +{ + const struct map_replace *a = p1, *b = p2; + + return a->idx - b->idx; +} + +static int parse_attach_detach_args(int argc, char **argv, int *progfd, + enum bpf_attach_type *attach_type, + int *mapfd) +{ + if (!REQ_ARGS(3)) + return -EINVAL; + + *progfd = prog_parse_fd(&argc, &argv); + if (*progfd < 0) + return *progfd; + + *attach_type = parse_attach_type(*argv); + if (*attach_type == __MAX_BPF_ATTACH_TYPE) { + p_err("invalid attach/detach type"); + return -EINVAL; + } + + if (*attach_type == BPF_FLOW_DISSECTOR) { + *mapfd = 0; + return 0; + } + + NEXT_ARG(); + if (!REQ_ARGS(2)) + return -EINVAL; + + *mapfd = map_parse_fd(&argc, &argv); + if (*mapfd < 0) + return *mapfd; + + return 0; +} + +static int do_attach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, progfd; + int mapfd; + + err = parse_attach_detach_args(argc, argv, + &progfd, &attach_type, &mapfd); + if (err) + return err; + + err = bpf_prog_attach(progfd, mapfd, attach_type, 0); + if (err) { + p_err("failed prog attach to map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + +static int do_detach(int argc, char **argv) +{ + enum bpf_attach_type attach_type; + int err, progfd; + int mapfd; + + err = parse_attach_detach_args(argc, argv, + &progfd, &attach_type, &mapfd); + if (err) + return err; + + err = bpf_prog_detach2(progfd, mapfd, attach_type); + if (err) { + p_err("failed prog detach from map"); + return -EINVAL; + } + + if (json_output) + jsonw_null(json_wtr); + return 0; +} + +static int check_single_stdin(char *file_data_in, char *file_ctx_in) +{ + if (file_data_in && file_ctx_in && + !strcmp(file_data_in, "-") && !strcmp(file_ctx_in, "-")) { + p_err("cannot use standard input for both data_in and ctx_in"); + return -1; + } + + return 0; +} + +static int get_run_data(const char *fname, void **data_ptr, unsigned int *size) +{ + size_t block_size = 256; + size_t buf_size = block_size; + size_t nb_read = 0; + void *tmp; + FILE *f; + + if (!fname) { + *data_ptr = NULL; + *size = 0; + return 0; + } + + if (!strcmp(fname, "-")) + f = stdin; + else + f = fopen(fname, "r"); + if (!f) { + p_err("failed to open %s: %s", fname, strerror(errno)); + return -1; + } + + *data_ptr = malloc(block_size); + if (!*data_ptr) { + p_err("failed to allocate memory for data_in/ctx_in: %s", + strerror(errno)); + goto err_fclose; + } + + while ((nb_read += fread(*data_ptr + nb_read, 1, block_size, f))) { + if (feof(f)) + break; + if (ferror(f)) { + p_err("failed to read data_in/ctx_in from %s: %s", + fname, strerror(errno)); + goto err_free; + } + if (nb_read > buf_size - block_size) { + if (buf_size == UINT32_MAX) { + p_err("data_in/ctx_in is too long (max: %d)", + UINT32_MAX); + goto err_free; + } + /* No space for fread()-ing next chunk; realloc() */ + buf_size *= 2; + tmp = realloc(*data_ptr, buf_size); + if (!tmp) { + p_err("failed to reallocate data_in/ctx_in: %s", + strerror(errno)); + goto err_free; + } + *data_ptr = tmp; + } + } + if (f != stdin) + fclose(f); + + *size = nb_read; + return 0; + +err_free: + free(*data_ptr); + *data_ptr = NULL; +err_fclose: + if (f != stdin) + fclose(f); + return -1; +} + +static void hex_print(void *data, unsigned int size, FILE *f) +{ + size_t i, j; + char c; + + for (i = 0; i < size; i += 16) { + /* Row offset */ + fprintf(f, "%07zx\t", i); + + /* Hexadecimal values */ + for (j = i; j < i + 16 && j < size; j++) + fprintf(f, "%02x%s", *(uint8_t *)(data + j), + j % 2 ? " " : ""); + for (; j < i + 16; j++) + fprintf(f, " %s", j % 2 ? " " : ""); + + /* ASCII values (if relevant), '.' otherwise */ + fprintf(f, "| "); + for (j = i; j < i + 16 && j < size; j++) { + c = *(char *)(data + j); + if (c < ' ' || c > '~') + c = '.'; + fprintf(f, "%c%s", c, j == i + 7 ? " " : ""); + } + + fprintf(f, "\n"); + } +} + +static int +print_run_output(void *data, unsigned int size, const char *fname, + const char *json_key) +{ + size_t nb_written; + FILE *f; + + if (!fname) + return 0; + + if (!strcmp(fname, "-")) { + f = stdout; + if (json_output) { + jsonw_name(json_wtr, json_key); + print_data_json(data, size); + } else { + hex_print(data, size, f); + } + return 0; + } + + f = fopen(fname, "w"); + if (!f) { + p_err("failed to open %s: %s", fname, strerror(errno)); + return -1; + } + + nb_written = fwrite(data, 1, size, f); + fclose(f); + if (nb_written != size) { + p_err("failed to write output data/ctx: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int alloc_run_data(void **data_ptr, unsigned int size_out) +{ + *data_ptr = calloc(size_out, 1); + if (!*data_ptr) { + p_err("failed to allocate memory for output data/ctx: %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static int do_run(int argc, char **argv) +{ + char *data_fname_in = NULL, *data_fname_out = NULL; + char *ctx_fname_in = NULL, *ctx_fname_out = NULL; + const unsigned int default_size = SZ_32K; + void *data_in = NULL, *data_out = NULL; + void *ctx_in = NULL, *ctx_out = NULL; + unsigned int repeat = 1; + int fd, err; + LIBBPF_OPTS(bpf_test_run_opts, test_attr); + + if (!REQ_ARGS(4)) + return -1; + + fd = prog_parse_fd(&argc, &argv); + if (fd < 0) + return -1; + + while (argc) { + if (detect_common_prefix(*argv, "data_in", "data_out", + "data_size_out", NULL)) + return -1; + if (detect_common_prefix(*argv, "ctx_in", "ctx_out", + "ctx_size_out", NULL)) + return -1; + + if (is_prefix(*argv, "data_in")) { + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + data_fname_in = GET_ARG(); + if (check_single_stdin(data_fname_in, ctx_fname_in)) + return -1; + } else if (is_prefix(*argv, "data_out")) { + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + data_fname_out = GET_ARG(); + } else if (is_prefix(*argv, "data_size_out")) { + char *endptr; + + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + test_attr.data_size_out = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as output data size", + *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "ctx_in")) { + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + ctx_fname_in = GET_ARG(); + if (check_single_stdin(data_fname_in, ctx_fname_in)) + return -1; + } else if (is_prefix(*argv, "ctx_out")) { + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + ctx_fname_out = GET_ARG(); + } else if (is_prefix(*argv, "ctx_size_out")) { + char *endptr; + + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + test_attr.ctx_size_out = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as output context size", + *argv); + return -1; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "repeat")) { + char *endptr; + + NEXT_ARG(); + if (!REQ_ARGS(1)) + return -1; + + repeat = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as repeat number", + *argv); + return -1; + } + NEXT_ARG(); + } else { + p_err("expected no more arguments, 'data_in', 'data_out', 'data_size_out', 'ctx_in', 'ctx_out', 'ctx_size_out' or 'repeat', got: '%s'?", + *argv); + return -1; + } + } + + err = get_run_data(data_fname_in, &data_in, &test_attr.data_size_in); + if (err) + return -1; + + if (data_in) { + if (!test_attr.data_size_out) + test_attr.data_size_out = default_size; + err = alloc_run_data(&data_out, test_attr.data_size_out); + if (err) + goto free_data_in; + } + + err = get_run_data(ctx_fname_in, &ctx_in, &test_attr.ctx_size_in); + if (err) + goto free_data_out; + + if (ctx_in) { + if (!test_attr.ctx_size_out) + test_attr.ctx_size_out = default_size; + err = alloc_run_data(&ctx_out, test_attr.ctx_size_out); + if (err) + goto free_ctx_in; + } + + test_attr.repeat = repeat; + test_attr.data_in = data_in; + test_attr.data_out = data_out; + test_attr.ctx_in = ctx_in; + test_attr.ctx_out = ctx_out; + + err = bpf_prog_test_run_opts(fd, &test_attr); + if (err) { + p_err("failed to run program: %s", strerror(errno)); + goto free_ctx_out; + } + + err = 0; + + if (json_output) + jsonw_start_object(json_wtr); /* root */ + + /* Do not exit on errors occurring when printing output data/context, + * we still want to print return value and duration for program run. + */ + if (test_attr.data_size_out) + err += print_run_output(test_attr.data_out, + test_attr.data_size_out, + data_fname_out, "data_out"); + if (test_attr.ctx_size_out) + err += print_run_output(test_attr.ctx_out, + test_attr.ctx_size_out, + ctx_fname_out, "ctx_out"); + + if (json_output) { + jsonw_uint_field(json_wtr, "retval", test_attr.retval); + jsonw_uint_field(json_wtr, "duration", test_attr.duration); + jsonw_end_object(json_wtr); /* root */ + } else { + fprintf(stdout, "Return value: %u, duration%s: %uns\n", + test_attr.retval, + repeat > 1 ? " (average)" : "", test_attr.duration); + } + +free_ctx_out: + free(ctx_out); +free_ctx_in: + free(ctx_in); +free_data_out: + free(data_out); +free_data_in: + free(data_in); + + return err; +} + +static int +get_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type) +{ + libbpf_print_fn_t print_backup; + int ret; + + ret = libbpf_prog_type_by_name(name, prog_type, expected_attach_type); + if (!ret) + return ret; + + /* libbpf_prog_type_by_name() failed, let's re-run with debug level */ + print_backup = libbpf_set_print(print_all_levels); + ret = libbpf_prog_type_by_name(name, prog_type, expected_attach_type); + libbpf_set_print(print_backup); + + return ret; +} + +static int +auto_attach_program(struct bpf_program *prog, const char *path) +{ + struct bpf_link *link; + int err; + + link = bpf_program__attach(prog); + if (!link) { + p_info("Program %s does not support autoattach, falling back to pinning", + bpf_program__name(prog)); + return bpf_obj_pin(bpf_program__fd(prog), path); + } + + err = bpf_link__pin(link, path); + bpf_link__destroy(link); + return err; +} + +static int +auto_attach_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + char buf[PATH_MAX]; + int err; + + bpf_object__for_each_program(prog, obj) { + err = pathname_concat(buf, sizeof(buf), path, bpf_program__name(prog)); + if (err) + goto err_unpin_programs; + + err = auto_attach_program(prog, buf); + if (err) + goto err_unpin_programs; + } + + return 0; + +err_unpin_programs: + while ((prog = bpf_object__prev_program(obj, prog))) { + if (pathname_concat(buf, sizeof(buf), path, bpf_program__name(prog))) + continue; + + bpf_program__unpin(prog, buf); + } + + return err; +} + +static int load_with_options(int argc, char **argv, bool first_prog_only) +{ + enum bpf_prog_type common_prog_type = BPF_PROG_TYPE_UNSPEC; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts, + .relaxed_maps = relaxed_maps, + ); + enum bpf_attach_type expected_attach_type; + struct map_replace *map_replace = NULL; + struct bpf_program *prog = NULL, *pos; + unsigned int old_map_fds = 0; + const char *pinmaps = NULL; + __u32 xdpmeta_ifindex = 0; + __u32 offload_ifindex = 0; + bool auto_attach = false; + struct bpf_object *obj; + struct bpf_map *map; + const char *pinfile; + unsigned int i, j; + const char *file; + int idx, err; + + + if (!REQ_ARGS(2)) + return -1; + file = GET_ARG(); + pinfile = GET_ARG(); + + while (argc) { + if (is_prefix(*argv, "type")) { + NEXT_ARG(); + + if (common_prog_type != BPF_PROG_TYPE_UNSPEC) { + p_err("program type already specified"); + goto err_free_reuse_maps; + } + if (!REQ_ARGS(1)) + goto err_free_reuse_maps; + + err = libbpf_prog_type_by_name(*argv, &common_prog_type, + &expected_attach_type); + if (err < 0) { + /* Put a '/' at the end of type to appease libbpf */ + char *type = malloc(strlen(*argv) + 2); + + if (!type) { + p_err("mem alloc failed"); + goto err_free_reuse_maps; + } + *type = 0; + strcat(type, *argv); + strcat(type, "/"); + + err = get_prog_type_by_name(type, &common_prog_type, + &expected_attach_type); + free(type); + if (err < 0) + goto err_free_reuse_maps; + } + + NEXT_ARG(); + } else if (is_prefix(*argv, "map")) { + void *new_map_replace; + char *endptr, *name; + int fd; + + NEXT_ARG(); + + if (!REQ_ARGS(4)) + goto err_free_reuse_maps; + + if (is_prefix(*argv, "idx")) { + NEXT_ARG(); + + idx = strtoul(*argv, &endptr, 0); + if (*endptr) { + p_err("can't parse %s as IDX", *argv); + goto err_free_reuse_maps; + } + name = NULL; + } else if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + name = *argv; + idx = -1; + } else { + p_err("expected 'idx' or 'name', got: '%s'?", + *argv); + goto err_free_reuse_maps; + } + NEXT_ARG(); + + fd = map_parse_fd(&argc, &argv); + if (fd < 0) + goto err_free_reuse_maps; + + new_map_replace = libbpf_reallocarray(map_replace, + old_map_fds + 1, + sizeof(*map_replace)); + if (!new_map_replace) { + p_err("mem alloc failed"); + goto err_free_reuse_maps; + } + map_replace = new_map_replace; + + map_replace[old_map_fds].idx = idx; + map_replace[old_map_fds].name = name; + map_replace[old_map_fds].fd = fd; + old_map_fds++; + } else if (is_prefix(*argv, "dev")) { + p_info("Warning: 'bpftool prog load [...] dev ' syntax is deprecated.\n" + "Going further, please use 'offload_dev ' to offload program to device.\n" + "For applications using XDP hints only, use 'xdpmeta_dev '."); + goto offload_dev; + } else if (is_prefix(*argv, "offload_dev")) { +offload_dev: + NEXT_ARG(); + + if (offload_ifindex) { + p_err("offload_dev already specified"); + goto err_free_reuse_maps; + } else if (xdpmeta_ifindex) { + p_err("xdpmeta_dev and offload_dev are mutually exclusive"); + goto err_free_reuse_maps; + } + if (!REQ_ARGS(1)) + goto err_free_reuse_maps; + + offload_ifindex = if_nametoindex(*argv); + if (!offload_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + goto err_free_reuse_maps; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "xdpmeta_dev")) { + NEXT_ARG(); + + if (xdpmeta_ifindex) { + p_err("xdpmeta_dev already specified"); + goto err_free_reuse_maps; + } else if (offload_ifindex) { + p_err("xdpmeta_dev and offload_dev are mutually exclusive"); + goto err_free_reuse_maps; + } + if (!REQ_ARGS(1)) + goto err_free_reuse_maps; + + xdpmeta_ifindex = if_nametoindex(*argv); + if (!xdpmeta_ifindex) { + p_err("unrecognized netdevice '%s': %s", + *argv, strerror(errno)); + goto err_free_reuse_maps; + } + NEXT_ARG(); + } else if (is_prefix(*argv, "pinmaps")) { + NEXT_ARG(); + + if (!REQ_ARGS(1)) + goto err_free_reuse_maps; + + pinmaps = GET_ARG(); + } else if (is_prefix(*argv, "autoattach")) { + auto_attach = true; + NEXT_ARG(); + } else { + p_err("expected no more arguments, 'type', 'map' or 'dev', got: '%s'?", + *argv); + goto err_free_reuse_maps; + } + } + + set_max_rlimit(); + + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + + obj = bpf_object__open_file(file, &open_opts); + if (!obj) { + p_err("failed to open object file"); + goto err_free_reuse_maps; + } + + bpf_object__for_each_program(pos, obj) { + enum bpf_prog_type prog_type = common_prog_type; + + if (prog_type == BPF_PROG_TYPE_UNSPEC) { + const char *sec_name = bpf_program__section_name(pos); + + err = get_prog_type_by_name(sec_name, &prog_type, + &expected_attach_type); + if (err < 0) + goto err_close_obj; + } + + if (prog_type == BPF_PROG_TYPE_XDP && xdpmeta_ifindex) { + bpf_program__set_flags(pos, BPF_F_XDP_DEV_BOUND_ONLY); + bpf_program__set_ifindex(pos, xdpmeta_ifindex); + } else { + bpf_program__set_ifindex(pos, offload_ifindex); + } + if (bpf_program__type(pos) != prog_type) + bpf_program__set_type(pos, prog_type); + bpf_program__set_expected_attach_type(pos, expected_attach_type); + } + + qsort(map_replace, old_map_fds, sizeof(*map_replace), + map_replace_compar); + + /* After the sort maps by name will be first on the list, because they + * have idx == -1. Resolve them. + */ + j = 0; + while (j < old_map_fds && map_replace[j].name) { + i = 0; + bpf_object__for_each_map(map, obj) { + if (!strcmp(bpf_map__name(map), map_replace[j].name)) { + map_replace[j].idx = i; + break; + } + i++; + } + if (map_replace[j].idx == -1) { + p_err("unable to find map '%s'", map_replace[j].name); + goto err_close_obj; + } + j++; + } + /* Resort if any names were resolved */ + if (j) + qsort(map_replace, old_map_fds, sizeof(*map_replace), + map_replace_compar); + + /* Set ifindex and name reuse */ + j = 0; + idx = 0; + bpf_object__for_each_map(map, obj) { + if (bpf_map__type(map) != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + bpf_map__set_ifindex(map, offload_ifindex); + + if (j < old_map_fds && idx == map_replace[j].idx) { + err = bpf_map__reuse_fd(map, map_replace[j++].fd); + if (err) { + p_err("unable to set up map reuse: %d", err); + goto err_close_obj; + } + + /* Next reuse wants to apply to the same map */ + if (j < old_map_fds && map_replace[j].idx == idx) { + p_err("replacement for map idx %d specified more than once", + idx); + goto err_close_obj; + } + } + + idx++; + } + if (j < old_map_fds) { + p_err("map idx '%d' not used", map_replace[j].idx); + goto err_close_obj; + } + + err = bpf_object__load(obj); + if (err) { + p_err("failed to load object file"); + goto err_close_obj; + } + + err = mount_bpffs_for_pin(pinfile, !first_prog_only); + if (err) + goto err_close_obj; + + if (first_prog_only) { + prog = bpf_object__next_program(obj, NULL); + if (!prog) { + p_err("object file doesn't contain any bpf program"); + goto err_close_obj; + } + + if (auto_attach) + err = auto_attach_program(prog, pinfile); + else + err = bpf_obj_pin(bpf_program__fd(prog), pinfile); + if (err) { + p_err("failed to pin program %s", + bpf_program__section_name(prog)); + goto err_close_obj; + } + } else { + if (auto_attach) + err = auto_attach_programs(obj, pinfile); + else + err = bpf_object__pin_programs(obj, pinfile); + if (err) { + p_err("failed to pin all programs"); + goto err_close_obj; + } + } + + if (pinmaps) { + err = bpf_object__pin_maps(obj, pinmaps); + if (err) { + p_err("failed to pin all maps"); + goto err_unpin; + } + } + + if (json_output) + jsonw_null(json_wtr); + + bpf_object__close(obj); + for (i = 0; i < old_map_fds; i++) + close(map_replace[i].fd); + free(map_replace); + + return 0; + +err_unpin: + if (first_prog_only) + unlink(pinfile); + else + bpf_object__unpin_programs(obj, pinfile); +err_close_obj: + bpf_object__close(obj); +err_free_reuse_maps: + for (i = 0; i < old_map_fds; i++) + close(map_replace[i].fd); + free(map_replace); + return -1; +} + +static int count_open_fds(void) +{ + DIR *dp = opendir("/proc/self/fd"); + struct dirent *de; + int cnt = -3; + + if (!dp) + return -1; + + while ((de = readdir(dp))) + cnt++; + + closedir(dp); + return cnt; +} + +static int try_loader(struct gen_loader_opts *gen) +{ + struct bpf_load_and_run_opts opts = {}; + struct bpf_loader_ctx *ctx; + int ctx_sz = sizeof(*ctx) + 64 * max(sizeof(struct bpf_map_desc), + sizeof(struct bpf_prog_desc)); + int log_buf_sz = (1u << 24) - 1; + int err, fds_before, fd_delta; + char *log_buf = NULL; + + ctx = alloca(ctx_sz); + memset(ctx, 0, ctx_sz); + ctx->sz = ctx_sz; + if (verifier_logs) { + ctx->log_level = 1 + 2 + 4; + ctx->log_size = log_buf_sz; + log_buf = malloc(log_buf_sz); + if (!log_buf) + return -ENOMEM; + ctx->log_buf = (long) log_buf; + } + opts.ctx = ctx; + opts.data = gen->data; + opts.data_sz = gen->data_sz; + opts.insns = gen->insns; + opts.insns_sz = gen->insns_sz; + fds_before = count_open_fds(); + err = bpf_load_and_run(&opts); + fd_delta = count_open_fds() - fds_before; + if (err < 0 || verifier_logs) { + fprintf(stderr, "err %d\n%s\n%s", err, opts.errstr, log_buf); + if (fd_delta && err < 0) + fprintf(stderr, "loader prog leaked %d FDs\n", + fd_delta); + } + free(log_buf); + return err; +} + +static int do_loader(int argc, char **argv) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts); + DECLARE_LIBBPF_OPTS(gen_loader_opts, gen); + struct bpf_object *obj; + const char *file; + int err = 0; + + if (!REQ_ARGS(1)) + return -1; + file = GET_ARG(); + + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + + obj = bpf_object__open_file(file, &open_opts); + if (!obj) { + p_err("failed to open object file"); + goto err_close_obj; + } + + err = bpf_object__gen_loader(obj, &gen); + if (err) + goto err_close_obj; + + err = bpf_object__load(obj); + if (err) { + p_err("failed to load object file"); + goto err_close_obj; + } + + if (verifier_logs) { + struct dump_data dd = {}; + + kernel_syms_load(&dd); + dump_xlated_plain(&dd, (void *)gen.insns, gen.insns_sz, false, false); + kernel_syms_destroy(&dd); + } + err = try_loader(&gen); +err_close_obj: + bpf_object__close(obj); + return err; +} + +static int do_load(int argc, char **argv) +{ + if (use_loader) + return do_loader(argc, argv); + return load_with_options(argc, argv, true); +} + +static int do_loadall(int argc, char **argv) +{ + return load_with_options(argc, argv, false); +} + +#ifdef BPFTOOL_WITHOUT_SKELETONS + +static int do_profile(int argc, char **argv) +{ + p_err("bpftool prog profile command is not supported. Please build bpftool with clang >= 10.0.0"); + return 0; +} + +#else /* BPFTOOL_WITHOUT_SKELETONS */ + +#include "profiler.skel.h" + +struct profile_metric { + const char *name; + struct bpf_perf_event_value val; + struct perf_event_attr attr; + bool selected; + + /* calculate ratios like instructions per cycle */ + const int ratio_metric; /* 0 for N/A, 1 for index 0 (cycles) */ + const char *ratio_desc; + const float ratio_mul; +} metrics[] = { + { + .name = "cycles", + .attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .exclude_user = 1, + }, + }, + { + .name = "instructions", + .attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_INSTRUCTIONS, + .exclude_user = 1, + }, + .ratio_metric = 1, + .ratio_desc = "insns per cycle", + .ratio_mul = 1.0, + }, + { + .name = "l1d_loads", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), + .exclude_user = 1, + }, + }, + { + .name = "llc_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "LLC misses per million insns", + .ratio_mul = 1e6, + }, + { + .name = "itlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_ITLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "itlb misses per million insns", + .ratio_mul = 1e6, + }, + { + .name = "dtlb_misses", + .attr = { + .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_DTLB | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), + .exclude_user = 1 + }, + .ratio_metric = 2, + .ratio_desc = "dtlb misses per million insns", + .ratio_mul = 1e6, + }, +}; + +static __u64 profile_total_count; + +#define MAX_NUM_PROFILE_METRICS 4 + +static int profile_parse_metrics(int argc, char **argv) +{ + unsigned int metric_cnt; + int selected_cnt = 0; + unsigned int i; + + metric_cnt = ARRAY_SIZE(metrics); + + while (argc > 0) { + for (i = 0; i < metric_cnt; i++) { + if (is_prefix(argv[0], metrics[i].name)) { + if (!metrics[i].selected) + selected_cnt++; + metrics[i].selected = true; + break; + } + } + if (i == metric_cnt) { + p_err("unknown metric %s", argv[0]); + return -1; + } + NEXT_ARG(); + } + if (selected_cnt > MAX_NUM_PROFILE_METRICS) { + p_err("too many (%d) metrics, please specify no more than %d metrics at at time", + selected_cnt, MAX_NUM_PROFILE_METRICS); + return -1; + } + return selected_cnt; +} + +static void profile_read_values(struct profiler_bpf *obj) +{ + __u32 m, cpu, num_cpu = obj->rodata->num_cpu; + int reading_map_fd, count_map_fd; + __u64 counts[num_cpu]; + __u32 key = 0; + int err; + + reading_map_fd = bpf_map__fd(obj->maps.accum_readings); + count_map_fd = bpf_map__fd(obj->maps.counts); + if (reading_map_fd < 0 || count_map_fd < 0) { + p_err("failed to get fd for map"); + return; + } + + err = bpf_map_lookup_elem(count_map_fd, &key, counts); + if (err) { + p_err("failed to read count_map: %s", strerror(errno)); + return; + } + + profile_total_count = 0; + for (cpu = 0; cpu < num_cpu; cpu++) + profile_total_count += counts[cpu]; + + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + struct bpf_perf_event_value values[num_cpu]; + + if (!metrics[m].selected) + continue; + + err = bpf_map_lookup_elem(reading_map_fd, &key, values); + if (err) { + p_err("failed to read reading_map: %s", + strerror(errno)); + return; + } + for (cpu = 0; cpu < num_cpu; cpu++) { + metrics[m].val.counter += values[cpu].counter; + metrics[m].val.enabled += values[cpu].enabled; + metrics[m].val.running += values[cpu].running; + } + key++; + } +} + +static void profile_print_readings_json(void) +{ + __u32 m; + + jsonw_start_array(json_wtr); + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + if (!metrics[m].selected) + continue; + jsonw_start_object(json_wtr); + jsonw_string_field(json_wtr, "metric", metrics[m].name); + jsonw_lluint_field(json_wtr, "run_cnt", profile_total_count); + jsonw_lluint_field(json_wtr, "value", metrics[m].val.counter); + jsonw_lluint_field(json_wtr, "enabled", metrics[m].val.enabled); + jsonw_lluint_field(json_wtr, "running", metrics[m].val.running); + + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); +} + +static void profile_print_readings_plain(void) +{ + __u32 m; + + printf("\n%18llu %-20s\n", profile_total_count, "run_cnt"); + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + struct bpf_perf_event_value *val = &metrics[m].val; + int r; + + if (!metrics[m].selected) + continue; + printf("%18llu %-20s", val->counter, metrics[m].name); + + r = metrics[m].ratio_metric - 1; + if (r >= 0 && metrics[r].selected && + metrics[r].val.counter > 0) { + printf("# %8.2f %-30s", + val->counter * metrics[m].ratio_mul / + metrics[r].val.counter, + metrics[m].ratio_desc); + } else { + printf("%-41s", ""); + } + + if (val->enabled > val->running) + printf("(%4.2f%%)", + val->running * 100.0 / val->enabled); + printf("\n"); + } +} + +static void profile_print_readings(void) +{ + if (json_output) + profile_print_readings_json(); + else + profile_print_readings_plain(); +} + +static char *profile_target_name(int tgt_fd) +{ + struct bpf_func_info func_info; + struct bpf_prog_info info = {}; + __u32 info_len = sizeof(info); + const struct btf_type *t; + __u32 func_info_rec_size; + struct btf *btf = NULL; + char *name = NULL; + int err; + + err = bpf_prog_get_info_by_fd(tgt_fd, &info, &info_len); + if (err) { + p_err("failed to get info for prog FD %d", tgt_fd); + goto out; + } + + if (info.btf_id == 0) { + p_err("prog FD %d doesn't have valid btf", tgt_fd); + goto out; + } + + func_info_rec_size = info.func_info_rec_size; + if (info.nr_func_info == 0) { + p_err("found 0 func_info for prog FD %d", tgt_fd); + goto out; + } + + memset(&info, 0, sizeof(info)); + info.nr_func_info = 1; + info.func_info_rec_size = func_info_rec_size; + info.func_info = ptr_to_u64(&func_info); + + err = bpf_prog_get_info_by_fd(tgt_fd, &info, &info_len); + if (err) { + p_err("failed to get func_info for prog FD %d", tgt_fd); + goto out; + } + + btf = btf__load_from_kernel_by_id(info.btf_id); + if (!btf) { + p_err("failed to load btf for prog FD %d", tgt_fd); + goto out; + } + + t = btf__type_by_id(btf, func_info.type_id); + if (!t) { + p_err("btf %d doesn't have type %d", + info.btf_id, func_info.type_id); + goto out; + } + name = strdup(btf__name_by_offset(btf, t->name_off)); +out: + btf__free(btf); + return name; +} + +static struct profiler_bpf *profile_obj; +static int profile_tgt_fd = -1; +static char *profile_tgt_name; +static int *profile_perf_events; +static int profile_perf_event_cnt; + +static void profile_close_perf_events(struct profiler_bpf *obj) +{ + int i; + + for (i = profile_perf_event_cnt - 1; i >= 0; i--) + close(profile_perf_events[i]); + + free(profile_perf_events); + profile_perf_event_cnt = 0; +} + +static int profile_open_perf_event(int mid, int cpu, int map_fd) +{ + int pmu_fd; + + pmu_fd = syscall(__NR_perf_event_open, &metrics[mid].attr, + -1 /*pid*/, cpu, -1 /*group_fd*/, 0); + if (pmu_fd < 0) { + if (errno == ENODEV) { + p_info("cpu %d may be offline, skip %s profiling.", + cpu, metrics[mid].name); + profile_perf_event_cnt++; + return 0; + } + return -1; + } + + if (bpf_map_update_elem(map_fd, + &profile_perf_event_cnt, + &pmu_fd, BPF_ANY) || + ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0)) { + close(pmu_fd); + return -1; + } + + profile_perf_events[profile_perf_event_cnt++] = pmu_fd; + return 0; +} + +static int profile_open_perf_events(struct profiler_bpf *obj) +{ + unsigned int cpu, m; + int map_fd; + + profile_perf_events = calloc( + sizeof(int), obj->rodata->num_cpu * obj->rodata->num_metric); + if (!profile_perf_events) { + p_err("failed to allocate memory for perf_event array: %s", + strerror(errno)); + return -1; + } + map_fd = bpf_map__fd(obj->maps.events); + if (map_fd < 0) { + p_err("failed to get fd for events map"); + return -1; + } + + for (m = 0; m < ARRAY_SIZE(metrics); m++) { + if (!metrics[m].selected) + continue; + for (cpu = 0; cpu < obj->rodata->num_cpu; cpu++) { + if (profile_open_perf_event(m, cpu, map_fd)) { + p_err("failed to create event %s on cpu %d", + metrics[m].name, cpu); + return -1; + } + } + } + return 0; +} + +static void profile_print_and_cleanup(void) +{ + profile_close_perf_events(profile_obj); + profile_read_values(profile_obj); + profile_print_readings(); + profiler_bpf__destroy(profile_obj); + + close(profile_tgt_fd); + free(profile_tgt_name); +} + +static void int_exit(int signo) +{ + profile_print_and_cleanup(); + exit(0); +} + +static int do_profile(int argc, char **argv) +{ + int num_metric, num_cpu, err = -1; + struct bpf_program *prog; + unsigned long duration; + char *endptr; + + /* we at least need two args for the prog and one metric */ + if (!REQ_ARGS(3)) + return -EINVAL; + + /* parse target fd */ + profile_tgt_fd = prog_parse_fd(&argc, &argv); + if (profile_tgt_fd < 0) { + p_err("failed to parse fd"); + return -1; + } + + /* parse profiling optional duration */ + if (argc > 2 && is_prefix(argv[0], "duration")) { + NEXT_ARG(); + duration = strtoul(*argv, &endptr, 0); + if (*endptr) + usage(); + NEXT_ARG(); + } else { + duration = UINT_MAX; + } + + num_metric = profile_parse_metrics(argc, argv); + if (num_metric <= 0) + goto out; + + num_cpu = libbpf_num_possible_cpus(); + if (num_cpu <= 0) { + p_err("failed to identify number of CPUs"); + goto out; + } + + profile_obj = profiler_bpf__open(); + if (!profile_obj) { + p_err("failed to open and/or load BPF object"); + goto out; + } + + profile_obj->rodata->num_cpu = num_cpu; + profile_obj->rodata->num_metric = num_metric; + + /* adjust map sizes */ + bpf_map__set_max_entries(profile_obj->maps.events, num_metric * num_cpu); + bpf_map__set_max_entries(profile_obj->maps.fentry_readings, num_metric); + bpf_map__set_max_entries(profile_obj->maps.accum_readings, num_metric); + bpf_map__set_max_entries(profile_obj->maps.counts, 1); + + /* change target name */ + profile_tgt_name = profile_target_name(profile_tgt_fd); + if (!profile_tgt_name) + goto out; + + bpf_object__for_each_program(prog, profile_obj->obj) { + err = bpf_program__set_attach_target(prog, profile_tgt_fd, + profile_tgt_name); + if (err) { + p_err("failed to set attach target\n"); + goto out; + } + } + + set_max_rlimit(); + err = profiler_bpf__load(profile_obj); + if (err) { + p_err("failed to load profile_obj"); + goto out; + } + + err = profile_open_perf_events(profile_obj); + if (err) + goto out; + + err = profiler_bpf__attach(profile_obj); + if (err) { + p_err("failed to attach profile_obj"); + goto out; + } + signal(SIGINT, int_exit); + + sleep(duration); + profile_print_and_cleanup(); + return 0; + +out: + profile_close_perf_events(profile_obj); + if (profile_obj) + profiler_bpf__destroy(profile_obj); + close(profile_tgt_fd); + free(profile_tgt_name); + return err; +} + +#endif /* BPFTOOL_WITHOUT_SKELETONS */ + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [PROG]\n" + " %1$s %2$s dump xlated PROG [{ file FILE | [opcodes] [linum] [visual] }]\n" + " %1$s %2$s dump jited PROG [{ file FILE | [opcodes] [linum] }]\n" + " %1$s %2$s pin PROG FILE\n" + " %1$s %2$s { load | loadall } OBJ PATH \\\n" + " [type TYPE] [{ offload_dev | xdpmeta_dev } NAME] \\\n" + " [map { idx IDX | name NAME } MAP]\\\n" + " [pinmaps MAP_DIR]\n" + " [autoattach]\n" + " %1$s %2$s attach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s detach PROG ATTACH_TYPE [MAP]\n" + " %1$s %2$s run PROG \\\n" + " data_in FILE \\\n" + " [data_out FILE [data_size_out L]] \\\n" + " [ctx_in FILE [ctx_out FILE [ctx_size_out M]]] \\\n" + " [repeat N]\n" + " %1$s %2$s profile PROG [duration DURATION] METRICs\n" + " %1$s %2$s tracelog\n" + " %1$s %2$s help\n" + "\n" + " " HELP_SPEC_MAP "\n" + " " HELP_SPEC_PROGRAM "\n" + " TYPE := { socket | kprobe | kretprobe | classifier | action |\n" + " tracepoint | raw_tracepoint | xdp | perf_event | cgroup/skb |\n" + " cgroup/sock | cgroup/dev | lwt_in | lwt_out | lwt_xmit |\n" + " lwt_seg6local | sockops | sk_skb | sk_msg | lirc_mode2 |\n" + " sk_reuseport | flow_dissector | cgroup/sysctl |\n" + " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" + " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" + " cgroup/getpeername4 | cgroup/getpeername6 |\n" + " cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n" + " cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n" + " cgroup/getsockopt | cgroup/setsockopt | cgroup/sock_release |\n" + " struct_ops | fentry | fexit | freplace | sk_lookup }\n" + " ATTACH_TYPE := { sk_msg_verdict | sk_skb_verdict | sk_skb_stream_verdict |\n" + " sk_skb_stream_parser | flow_dissector }\n" + " METRIC := { cycles | instructions | l1d_loads | llc_misses | itlb_misses | dtlb_misses }\n" + " " HELP_SPEC_OPTIONS " |\n" + " {-f|--bpffs} | {-m|--mapcompat} | {-n|--nomount} |\n" + " {-L|--use-loader} }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { "dump", do_dump }, + { "pin", do_pin }, + { "load", do_load }, + { "loadall", do_loadall }, + { "attach", do_attach }, + { "detach", do_detach }, + { "tracelog", do_tracelog }, + { "run", do_run }, + { "profile", do_profile }, + { 0 } +}; + +int do_prog(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/third_party/bpftool/src/skeleton/pid_iter.bpf.c b/third_party/bpftool/src/skeleton/pid_iter.bpf.c new file mode 100644 index 0000000..26004f0 --- /dev/null +++ b/third_party/bpftool/src/skeleton/pid_iter.bpf.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include "pid_iter.h" + +/* keep in sync with the definition in main.h */ +enum bpf_obj_type { + BPF_OBJ_UNKNOWN, + BPF_OBJ_PROG, + BPF_OBJ_MAP, + BPF_OBJ_LINK, + BPF_OBJ_BTF, +}; + +struct bpf_perf_link___local { + struct bpf_link link; + struct file *perf_file; +} __attribute__((preserve_access_index)); + +struct perf_event___local { + u64 bpf_cookie; +} __attribute__((preserve_access_index)); + +enum bpf_link_type___local { + BPF_LINK_TYPE_PERF_EVENT___local = 7, +}; + +extern const void bpf_link_fops __ksym; +extern const void bpf_map_fops __ksym; +extern const void bpf_prog_fops __ksym; +extern const void btf_fops __ksym; + +const volatile enum bpf_obj_type obj_type = BPF_OBJ_UNKNOWN; + +static __always_inline __u32 get_obj_id(void *ent, enum bpf_obj_type type) +{ + switch (type) { + case BPF_OBJ_PROG: + return BPF_CORE_READ((struct bpf_prog *)ent, aux, id); + case BPF_OBJ_MAP: + return BPF_CORE_READ((struct bpf_map *)ent, id); + case BPF_OBJ_BTF: + return BPF_CORE_READ((struct btf *)ent, id); + case BPF_OBJ_LINK: + return BPF_CORE_READ((struct bpf_link *)ent, id); + default: + return 0; + } +} + +/* could be used only with BPF_LINK_TYPE_PERF_EVENT links */ +static __u64 get_bpf_cookie(struct bpf_link *link) +{ + struct bpf_perf_link___local *perf_link; + struct perf_event___local *event; + + perf_link = container_of(link, struct bpf_perf_link___local, link); + event = BPF_CORE_READ(perf_link, perf_file, private_data); + return BPF_CORE_READ(event, bpf_cookie); +} + +SEC("iter/task_file") +int iter(struct bpf_iter__task_file *ctx) +{ + struct file *file = ctx->file; + struct task_struct *task = ctx->task; + struct pid_iter_entry e; + const void *fops; + + if (!file || !task) + return 0; + + switch (obj_type) { + case BPF_OBJ_PROG: + fops = &bpf_prog_fops; + break; + case BPF_OBJ_MAP: + fops = &bpf_map_fops; + break; + case BPF_OBJ_BTF: + fops = &btf_fops; + break; + case BPF_OBJ_LINK: + fops = &bpf_link_fops; + break; + default: + return 0; + } + + if (file->f_op != fops) + return 0; + + __builtin_memset(&e, 0, sizeof(e)); + e.pid = task->tgid; + e.id = get_obj_id(file->private_data, obj_type); + + if (obj_type == BPF_OBJ_LINK && + bpf_core_enum_value_exists(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { + struct bpf_link *link = (struct bpf_link *) file->private_data; + + if (link->type == bpf_core_enum_value(enum bpf_link_type___local, + BPF_LINK_TYPE_PERF_EVENT___local)) { + e.has_bpf_cookie = true; + e.bpf_cookie = get_bpf_cookie(link); + } + } + + bpf_probe_read_kernel_str(&e.comm, sizeof(e.comm), + task->group_leader->comm); + bpf_seq_write(ctx->meta->seq, &e, sizeof(e)); + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/third_party/bpftool/src/skeleton/pid_iter.h b/third_party/bpftool/src/skeleton/pid_iter.h new file mode 100644 index 0000000..bbb570d --- /dev/null +++ b/third_party/bpftool/src/skeleton/pid_iter.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2020 Facebook */ +#ifndef __PID_ITER_H +#define __PID_ITER_H + +struct pid_iter_entry { + __u32 id; + int pid; + __u64 bpf_cookie; + bool has_bpf_cookie; + char comm[16]; +}; + +#endif diff --git a/third_party/bpftool/src/skeleton/profiler.bpf.c b/third_party/bpftool/src/skeleton/profiler.bpf.c new file mode 100644 index 0000000..2f80edc --- /dev/null +++ b/third_party/bpftool/src/skeleton/profiler.bpf.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2020 Facebook +#include +#include +#include + +struct bpf_perf_event_value___local { + __u64 counter; + __u64 enabled; + __u64 running; +} __attribute__((preserve_access_index)); + +/* map of perf event fds, num_cpu * num_metric entries */ +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +/* readings at fentry */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value___local)); +} fentry_readings SEC(".maps"); + +/* accumulated readings */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value___local)); +} accum_readings SEC(".maps"); + +/* sample counts, one per cpu */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u64)); +} counts SEC(".maps"); + +const volatile __u32 num_cpu = 1; +const volatile __u32 num_metric = 1; +#define MAX_NUM_MATRICS 4 + +SEC("fentry/XXX") +int BPF_PROG(fentry_XXX) +{ + struct bpf_perf_event_value___local *ptrs[MAX_NUM_MATRICS]; + u32 key = bpf_get_smp_processor_id(); + u32 i; + + /* look up before reading, to reduce error */ + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + u32 flag = i; + + ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag); + if (!ptrs[i]) + return 0; + } + + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + struct bpf_perf_event_value___local reading; + int err; + + err = bpf_perf_event_read_value(&events, key, (void *)&reading, + sizeof(reading)); + if (err) + return 0; + *(ptrs[i]) = reading; + key += num_cpu; + } + + return 0; +} + +static inline void +fexit_update_maps(u32 id, struct bpf_perf_event_value___local *after) +{ + struct bpf_perf_event_value___local *before, diff; + + before = bpf_map_lookup_elem(&fentry_readings, &id); + /* only account samples with a valid fentry_reading */ + if (before && before->counter) { + struct bpf_perf_event_value___local *accum; + + diff.counter = after->counter - before->counter; + diff.enabled = after->enabled - before->enabled; + diff.running = after->running - before->running; + + accum = bpf_map_lookup_elem(&accum_readings, &id); + if (accum) { + accum->counter += diff.counter; + accum->enabled += diff.enabled; + accum->running += diff.running; + } + } +} + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value___local readings[MAX_NUM_MATRICS]; + u32 cpu = bpf_get_smp_processor_id(); + u32 i, zero = 0; + int err; + u64 *count; + + /* read all events before updating the maps, to reduce error */ + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { + err = bpf_perf_event_read_value(&events, cpu + i * num_cpu, + (void *)(readings + i), + sizeof(*readings)); + if (err) + return 0; + } + count = bpf_map_lookup_elem(&counts, &zero); + if (count) { + *count += 1; + for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) + fexit_update_maps(i, &readings[i]); + } + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/third_party/bpftool/src/struct_ops.c b/third_party/bpftool/src/struct_ops.c new file mode 100644 index 0000000..3ebc9fe --- /dev/null +++ b/third_party/bpftool/src/struct_ops.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2020 Facebook */ + +#include +#include +#include + +#include + +#include +#include +#include + +#include "json_writer.h" +#include "main.h" + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" + +static const struct btf_type *map_info_type; +static __u32 map_info_alloc_len; +static struct btf *btf_vmlinux; +static __s32 map_info_type_id; + +struct res { + unsigned int nr_maps; + unsigned int nr_errs; +}; + +static const struct btf *get_btf_vmlinux(void) +{ + if (btf_vmlinux) + return btf_vmlinux; + + btf_vmlinux = libbpf_find_kernel_btf(); + if (!btf_vmlinux) + p_err("struct_ops requires kernel CONFIG_DEBUG_INFO_BTF=y"); + + return btf_vmlinux; +} + +static const char *get_kern_struct_ops_name(const struct bpf_map_info *info) +{ + const struct btf *kern_btf; + const struct btf_type *t; + const char *st_ops_name; + + kern_btf = get_btf_vmlinux(); + if (!kern_btf) + return ""; + + t = btf__type_by_id(kern_btf, info->btf_vmlinux_value_type_id); + st_ops_name = btf__name_by_offset(kern_btf, t->name_off); + st_ops_name += strlen(STRUCT_OPS_VALUE_PREFIX); + + return st_ops_name; +} + +static __s32 get_map_info_type_id(void) +{ + const struct btf *kern_btf; + + if (map_info_type_id) + return map_info_type_id; + + kern_btf = get_btf_vmlinux(); + if (!kern_btf) + return 0; + + map_info_type_id = btf__find_by_name_kind(kern_btf, "bpf_map_info", + BTF_KIND_STRUCT); + if (map_info_type_id < 0) { + p_err("can't find bpf_map_info from btf_vmlinux"); + return map_info_type_id; + } + map_info_type = btf__type_by_id(kern_btf, map_info_type_id); + + /* Ensure map_info_alloc() has at least what the bpftool needs */ + map_info_alloc_len = map_info_type->size; + if (map_info_alloc_len < sizeof(struct bpf_map_info)) + map_info_alloc_len = sizeof(struct bpf_map_info); + + return map_info_type_id; +} + +/* If the subcmd needs to print out the bpf_map_info, + * it should always call map_info_alloc to allocate + * a bpf_map_info object instead of allocating it + * on the stack. + * + * map_info_alloc() will take the running kernel's btf + * into account. i.e. it will consider the + * sizeof(struct bpf_map_info) of the running kernel. + * + * It will enable the "struct_ops" cmd to print the latest + * "struct bpf_map_info". + * + * [ Recall that "struct_ops" requires the kernel's btf to + * be available ] + */ +static struct bpf_map_info *map_info_alloc(__u32 *alloc_len) +{ + struct bpf_map_info *info; + + if (get_map_info_type_id() < 0) + return NULL; + + info = calloc(1, map_info_alloc_len); + if (!info) + p_err("mem alloc failed"); + else + *alloc_len = map_info_alloc_len; + + return info; +} + +/* It iterates all struct_ops maps of the system. + * It returns the fd in "*res_fd" and map_info in "*info". + * In the very first iteration, info->id should be 0. + * An optional map "*name" filter can be specified. + * The filter can be made more flexible in the future. + * e.g. filter by kernel-struct-ops-name, regex-name, glob-name, ...etc. + * + * Return value: + * 1: A struct_ops map found. It is returned in "*res_fd" and "*info". + * The caller can continue to call get_next in the future. + * 0: No struct_ops map is returned. + * All struct_ops map has been found. + * -1: Error and the caller should abort the iteration. + */ +static int get_next_struct_ops_map(const char *name, int *res_fd, + struct bpf_map_info *info, __u32 info_len) +{ + __u32 id = info->id; + int err, fd; + + while (true) { + err = bpf_map_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) + return 0; + p_err("can't get next map: %s", strerror(errno)); + return -1; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + p_err("can't get map by id (%u): %s", + id, strerror(errno)); + return -1; + } + + err = bpf_map_get_info_by_fd(fd, info, &info_len); + if (err) { + p_err("can't get map info: %s", strerror(errno)); + close(fd); + return -1; + } + + if (info->type == BPF_MAP_TYPE_STRUCT_OPS && + (!name || !strcmp(name, info->name))) { + *res_fd = fd; + return 1; + } + close(fd); + } +} + +static int cmd_retval(const struct res *res, bool must_have_one_map) +{ + if (res->nr_errs || (!res->nr_maps && must_have_one_map)) + return -1; + + return 0; +} + +/* "data" is the work_func private storage */ +typedef int (*work_func)(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr); + +/* Find all struct_ops map in the system. + * Filter out by "name" (if specified). + * Then call "func(fd, info, data, wtr)" on each struct_ops map found. + */ +static struct res do_search(const char *name, work_func func, void *data, + struct json_writer *wtr) +{ + struct bpf_map_info *info; + struct res res = {}; + __u32 info_len; + int fd, err; + + info = map_info_alloc(&info_len); + if (!info) { + res.nr_errs++; + return res; + } + + if (wtr) + jsonw_start_array(wtr); + while ((err = get_next_struct_ops_map(name, &fd, info, info_len)) == 1) { + res.nr_maps++; + err = func(fd, info, data, wtr); + if (err) + res.nr_errs++; + close(fd); + } + if (wtr) + jsonw_end_array(wtr); + + if (err) + res.nr_errs++; + + if (!wtr && name && !res.nr_errs && !res.nr_maps) + /* It is not printing empty []. + * Thus, needs to specifically say nothing found + * for "name" here. + */ + p_err("no struct_ops found for %s", name); + else if (!wtr && json_output && !res.nr_errs) + /* The "func()" above is not writing any json (i.e. !wtr + * test here). + * + * However, "-j" is enabled and there is no errs here, + * so call json_null() as the current convention of + * other cmds. + */ + jsonw_null(json_wtr); + + free(info); + return res; +} + +static struct res do_one_id(const char *id_str, work_func func, void *data, + struct json_writer *wtr) +{ + struct bpf_map_info *info; + struct res res = {}; + unsigned long id; + __u32 info_len; + char *endptr; + int fd; + + id = strtoul(id_str, &endptr, 0); + if (*endptr || !id || id > UINT32_MAX) { + p_err("invalid id %s", id_str); + res.nr_errs++; + return res; + } + + fd = bpf_map_get_fd_by_id(id); + if (fd < 0) { + p_err("can't get map by id (%lu): %s", id, strerror(errno)); + res.nr_errs++; + return res; + } + + info = map_info_alloc(&info_len); + if (!info) { + res.nr_errs++; + goto done; + } + + if (bpf_map_get_info_by_fd(fd, info, &info_len)) { + p_err("can't get map info: %s", strerror(errno)); + res.nr_errs++; + goto done; + } + + if (info->type != BPF_MAP_TYPE_STRUCT_OPS) { + p_err("%s id %u is not a struct_ops map", info->name, info->id); + res.nr_errs++; + goto done; + } + + res.nr_maps++; + + if (func(fd, info, data, wtr)) + res.nr_errs++; + else if (!wtr && json_output) + /* The "func()" above is not writing any json (i.e. !wtr + * test here). + * + * However, "-j" is enabled and there is no errs here, + * so call json_null() as the current convention of + * other cmds. + */ + jsonw_null(json_wtr); + +done: + free(info); + close(fd); + + return res; +} + +static struct res do_work_on_struct_ops(const char *search_type, + const char *search_term, + work_func func, void *data, + struct json_writer *wtr) +{ + if (search_type) { + if (is_prefix(search_type, "id")) + return do_one_id(search_term, func, data, wtr); + else if (!is_prefix(search_type, "name")) + usage(); + } + + return do_search(search_term, func, data, wtr); +} + +static int __do_show(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + if (wtr) { + jsonw_start_object(wtr); + jsonw_uint_field(wtr, "id", info->id); + jsonw_string_field(wtr, "name", info->name); + jsonw_string_field(wtr, "kernel_struct_ops", + get_kern_struct_ops_name(info)); + jsonw_end_object(wtr); + } else { + printf("%u: %-15s %-32s\n", info->id, info->name, + get_kern_struct_ops_name(info)); + } + + return 0; +} + +static int do_show(int argc, char **argv) +{ + const char *search_type = NULL, *search_term = NULL; + struct res res; + + if (argc && argc != 2) + usage(); + + if (argc == 2) { + search_type = GET_ARG(); + search_term = GET_ARG(); + } + + res = do_work_on_struct_ops(search_type, search_term, __do_show, + NULL, json_wtr); + + return cmd_retval(&res, !!search_term); +} + +static int __do_dump(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + struct btf_dumper *d = (struct btf_dumper *)data; + const struct btf_type *struct_ops_type; + const struct btf *kern_btf = d->btf; + const char *struct_ops_name; + int zero = 0; + void *value; + + /* note: d->jw == wtr */ + + kern_btf = d->btf; + + /* The kernel supporting BPF_MAP_TYPE_STRUCT_OPS must have + * btf_vmlinux_value_type_id. + */ + struct_ops_type = btf__type_by_id(kern_btf, + info->btf_vmlinux_value_type_id); + struct_ops_name = btf__name_by_offset(kern_btf, + struct_ops_type->name_off); + value = calloc(1, info->value_size); + if (!value) { + p_err("mem alloc failed"); + return -1; + } + + if (bpf_map_lookup_elem(fd, &zero, value)) { + p_err("can't lookup struct_ops map %s id %u", + info->name, info->id); + free(value); + return -1; + } + + jsonw_start_object(wtr); + jsonw_name(wtr, "bpf_map_info"); + btf_dumper_type(d, map_info_type_id, (void *)info); + jsonw_end_object(wtr); + + jsonw_start_object(wtr); + jsonw_name(wtr, struct_ops_name); + btf_dumper_type(d, info->btf_vmlinux_value_type_id, value); + jsonw_end_object(wtr); + + free(value); + + return 0; +} + +static int do_dump(int argc, char **argv) +{ + const char *search_type = NULL, *search_term = NULL; + json_writer_t *wtr = json_wtr; + const struct btf *kern_btf; + struct btf_dumper d = {}; + struct res res; + + if (argc && argc != 2) + usage(); + + if (argc == 2) { + search_type = GET_ARG(); + search_term = GET_ARG(); + } + + kern_btf = get_btf_vmlinux(); + if (!kern_btf) + return -1; + + if (!json_output) { + wtr = jsonw_new(stdout); + if (!wtr) { + p_err("can't create json writer"); + return -1; + } + jsonw_pretty(wtr, true); + } + + d.btf = kern_btf; + d.jw = wtr; + d.is_plain_text = !json_output; + d.prog_id_as_func_ptr = true; + + res = do_work_on_struct_ops(search_type, search_term, __do_dump, &d, + wtr); + + if (!json_output) + jsonw_destroy(&wtr); + + return cmd_retval(&res, !!search_term); +} + +static int __do_unregister(int fd, const struct bpf_map_info *info, void *data, + struct json_writer *wtr) +{ + int zero = 0; + + if (bpf_map_delete_elem(fd, &zero)) { + p_err("can't unload %s %s id %u: %s", + get_kern_struct_ops_name(info), info->name, + info->id, strerror(errno)); + return -1; + } + + p_info("Unregistered %s %s id %u", + get_kern_struct_ops_name(info), info->name, + info->id); + + return 0; +} + +static int do_unregister(int argc, char **argv) +{ + const char *search_type, *search_term; + struct res res; + + if (argc != 2) + usage(); + + search_type = GET_ARG(); + search_term = GET_ARG(); + + res = do_work_on_struct_ops(search_type, search_term, + __do_unregister, NULL, NULL); + + return cmd_retval(&res, true); +} + +static int pin_link(struct bpf_link *link, const char *pindir, + const char *name) +{ + char pinfile[PATH_MAX]; + int err; + + err = pathname_concat(pinfile, sizeof(pinfile), pindir, name); + if (err) + return -1; + + return bpf_link__pin(link, pinfile); +} + +static int do_register(int argc, char **argv) +{ + LIBBPF_OPTS(bpf_object_open_opts, open_opts); + __u32 link_info_len = sizeof(struct bpf_link_info); + struct bpf_link_info link_info = {}; + struct bpf_map_info info = {}; + __u32 info_len = sizeof(info); + int nr_errs = 0, nr_maps = 0; + const char *linkdir = NULL; + struct bpf_object *obj; + struct bpf_link *link; + struct bpf_map *map; + const char *file; + + if (argc != 1 && argc != 2) + usage(); + + file = GET_ARG(); + if (argc == 1) + linkdir = GET_ARG(); + + if (linkdir && mount_bpffs_for_pin(linkdir, true)) { + p_err("can't mount bpffs for pinning"); + return -1; + } + + if (verifier_logs) + /* log_level1 + log_level2 + stats, but not stable UAPI */ + open_opts.kernel_log_level = 1 + 2 + 4; + + obj = bpf_object__open_file(file, &open_opts); + if (!obj) + return -1; + + set_max_rlimit(); + + if (bpf_object__load(obj)) { + bpf_object__close(obj); + return -1; + } + + bpf_object__for_each_map(map, obj) { + if (bpf_map__type(map) != BPF_MAP_TYPE_STRUCT_OPS) + continue; + + link = bpf_map__attach_struct_ops(map); + if (!link) { + p_err("can't register struct_ops %s: %s", + bpf_map__name(map), strerror(errno)); + nr_errs++; + continue; + } + nr_maps++; + + if (bpf_map_get_info_by_fd(bpf_map__fd(map), &info, + &info_len)) { + /* Not p_err. The struct_ops was attached + * successfully. + */ + p_info("Registered %s but can't find id: %s", + bpf_map__name(map), strerror(errno)); + goto clean_link; + } + if (!(bpf_map__map_flags(map) & BPF_F_LINK)) { + p_info("Registered %s %s id %u", + get_kern_struct_ops_name(&info), + info.name, + info.id); + goto clean_link; + } + if (bpf_link_get_info_by_fd(bpf_link__fd(link), + &link_info, + &link_info_len)) { + p_err("Registered %s but can't find link id: %s", + bpf_map__name(map), strerror(errno)); + nr_errs++; + goto clean_link; + } + if (linkdir && pin_link(link, linkdir, info.name)) { + p_err("can't pin link %u for %s: %s", + link_info.id, info.name, + strerror(errno)); + nr_errs++; + goto clean_link; + } + p_info("Registered %s %s map id %u link id %u", + get_kern_struct_ops_name(&info), + info.name, info.id, link_info.id); + +clean_link: + bpf_link__disconnect(link); + bpf_link__destroy(link); + } + + bpf_object__close(obj); + + if (nr_errs) + return -1; + + if (!nr_maps) { + p_err("no struct_ops found in %s", file); + return -1; + } + + if (json_output) + jsonw_null(json_wtr); + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %1$s %2$s { show | list } [STRUCT_OPS_MAP]\n" + " %1$s %2$s dump [STRUCT_OPS_MAP]\n" + " %1$s %2$s register OBJ [LINK_DIR]\n" + " %1$s %2$s unregister STRUCT_OPS_MAP\n" + " %1$s %2$s help\n" + "\n" + " STRUCT_OPS_MAP := [ id STRUCT_OPS_MAP_ID | name STRUCT_OPS_MAP_NAME ]\n" + " " HELP_SPEC_OPTIONS " }\n" + "", + bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "register", do_register }, + { "unregister", do_unregister }, + { "dump", do_dump }, + { "help", do_help }, + { 0 } +}; + +int do_struct_ops(int argc, char **argv) +{ + int err; + + err = cmd_select(cmds, argc, argv, do_help); + + btf__free(btf_vmlinux); + + return err; +} diff --git a/third_party/bpftool/src/tracelog.c b/third_party/bpftool/src/tracelog.c new file mode 100644 index 0000000..bf1f022 --- /dev/null +++ b/third_party/bpftool/src/tracelog.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2015-2017 Daniel Borkmann */ +/* Copyright (c) 2018 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "main.h" + +#ifndef TRACEFS_MAGIC +# define TRACEFS_MAGIC 0x74726163 +#endif + +#define _textify(x) #x +#define textify(x) _textify(x) + +FILE *trace_pipe_fd; +char *buff; + +static int validate_tracefs_mnt(const char *mnt, unsigned long magic) +{ + struct statfs st_fs; + + if (statfs(mnt, &st_fs) < 0) + return -ENOENT; + if ((unsigned long)st_fs.f_type != magic) + return -ENOENT; + + return 0; +} + +static bool +find_tracefs_mnt_single(unsigned long magic, char *mnt, const char *mntpt) +{ + size_t src_len; + + if (validate_tracefs_mnt(mntpt, magic)) + return false; + + src_len = strlen(mntpt); + if (src_len + 1 >= PATH_MAX) { + p_err("tracefs mount point name too long"); + return false; + } + + strcpy(mnt, mntpt); + return true; +} + +static bool get_tracefs_pipe(char *mnt) +{ + static const char * const known_mnts[] = { + "/sys/kernel/debug/tracing", + "/sys/kernel/tracing", + "/tracing", + "/trace", + }; + const char *pipe_name = "/trace_pipe"; + const char *fstype = "tracefs"; + char type[100], format[32]; + const char * const *ptr; + bool found = false; + FILE *fp; + + for (ptr = known_mnts; ptr < known_mnts + ARRAY_SIZE(known_mnts); ptr++) + if (find_tracefs_mnt_single(TRACEFS_MAGIC, mnt, *ptr)) + goto exit_found; + + fp = fopen("/proc/mounts", "r"); + if (!fp) + return false; + + /* Allow room for NULL terminating byte and pipe file name */ + snprintf(format, sizeof(format), "%%*s %%%zds %%99s %%*s %%*d %%*d\\n", + PATH_MAX - strlen(pipe_name) - 1); + while (fscanf(fp, format, mnt, type) == 2) + if (strcmp(type, fstype) == 0) { + found = true; + break; + } + fclose(fp); + + /* The string from fscanf() might be truncated, check mnt is valid */ + if (found && validate_tracefs_mnt(mnt, TRACEFS_MAGIC)) + goto exit_found; + + if (block_mount) + return false; + + p_info("could not find tracefs, attempting to mount it now"); + /* Most of the time, tracefs is automatically mounted by debugfs at + * /sys/kernel/debug/tracing when we try to access it. If we could not + * find it, it is likely that debugfs is not mounted. Let's give one + * attempt at mounting just tracefs at /sys/kernel/tracing. + */ + strcpy(mnt, known_mnts[1]); + if (mount_tracefs(mnt)) + return false; + +exit_found: + strcat(mnt, pipe_name); + return true; +} + +static void exit_tracelog(int signum) +{ + fclose(trace_pipe_fd); + free(buff); + + if (json_output) { + jsonw_end_array(json_wtr); + jsonw_destroy(&json_wtr); + } + + exit(0); +} + +int do_tracelog(int argc, char **argv) +{ + const struct sigaction act = { + .sa_handler = exit_tracelog + }; + char trace_pipe[PATH_MAX]; + size_t buff_len = 0; + + if (json_output) + jsonw_start_array(json_wtr); + + if (!get_tracefs_pipe(trace_pipe)) + return -1; + + trace_pipe_fd = fopen(trace_pipe, "r"); + if (!trace_pipe_fd) { + p_err("could not open trace pipe: %s", strerror(errno)); + return -1; + } + + sigaction(SIGHUP, &act, NULL); + sigaction(SIGINT, &act, NULL); + sigaction(SIGTERM, &act, NULL); + while (1) { + ssize_t ret; + + ret = getline(&buff, &buff_len, trace_pipe_fd); + if (ret <= 0) { + p_err("failed to read content from trace pipe: %s", + strerror(errno)); + break; + } + if (json_output) + jsonw_string(json_wtr, buff); + else + printf("%s", buff); + } + + fclose(trace_pipe_fd); + free(buff); + return -1; +} diff --git a/third_party/bpftool/src/xlated_dumper.c b/third_party/bpftool/src/xlated_dumper.c new file mode 100644 index 0000000..da608e1 --- /dev/null +++ b/third_party/bpftool/src/xlated_dumper.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (C) 2018 Netronome Systems, Inc. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include + +#include "disasm.h" +#include "json_writer.h" +#include "main.h" +#include "xlated_dumper.h" + +static int kernel_syms_cmp(const void *sym_a, const void *sym_b) +{ + return ((struct kernel_sym *)sym_a)->address - + ((struct kernel_sym *)sym_b)->address; +} + +void kernel_syms_load(struct dump_data *dd) +{ + struct kernel_sym *sym; + char buff[256]; + void *tmp, *address; + FILE *fp; + + fp = fopen("/proc/kallsyms", "r"); + if (!fp) + return; + + while (fgets(buff, sizeof(buff), fp)) { + tmp = libbpf_reallocarray(dd->sym_mapping, dd->sym_count + 1, + sizeof(*dd->sym_mapping)); + if (!tmp) { +out: + free(dd->sym_mapping); + dd->sym_mapping = NULL; + fclose(fp); + return; + } + dd->sym_mapping = tmp; + sym = &dd->sym_mapping[dd->sym_count]; + if (sscanf(buff, "%p %*c %s", &address, sym->name) != 2) + continue; + sym->address = (unsigned long)address; + if (!strcmp(sym->name, "__bpf_call_base")) { + dd->address_call_base = sym->address; + /* sysctl kernel.kptr_restrict was set */ + if (!sym->address) + goto out; + } + if (sym->address) + dd->sym_count++; + } + + fclose(fp); + + qsort(dd->sym_mapping, dd->sym_count, + sizeof(*dd->sym_mapping), kernel_syms_cmp); +} + +void kernel_syms_destroy(struct dump_data *dd) +{ + free(dd->sym_mapping); +} + +struct kernel_sym *kernel_syms_search(struct dump_data *dd, + unsigned long key) +{ + struct kernel_sym sym = { + .address = key, + }; + + return dd->sym_mapping ? + bsearch(&sym, dd->sym_mapping, dd->sym_count, + sizeof(*dd->sym_mapping), kernel_syms_cmp) : NULL; +} + +static void __printf(2, 3) print_insn(void *private_data, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); +} + +static void __printf(2, 3) +print_insn_for_graph(void *private_data, const char *fmt, ...) +{ + char buf[64], *p; + va_list args; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + p = buf; + while (*p != '\0') { + if (*p == '\n') { + memmove(p + 3, p, strlen(buf) + 1 - (p - buf)); + /* Align each instruction dump row left. */ + *p++ = '\\'; + *p++ = 'l'; + /* Output multiline concatenation. */ + *p++ = '\\'; + } else if (*p == '<' || *p == '>' || *p == '|' || *p == '&') { + memmove(p + 1, p, strlen(buf) + 1 - (p - buf)); + /* Escape special character. */ + *p++ = '\\'; + } + + p++; + } + + printf("%s", buf); +} + +static void __printf(2, 3) +print_insn_json(void *private_data, const char *fmt, ...) +{ + unsigned int l = strlen(fmt); + char chomped_fmt[l]; + va_list args; + + va_start(args, fmt); + if (l > 0) { + strncpy(chomped_fmt, fmt, l - 1); + chomped_fmt[l - 1] = '\0'; + } + jsonw_vprintf_enquote(json_wtr, chomped_fmt, args); + va_end(args); +} + +static const char *print_call_pcrel(struct dump_data *dd, + struct kernel_sym *sym, + unsigned long address, + const struct bpf_insn *insn) +{ + if (!dd->nr_jited_ksyms) + /* Do not show address for interpreted programs */ + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d", insn->off); + else if (sym) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d#%s", insn->off, sym->name); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d#0x%lx", insn->off, address); + return dd->scratch_buff; +} + +static const char *print_call_helper(struct dump_data *dd, + struct kernel_sym *sym, + unsigned long address) +{ + if (sym) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%s", sym->name); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "0x%lx", address); + return dd->scratch_buff; +} + +static const char *print_call(void *private_data, + const struct bpf_insn *insn) +{ + struct dump_data *dd = private_data; + unsigned long address = dd->address_call_base + insn->imm; + struct kernel_sym *sym; + + if (insn->src_reg == BPF_PSEUDO_CALL && + (__u32) insn->imm < dd->nr_jited_ksyms && dd->jited_ksyms) + address = dd->jited_ksyms[insn->imm]; + + sym = kernel_syms_search(dd, address); + if (insn->src_reg == BPF_PSEUDO_CALL) + return print_call_pcrel(dd, sym, address, insn); + else + return print_call_helper(dd, sym, address); +} + +static const char *print_imm(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm) +{ + struct dump_data *dd = private_data; + + if (insn->src_reg == BPF_PSEUDO_MAP_FD) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "map[id:%u]", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "map[id:%u][0]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "map[idx:%u]+%u", insn->imm, (insn + 1)->imm); + else if (insn->src_reg == BPF_PSEUDO_FUNC) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "subprog[%+d]", insn->imm); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "0x%llx", (unsigned long long)full_imm); + return dd->scratch_buff; +} + +void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum) +{ + const struct bpf_prog_linfo *prog_linfo = dd->prog_linfo; + const struct bpf_insn_cbs cbs = { + .cb_print = print_insn_json, + .cb_call = print_call, + .cb_imm = print_imm, + .private_data = dd, + }; + struct bpf_func_info *record; + struct bpf_insn *insn = buf; + struct btf *btf = dd->btf; + bool double_insn = false; + unsigned int nr_skip = 0; + char func_sig[1024]; + unsigned int i; + + jsonw_start_array(json_wtr); + record = dd->func_info; + for (i = 0; i < len / sizeof(*insn); i++) { + if (double_insn) { + double_insn = false; + continue; + } + double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); + + jsonw_start_object(json_wtr); + + if (btf && record) { + if (record->insn_off == i) { + btf_dumper_type_only(btf, record->type_id, + func_sig, + sizeof(func_sig)); + if (func_sig[0] != '\0') { + jsonw_name(json_wtr, "proto"); + jsonw_string(json_wtr, func_sig); + } + record = (void *)record + dd->finfo_rec_size; + } + } + + if (prog_linfo) { + const struct bpf_line_info *linfo; + + linfo = bpf_prog_linfo__lfind(prog_linfo, i, nr_skip); + if (linfo) { + btf_dump_linfo_json(btf, linfo, linum); + nr_skip++; + } + } + + jsonw_name(json_wtr, "disasm"); + print_bpf_insn(&cbs, insn + i, true); + + if (opcodes) { + jsonw_name(json_wtr, "opcodes"); + jsonw_start_object(json_wtr); + + jsonw_name(json_wtr, "code"); + jsonw_printf(json_wtr, "\"0x%02hhx\"", insn[i].code); + + jsonw_name(json_wtr, "src_reg"); + jsonw_printf(json_wtr, "\"0x%hhx\"", insn[i].src_reg); + + jsonw_name(json_wtr, "dst_reg"); + jsonw_printf(json_wtr, "\"0x%hhx\"", insn[i].dst_reg); + + jsonw_name(json_wtr, "off"); + print_hex_data_json((uint8_t *)(&insn[i].off), 2); + + jsonw_name(json_wtr, "imm"); + if (double_insn && i < len - 1) + print_hex_data_json((uint8_t *)(&insn[i].imm), + 12); + else + print_hex_data_json((uint8_t *)(&insn[i].imm), + 4); + jsonw_end_object(json_wtr); + } + jsonw_end_object(json_wtr); + } + jsonw_end_array(json_wtr); +} + +void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum) +{ + const struct bpf_prog_linfo *prog_linfo = dd->prog_linfo; + const struct bpf_insn_cbs cbs = { + .cb_print = print_insn, + .cb_call = print_call, + .cb_imm = print_imm, + .private_data = dd, + }; + struct bpf_func_info *record; + struct bpf_insn *insn = buf; + struct btf *btf = dd->btf; + unsigned int nr_skip = 0; + bool double_insn = false; + char func_sig[1024]; + unsigned int i; + + record = dd->func_info; + for (i = 0; i < len / sizeof(*insn); i++) { + if (double_insn) { + double_insn = false; + continue; + } + + if (btf && record) { + if (record->insn_off == i) { + btf_dumper_type_only(btf, record->type_id, + func_sig, + sizeof(func_sig)); + if (func_sig[0] != '\0') + printf("%s:\n", func_sig); + record = (void *)record + dd->finfo_rec_size; + } + } + + if (prog_linfo) { + const struct bpf_line_info *linfo; + + linfo = bpf_prog_linfo__lfind(prog_linfo, i, nr_skip); + if (linfo) { + btf_dump_linfo_plain(btf, linfo, "; ", + linum); + nr_skip++; + } + } + + double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); + + printf("% 4d: ", i); + print_bpf_insn(&cbs, insn + i, true); + + if (opcodes) { + printf(" "); + fprint_hex(stdout, insn + i, 8, " "); + if (double_insn && i < len - 1) { + printf(" "); + fprint_hex(stdout, insn + i + 1, 8, " "); + } + printf("\n"); + } + } +} + +void dump_xlated_for_graph(struct dump_data *dd, void *buf_start, void *buf_end, + unsigned int start_idx, + bool opcodes, bool linum) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = print_insn_for_graph, + .cb_call = print_call, + .cb_imm = print_imm, + .private_data = dd, + }; + const struct bpf_prog_linfo *prog_linfo = dd->prog_linfo; + const struct bpf_line_info *last_linfo = NULL; + struct bpf_func_info *record = dd->func_info; + struct bpf_insn *insn_start = buf_start; + struct bpf_insn *insn_end = buf_end; + struct bpf_insn *cur = insn_start; + struct btf *btf = dd->btf; + bool double_insn = false; + char func_sig[1024]; + + for (; cur <= insn_end; cur++) { + unsigned int insn_off; + + if (double_insn) { + double_insn = false; + continue; + } + double_insn = cur->code == (BPF_LD | BPF_IMM | BPF_DW); + + insn_off = (unsigned int)(cur - insn_start + start_idx); + if (btf && record) { + if (record->insn_off == insn_off) { + btf_dumper_type_only(btf, record->type_id, + func_sig, + sizeof(func_sig)); + if (func_sig[0] != '\0') + printf("; %s:\\l\\\n", func_sig); + record = (void *)record + dd->finfo_rec_size; + } + } + + if (prog_linfo) { + const struct bpf_line_info *linfo; + + linfo = bpf_prog_linfo__lfind(prog_linfo, insn_off, 0); + if (linfo && linfo != last_linfo) { + btf_dump_linfo_dotlabel(btf, linfo, linum); + last_linfo = linfo; + } + } + + printf("%d: ", insn_off); + print_bpf_insn(&cbs, cur, true); + + if (opcodes) { + printf("\\ \\ \\ \\ "); + fprint_hex(stdout, cur, 8, " "); + if (double_insn && cur <= insn_end - 1) { + printf(" "); + fprint_hex(stdout, cur + 1, 8, " "); + } + printf("\\l\\\n"); + } + + if (cur != insn_end) + printf("| "); + } +} diff --git a/third_party/bpftool/src/xlated_dumper.h b/third_party/bpftool/src/xlated_dumper.h new file mode 100644 index 0000000..9a94637 --- /dev/null +++ b/third_party/bpftool/src/xlated_dumper.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (C) 2018 Netronome Systems, Inc. */ + +#ifndef __BPF_TOOL_XLATED_DUMPER_H +#define __BPF_TOOL_XLATED_DUMPER_H + +#define SYM_MAX_NAME 256 + +struct bpf_prog_linfo; + +struct kernel_sym { + unsigned long address; + char name[SYM_MAX_NAME]; +}; + +struct dump_data { + unsigned long address_call_base; + struct kernel_sym *sym_mapping; + __u32 sym_count; + __u64 *jited_ksyms; + __u32 nr_jited_ksyms; + struct btf *btf; + void *func_info; + __u32 finfo_rec_size; + const struct bpf_prog_linfo *prog_linfo; + char scratch_buff[SYM_MAX_NAME + 8]; +}; + +void kernel_syms_load(struct dump_data *dd); +void kernel_syms_destroy(struct dump_data *dd); +struct kernel_sym *kernel_syms_search(struct dump_data *dd, unsigned long key); +void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum); +void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, + bool opcodes, bool linum); +void dump_xlated_for_graph(struct dump_data *dd, void *buf, void *buf_end, + unsigned int start_index, + bool opcodes, bool linum); + +#endif diff --git a/third_party/libbpf/.gitattributes b/third_party/libbpf/.gitattributes new file mode 100644 index 0000000..c899af4 --- /dev/null +++ b/third_party/libbpf/.gitattributes @@ -0,0 +1 @@ +assets/** export-ignore diff --git a/third_party/libbpf/.readthedocs.yaml b/third_party/libbpf/.readthedocs.yaml new file mode 100644 index 0000000..803dfa2 --- /dev/null +++ b/third_party/libbpf/.readthedocs.yaml @@ -0,0 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: docs/conf.py + +formats: + - htmlzip + - pdf + - epub + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.7 + install: + - requirements: docs/sphinx/requirements.txt \ No newline at end of file diff --git a/third_party/libbpf/BPF-CHECKPOINT-COMMIT b/third_party/libbpf/BPF-CHECKPOINT-COMMIT new file mode 100644 index 0000000..ec5e4be --- /dev/null +++ b/third_party/libbpf/BPF-CHECKPOINT-COMMIT @@ -0,0 +1 @@ +496720b7cfb6574a8f6f4d434f23e3d1e6cfaeb9 diff --git a/third_party/libbpf/CHECKPOINT-COMMIT b/third_party/libbpf/CHECKPOINT-COMMIT new file mode 100644 index 0000000..130cfa6 --- /dev/null +++ b/third_party/libbpf/CHECKPOINT-COMMIT @@ -0,0 +1 @@ +a3e7e6b17946f48badce98d7ac360678a0ea7393 diff --git a/third_party/libbpf/LICENSE b/third_party/libbpf/LICENSE new file mode 100644 index 0000000..d38fed3 --- /dev/null +++ b/third_party/libbpf/LICENSE @@ -0,0 +1 @@ +LGPL-2.1 OR BSD-2-Clause diff --git a/third_party/libbpf/LICENSE.BSD-2-Clause b/third_party/libbpf/LICENSE.BSD-2-Clause new file mode 100644 index 0000000..bce40aa --- /dev/null +++ b/third_party/libbpf/LICENSE.BSD-2-Clause @@ -0,0 +1,32 @@ +Valid-License-Identifier: BSD-2-Clause +SPDX-URL: https://spdx.org/licenses/BSD-2-Clause.html +Usage-Guide: + To use the BSD 2-clause "Simplified" License put the following SPDX + tag/value pair into a comment according to the placement guidelines in + the licensing rules documentation: + SPDX-License-Identifier: BSD-2-Clause +License-Text: + +Copyright (c) 2015 The Libbpf Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/libbpf/LICENSE.LGPL-2.1 b/third_party/libbpf/LICENSE.LGPL-2.1 new file mode 100644 index 0000000..27bb434 --- /dev/null +++ b/third_party/libbpf/LICENSE.LGPL-2.1 @@ -0,0 +1,503 @@ +Valid-License-Identifier: LGPL-2.1 +Valid-License-Identifier: LGPL-2.1+ +SPDX-URL: https://spdx.org/licenses/LGPL-2.1.html +Usage-Guide: + To use this license in source code, put one of the following SPDX + tag/value pairs into a comment according to the placement + guidelines in the licensing rules documentation. + For 'GNU Lesser General Public License (LGPL) version 2.1 only' use: + SPDX-License-Identifier: LGPL-2.1 + For 'GNU Lesser General Public License (LGPL) version 2.1 or any later + version' use: + SPDX-License-Identifier: LGPL-2.1+ +License-Text: + +GNU LESSER GENERAL PUBLIC LICENSE +Version 2.1, February 1999 + +Copyright (C) 1991, 1999 Free Software Foundation, Inc. +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts as +the successor of the GNU Library Public License, version 2, hence the +version number 2.1.] + +Preamble + +The licenses for most software are designed to take away your freedom to +share and change it. By contrast, the GNU General Public Licenses are +intended to guarantee your freedom to share and change free software--to +make sure the software is free for all its users. + +This license, the Lesser General Public License, applies to some specially +designated software packages--typically libraries--of the Free Software +Foundation and other authors who decide to use it. You can use it too, but +we suggest you first think carefully about whether this license or the +ordinary General Public License is the better strategy to use in any +particular case, based on the explanations below. + +When we speak of free software, we are referring to freedom of use, not +price. Our General Public Licenses are designed to make sure that you have +the freedom to distribute copies of free software (and charge for this +service if you wish); that you receive source code or can get it if you +want it; that you can change the software and use pieces of it in new free +programs; and that you are informed that you can do these things. + +To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for you if +you distribute copies of the library or if you modify it. + +For example, if you distribute copies of the library, whether gratis or for +a fee, you must give the recipients all the rights that we gave you. You +must make sure that they, too, receive or can get the source code. If you +link other code with the library, you must provide complete object files to +the recipients, so that they can relink them with the library after making +changes to the library and recompiling it. And you must show them these +terms so they know their rights. + +We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + +To protect each distributor, we want to make it very clear that there is no +warranty for the free library. Also, if the library is modified by someone +else and passed on, the recipients should know that what they have is not +the original version, so that the original author's reputation will not be +affected by problems that might be introduced by others. + +Finally, software patents pose a constant threat to the existence of any +free program. We wish to make sure that a company cannot effectively +restrict the users of a free program by obtaining a restrictive license +from a patent holder. Therefore, we insist that any patent license obtained +for a version of the library must be consistent with the full freedom of +use specified in this license. + +Most GNU software, including some libraries, is covered by the ordinary GNU +General Public License. This license, the GNU Lesser General Public +License, applies to certain designated libraries, and is quite different +from the ordinary General Public License. We use this license for certain +libraries in order to permit linking those libraries into non-free +programs. + +When a program is linked with a library, whether statically or using a +shared library, the combination of the two is legally speaking a combined +work, a derivative of the original library. The ordinary General Public +License therefore permits such linking only if the entire combination fits +its criteria of freedom. The Lesser General Public License permits more lax +criteria for linking other code with the library. + +We call this license the "Lesser" General Public License because it does +Less to protect the user's freedom than the ordinary General Public +License. It also provides other free software developers Less of an +advantage over competing non-free programs. These disadvantages are the +reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + +For example, on rare occasions, there may be a special need to encourage +the widest possible use of a certain library, so that it becomes a de-facto +standard. To achieve this, non-free programs must be allowed to use the +library. A more frequent case is that a free library does the same job as +widely used non-free libraries. In this case, there is little to gain by +limiting the free library to free software only, so we use the Lesser +General Public License. + +In other cases, permission to use a particular library in non-free programs +enables a greater number of people to use a large body of free +software. For example, permission to use the GNU C Library in non-free +programs enables many more people to use the whole GNU operating system, as +well as its variant, the GNU/Linux operating system. + +Although the Lesser General Public License is Less protective of the users' +freedom, it does ensure that the user of a program that is linked with the +Library has the freedom and the wherewithal to run that program using a +modified version of the Library. + +The precise terms and conditions for copying, distribution and modification +follow. Pay close attention to the difference between a "work based on the +library" and a "work that uses the library". The former contains code +derived from the library, whereas the latter must be combined with the +library in order to run. + +TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +0. This License Agreement applies to any software library or other program + which contains a notice placed by the copyright holder or other + authorized party saying it may be distributed under the terms of this + Lesser General Public License (also called "this License"). Each + licensee is addressed as "you". + + A "library" means a collection of software functions and/or data + prepared so as to be conveniently linked with application programs + (which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work which + has been distributed under these terms. A "work based on the Library" + means either the Library or any derivative work under copyright law: + that is to say, a work containing the Library or a portion of it, either + verbatim or with modifications and/or translated straightforwardly into + another language. (Hereinafter, translation is included without + limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for making + modifications to it. For a library, complete source code means all the + source code for all modules it contains, plus any associated interface + definition files, plus the scripts used to control compilation and + installation of the library. + + Activities other than copying, distribution and modification are not + covered by this License; they are outside its scope. The act of running + a program using the Library is not restricted, and output from such a + program is covered only if its contents constitute a work based on the + Library (independent of the use of the Library in a tool for writing + it). Whether that is true depends on what the Library does and what the + program that uses the Library does. + +1. You may copy and distribute verbatim copies of the Library's complete + source code as you receive it, in any medium, provided that you + conspicuously and appropriately publish on each copy an appropriate + copyright notice and disclaimer of warranty; keep intact all the notices + that refer to this License and to the absence of any warranty; and + distribute a copy of this License along with the Library. + + You may charge a fee for the physical act of transferring a copy, and + you may at your option offer warranty protection in exchange for a fee. + +2. You may modify your copy or copies of the Library or any portion of it, + thus forming a work based on the Library, and copy and distribute such + modifications or work under the terms of Section 1 above, provided that + you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices stating + that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no charge to + all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a table + of data to be supplied by an application program that uses the + facility, other than as an argument passed when the facility is + invoked, then you must make a good faith effort to ensure that, in + the event an application does not supply such function or table, the + facility still operates, and performs whatever part of its purpose + remains meaningful. + + (For example, a function in a library to compute square roots has a + purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must be + optional: if the application does not supply it, the square root + function must still compute square roots.) + + These requirements apply to the modified work as a whole. If + identifiable sections of that work are not derived from the Library, and + can be reasonably considered independent and separate works in + themselves, then this License, and its terms, do not apply to those + sections when you distribute them as separate works. But when you + distribute the same sections as part of a whole which is a work based on + the Library, the distribution of the whole must be on the terms of this + License, whose permissions for other licensees extend to the entire + whole, and thus to each and every part regardless of who wrote it. + + Thus, it is not the intent of this section to claim rights or contest + your rights to work written entirely by you; rather, the intent is to + exercise the right to control the distribution of derivative or + collective works based on the Library. + + In addition, mere aggregation of another work not based on the Library + with the Library (or with a work based on the Library) on a volume of a + storage or distribution medium does not bring the other work under the + scope of this License. + +3. You may opt to apply the terms of the ordinary GNU General Public + License instead of this License to a given copy of the Library. To do + this, you must alter all the notices that refer to this License, so that + they refer to the ordinary GNU General Public License, version 2, + instead of to this License. (If a newer version than version 2 of the + ordinary GNU General Public License has appeared, then you can specify + that version instead if you wish.) Do not make any other change in these + notices. + + Once this change is made in a given copy, it is irreversible for that + copy, so the ordinary GNU General Public License applies to all + subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of the + Library into a program that is not a library. + +4. You may copy and distribute the Library (or a portion or derivative of + it, under Section 2) in object code or executable form under the terms + of Sections 1 and 2 above provided that you accompany it with the + complete corresponding machine-readable source code, which must be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange. + + If distribution of object code is made by offering access to copy from a + designated place, then offering equivalent access to copy the source + code from the same place satisfies the requirement to distribute the + source code, even though third parties are not compelled to copy the + source along with the object code. + +5. A program that contains no derivative of any portion of the Library, but + is designed to work with the Library by being compiled or linked with + it, is called a "work that uses the Library". Such a work, in isolation, + is not a derivative work of the Library, and therefore falls outside the + scope of this License. + + However, linking a "work that uses the Library" with the Library creates + an executable that is a derivative of the Library (because it contains + portions of the Library), rather than a "work that uses the + library". The executable is therefore covered by this License. Section 6 + states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file + that is part of the Library, the object code for the work may be a + derivative work of the Library even though the source code is + not. Whether this is true is especially significant if the work can be + linked without the Library, or if the work is itself a library. The + threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data structure + layouts and accessors, and small macros and small inline functions (ten + lines or less in length), then the use of the object file is + unrestricted, regardless of whether it is legally a derivative + work. (Executables containing this object code plus portions of the + Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may + distribute the object code for the work under the terms of Section + 6. Any executables containing that work also fall under Section 6, + whether or not they are linked directly with the Library itself. + +6. As an exception to the Sections above, you may also combine or link a + "work that uses the Library" with the Library to produce a work + containing portions of the Library, and distribute that work under terms + of your choice, provided that the terms permit modification of the work + for the customer's own use and reverse engineering for debugging such + modifications. + + You must give prominent notice with each copy of the work that the + Library is used in it and that the Library and its use are covered by + this License. You must supply a copy of this License. If the work during + execution displays copyright notices, you must include the copyright + notice for the Library among them, as well as a reference directing the + user to the copy of this License. Also, you must do one of these things: + + a) Accompany the work with the complete corresponding machine-readable + source code for the Library including whatever changes were used in + the work (which must be distributed under Sections 1 and 2 above); + and, if the work is an executable linked with the Library, with the + complete machine-readable "work that uses the Library", as object + code and/or source code, so that the user can modify the Library and + then relink to produce a modified executable containing the modified + Library. (It is understood that the user who changes the contents of + definitions files in the Library will not necessarily be able to + recompile the application to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a copy + of the library already present on the user's computer system, rather + than copying library functions into the executable, and (2) will + operate properly with a modified version of the library, if the user + installs one, as long as the modified version is interface-compatible + with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least three + years, to give the same user the materials specified in Subsection + 6a, above, for a charge no more than the cost of performing this + distribution. + + d) If distribution of the work is made by offering access to copy from a + designated place, offer equivalent access to copy the above specified + materials from the same place. + + e) Verify that the user has already received a copy of these materials + or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the Library" + must include any data and utility programs needed for reproducing the + executable from it. However, as a special exception, the materials to be + distributed need not include anything that is normally distributed (in + either source or binary form) with the major components (compiler, + kernel, and so on) of the operating system on which the executable runs, + unless that component itself accompanies the executable. + + It may happen that this requirement contradicts the license restrictions + of other proprietary libraries that do not normally accompany the + operating system. Such a contradiction means you cannot use both them + and the Library together in an executable that you distribute. + +7. You may place library facilities that are a work based on the Library + side-by-side in a single library together with other library facilities + not covered by this License, and distribute such a combined library, + provided that the separate distribution of the work based on the Library + and of the other library facilities is otherwise permitted, and provided + that you do these two things: + + a) Accompany the combined library with a copy of the same work based on + the Library, uncombined with any other library facilities. This must + be distributed under the terms of the Sections above. + + b) Give prominent notice with the combined library of the fact that part + of it is a work based on the Library, and explaining where to find + the accompanying uncombined form of the same work. + +8. You may not copy, modify, sublicense, link with, or distribute the + Library except as expressly provided under this License. Any attempt + otherwise to copy, modify, sublicense, link with, or distribute the + Library is void, and will automatically terminate your rights under this + License. However, parties who have received copies, or rights, from you + under this License will not have their licenses terminated so long as + such parties remain in full compliance. + +9. You are not required to accept this License, since you have not signed + it. However, nothing else grants you permission to modify or distribute + the Library or its derivative works. These actions are prohibited by law + if you do not accept this License. Therefore, by modifying or + distributing the Library (or any work based on the Library), you + indicate your acceptance of this License to do so, and all its terms and + conditions for copying, distributing or modifying the Library or works + based on it. + +10. Each time you redistribute the Library (or any work based on the + Library), the recipient automatically receives a license from the + original licensor to copy, distribute, link with or modify the Library + subject to these terms and conditions. You may not impose any further + restrictions on the recipients' exercise of the rights granted + herein. You are not responsible for enforcing compliance by third + parties with this License. + +11. If, as a consequence of a court judgment or allegation of patent + infringement or for any other reason (not limited to patent issues), + conditions are imposed on you (whether by court order, agreement or + otherwise) that contradict the conditions of this License, they do not + excuse you from the conditions of this License. If you cannot + distribute so as to satisfy simultaneously your obligations under this + License and any other pertinent obligations, then as a consequence you + may not distribute the Library at all. For example, if a patent license + would not permit royalty-free redistribution of the Library by all + those who receive copies directly or indirectly through you, then the + only way you could satisfy both it and this License would be to refrain + entirely from distribution of the Library. + + If any portion of this section is held invalid or unenforceable under + any particular circumstance, the balance of the section is intended to + apply, and the section as a whole is intended to apply in other + circumstances. + + It is not the purpose of this section to induce you to infringe any + patents or other property right claims or to contest validity of any + such claims; this section has the sole purpose of protecting the + integrity of the free software distribution system which is implemented + by public license practices. Many people have made generous + contributions to the wide range of software distributed through that + system in reliance on consistent application of that system; it is up + to the author/donor to decide if he or she is willing to distribute + software through any other system and a licensee cannot impose that + choice. + + This section is intended to make thoroughly clear what is believed to + be a consequence of the rest of this License. + +12. If the distribution and/or use of the Library is restricted in certain + countries either by patents or by copyrighted interfaces, the original + copyright holder who places the Library under this License may add an + explicit geographical distribution limitation excluding those + countries, so that distribution is permitted only in or among countries + not thus excluded. In such case, this License incorporates the + limitation as if written in the body of this License. + +13. The Free Software Foundation may publish revised and/or new versions of + the Lesser General Public License from time to time. Such new versions + will be similar in spirit to the present version, but may differ in + detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the Library + specifies a version number of this License which applies to it and "any + later version", you have the option of following the terms and + conditions either of that version or of any later version published by + the Free Software Foundation. If the Library does not specify a license + version number, you may choose any version ever published by the Free + Software Foundation. + +14. If you wish to incorporate parts of the Library into other free + programs whose distribution conditions are incompatible with these, + write to the author to ask for permission. For software which is + copyrighted by the Free Software Foundation, write to the Free Software + Foundation; we sometimes make exceptions for this. Our decision will be + guided by the two goals of preserving the free status of all + derivatives of our free software and of promoting the sharing and reuse + of software generally. + +NO WARRANTY + +15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY + FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN + OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES + PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER + EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE + ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH + YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL + NECESSARY SERVICING, REPAIR OR CORRECTION. + +16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING + WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR + REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR + DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL + DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY + (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED + INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF + THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR + OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +END OF TERMS AND CONDITIONS + +How to Apply These Terms to Your New Libraries + +If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + +To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + +one line to give the library's name and an idea of what it does. +Copyright (C) year name of author + +This library is free software; you can redistribute it and/or modify it +under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at +your option) any later version. + +This library is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this library; if not, write to the Free Software Foundation, +Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add +information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + +Yoyodyne, Inc., hereby disclaims all copyright interest in +the library `Frob' (a library for tweaking knobs) written +by James Random Hacker. + +signature of Ty Coon, 1 April 1990 +Ty Coon, President of Vice +That's all there is to it! diff --git a/third_party/libbpf/assets/libbpf-logo-compact-darkbg.png b/third_party/libbpf/assets/libbpf-logo-compact-darkbg.png new file mode 100644 index 0000000..aaef60e Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-compact-darkbg.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-compact-mono.png b/third_party/libbpf/assets/libbpf-logo-compact-mono.png new file mode 100644 index 0000000..51daed1 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-compact-mono.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-compact.png b/third_party/libbpf/assets/libbpf-logo-compact.png new file mode 100644 index 0000000..99ae3ab Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-compact.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sideways-darkbg.png b/third_party/libbpf/assets/libbpf-logo-sideways-darkbg.png new file mode 100644 index 0000000..00d6d83 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sideways-darkbg.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sideways-mono.png b/third_party/libbpf/assets/libbpf-logo-sideways-mono.png new file mode 100644 index 0000000..82f1ce5 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sideways-mono.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sideways.png b/third_party/libbpf/assets/libbpf-logo-sideways.png new file mode 100644 index 0000000..48186c3 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sideways.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sparse-darkbg.png b/third_party/libbpf/assets/libbpf-logo-sparse-darkbg.png new file mode 100644 index 0000000..5e57430 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sparse-darkbg.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sparse-mono.png b/third_party/libbpf/assets/libbpf-logo-sparse-mono.png new file mode 100644 index 0000000..b02d432 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sparse-mono.png differ diff --git a/third_party/libbpf/assets/libbpf-logo-sparse.png b/third_party/libbpf/assets/libbpf-logo-sparse.png new file mode 100644 index 0000000..a306588 Binary files /dev/null and b/third_party/libbpf/assets/libbpf-logo-sparse.png differ diff --git a/third_party/libbpf/ci/diffs/.keep b/third_party/libbpf/ci/diffs/.keep new file mode 100644 index 0000000..e69de29 diff --git a/third_party/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch b/third_party/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch new file mode 100644 index 0000000..2f8d84f --- /dev/null +++ b/third_party/libbpf/ci/diffs/0001-s390-define-RUNTIME_DISCARD_EXIT-to-fix-link-error-w.patch @@ -0,0 +1,70 @@ +From 6fba14e2ed9d159f76b23fa5c16f3ea99acbc003 Mon Sep 17 00:00:00 2001 +From: Masahiro Yamada +Date: Thu, 5 Jan 2023 12:13:06 +0900 +Subject: [PATCH] s390: define RUNTIME_DISCARD_EXIT to fix link error with GNU + ld < 2.36 + +Nathan Chancellor reports that the s390 vmlinux fails to link with +GNU ld < 2.36 since commit 99cb0d917ffa ("arch: fix broken BuildID +for arm64 and riscv"). + +It happens for defconfig, or more specifically for CONFIG_EXPOLINE=y. + + $ s390x-linux-gnu-ld --version | head -n1 + GNU ld (GNU Binutils for Debian) 2.35.2 + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- allnoconfig + $ ./scripts/config -e CONFIG_EXPOLINE + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- olddefconfig + $ make -s ARCH=s390 CROSS_COMPILE=s390x-linux-gnu- + `.exit.text' referenced in section `.s390_return_reg' of drivers/base/dd.o: defined in discarded section `.exit.text' of drivers/base/dd.o + make[1]: *** [scripts/Makefile.vmlinux:34: vmlinux] Error 1 + make: *** [Makefile:1252: vmlinux] Error 2 + +arch/s390/kernel/vmlinux.lds.S wants to keep EXIT_TEXT: + + .exit.text : { + EXIT_TEXT + } + +But, at the same time, EXIT_TEXT is thrown away by DISCARD because +s390 does not define RUNTIME_DISCARD_EXIT. + +I still do not understand why the latter wins after 99cb0d917ffa, +but defining RUNTIME_DISCARD_EXIT seems correct because the comment +line in arch/s390/kernel/vmlinux.lds.S says: + + /* + * .exit.text is discarded at runtime, not link time, + * to deal with references from __bug_table + */ + +Nathan also found that binutils commit 21401fc7bf67 ("Duplicate output +sections in scripts") cured this issue, so we cannot reproduce it with +binutils 2.36+, but it is better to not rely on it. + +Fixes: 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") +Link: https://lore.kernel.org/all/Y7Jal56f6UBh1abE@dev-arch.thelio-3990X/ +Reported-by: Nathan Chancellor +Signed-off-by: Masahiro Yamada +Link: https://lore.kernel.org/r/20230105031306.1455409-1-masahiroy@kernel.org +Signed-off-by: Heiko Carstens +--- + arch/s390/kernel/vmlinux.lds.S | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S +index 5ea3830af0cc..6e101e6f499d 100644 +--- a/arch/s390/kernel/vmlinux.lds.S ++++ b/arch/s390/kernel/vmlinux.lds.S +@@ -17,6 +17,8 @@ + /* Handle ro_after_init data on our own. */ + #define RO_AFTER_INIT_DATA + ++#define RUNTIME_DISCARD_EXIT ++ + #define EMITS_PT_NOTE + + #include +-- +2.30.2 + diff --git a/third_party/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch b/third_party/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch new file mode 100644 index 0000000..64bf9ff --- /dev/null +++ b/third_party/libbpf/ci/diffs/0001-selftests-bpf-Check-whether-to-run-selftest.patch @@ -0,0 +1,37 @@ +From ff8be5401b359e23ec2b74184034082564bac7c5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Daniel=20M=C3=BCller?= +Date: Thu, 25 May 2023 16:04:20 -0700 +Subject: [PATCH] selftests/bpf: Check whether to run selftest +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The sockopt test invokes test__start_subtest and then unconditionally +asserts the success. That means that even if deny-listed, any test will +still run and potentially fail. +Evaluate the return value of test__start_subtest() to achieve the +desired behavior, as other tests do. + +Signed-off-by: Daniel Müller +--- + tools/testing/selftests/bpf/prog_tests/sockopt.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c +index 33dd45..9e6a5e 100644 +--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c ++++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c +@@ -1060,7 +1060,9 @@ void test_sockopt(void) + return; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { +- test__start_subtest(tests[i].descr); ++ if (!test__start_subtest(tests[i].descr)) ++ continue; ++ + ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr); + } + +-- +2.34.1 + diff --git a/third_party/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch b/third_party/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch new file mode 100644 index 0000000..3ee42c6 --- /dev/null +++ b/third_party/libbpf/ci/diffs/0001-selftests-bpf-Select-CONFIG_FUNCTION_ERROR_INJECTION.patch @@ -0,0 +1,46 @@ +From a8dfde09c90109e3a98af54847e91bde7dc2d5c2 Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Tue, 13 Dec 2022 14:05:00 -0800 +Subject: [PATCH] selftests/bpf: Select CONFIG_FUNCTION_ERROR_INJECTION +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +BPF selftests require CONFIG_FUNCTION_ERROR_INJECTION to work. However, +CONFIG_FUNCTION_ERROR_INJECTION is no longer 'y' by default after recent +changes. As a result, we are seeing errors like the following from BPF CI: + + bpf_testmod_test_read() is not modifiable + __x64_sys_setdomainname is not sleepable + __x64_sys_getpgid is not sleepable + +Fix this by explicitly selecting CONFIG_FUNCTION_ERROR_INJECTION in the +selftest config. + +Fixes: a4412fdd49dc ("error-injection: Add prompt for function error injection") +Reported-by: Daniel Müller +Signed-off-by: Song Liu +Signed-off-by: Andrii Nakryiko +Signed-off-by: Daniel Borkmann +Acked-by: Daniel Müller +Link: https://lore.kernel.org/bpf/20221213220500.3427947-1-song@kernel.org +Signed-off-by: Daniel Müller +--- + tools/testing/selftests/bpf/config | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config +index 612f69..63cd4a 100644 +--- a/tools/testing/selftests/bpf/config ++++ b/tools/testing/selftests/bpf/config +@@ -16,6 +16,7 @@ CONFIG_CRYPTO_USER_API_HASH=y + CONFIG_DYNAMIC_FTRACE=y + CONFIG_FPROBE=y + CONFIG_FTRACE_SYSCALLS=y ++CONFIG_FUNCTION_ERROR_INJECTION=y + CONFIG_FUNCTION_TRACER=y + CONFIG_GENEVE=y + CONFIG_IKCONFIG=y +-- +2.30.2 + diff --git a/third_party/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch b/third_party/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch new file mode 100644 index 0000000..9547c62 --- /dev/null +++ b/third_party/libbpf/ci/diffs/0001-tracing-fprobe-Initialize-ret-valiable-to-fix-smatch.patch @@ -0,0 +1,68 @@ +From d3484f640bc82cff459beb85a00f7ebab20f0a41 Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Sun, 9 Apr 2023 11:28:31 +0900 +Subject: [PATCH] tracing: fprobe: Initialize ret valiable to fix smatch error + +The commit 39d954200bf6 ("fprobe: Skip exit_handler if entry_handler returns +!0") introduced a hidden dependency of 'ret' local variable in the +fprobe_handler(), Smatch warns the `ret` can be accessed without +initialization. + + kernel/trace/fprobe.c:59 fprobe_handler() + error: uninitialized symbol 'ret'. + +kernel/trace/fprobe.c + 49 fpr->entry_ip = ip; + 50 if (fp->entry_data_size) + 51 entry_data = fpr->data; + 52 } + 53 + 54 if (fp->entry_handler) + 55 ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data); + +ret is only initialized if there is an ->entry_handler + + 56 + 57 /* If entry_handler returns !0, nmissed is not counted. */ + 58 if (rh) { + +rh is only true if there is an ->exit_handler. Presumably if you have +and ->exit_handler that means you also have a ->entry_handler but Smatch +is not smart enough to figure it out. + +--> 59 if (ret) + ^^^ +Warning here. + + 60 rethook_recycle(rh); + 61 else + 62 rethook_hook(rh, ftrace_get_regs(fregs), true); + 63 } + 64 out: + 65 ftrace_test_recursion_unlock(bit); + 66 } + +Reported-by: Dan Carpenter +Link: https://lore.kernel.org/all/85429a5c-a4b9-499e-b6c0-cbd313291c49@kili.mountain +Fixes: 39d954200bf6 ("fprobe: Skip exit_handler if entry_handler returns !0") +Signed-off-by: Masami Hiramatsu (Google) +--- + kernel/trace/fprobe.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c +index 9abb3905bc8e..293184227394 100644 +--- a/kernel/trace/fprobe.c ++++ b/kernel/trace/fprobe.c +@@ -27,7 +27,7 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct rethook_node *rh = NULL; + struct fprobe *fp; + void *entry_data = NULL; +- int bit, ret; ++ int bit, ret = 0; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) +-- +2.34.1 + diff --git a/third_party/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch b/third_party/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch new file mode 100644 index 0000000..b97dba0 --- /dev/null +++ b/third_party/libbpf/ci/diffs/0001-veth-take-into-account-peer-device-for-NETDEV_XDP_AC.patch @@ -0,0 +1,83 @@ +From 8267fc71abb2dc47338570e56dd3473a58313fce Mon Sep 17 00:00:00 2001 +From: Lorenzo Bianconi +Date: Mon, 17 Apr 2023 23:53:22 +0200 +Subject: [PATCH] veth: take into account peer device for + NETDEV_XDP_ACT_NDO_XMIT xdp_features flag + +For veth pairs, NETDEV_XDP_ACT_NDO_XMIT is supported by the current +device if the peer one is running a XDP program or if it has GRO enabled. +Fix the xdp_features flags reporting considering peer device and not +current one for NETDEV_XDP_ACT_NDO_XMIT. + +Fixes: fccca038f300 ("veth: take into account device reconfiguration for xdp_features flag") +Signed-off-by: Lorenzo Bianconi +Link: https://lore.kernel.org/r/4f1ca6f6f6b42ae125bfdb5c7782217c83968b2e.1681767806.git.lorenzo@kernel.org +Signed-off-by: Alexei Starovoitov +--- + drivers/net/veth.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/veth.c b/drivers/net/veth.c +index e1b38fbf1dd9..4b3c6647edc6 100644 +--- a/drivers/net/veth.c ++++ b/drivers/net/veth.c +@@ -1262,11 +1262,12 @@ static void veth_set_xdp_features(struct net_device *dev) + + peer = rtnl_dereference(priv->peer); + if (peer && peer->real_num_tx_queues <= dev->real_num_rx_queues) { ++ struct veth_priv *priv_peer = netdev_priv(peer); + xdp_features_t val = NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_RX_SG; + +- if (priv->_xdp_prog || veth_gro_requested(dev)) ++ if (priv_peer->_xdp_prog || veth_gro_requested(peer)) + val |= NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_NDO_XMIT_SG; + xdp_set_features_flag(dev, val); +@@ -1504,19 +1505,23 @@ static int veth_set_features(struct net_device *dev, + { + netdev_features_t changed = features ^ dev->features; + struct veth_priv *priv = netdev_priv(dev); ++ struct net_device *peer; + int err; + + if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) + return 0; + ++ peer = rtnl_dereference(priv->peer); + if (features & NETIF_F_GRO) { + err = veth_napi_enable(dev); + if (err) + return err; + +- xdp_features_set_redirect_target(dev, true); ++ if (peer) ++ xdp_features_set_redirect_target(peer, true); + } else { +- xdp_features_clear_redirect_target(dev); ++ if (peer) ++ xdp_features_clear_redirect_target(peer); + veth_napi_del(dev); + } + return 0; +@@ -1598,13 +1603,13 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, + peer->max_mtu = max_mtu; + } + +- xdp_features_set_redirect_target(dev, true); ++ xdp_features_set_redirect_target(peer, true); + } + + if (old_prog) { + if (!prog) { +- if (!veth_gro_requested(dev)) +- xdp_features_clear_redirect_target(dev); ++ if (peer && !veth_gro_requested(dev)) ++ xdp_features_clear_redirect_target(peer); + + if (dev->flags & IFF_UP) + veth_disable_xdp(dev); +-- +2.34.1 + diff --git a/third_party/libbpf/ci/managers/debian.sh b/third_party/libbpf/ci/managers/debian.sh new file mode 100755 index 0000000..3d3f860 --- /dev/null +++ b/third_party/libbpf/ci/managers/debian.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +PHASES=(${@:-SETUP RUN RUN_ASAN CLEANUP}) +DEBIAN_RELEASE="${DEBIAN_RELEASE:-testing}" +CONT_NAME="${CONT_NAME:-libbpf-debian-$DEBIAN_RELEASE}" +ENV_VARS="${ENV_VARS:-}" +DOCKER_RUN="${DOCKER_RUN:-docker run}" +REPO_ROOT="${REPO_ROOT:-$PWD}" +ADDITIONAL_DEPS=(pkgconf) +EXTRA_CFLAGS="" +EXTRA_LDFLAGS="" + +function info() { + echo -e "\033[33;1m$1\033[0m" +} + +function error() { + echo -e "\033[31;1m$1\033[0m" +} + +function docker_exec() { + docker exec $ENV_VARS $CONT_NAME "$@" +} + +set -eu + +source "$(dirname $0)/travis_wait.bash" + +for phase in "${PHASES[@]}"; do + case $phase in + SETUP) + info "Setup phase" + info "Using Debian $DEBIAN_RELEASE" + + docker --version + + docker pull debian:$DEBIAN_RELEASE + info "Starting container $CONT_NAME" + $DOCKER_RUN -v $REPO_ROOT:/build:rw \ + -w /build --privileged=true --name $CONT_NAME \ + -dit --net=host debian:$DEBIAN_RELEASE /bin/bash + echo -e "::group::Build Env Setup" + docker_exec bash -c "echo deb-src http://deb.debian.org/debian $DEBIAN_RELEASE main >>/etc/apt/sources.list" + docker_exec apt-get -y update + docker_exec apt-get -y install aptitude + docker_exec aptitude -y install make libz-dev libelf-dev + docker_exec aptitude -y install "${ADDITIONAL_DEPS[@]}" + echo -e "::endgroup::" + ;; + RUN|RUN_CLANG|RUN_CLANG14|RUN_CLANG15|RUN_CLANG16|RUN_GCC10|RUN_GCC11|RUN_GCC12|RUN_ASAN|RUN_CLANG_ASAN|RUN_GCC10_ASAN) + CC="cc" + if [[ "$phase" =~ "RUN_CLANG(\d+)(_ASAN)?" ]]; then + ENV_VARS="-e CC=clang-${BASH_REMATCH[1]} -e CXX=clang++-${BASH_REMATCH[1]}" + CC="clang-${BASH_REMATCH[1]}" + elif [[ "$phase" = *"CLANG"* ]]; then + ENV_VARS="-e CC=clang -e CXX=clang++" + CC="clang" + elif [[ "$phase" =~ "RUN_GCC(\d+)(_ASAN)?" ]]; then + ENV_VARS="-e CC=gcc-${BASH_REMATCH[1]} -e CXX=g++-${BASH_REMATCH[1]}" + CC="gcc-${BASH_REMATCH[1]}" + fi + if [[ "$phase" = *"ASAN"* ]]; then + EXTRA_CFLAGS="${EXTRA_CFLAGS} -fsanitize=address,undefined" + EXTRA_LDFLAGS="${EXTRA_LDFLAGS} -fsanitize=address,undefined" + fi + if [[ "$CC" != "cc" ]]; then + docker_exec aptitude -y install "$CC" + else + docker_exec aptitude -y install gcc + fi + docker_exec mkdir build install + docker_exec ${CC} --version + info "build" + docker_exec make -j$((4*$(nproc))) EXTRA_CFLAGS="${EXTRA_CFLAGS}" EXTRA_LDFLAGS="${EXTRA_LDFLAGS}" -C ./src -B OBJDIR=../build + info "ldd build/libbpf.so:" + docker_exec ldd build/libbpf.so + if ! docker_exec ldd build/libbpf.so | grep -q libelf; then + error "No reference to libelf.so in libbpf.so!" + exit 1 + fi + info "install" + docker_exec make -j$((4*$(nproc))) -C src OBJDIR=../build DESTDIR=../install install + info "link binary" + docker_exec bash -c "EXTRA_CFLAGS=\"${EXTRA_CFLAGS}\" EXTRA_LDFLAGS=\"${EXTRA_LDFLAGS}\" ./ci/managers/test_compile.sh" + ;; + CLEANUP) + info "Cleanup phase" + docker stop $CONT_NAME + docker rm -f $CONT_NAME + ;; + *) + echo >&2 "Unknown phase '$phase'" + exit 1 + esac +done diff --git a/third_party/libbpf/ci/managers/test_compile.sh b/third_party/libbpf/ci/managers/test_compile.sh new file mode 100755 index 0000000..094ba3e --- /dev/null +++ b/third_party/libbpf/ci/managers/test_compile.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euox pipefail + +EXTRA_CFLAGS=${EXTRA_CFLAGS:-} +EXTRA_LDFLAGS=${EXTRA_LDFLAGS:-} + +cat << EOF > main.c +#include +int main() { + return bpf_object__open(0) < 0; +} +EOF + +# static linking +${CC:-cc} ${EXTRA_CFLAGS} ${EXTRA_LDFLAGS} -o main -I./include/uapi -I./install/usr/include main.c ./build/libbpf.a -lelf -lz diff --git a/third_party/libbpf/ci/managers/travis_wait.bash b/third_party/libbpf/ci/managers/travis_wait.bash new file mode 100644 index 0000000..acf6ad1 --- /dev/null +++ b/third_party/libbpf/ci/managers/travis_wait.bash @@ -0,0 +1,61 @@ +# This was borrowed from https://github.com/travis-ci/travis-build/tree/master/lib/travis/build/bash +# to get around https://github.com/travis-ci/travis-ci/issues/9979. It should probably be removed +# as soon as Travis CI has started to provide an easy way to export the functions to bash scripts. + +travis_jigger() { + local cmd_pid="${1}" + shift + local timeout="${1}" + shift + local count=0 + + echo -e "\\n" + + while [[ "${count}" -lt "${timeout}" ]]; do + count="$((count + 1))" + echo -ne "Still running (${count} of ${timeout}): ${*}\\r" + sleep 60 + done + + echo -e "\\n${ANSI_RED}Timeout (${timeout} minutes) reached. Terminating \"${*}\"${ANSI_RESET}\\n" + kill -9 "${cmd_pid}" +} + +travis_wait() { + local timeout="${1}" + + if [[ "${timeout}" =~ ^[0-9]+$ ]]; then + shift + else + timeout=20 + fi + + local cmd=("${@}") + local log_file="travis_wait_${$}.log" + + "${cmd[@]}" &>"${log_file}" & + local cmd_pid="${!}" + + travis_jigger "${!}" "${timeout}" "${cmd[@]}" & + local jigger_pid="${!}" + local result + + { + set +e + wait "${cmd_pid}" 2>/dev/null + result="${?}" + ps -p"${jigger_pid}" &>/dev/null && kill "${jigger_pid}" + set -e + } + + if [[ "${result}" -eq 0 ]]; then + echo -e "\\n${ANSI_GREEN}The command ${cmd[*]} exited with ${result}.${ANSI_RESET}" + else + echo -e "\\n${ANSI_RED}The command ${cmd[*]} exited with ${result}.${ANSI_RESET}" + fi + + echo -e "\\n${ANSI_GREEN}Log:${ANSI_RESET}\\n" + cat "${log_file}" + + return "${result}" +} diff --git a/third_party/libbpf/ci/managers/ubuntu.sh b/third_party/libbpf/ci/managers/ubuntu.sh new file mode 100755 index 0000000..7fe1b3f --- /dev/null +++ b/third_party/libbpf/ci/managers/ubuntu.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -eux + +RELEASE="focal" + +apt-get update +apt-get install -y pkg-config + +source "$(dirname $0)/travis_wait.bash" + +cd $REPO_ROOT + +EXTRA_CFLAGS="-Werror -Wall -fsanitize=address,undefined" +EXTRA_LDFLAGS="-Werror -Wall -fsanitize=address,undefined" +mkdir build install +cc --version +make -j$((4*$(nproc))) EXTRA_CFLAGS="${EXTRA_CFLAGS}" EXTRA_LDFLAGS="${EXTRA_LDFLAGS}" -C ./src -B OBJDIR=../build +ldd build/libbpf.so +if ! ldd build/libbpf.so | grep -q libelf; then + echo "FAIL: No reference to libelf.so in libbpf.so!" + exit 1 +fi +make -j$((4*$(nproc))) -C src OBJDIR=../build DESTDIR=../install install +EXTRA_CFLAGS=${EXTRA_CFLAGS} EXTRA_LDFLAGS=${EXTRA_LDFLAGS} $(dirname $0)/test_compile.sh diff --git a/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 b/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 new file mode 100644 index 0000000..ee0d3db --- /dev/null +++ b/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-4.9.0 @@ -0,0 +1,8 @@ +# btf_dump -- need to disable data dump sub-tests +core_retro +cpu_mask +hashmap +legacy_printk +perf_buffer +section_names + diff --git a/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 b/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 new file mode 100644 index 0000000..998f036 --- /dev/null +++ b/third_party/libbpf/ci/vmtest/configs/ALLOWLIST-5.5.0 @@ -0,0 +1,51 @@ +# attach_probe +autoload +bpf_verif_scale +cgroup_attach_autodetach +cgroup_attach_override +core_autosize +core_extern +core_read_macros +core_reloc +core_retro +cpu_mask +endian +get_branch_snapshot +get_stackid_cannot_attach +global_data +global_data_init +global_func_args +hashmap +legacy_printk +linked_funcs +linked_maps +map_lock +obj_name +perf_buffer +perf_event_stackmap +pinning +pkt_md_access +probe_user +queue_stack_map +raw_tp_writable_reject_nbd_invalid +raw_tp_writable_test_run +rdonly_maps +section_names +signal_pending +skeleton +sockmap_ktls +sockopt +spinlock +stacktrace_map +stacktrace_map_raw_tp +static_linked +task_fd_query_rawtp +task_fd_query_tp +tc_bpf +tcp_estats +tcp_rtt +tp_attach_query +usdt/urand_pid_attach +xdp +xdp_noinline +xdp_perf diff --git a/third_party/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 b/third_party/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 new file mode 100644 index 0000000..2688771 --- /dev/null +++ b/third_party/libbpf/ci/vmtest/configs/DENYLIST-5.5.0 @@ -0,0 +1,120 @@ +# This file is not used and is there for historic purposes only. +# See ALLOWLIST-5.5.0 instead. + +# PERMANENTLY DISABLED +align # verifier output format changed +atomics # new atomic operations (v5.12+) +atomic_bounds # new atomic operations (v5.12+) +bind_perm # changed semantics of return values (v5.12+) +bpf_cookie # 5.15+ +bpf_iter # bpf_iter support is missing +bpf_obj_id # bpf_link support missing for GET_OBJ_INFO, GET_FD_BY_ID, etc +bpf_tcp_ca # STRUCT_OPS is missing +btf_map_in_map # inner map leak fixed in 5.8 +btf_skc_cls_ingress # v5.10+ functionality +cg_storage_multi # v5.9+ functionality +cgroup_attach_multi # BPF_F_REPLACE_PROG missing +cgroup_link # LINK_CREATE is missing +cgroup_skb_sk_lookup # bpf_sk_lookup_tcp() helper is missing +check_mtu # missing BPF helper (v5.12+) +cls_redirect # bpf_csum_level() helper is missing +connect_force_port # cgroup/get{peer,sock}name{4,6} support is missing +d_path # v5.10+ feature +enable_stats # BPF_ENABLE_STATS support is missing +fentry_fexit # bpf_prog_test_tracing missing +fentry_test # bpf_prog_test_tracing missing +fexit_bpf2bpf # freplace is missing +fexit_sleep # relies on bpf_trampoline fix in 5.12+ +fexit_test # bpf_prog_test_tracing missing +flow_dissector # bpf_link-based flow dissector is in 5.8+ +flow_dissector_reattach +for_each # v5.12+ +get_func_ip_test # v5.15+ +get_stack_raw_tp # exercising BPF verifier bug causing infinite loop +hash_large_key # v5.11+ +ima # v5.11+ +kfree_skb # 32-bit pointer arith in test_pkt_access +ksyms # __start_BTF has different name +kfunc_call # v5.13+ +link_pinning # bpf_link is missing +linked_vars # v5.13+ +load_bytes_relative # new functionality in 5.8 +lookup_and_delete # v5.14+ +map_init # per-CPU LRU missing +map_ptr # test uses BPF_MAP_TYPE_RINGBUF, added in 5.8 +metadata # v5.10+ +migrate_reuseport # v5.14+ +mmap # 5.5 kernel is too permissive with re-mmaping +modify_return # fmod_ret support is missing +module_attach # module BTF support missing (v5.11+) +netcnt +netns_cookie # v5.15+ +ns_current_pid_tgid # bpf_get_ns_current_pid_tgid() helper is missing +pe_preserve_elems # v5.10+ +perf_branches # bpf_read_branch_records() helper is missing +perf_link # v5.15+ +pkt_access # 32-bit pointer arith in test_pkt_access +probe_read_user_str # kernel bug with garbage bytes at the end +prog_run_xattr # 32-bit pointer arith in test_pkt_access +raw_tp_test_run # v5.10+ +recursion # v5.12+ +ringbuf # BPF_MAP_TYPE_RINGBUF is supported in 5.8+ + +# bug in verifier w/ tracking references +#reference_tracking/classifier/sk_lookup_success +reference_tracking + +select_reuseport # UDP support is missing +send_signal # bpf_send_signal_thread() helper is missing +sk_assign # bpf_sk_assign helper missing +sk_lookup # v5.9+ +sk_storage_tracing # missing bpf_sk_storage_get() helper +skb_ctx # ctx_{size, }_{in, out} in BPF_PROG_TEST_RUN is missing +skb_helpers # helpers added in 5.8+ +skeleton # creates too big ARRAY map +snprintf # v5.13+ +snprintf_btf # v5.10+ +sock_fields # v5.10+ +socket_cookie # v5.12+ +sockmap_basic # uses new socket fields, 5.8+ +sockmap_listen # no listen socket supportin SOCKMAP +sockopt/getsockopt: ignore >PAGE_SIZE optlen +sockopt/setsockopt: ignore >PAGE_SIZE optlen +sockopt_sk +sockopt_qos_to_cc # v5.15+ +stacktrace_build_id # v5.9+ +stack_var_off # v5.12+ +syscall # v5.14+ +task_local_storage # v5.12+ +task_pt_regs # v5.15+ +tcp_hdr_options # v5.10+, new TCP header options feature in BPF +tcpbpf_user # LINK_CREATE is missing +tc_redirect # v5.14+ +test_bpffs # v5.10+, new CONFIG_BPF_PRELOAD=y and CONFIG_BPF_PRELOAD_UMG=y|m +test_bprm_opts # v5.11+ +test_global_funcs # kernel doesn't support BTF linkage=global on FUNCs +test_local_storage # v5.10+ feature +test_lsm # no BPF_LSM support +test_overhead # no fmod_ret support +test_profiler # needs verifier logic improvements from v5.10+ +test_skb_pkt_end # v5.11+ +timer # v5.15+ +timer_mim # v5.15+ +trace_ext # v5.10+ +trace_printk # v5.14+ +trampoline_count # v5.12+ have lower allowed limits +udp_limit # no cgroup/sock_release BPF program type (5.9+) +varlen # verifier bug fixed in later kernels +vmlinux # hrtimer_nanosleep() signature changed incompatibly +xdp_adjust_tail # new XDP functionality added in 5.8 +xdp_attach # IFLA_XDP_EXPECTED_FD support is missing +xdp_bonding # v5.15+ +xdp_bpf2bpf # freplace is missing +xdp_context_test_run # v5.15+ +xdp_cpumap_attach # v5.9+ +xdp_devmap_attach # new feature in 5.8 +xdp_link # v5.9+ + +# SUBTESTS FAILING (block entire test until blocking subtests works properly) +btf # "size check test", "func (Non zero vlen)" +tailcalls # tailcall_bpf2bpf_1, tailcall_bpf2bpf_2, tailcall_bpf2bpf_3 diff --git a/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest b/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest new file mode 100644 index 0000000..1984883 --- /dev/null +++ b/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest @@ -0,0 +1,4 @@ +decap_sanity # weird failure with decap_sanity_ns netns already existing, TBD +bpf_nf/tc-bpf-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +bpf_nf/xdp-ct # test consistently failing on x86: https://github.com/libbpf/libbpf/pull/698#issuecomment-1590341200 +kprobe_multi_bench_attach # suspected to cause crashes in CI diff --git a/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x b/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x new file mode 100644 index 0000000..e97d7ba --- /dev/null +++ b/third_party/libbpf/ci/vmtest/configs/DENYLIST-latest.s390x @@ -0,0 +1,4 @@ +# TEMPORARY +sockmap_listen/sockhash VSOCK test_vsock_redir +usdt/basic # failing verifier due to bounds check after LLVM update +usdt/multispec # same as above diff --git a/third_party/libbpf/ci/vmtest/helpers.sh b/third_party/libbpf/ci/vmtest/helpers.sh new file mode 100755 index 0000000..c44d098 --- /dev/null +++ b/third_party/libbpf/ci/vmtest/helpers.sh @@ -0,0 +1,38 @@ +# shellcheck shell=bash + +# $1 - start or end +# $2 - fold identifier, no spaces +# $3 - fold section description +foldable() { + local YELLOW='\033[1;33m' + local NOCOLOR='\033[0m' + if [ $1 = "start" ]; then + line="::group::$2" + if [ ! -z "${3:-}" ]; then + line="$line - ${YELLOW}$3${NOCOLOR}" + fi + else + line="::endgroup::" + fi + echo -e "$line" +} + +__print() { + local TITLE="" + if [[ -n $2 ]]; then + TITLE=" title=$2" + fi + echo "::$1${TITLE}::$3" +} + +# $1 - title +# $2 - message +print_error() { + __print error $1 $2 +} + +# $1 - title +# $2 - message +print_notice() { + __print notice $1 $2 +} diff --git a/third_party/libbpf/ci/vmtest/run_selftests.sh b/third_party/libbpf/ci/vmtest/run_selftests.sh new file mode 100755 index 0000000..0e5ac17 --- /dev/null +++ b/third_party/libbpf/ci/vmtest/run_selftests.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +set -euo pipefail + +source $(cd $(dirname $0) && pwd)/helpers.sh + +ARCH=$(uname -m) + +STATUS_FILE=/exitstatus + +read_lists() { + (for path in "$@"; do + if [[ -s "$path" ]]; then + cat "$path" + fi; + done) | cut -d'#' -f1 | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -s '\n' ',' +} + +test_progs() { + if [[ "${KERNEL}" != '4.9.0' ]]; then + foldable start test_progs "Testing test_progs" + # "&& true" does not change the return code (it is not executed + # if the Python script fails), but it prevents exiting on a + # failure due to the "set -e". + ./test_progs ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true + echo "test_progs:$?" >> "${STATUS_FILE}" + foldable end test_progs + fi +} + +test_progs_no_alu32() { + foldable start test_progs-no_alu32 "Testing test_progs-no_alu32" + ./test_progs-no_alu32 ${DENYLIST:+-d"$DENYLIST"} ${ALLOWLIST:+-a"$ALLOWLIST"} && true + echo "test_progs-no_alu32:$?" >> "${STATUS_FILE}" + foldable end test_progs-no_alu32 +} + +test_maps() { + if [[ "${KERNEL}" == 'latest' ]]; then + foldable start test_maps "Testing test_maps" + ./test_maps && true + echo "test_maps:$?" >> "${STATUS_FILE}" + foldable end test_maps + fi +} + +test_verifier() { + if [[ "${KERNEL}" == 'latest' ]]; then + foldable start test_verifier "Testing test_verifier" + ./test_verifier && true + echo "test_verifier:$?" >> "${STATUS_FILE}" + foldable end test_verifier + fi +} + +foldable end vm_init + +foldable start kernel_config "Kconfig" + +zcat /proc/config.gz + +foldable end kernel_config + + +configs_path=/${PROJECT_NAME}/selftests/bpf +local_configs_path=${PROJECT_NAME}/vmtest/configs +DENYLIST=$(read_lists \ + "$configs_path/DENYLIST" \ + "$configs_path/DENYLIST.${ARCH}" \ + "$local_configs_path/DENYLIST-${KERNEL}" \ + "$local_configs_path/DENYLIST-${KERNEL}.${ARCH}" \ +) +ALLOWLIST=$(read_lists \ + "$configs_path/ALLOWLIST" \ + "$configs_path/ALLOWLIST.${ARCH}" \ + "$local_configs_path/ALLOWLIST-${KERNEL}" \ + "$local_configs_path/ALLOWLIST-${KERNEL}.${ARCH}" \ +) + +echo "DENYLIST: ${DENYLIST}" +echo "ALLOWLIST: ${ALLOWLIST}" + +cd ${PROJECT_NAME}/selftests/bpf + +if [ $# -eq 0 ]; then + test_progs + test_progs_no_alu32 + # test_maps + test_verifier +else + for test_name in "$@"; do + "${test_name}" + done +fi diff --git a/third_party/libbpf/docs/.gitignore b/third_party/libbpf/docs/.gitignore new file mode 100644 index 0000000..dd08fae --- /dev/null +++ b/third_party/libbpf/docs/.gitignore @@ -0,0 +1,2 @@ +sphinx/build +sphinx/doxygen/build \ No newline at end of file diff --git a/third_party/libbpf/docs/api.rst b/third_party/libbpf/docs/api.rst new file mode 100644 index 0000000..7a8e709 --- /dev/null +++ b/third_party/libbpf/docs/api.rst @@ -0,0 +1,93 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _api: + +.. toctree:: Table of Contents + + +LIBBPF API +========== + +Error Handling +-------------- + +When libbpf is used in "libbpf 1.0 mode", API functions can return errors in one of two ways. + +You can set "libbpf 1.0" mode with the following line: + +.. code-block:: + + libbpf_set_strict_mode(LIBBPF_STRICT_DIRECT_ERRS | LIBBPF_STRICT_CLEAN_PTRS); + +If the function returns an error code directly, it uses 0 to indicate success +and a negative error code to indicate what caused the error. In this case the +error code should be checked directly from the return, you do not need to check +errno. + +For example: + +.. code-block:: + + err = some_libbpf_api_with_error_return(...); + if (err < 0) { + /* Handle error accordingly */ + } + +If the function returns a pointer, it will return NULL to indicate there was +an error. In this case errno should be checked for the error code. + +For example: + +.. code-block:: + + ptr = some_libbpf_api_returning_ptr(); + if (!ptr) { + /* note no minus sign for EINVAL and E2BIG below */ + if (errno == EINVAL) { + /* handle EINVAL error */ + } else if (errno == E2BIG) { + /* handle E2BIG error */ + } + } + +libbpf.h +-------- +.. doxygenfile:: libbpf.h + :project: libbpf + :sections: func define public-type enum + +bpf.h +----- +.. doxygenfile:: bpf.h + :project: libbpf + :sections: func define public-type enum + +btf.h +----- +.. doxygenfile:: btf.h + :project: libbpf + :sections: func define public-type enum + +xsk.h +----- +.. doxygenfile:: xsk.h + :project: libbpf + :sections: func define public-type enum + +bpf_tracing.h +------------- +.. doxygenfile:: bpf_tracing.h + :project: libbpf + :sections: func define public-type enum + +bpf_core_read.h +--------------- +.. doxygenfile:: bpf_core_read.h + :project: libbpf + :sections: func define public-type enum + +bpf_endian.h +------------ +.. doxygenfile:: bpf_endian.h + :project: libbpf + :sections: func define public-type enum diff --git a/third_party/libbpf/docs/conf.py b/third_party/libbpf/docs/conf.py new file mode 100644 index 0000000..1d8714e --- /dev/null +++ b/third_party/libbpf/docs/conf.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import os +import subprocess + +project = "libbpf" + +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.imgmath', + 'sphinx.ext.todo', + 'breathe', +] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' + +if read_the_docs_build: + subprocess.call('cd sphinx ; make clean', shell=True) + subprocess.call('cd sphinx/doxygen ; doxygen', shell=True) + +html_theme = 'sphinx_rtd_theme' + +breathe_projects = { "libbpf": "./sphinx/doxygen/build/xml/" } +breathe_default_project = "libbpf" +breathe_show_define_initializer = True +breathe_show_enumvalue_initializer = True diff --git a/third_party/libbpf/docs/index.rst b/third_party/libbpf/docs/index.rst new file mode 100644 index 0000000..7545a20 --- /dev/null +++ b/third_party/libbpf/docs/index.rst @@ -0,0 +1,33 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _libbpf: + +====== +libbpf +====== + +If you are looking to develop BPF applications using the libbpf library, this +directory contains important documentation that you should read. + +To get started, it is recommended to begin with the :doc:`libbpf Overview +` document, which provides a high-level understanding of the +libbpf APIs and their usage. This will give you a solid foundation to start +exploring and utilizing the various features of libbpf to develop your BPF +applications. + +.. toctree:: + :maxdepth: 1 + + libbpf_overview + API Documentation + program_types + libbpf_naming_convention + libbpf_build + + +All general BPF questions, including kernel functionality, libbpf APIs and their +application, should be sent to bpf@vger.kernel.org mailing list. You can +`subscribe `_ to the mailing list +search its `archive `_. Please search the archive +before asking new questions. It may be that this was already addressed or +answered before. diff --git a/third_party/libbpf/docs/libbpf_build.rst b/third_party/libbpf/docs/libbpf_build.rst new file mode 100644 index 0000000..8e8c23e --- /dev/null +++ b/third_party/libbpf/docs/libbpf_build.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +Building libbpf +=============== + +libelf and zlib are internal dependencies of libbpf and thus are required to link +against and must be installed on the system for applications to work. +pkg-config is used by default to find libelf, and the program called +can be overridden with PKG_CONFIG. + +If using pkg-config at build time is not desired, it can be disabled by +setting NO_PKG_CONFIG=1 when calling make. + +To build both static libbpf.a and shared libbpf.so: + +.. code-block:: bash + + $ cd src + $ make + +To build only static libbpf.a library in directory build/ and install them +together with libbpf headers in a staging directory root/: + +.. code-block:: bash + + $ cd src + $ mkdir build root + $ BUILD_STATIC_ONLY=y OBJDIR=build DESTDIR=root make install + +To build both static libbpf.a and shared libbpf.so against a custom libelf +dependency installed in /build/root/ and install them together with libbpf +headers in a build directory /build/root/: + +.. code-block:: bash + + $ cd src + $ PKG_CONFIG_PATH=/build/root/lib64/pkgconfig DESTDIR=/build/root make \ No newline at end of file diff --git a/third_party/libbpf/docs/libbpf_naming_convention.rst b/third_party/libbpf/docs/libbpf_naming_convention.rst new file mode 100644 index 0000000..b5b41b6 --- /dev/null +++ b/third_party/libbpf/docs/libbpf_naming_convention.rst @@ -0,0 +1,193 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +API naming convention +===================== + +libbpf API provides access to a few logically separated groups of +functions and types. Every group has its own naming convention +described here. It's recommended to follow these conventions whenever a +new function or type is added to keep libbpf API clean and consistent. + +All types and functions provided by libbpf API should have one of the +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``btf_dump_``, +``ring_buffer_``, ``perf_buffer_``. + +System call wrappers +-------------------- + +System call wrappers are simple wrappers for commands supported by +sys_bpf system call. These wrappers should go to ``bpf.h`` header file +and map one to one to corresponding commands. + +For example ``bpf_map_lookup_elem`` wraps ``BPF_MAP_LOOKUP_ELEM`` +command of sys_bpf, ``bpf_prog_attach`` wraps ``BPF_PROG_ATTACH``, etc. + +Objects +------- + +Another class of types and functions provided by libbpf API is "objects" +and functions to work with them. Objects are high-level abstractions +such as BPF program or BPF map. They're represented by corresponding +structures such as ``struct bpf_object``, ``struct bpf_program``, +``struct bpf_map``, etc. + +Structures are forward declared and access to their fields should be +provided via corresponding getters and setters rather than directly. + +These objects are associated with corresponding parts of ELF object that +contains compiled BPF programs. + +For example ``struct bpf_object`` represents ELF object itself created +from an ELF file or from a buffer, ``struct bpf_program`` represents a +program in ELF object and ``struct bpf_map`` is a map. + +Functions that work with an object have names built from object name, +double underscore and part that describes function purpose. + +For example ``bpf_object__open`` consists of the name of corresponding +object, ``bpf_object``, double underscore and ``open`` that defines the +purpose of the function to open ELF file and create ``bpf_object`` from +it. + +All objects and corresponding functions other than BTF related should go +to ``libbpf.h``. BTF types and functions should go to ``btf.h``. + +Auxiliary functions +------------------- + +Auxiliary functions and types that don't fit well in any of categories +described above should have ``libbpf_`` prefix, e.g. +``libbpf_get_error`` or ``libbpf_prog_type_by_name``. + +ABI +--- + +libbpf can be both linked statically or used as DSO. To avoid possible +conflicts with other libraries an application is linked with, all +non-static libbpf symbols should have one of the prefixes mentioned in +API documentation above. See API naming convention to choose the right +name for a new symbol. + +Symbol visibility +----------------- + +libbpf follow the model when all global symbols have visibility "hidden" +by default and to make a symbol visible it has to be explicitly +attributed with ``LIBBPF_API`` macro. For example: + +.. code-block:: c + + LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); + +This prevents from accidentally exporting a symbol, that is not supposed +to be a part of ABI what, in turn, improves both libbpf developer- and +user-experiences. + +ABI versioning +-------------- + +To make future ABI extensions possible libbpf ABI is versioned. +Versioning is implemented by ``libbpf.map`` version script that is +passed to linker. + +Version name is ``LIBBPF_`` prefix + three-component numeric version, +starting from ``0.0.1``. + +Every time ABI is being changed, e.g. because a new symbol is added or +semantic of existing symbol is changed, ABI version should be bumped. +This bump in ABI version is at most once per kernel development cycle. + +For example, if current state of ``libbpf.map`` is: + +.. code-block:: none + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + +, and a new symbol ``bpf_func_c`` is being introduced, then +``libbpf.map`` should be changed like this: + +.. code-block:: none + + LIBBPF_0.0.1 { + global: + bpf_func_a; + bpf_func_b; + local: + \*; + }; + LIBBPF_0.0.2 { + global: + bpf_func_c; + } LIBBPF_0.0.1; + +, where new version ``LIBBPF_0.0.2`` depends on the previous +``LIBBPF_0.0.1``. + +Format of version script and ways to handle ABI changes, including +incompatible ones, described in details in [1]. + +Stand-alone build +------------------- + +Under https://github.com/libbpf/libbpf there is a (semi-)automated +mirror of the mainline's version of libbpf for a stand-alone build. + +However, all changes to libbpf's code base must be upstreamed through +the mainline kernel tree. + + +API documentation convention +============================ + +The libbpf API is documented via comments above definitions in +header files. These comments can be rendered by doxygen and sphinx +for well organized html output. This section describes the +convention in which these comments should be formatted. + +Here is an example from btf.h: + +.. code-block:: c + + /** + * @brief **btf__new()** creates a new instance of a BTF object from the raw + * bytes of an ELF's BTF section + * @param data raw bytes + * @param size number of bytes passed in `data` + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ + +The comment must start with a block comment of the form '/\*\*'. + +The documentation always starts with a @brief directive. This line is a short +description about this API. It starts with the name of the API, denoted in bold +like so: **api_name**. Please include an open and close parenthesis if this is a +function. Follow with the short description of the API. A longer form description +can be added below the last directive, at the bottom of the comment. + +Parameters are denoted with the @param directive, there should be one for each +parameter. If this is a function with a non-void return, use the @return directive +to document it. + +License +------------------- + +libbpf is dual-licensed under LGPL 2.1 and BSD 2-Clause. + +Links +------------------- + +[1] https://www.akkadia.org/drepper/dsohowto.pdf + (Chapter 3. Maintaining APIs and ABIs). diff --git a/third_party/libbpf/docs/libbpf_overview.rst b/third_party/libbpf/docs/libbpf_overview.rst new file mode 100644 index 0000000..f36a2d4 --- /dev/null +++ b/third_party/libbpf/docs/libbpf_overview.rst @@ -0,0 +1,228 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +libbpf Overview +=============== + +libbpf is a C-based library containing a BPF loader that takes compiled BPF +object files and prepares and loads them into the Linux kernel. libbpf takes the +heavy lifting of loading, verifying, and attaching BPF programs to various +kernel hooks, allowing BPF application developers to focus only on BPF program +correctness and performance. + +The following are the high-level features supported by libbpf: + +* Provides high-level and low-level APIs for user space programs to interact + with BPF programs. The low-level APIs wrap all the bpf system call + functionality, which is useful when users need more fine-grained control + over the interactions between user space and BPF programs. +* Provides overall support for the BPF object skeleton generated by bpftool. + The skeleton file simplifies the process for the user space programs to access + global variables and work with BPF programs. +* Provides BPF-side APIS, including BPF helper definitions, BPF maps support, + and tracing helpers, allowing developers to simplify BPF code writing. +* Supports BPF CO-RE mechanism, enabling BPF developers to write portable + BPF programs that can be compiled once and run across different kernel + versions. + +This document will delve into the above concepts in detail, providing a deeper +understanding of the capabilities and advantages of libbpf and how it can help +you develop BPF applications efficiently. + +BPF App Lifecycle and libbpf APIs +================================== + +A BPF application consists of one or more BPF programs (either cooperating or +completely independent), BPF maps, and global variables. The global +variables are shared between all BPF programs, which allows them to cooperate on +a common set of data. libbpf provides APIs that user space programs can use to +manipulate the BPF programs by triggering different phases of a BPF application +lifecycle. + +The following section provides a brief overview of each phase in the BPF life +cycle: + +* **Open phase**: In this phase, libbpf parses the BPF + object file and discovers BPF maps, BPF programs, and global variables. After + a BPF app is opened, user space apps can make additional adjustments + (setting BPF program types, if necessary; pre-setting initial values for + global variables, etc.) before all the entities are created and loaded. + +* **Load phase**: In the load phase, libbpf creates BPF + maps, resolves various relocations, and verifies and loads BPF programs into + the kernel. At this point, libbpf validates all the parts of a BPF application + and loads the BPF program into the kernel, but no BPF program has yet been + executed. After the load phase, it’s possible to set up the initial BPF map + state without racing with the BPF program code execution. + +* **Attachment phase**: In this phase, libbpf + attaches BPF programs to various BPF hook points (e.g., tracepoints, kprobes, + cgroup hooks, network packet processing pipeline, etc.). During this + phase, BPF programs perform useful work such as processing + packets, or updating BPF maps and global variables that can be read from user + space. + +* **Tear down phase**: In the tear down phase, + libbpf detaches BPF programs and unloads them from the kernel. BPF maps are + destroyed, and all the resources used by the BPF app are freed. + +BPF Object Skeleton File +======================== + +BPF skeleton is an alternative interface to libbpf APIs for working with BPF +objects. Skeleton code abstract away generic libbpf APIs to significantly +simplify code for manipulating BPF programs from user space. Skeleton code +includes a bytecode representation of the BPF object file, simplifying the +process of distributing your BPF code. With BPF bytecode embedded, there are no +extra files to deploy along with your application binary. + +You can generate the skeleton header file ``(.skel.h)`` for a specific object +file by passing the BPF object to the bpftool. The generated BPF skeleton +provides the following custom functions that correspond to the BPF lifecycle, +each of them prefixed with the specific object name: + +* ``__open()`` – creates and opens BPF application (```` stands for + the specific bpf object name) +* ``__load()`` – instantiates, loads,and verifies BPF application parts +* ``__attach()`` – attaches all auto-attachable BPF programs (it’s + optional, you can have more control by using libbpf APIs directly) +* ``__destroy()`` – detaches all BPF programs and + frees up all used resources + +Using the skeleton code is the recommended way to work with bpf programs. Keep +in mind, BPF skeleton provides access to the underlying BPF object, so whatever +was possible to do with generic libbpf APIs is still possible even when the BPF +skeleton is used. It's an additive convenience feature, with no syscalls, and no +cumbersome code. + +Other Advantages of Using Skeleton File +--------------------------------------- + +* BPF skeleton provides an interface for user space programs to work with BPF + global variables. The skeleton code memory maps global variables as a struct + into user space. The struct interface allows user space programs to initialize + BPF programs before the BPF load phase and fetch and update data from user + space afterward. + +* The ``skel.h`` file reflects the object file structure by listing out the + available maps, programs, etc. BPF skeleton provides direct access to all the + BPF maps and BPF programs as struct fields. This eliminates the need for + string-based lookups with ``bpf_object_find_map_by_name()`` and + ``bpf_object_find_program_by_name()`` APIs, reducing errors due to BPF source + code and user-space code getting out of sync. + +* The embedded bytecode representation of the object file ensures that the + skeleton and the BPF object file are always in sync. + +BPF Helpers +=========== + +libbpf provides BPF-side APIs that BPF programs can use to interact with the +system. The BPF helpers definition allows developers to use them in BPF code as +any other plain C function. For example, there are helper functions to print +debugging messages, get the time since the system was booted, interact with BPF +maps, manipulate network packets, etc. + +For a complete description of what the helpers do, the arguments they take, and +the return value, see the `bpf-helpers +`_ man page. + +BPF CO-RE (Compile Once – Run Everywhere) +========================================= + +BPF programs work in the kernel space and have access to kernel memory and data +structures. One limitation that BPF applications come across is the lack of +portability across different kernel versions and configurations. `BCC +`_ is one of the solutions for BPF +portability. However, it comes with runtime overhead and a large binary size +from embedding the compiler with the application. + +libbpf steps up the BPF program portability by supporting the BPF CO-RE concept. +BPF CO-RE brings together BTF type information, libbpf, and the compiler to +produce a single executable binary that you can run on multiple kernel versions +and configurations. + +To make BPF programs portable libbpf relies on the BTF type information of the +running kernel. Kernel also exposes this self-describing authoritative BTF +information through ``sysfs`` at ``/sys/kernel/btf/vmlinux``. + +You can generate the BTF information for the running kernel with the following +command: + +:: + + $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h + +The command generates a ``vmlinux.h`` header file with all kernel types +(:doc:`BTF types <../btf>`) that the running kernel uses. Including +``vmlinux.h`` in your BPF program eliminates dependency on system-wide kernel +headers. + +libbpf enables portability of BPF programs by looking at the BPF program’s +recorded BTF type and relocation information and matching them to BTF +information (vmlinux) provided by the running kernel. libbpf then resolves and +matches all the types and fields, and updates necessary offsets and other +relocatable data to ensure that BPF program’s logic functions correctly for a +specific kernel on the host. BPF CO-RE concept thus eliminates overhead +associated with BPF development and allows developers to write portable BPF +applications without modifications and runtime source code compilation on the +target machine. + +The following code snippet shows how to read the parent field of a kernel +``task_struct`` using BPF CO-RE and libbf. The basic helper to read a field in a +CO-RE relocatable manner is ``bpf_core_read(dst, sz, src)``, which will read +``sz`` bytes from the field referenced by ``src`` into the memory pointed to by +``dst``. + +.. code-block:: C + :emphasize-lines: 6 + + //... + struct task_struct *task = (void *)bpf_get_current_task(); + struct task_struct *parent_task; + int err; + + err = bpf_core_read(&parent_task, sizeof(void *), &task->parent); + if (err) { + /* handle error */ + } + + /* parent_task contains the value of task->parent pointer */ + +In the code snippet, we first get a pointer to the current ``task_struct`` using +``bpf_get_current_task()``. We then use ``bpf_core_read()`` to read the parent +field of task struct into the ``parent_task`` variable. ``bpf_core_read()`` is +just like ``bpf_probe_read_kernel()`` BPF helper, except it records information +about the field that should be relocated on the target kernel. i.e, if the +``parent`` field gets shifted to a different offset within +``struct task_struct`` due to some new field added in front of it, libbpf will +automatically adjust the actual offset to the proper value. + +Getting Started with libbpf +=========================== + +Check out the `libbpf-bootstrap `_ +repository with simple examples of using libbpf to build various BPF +applications. + +See also `libbpf API documentation +`_. + +libbpf and Rust +=============== + +If you are building BPF applications in Rust, it is recommended to use the +`Libbpf-rs `_ library instead of bindgen +bindings directly to libbpf. Libbpf-rs wraps libbpf functionality in +Rust-idiomatic interfaces and provides libbpf-cargo plugin to handle BPF code +compilation and skeleton generation. Using Libbpf-rs will make building user +space part of the BPF application easier. Note that the BPF program themselves +must still be written in plain C. + +Additional Documentation +======================== + +* `Program types and ELF Sections `_ +* `API naming convention `_ +* `Building libbpf `_ +* `API documentation Convention `_ diff --git a/third_party/libbpf/docs/program_types.rst b/third_party/libbpf/docs/program_types.rst new file mode 100644 index 0000000..ad4d4d5 --- /dev/null +++ b/third_party/libbpf/docs/program_types.rst @@ -0,0 +1,203 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +.. _program_types_and_elf: + +Program Types and ELF Sections +============================== + +The table below lists the program types, their attach types where relevant and the ELF section +names supported by libbpf for them. The ELF section names follow these rules: + +- ``type`` is an exact match, e.g. ``SEC("socket")`` +- ``type+`` means it can be either exact ``SEC("type")`` or well-formed ``SEC("type/extras")`` + with a '``/``' separator between ``type`` and ``extras``. + +When ``extras`` are specified, they provide details of how to auto-attach the BPF program. The +format of ``extras`` depends on the program type, e.g. ``SEC("tracepoint//")`` +for tracepoints or ``SEC("usdt/::")`` for USDT probes. The extras are +described in more detail in the footnotes. + + ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| Program Type | Attach Type | ELF Section Name | Sleepable | ++===========================================+========================================+==================================+===========+ +| ``BPF_PROG_TYPE_CGROUP_DEVICE`` | ``BPF_CGROUP_DEVICE`` | ``cgroup/dev`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SKB`` | | ``cgroup/skb`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_EGRESS`` | ``cgroup_skb/egress`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_INGRESS`` | ``cgroup_skb/ingress`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCKOPT`` | ``BPF_CGROUP_GETSOCKOPT`` | ``cgroup/getsockopt`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_SETSOCKOPT`` | ``cgroup/setsockopt`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCK_ADDR`` | ``BPF_CGROUP_INET4_BIND`` | ``cgroup/bind4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_CONNECT`` | ``cgroup/connect4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_GETPEERNAME`` | ``cgroup/getpeername4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET4_GETSOCKNAME`` | ``cgroup/getsockname4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_BIND`` | ``cgroup/bind6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_CONNECT`` | ``cgroup/connect6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_GETPEERNAME`` | ``cgroup/getpeername6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_GETSOCKNAME`` | ``cgroup/getsockname6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP4_RECVMSG`` | ``cgroup/recvmsg4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP4_SENDMSG`` | ``cgroup/sendmsg4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP6_RECVMSG`` | ``cgroup/recvmsg6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_UDP6_SENDMSG`` | ``cgroup/sendmsg6`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SOCK`` | ``BPF_CGROUP_INET4_POST_BIND`` | ``cgroup/post_bind4`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET6_POST_BIND`` | ``cgroup/post_bind6`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_SOCK_CREATE`` | ``cgroup/sock_create`` | | ++ + +----------------------------------+-----------+ +| | | ``cgroup/sock`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_CGROUP_INET_SOCK_RELEASE`` | ``cgroup/sock_release`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_CGROUP_SYSCTL`` | ``BPF_CGROUP_SYSCTL`` | ``cgroup/sysctl`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_EXT`` | | ``freplace+`` [#fentry]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_FLOW_DISSECTOR`` | ``BPF_FLOW_DISSECTOR`` | ``flow_dissector`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_KPROBE`` | | ``kprobe+`` [#kprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretprobe+`` [#kprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``ksyscall+`` [#ksyscall]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretsyscall+`` [#ksyscall]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe+`` [#uprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``uprobe.s+`` [#uprobe]_ | Yes | ++ + +----------------------------------+-----------+ +| | | ``uretprobe+`` [#uprobe]_ | | ++ + +----------------------------------+-----------+ +| | | ``uretprobe.s+`` [#uprobe]_ | Yes | ++ + +----------------------------------+-----------+ +| | | ``usdt+`` [#usdt]_ | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_KPROBE_MULTI`` | ``kprobe.multi+`` [#kpmulti]_ | | ++ + +----------------------------------+-----------+ +| | | ``kretprobe.multi+`` [#kpmulti]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LIRC_MODE2`` | ``BPF_LIRC_MODE2`` | ``lirc_mode2`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LSM`` | ``BPF_LSM_CGROUP`` | ``lsm_cgroup+`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_LSM_MAC`` | ``lsm+`` [#lsm]_ | | ++ + +----------------------------------+-----------+ +| | | ``lsm.s+`` [#lsm]_ | Yes | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_IN`` | | ``lwt_in`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_OUT`` | | ``lwt_out`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_SEG6LOCAL`` | | ``lwt_seg6local`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_LWT_XMIT`` | | ``lwt_xmit`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_PERF_EVENT`` | | ``perf_event`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE`` | | ``raw_tp.w+`` [#rawtp]_ | | ++ + +----------------------------------+-----------+ +| | | ``raw_tracepoint.w+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_RAW_TRACEPOINT`` | | ``raw_tp+`` [#rawtp]_ | | ++ + +----------------------------------+-----------+ +| | | ``raw_tracepoint+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SCHED_ACT`` | | ``action`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SCHED_CLS`` | | ``classifier`` | | ++ + +----------------------------------+-----------+ +| | | ``tc`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_LOOKUP`` | ``BPF_SK_LOOKUP`` | ``sk_lookup`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_MSG`` | ``BPF_SK_MSG_VERDICT`` | ``sk_msg`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_REUSEPORT`` | ``BPF_SK_REUSEPORT_SELECT_OR_MIGRATE`` | ``sk_reuseport/migrate`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_REUSEPORT_SELECT`` | ``sk_reuseport`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SK_SKB`` | | ``sk_skb`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_SKB_STREAM_PARSER`` | ``sk_skb/stream_parser`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_SK_SKB_STREAM_VERDICT`` | ``sk_skb/stream_verdict`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SOCKET_FILTER`` | | ``socket`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SOCK_OPS`` | ``BPF_CGROUP_SOCK_OPS`` | ``sockops`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_STRUCT_OPS`` | | ``struct_ops+`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_SYSCALL`` | | ``syscall`` | Yes | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_TRACEPOINT`` | | ``tp+`` [#tp]_ | | ++ + +----------------------------------+-----------+ +| | | ``tracepoint+`` [#tp]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_TRACING`` | ``BPF_MODIFY_RETURN`` | ``fmod_ret+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fmod_ret.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_FENTRY`` | ``fentry+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fentry.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_FEXIT`` | ``fexit+`` [#fentry]_ | | ++ + +----------------------------------+-----------+ +| | | ``fexit.s+`` [#fentry]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_ITER`` | ``iter+`` [#iter]_ | | ++ + +----------------------------------+-----------+ +| | | ``iter.s+`` [#iter]_ | Yes | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_TRACE_RAW_TP`` | ``tp_btf+`` [#fentry]_ | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ +| ``BPF_PROG_TYPE_XDP`` | ``BPF_XDP_CPUMAP`` | ``xdp.frags/cpumap`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp/cpumap`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_XDP_DEVMAP`` | ``xdp.frags/devmap`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp/devmap`` | | ++ +----------------------------------------+----------------------------------+-----------+ +| | ``BPF_XDP`` | ``xdp.frags`` | | ++ + +----------------------------------+-----------+ +| | | ``xdp`` | | ++-------------------------------------------+----------------------------------------+----------------------------------+-----------+ + + +.. rubric:: Footnotes + +.. [#fentry] The ``fentry`` attach format is ``fentry[.s]/``. +.. [#kprobe] The ``kprobe`` attach format is ``kprobe/[+]``. Valid + characters for ``function`` are ``a-zA-Z0-9_.`` and ``offset`` must be a valid + non-negative integer. +.. [#ksyscall] The ``ksyscall`` attach format is ``ksyscall/``. +.. [#uprobe] The ``uprobe`` attach format is ``uprobe[.s]/:[+]``. +.. [#usdt] The ``usdt`` attach format is ``usdt/::``. +.. [#kpmulti] The ``kprobe.multi`` attach format is ``kprobe.multi/`` where ``pattern`` + supports ``*`` and ``?`` wildcards. Valid characters for pattern are + ``a-zA-Z0-9_.*?``. +.. [#lsm] The ``lsm`` attachment format is ``lsm[.s]/``. +.. [#rawtp] The ``raw_tp`` attach format is ``raw_tracepoint[.w]/``. +.. [#tp] The ``tracepoint`` attach format is ``tracepoint//``. +.. [#iter] The ``iter`` attach format is ``iter[.s]/``. diff --git a/third_party/libbpf/docs/sphinx/Makefile b/third_party/libbpf/docs/sphinx/Makefile new file mode 100644 index 0000000..5dc39c5 --- /dev/null +++ b/third_party/libbpf/docs/sphinx/Makefile @@ -0,0 +1,9 @@ +SPHINXBUILD ?= sphinx-build +SOURCEDIR = ../src +BUILDDIR = build + +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" + +%: + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" diff --git a/third_party/libbpf/docs/sphinx/doxygen/Doxyfile b/third_party/libbpf/docs/sphinx/doxygen/Doxyfile new file mode 100644 index 0000000..b04c115 --- /dev/null +++ b/third_party/libbpf/docs/sphinx/doxygen/Doxyfile @@ -0,0 +1,277 @@ +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "libbpf" +PROJECT_NUMBER = +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = ./build +CREATE_SUBDIRS = NO +ALLOW_UNICODE_NAMES = NO +OUTPUT_LANGUAGE = English +OUTPUT_TEXT_DIRECTION = None +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +JAVADOC_BANNER = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +PYTHON_DOCSTRING = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 4 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +OPTIMIZE_OUTPUT_SLICE = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +TOC_INCLUDE_HEADINGS = 5 +AUTOLINK_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +GROUP_NESTED_COMPOUNDS = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +LOOKUP_CACHE_SIZE = 0 +NUM_PROC_THREADS = 1 +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PRIV_VIRTUAL = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +RESOLVE_UNNAMED_PARAMS = YES +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +HIDE_COMPOUND_REFERENCE= NO +SHOW_INCLUDE_FILES = YES +SHOW_GROUPED_MEMB_INC = NO +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_AS_ERROR = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +INPUT = ../../../src +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.c \ + *.h +RECURSIVE = NO +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = ___* +EXAMPLE_PATH = +EXAMPLE_PATTERNS = * +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +USE_MDFILE_AS_MAINPAGE = YES +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +SOURCE_TOOLTIPS = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +ALPHABETICAL_INDEX = YES +IGNORE_PREFIX = +GENERATE_HTML = NO +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_EXTRA_STYLESHEET = +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = NO +HTML_DYNAMIC_MENUS = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = NO +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +HTML_FORMULA_FORMAT = png +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +FORMULA_MACROFILE = +USE_MATHJAX = NO +MATHJAX_FORMAT = HTML-CSS +MATHJAX_RELPATH = https://cdn.jsdelivr.net/npm/mathjax@2 +MATHJAX_EXTENSIONS = +MATHJAX_CODEFILE = +SEARCHENGINE = YES +SERVER_BASED_SEARCH = NO +EXTERNAL_SEARCH = NO +SEARCHENGINE_URL = +SEARCHDATA_FILE = searchdata.xml +EXTERNAL_SEARCH_ID = +EXTRA_SEARCH_MAPPINGS = +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = +MAKEINDEX_CMD_NAME = makeindex +LATEX_MAKEINDEX_CMD = makeindex +COMPACT_LATEX = NO +PAPER_TYPE = a4 +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_FILES = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +LATEX_TIMESTAMP = NO +LATEX_EMOJI_DIRECTORY = +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +RTF_SOURCE_CODE = NO +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_SUBDIR = +MAN_LINKS = NO +GENERATE_XML = YES +XML_OUTPUT = xml +XML_PROGRAMLISTING = YES +XML_NS_MEMB_FILE_SCOPE = NO +GENERATE_DOCBOOK = NO +DOCBOOK_OUTPUT = docbook +DOCBOOK_PROGRAMLISTING = NO +GENERATE_AUTOGEN_DEF = NO +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = YES +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = NO +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +EXTERNAL_PAGES = YES +CLASS_DIAGRAMS = YES +DIA_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = Helvetica +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +DOT_UML_DETAILS = NO +DOT_WRAP_THRESHOLD = 17 +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DIAFILE_DIRS = +PLANTUML_JAR_PATH = +PLANTUML_CFG_FILE = +PLANTUML_INCLUDE_PATH = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/third_party/libbpf/docs/sphinx/requirements.txt b/third_party/libbpf/docs/sphinx/requirements.txt new file mode 100644 index 0000000..188f51e --- /dev/null +++ b/third_party/libbpf/docs/sphinx/requirements.txt @@ -0,0 +1 @@ +breathe \ No newline at end of file diff --git a/third_party/libbpf/fuzz/bpf-object-fuzzer.c b/third_party/libbpf/fuzz/bpf-object-fuzzer.c new file mode 100644 index 0000000..89286e2 --- /dev/null +++ b/third_party/libbpf/fuzz/bpf-object-fuzzer.c @@ -0,0 +1,23 @@ +#include "libbpf.h" + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return 0; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + struct bpf_object *obj = NULL; + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); + int err; + + libbpf_set_print(libbpf_print_fn); + + opts.object_name = "fuzz-object"; + obj = bpf_object__open_mem(data, size, &opts); + err = libbpf_get_error(obj); + if (err) + return 0; + + bpf_object__close(obj); + return 0; +} diff --git a/third_party/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip b/third_party/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip new file mode 100644 index 0000000..602b381 Binary files /dev/null and b/third_party/libbpf/fuzz/bpf-object-fuzzer_seed_corpus.zip differ diff --git a/third_party/libbpf/include/asm/barrier.h b/third_party/libbpf/include/asm/barrier.h new file mode 100644 index 0000000..1fc6aee --- /dev/null +++ b/third_party/libbpf/include/asm/barrier.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#include + +#endif diff --git a/third_party/libbpf/include/linux/compiler.h b/third_party/libbpf/include/linux/compiler.h new file mode 100644 index 0000000..26336dc --- /dev/null +++ b/third_party/libbpf/include/linux/compiler.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_COMPILER_H +#define __LINUX_COMPILER_H + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#define barrier() asm volatile("" ::: "memory") + +#if defined(__x86_64__) + +# define smp_rmb() barrier() +# define smp_wmb() barrier() +# define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc") + +# define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + barrier(); \ + ___p; \ +}) + +#elif defined(__aarch64__) + +# define smp_rmb() asm volatile("dmb ishld" ::: "memory") +# define smp_wmb() asm volatile("dmb ishst" ::: "memory") +# define smp_mb() asm volatile("dmb ish" ::: "memory") + +#endif + +#ifndef smp_mb +# define smp_mb() __sync_synchronize() +#endif + +#ifndef smp_rmb +# define smp_rmb() smp_mb() +#endif + +#ifndef smp_wmb +# define smp_wmb() smp_mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + smp_mb(); \ + ___p; \ +}) +#endif + +#endif /* __LINUX_COMPILER_H */ diff --git a/third_party/libbpf/include/linux/err.h b/third_party/libbpf/include/linux/err.h new file mode 100644 index 0000000..1b1dafb --- /dev/null +++ b/third_party/libbpf/include/linux/err.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_ERR_H +#define __LINUX_ERR_H + +#include +#include + +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * ERR_PTR(long error_) +{ + return (void *) error_; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline bool IS_ERR_OR_NULL(const void *ptr) +{ + return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long PTR_ERR_OR_ZERO(const void *ptr) +{ + return IS_ERR(ptr) ? PTR_ERR(ptr) : 0; +} + +#endif diff --git a/third_party/libbpf/include/linux/filter.h b/third_party/libbpf/include/linux/filter.h new file mode 100644 index 0000000..e7e3373 --- /dev/null +++ b/third_party/libbpf/include/linux/filter.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_FILTER_H +#define __LINUX_FILTER_H + +#include + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((FUNC) - BPF_FUNC_unspec) }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_LD_IMM64_RAW_FULL(DST, SRC, OFF1, OFF2, IMM1, IMM2) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF1, \ + .imm = IMM1 }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF2, \ + .imm = IMM2 }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_FD, 0, 0, \ + MAP_FD, 0) + +#define BPF_LD_MAP_VALUE(DST, MAP_FD, VALUE_OFF) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \ + MAP_FD, VALUE_OFF) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#endif diff --git a/third_party/libbpf/include/linux/kernel.h b/third_party/libbpf/include/linux/kernel.h new file mode 100644 index 0000000..a4a7a9d --- /dev/null +++ b/third_party/libbpf/include/linux/kernel.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_KERNEL_H +#define __LINUX_KERNEL_H + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#ifndef max +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) +#endif + +#ifndef min +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) +#endif + +#ifndef roundup +#define roundup(x, y) ( \ +{ \ + const typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#endif + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#endif diff --git a/third_party/libbpf/include/linux/list.h b/third_party/libbpf/include/linux/list.h new file mode 100644 index 0000000..fc91c34 --- /dev/null +++ b/third_party/libbpf/include/linux/list.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_LIST_H +#define __LINUX_LIST_H + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) + + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +#endif diff --git a/third_party/libbpf/include/linux/overflow.h b/third_party/libbpf/include/linux/overflow.h new file mode 100644 index 0000000..53d7580 --- /dev/null +++ b/third_party/libbpf/include/linux/overflow.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#ifdef __GNUC__ +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 50100 +#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 +#endif +#endif + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW + +#define check_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +}) + +#else + +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (__u64)__a * (__u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + +#define check_mul_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d)) + + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +#endif diff --git a/third_party/libbpf/include/linux/ring_buffer.h b/third_party/libbpf/include/linux/ring_buffer.h new file mode 100644 index 0000000..fc4677b --- /dev/null +++ b/third_party/libbpf/include/linux/ring_buffer.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +static inline __u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ + return smp_load_acquire(&base->data_head); +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + __u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ diff --git a/third_party/libbpf/include/linux/types.h b/third_party/libbpf/include/linux/types.h new file mode 100644 index 0000000..b15252a --- /dev/null +++ b/third_party/libbpf/include/linux/types.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_TYPES_H +#define __LINUX_TYPES_H + +#include +#include +#include + +#include + +#include +#include + +#define __bitwise__ +#define __bitwise __bitwise__ + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +#ifndef __aligned_u64 +# define __aligned_u64 __u64 __attribute__((aligned(8))) +#endif + +struct list_head { + struct list_head *next, *prev; +}; + +#endif diff --git a/third_party/libbpf/include/uapi/linux/bpf.h b/third_party/libbpf/include/uapi/linux/bpf.h new file mode 100644 index 0000000..60a9d59 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/bpf.h @@ -0,0 +1,7198 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +/* jmp encodings */ +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ +}; + +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; +}; + +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, + BPF_PROG_RUN = BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, +}; + +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, + BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + BPF_STRUCT_OPS, + BPF_NETFILTER, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, + + MAX_BPF_LINK_TYPE, +}; + +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) +#define BPF_F_REPLACE (1U << 2) + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] + * insn[0].imm: map fd or fd_idx + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP + */ +#define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_IDX 5 + +/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE + * insn[0].imm: map fd or fd_idx + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ +#define BPF_PSEUDO_MAP_VALUE 2 +#define BPF_PSEUDO_MAP_IDX_VALUE 6 + +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 + +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 + +/* flags for BPF_MAP_UPDATE_ELEM command */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; + +/* flags for BPF_MAP_CREATE command */ +enum { + BPF_F_NO_PREALLOC = (1U << 0), +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ + BPF_F_NO_COMMON_LRU = (1U << 1), +/* Specify numa node during map creation */ + BPF_F_NUMA_NODE = (1U << 2), + +/* Flags for accessing BPF object from syscall side. */ + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), + +/* Flag for stack_map, store build_id+offset instead of pointer */ + BPF_F_STACK_BUILD_ID = (1U << 5), + +/* Zero-initialize hash function seed. This should only be used for testing. */ + BPF_F_ZERO_SEED = (1U << 6), + +/* Flags for accessing BPF object from program side. */ + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), + +/* Clone map from listener for newly accepted socket */ + BPF_F_CLONE = (1U << 9), + +/* Enable memory-mapping BPF map */ + BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), +}; + +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are always returned 0. + */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) + +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + +#define BPF_OBJ_NAME_LEN 16U + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; /* previously attached eBPF + * program to replace if + * BPF_F_REPLACE is used + */ + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __u32 target_fd; /* container object to query */ + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + __u32 prog_cnt; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; + __u32 flags; /* extra flags */ + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + +} __attribute__((aligned(8))); + +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * Return + * Current *ktime*. + * + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * long bpf_get_current_comm(void *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * long bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * + * u64 bpf_get_current_task(void) + * Description + * Get the current task. + * Return + * A pointer to the current task struct. + * + * long bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. + * + * long bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * Return + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * long bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * Return + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + * + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_override_return(struct pt_regs *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + * + * void *bpf_get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * Return + * A pointer to the local storage area. + * + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_sk_release(void *sock) + * Description + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * long bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + * + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + * + * long bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + * + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies + * + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * Return + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * Return + * Current *ktime*. + * + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(void *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * Return + * 0 on success, or a negative error in case of failure. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. + * + * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) + * Description + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * Return + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + * + * long bpf_sys_close(u32 fd) + * Description + * Execute close syscall for given FD. + * Return + * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * Return + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * Return + * The number of argument registers of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * Return + * The BPF program's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + * + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) + * Description + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + * + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + * + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ + /* */ + +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, +enum bpf_func_id { + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; + +/* BPF_FUNC_l4_csum_replace flags. */ +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +enum { + BPF_F_INGRESS = (1ULL << 0), +}; + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; + +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), +/* flags used by BPF_FUNC_get_stackid only. */ + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), +/* flags used by BPF_FUNC_get_stack only. */ + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), +}; + +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, +/* BPF_FUNC_perf_event_output for sk_buff input context. */ + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; + +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + +/* BPF_FUNC_skb_adjust_room flags. */ +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), +}; + +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; + +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; + +/* BPF_FUNC__storage_get flags */ +enum { + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; + +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + +/* Mode for BPF_FUNC_skb_adjust_room helper. */ +enum bpf_adj_room_mode { + BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, +}; + +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, +}; + +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + +enum { + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ +}; + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; + + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; + __u8 tstamp_type; + __u32 :24; /* Padding, future use. */ + __u64 hwtstamp; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; + __u32 tunnel_label; + union { + __u32 local_ipv4; + __u32 local_ipv6[4]; + }; +}; + +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + __u16 ext; /* Padding, future use. */ + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; + __u32 mark; + __u32 priority; + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; + __s32 rx_queue_mapping; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +}; + +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + +struct bpf_xdp_sock { + __u32 queue_id; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, + XDP_REDIRECT, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; + __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ +}; + +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +enum sk_action { + SK_DROP = 0, + SK_PASS, +}; + +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ +}; + +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); +}; + +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ + __u64 netns_dev; + __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; + __u64 recursion_misses; + __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 btf_vmlinux_value_type_id; + __u64 netns_dev; + __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 :32; /* alignment pad */ + __u64 map_extra; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; +} __attribute__((aligned(8))); + +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ + union { + struct { + __u32 map_id; + } map; + }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; + }; + } iter; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; + struct { + __u32 ifindex; + } xdp; + struct { + __u32 map_id; + } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; +} __attribute__((aligned(8))); + +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __bpf_md_ptr(struct bpf_sock *, sk); +}; + +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ + __u64 skb_hwtstamp; +}; + +/* Definitions for bpf_sock_ops_cb_flags */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + + BPF_TCP_MAX_STATES /* Leave at the end! */ +}; + +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ +}; + +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; + +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; + +struct bpf_cgroup_dev_ctx { + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; + __u32 major; + __u32 minor; +}; + +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), +}; + +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + +struct bpf_fib_lookup { + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + + /* output: MTU value */ + __u16 mtu_result; + }; + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ + + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; + }; + + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route + */ + union { + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; + +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; + __u32 flags; + __be32 flow_label; +}; + +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + +struct bpf_spin_lock { + __u32 val; +}; + +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_dynptr { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_head { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_root { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 :64; + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + +struct bpf_refcount { + __u32 :32; +} __attribute__((aligned(4))); + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ + __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ +}; + +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ +}; + +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), +}; + +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/third_party/libbpf/include/uapi/linux/bpf_common.h b/third_party/libbpf/include/uapi/linux/bpf_common.h new file mode 100644 index 0000000..ee97668 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/bpf_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_BPF_COMMON_H__ +#define _UAPI__LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/third_party/libbpf/include/uapi/linux/btf.h b/third_party/libbpf/include/uapi/linux/btf.h new file mode 100644 index 0000000..ec1798b --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/btf.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x000fffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x00ffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union, enum, fwd and enum64 + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) + +enum { + BTF_KIND_UNKN = 0, /* Unknown */ + BTF_KIND_INT = 1, /* Integer */ + BTF_KIND_PTR = 2, /* Pointer */ + BTF_KIND_ARRAY = 3, /* Array */ + BTF_KIND_STRUCT = 4, /* Struct */ + BTF_KIND_UNION = 5, /* Union */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ + BTF_KIND_FWD = 7, /* Forward */ + BTF_KIND_TYPEDEF = 8, /* Typedef */ + BTF_KIND_VOLATILE = 9, /* Volatile */ + BTF_KIND_CONST = 10, /* Const */ + BTF_KIND_RESTRICT = 11, /* Restrict */ + BTF_KIND_FUNC = 12, /* Function */ + BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ + BTF_KIND_VAR = 14, /* Variable */ + BTF_KIND_DATASEC = 15, /* Section */ + BTF_KIND_FLOAT = 16, /* Floating point */ + BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ + + NR_BTF_KINDS, + BTF_KIND_MAX = NR_BTF_KINDS - 1, +}; + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name_off; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name_off; + __u32 type; + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; +}; + +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, +}; + +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + +/* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe + * additional information related to the tag applied location. + * If component_idx == -1, the tag is applied to a struct, union, + * variable or function. Otherwise, it is applied to a struct/union + * member or a func argument, and component_idx indicates which member + * or argument (0 ... vlen-1). + */ +struct btf_decl_tag { + __s32 component_idx; +}; + +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/third_party/libbpf/include/uapi/linux/fcntl.h b/third_party/libbpf/include/uapi/linux/fcntl.h new file mode 100644 index 0000000..e8c07da --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/fcntl.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_FCNTL_H +#define _UAPI_LINUX_FCNTL_H + +#include +#include + +#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) +#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) + +/* + * Cancel a blocking posix lock; internal use only until we expose an + * asynchronous lock api to userspace: + */ +#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5) + +/* Create a file descriptor with FD_CLOEXEC set. */ +#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6) + +/* + * Request nofications on a directory. + * See below for events that may be notified. + */ +#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) + +/* + * Set and get of pipe page size array + */ +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) + +/* + * Set/Get seals + */ +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +/* + * Types of seals + */ +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ +/* (1U << 31) is reserved for signed error codes */ + +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWH_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + +/* + * Types of directory notifications that may be requested. + */ +#define DN_ACCESS 0x00000001 /* File accessed */ +#define DN_MODIFY 0x00000002 /* File modified */ +#define DN_CREATE 0x00000004 /* File created */ +#define DN_DELETE 0x00000008 /* File removed */ +#define DN_RENAME 0x00000010 /* File renamed */ +#define DN_ATTRIB 0x00000020 /* File changed attibutes */ +#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ + +/* + * The constants AT_REMOVEDIR and AT_EACCESS have the same value. AT_EACCESS is + * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to + * unlinkat. The two functions do completely different things and therefore, + * the flags can be allowed to overlap. For example, passing AT_REMOVEDIR to + * faccessat would be undefined behavior and thus treating it equivalent to + * AT_EACCESS is valid undefined behavior. + */ +#define AT_FDCWD -100 /* Special value used to indicate + openat should use the current + working directory. */ +#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ +#define AT_EACCESS 0x200 /* Test access permitted for + effective IDs, not real IDs. */ +#define AT_REMOVEDIR 0x200 /* Remove directory instead of + unlinking file. */ +#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */ +#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ +#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ + +#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */ +#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */ +#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */ +#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */ + +#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ + +#endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/third_party/libbpf/include/uapi/linux/if_link.h b/third_party/libbpf/include/uapi/linux/if_link.h new file mode 100644 index 0000000..39e659c --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/if_link.h @@ -0,0 +1,1284 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; + __u32 collisions; + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; +}; + +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ +struct rtnl_link_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + __u64 rx_nohandler; +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* VXLAN section */ +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, + IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/third_party/libbpf/include/uapi/linux/if_xdp.h b/third_party/libbpf/include/uapi/linux/if_xdp.h new file mode 100644 index 0000000..a78a809 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/if_xdp.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ +/* If this option is set, the driver might go sleep and in that case + * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be + * set. If it is set, the application need to explicitly wake up the + * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are + * running the driver and the application on the same core, you should + * use this option so that the kernel will yield to the user space + * application. + */ +#define XDP_USE_NEED_WAKEUP (1 << 3) + +/* Flags for xsk_umem_config flags */ +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +struct sockaddr_xdp { + __u16 sxdp_family; + __u16 sxdp_flags; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; +}; + +/* XDP_RING flags */ +#define XDP_RING_NEED_WAKEUP (1 << 0) + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; + __u64 flags; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + +/* XDP socket options */ +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; +}; + +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for other reasons */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ +}; + +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + +/* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL + +/* Masks for unaligned chunks mode */ +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 +#define XSK_UNALIGNED_BUF_ADDR_MASK \ + ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) + +/* Rx/Tx descriptor */ +struct xdp_desc { + __u64 addr; + __u32 len; + __u32 options; +}; + +/* UMEM descriptor is __u64 */ + +#endif /* _LINUX_IF_XDP_H */ diff --git a/third_party/libbpf/include/uapi/linux/netdev.h b/third_party/libbpf/include/uapi/linux/netdev.h new file mode 100644 index 0000000..639524b --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/netdev.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NETDEV_H +#define _UAPI_LINUX_NETDEV_H + +#define NETDEV_FAMILY_NAME "netdev" +#define NETDEV_FAMILY_VERSION 1 + +/** + * enum netdev_xdp_act + * @NETDEV_XDP_ACT_BASIC: XDP feautues set supported by all drivers + * (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX) + * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT + * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements + * ndo_xdp_xmit callback. + * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP + * in zero copy mode. + * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw + * offloading. + * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear + * XDP buffer support in the driver napi callback. + * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements + * non-linear XDP buffer support in ndo_xdp_xmit callback. + */ +enum netdev_xdp_act { + NETDEV_XDP_ACT_BASIC = 1, + NETDEV_XDP_ACT_REDIRECT = 2, + NETDEV_XDP_ACT_NDO_XMIT = 4, + NETDEV_XDP_ACT_XSK_ZEROCOPY = 8, + NETDEV_XDP_ACT_HW_OFFLOAD = 16, + NETDEV_XDP_ACT_RX_SG = 32, + NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + NETDEV_XDP_ACT_MASK = 127, +}; + +enum { + NETDEV_A_DEV_IFINDEX = 1, + NETDEV_A_DEV_PAD, + NETDEV_A_DEV_XDP_FEATURES, + + __NETDEV_A_DEV_MAX, + NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) +}; + +enum { + NETDEV_CMD_DEV_GET = 1, + NETDEV_CMD_DEV_ADD_NTF, + NETDEV_CMD_DEV_DEL_NTF, + NETDEV_CMD_DEV_CHANGE_NTF, + + __NETDEV_CMD_MAX, + NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) +}; + +#define NETDEV_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_NETDEV_H */ diff --git a/third_party/libbpf/include/uapi/linux/netlink.h b/third_party/libbpf/include/uapi/linux/netlink.h new file mode 100644 index 0000000..0a4d733 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/netlink.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Sending process port ID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/third_party/libbpf/include/uapi/linux/openat2.h b/third_party/libbpf/include/uapi/linux/openat2.h new file mode 100644 index 0000000..a5feb76 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/openat2.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ + +#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/third_party/libbpf/include/uapi/linux/perf_event.h b/third_party/libbpf/include/uapi/linux/perf_event.h new file mode 100644 index 0000000..3767543 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/perf_event.h @@ -0,0 +1,1448 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _UAPI_LINUX_PERF_EVENT_H +#define _UAPI_LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, + PERF_COUNT_HW_REF_CPU_CYCLES = 9, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + PERF_COUNT_HW_CACHE_NODE = 6, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, + PERF_COUNT_SW_EMULATION_FAULTS = 8, + PERF_COUNT_SW_DUMMY = 9, + PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, + PERF_SAMPLE_REGS_USER = 1U << 12, + PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, + PERF_SAMPLE_TRANSACTION = 1U << 17, + PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ +}; + +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ + PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ + + PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ + PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_MAX, +}; + +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + +/* + * Values to determine ABI of the registers dump. + */ +enum perf_sample_regs_abi { + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, +}; + +/* + * Values for the memory transaction event qualifier, mostly for + * abort events. Multiple bits can be set. + */ +enum { + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + + PERF_TXN_MAX = (1 << 8), /* non-ABI */ + + /* bits 32..63 are reserved for the abort code */ + + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, + + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ +#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ + /* add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + * + * @sample_max_stack: Max number of frame pointers in a callchain, + * should be < /proc/sys/kernel/perf_event_max_stack + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ + + exclude_host : 1, /* don't count in host */ + exclude_guest : 1, /* don't count in guest */ + + exclude_callchain_kernel : 1, /* exclude kernel callchains */ + exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ + comm_exec : 1, /* flag comm events that are due to an exec */ + use_clockid : 1, /* use @clockid for time fields */ + context_switch : 1, /* context switch data */ + write_backward : 1, /* Write ring buffer from end to beginning */ + namespaces : 1, /* include namespaces data */ + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + aux_output : 1, /* generate AUX records instead of events */ + cgroup : 1, /* include cgroup events */ + text_poke : 1, /* include text poke events */ + build_id : 1, /* use build id in mmap2 events */ + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + + __u32 bp_type; + union { + __u64 bp_addr; + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ + __u64 config2; /* extension of config1 */ + }; + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + + /* + * Defines set of user regs to dump on samples. + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_user; + + /* + * Defines size of the user stack to dump on samples. + */ + __u32 sample_stack_user; + + __s32 clockid; + /* + * Defines set of regs to dump for each sample + * state captured on: + * - precise = 0: PMU interrupt + * - precise > 0: sampled instruction + * + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u16 sample_max_stack; + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. + */ + __u64 sig_data; + + __u64 config3; /* extension of config2 */ +}; + +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[]; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq, time_mult, time_shift, index, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; + * + * do { + * seq = pc->lock; + * barrier() + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * index = pc->index; + * count = pc->offset; + * if (pc->cap_user_rdpmc && index) { + * width = pc->pmc_width; + * pmc = rdpmc(index - 1); + * } + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + union { + __u64 capabilities; + struct { + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; + }; + }; + + /* + * If cap_user_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & (((u64)1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if index), improving the scaling: + * + * enabled += delta; + * if (index) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; + __u64 time_offset; + /* + * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated + * from sample timestamps. + * + * time = timestamp - time_zero; + * quot = time / time_mult; + * rem = time % time_mult; + * cyc = (quot << time_shift) + (rem << time_shift) / time_mult; + * + * And vice versa: + * + * quot = cyc >> time_shift; + * rem = cyc & (((u64)1 << time_shift) - 1); + * timestamp = time_zero + quot * time_mult + + * ((rem * time_mult) >> time_shift); + */ + __u64 time_zero; + + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; + + /* + * Hole for extension of the self monitor capabilities + */ + + __u8 __reserved[116*8]; /* align to 1k. */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; +}; + +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) + +/* + * Indicates that /proc/PID/maps parsing are truncated by time out. + */ +#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12) +/* + * Following PERF_RECORD_MISC_* are used on different + * events, so can reuse the same bit position: + * + * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events + * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) + * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events + */ +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) +#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) +/* + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +struct perf_ns_link_info { + __u64 dev; + __u64 ino; +}; + +enum { + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ +}; + +enum perf_event_type { + + /* + * If perf_event_attr.sample_id_all is set then all event types will + * have the sample_type selected fields related to where/when + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. + * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. + */ + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 nr; + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK + * + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 transaction; } && PERF_SAMPLE_TRANSACTION + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE + * }; + */ + PERF_RECORD_SAMPLE = 9, + + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; + * u32 prot, flags; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + + /* + * Records the dropped/lost sample number. + * + * struct { + * struct perf_event_header header; + * + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST_SAMPLES = 13, + + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + + /* + * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and + * next_prev_tid that are the next (switching out) or previous + * (switching in) pid/tid. + * + * struct { + * struct perf_event_header header; + * u32 next_prev_pid; + * u32 next_prev_tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH_CPU_WIDE = 15, + + /* + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * u64 nr_namespaces; + * { u64 dev, inode; } [nr_namespaces]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_NAMESPACES = 16, + + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + + /* + * Data written to the AUX area by hardware due to aux_output, may need + * to be matched to the event by an architecture-specific hardware ID. + * This records the hardware ID, but requires sample_id to provide the + * event ID. e.g. Intel PT uses this record to disambiguate PEBS-via-PT + * records from multiple events. + * + * struct { + * struct perf_event_header header; + * u64 hw_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ + +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ + +#if defined(__LITTLE_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; +#elif defined(__BIG_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ + mem_blk:3, /* access blocked */ + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_dtlb:7, /* tlb access */ + mem_lock:2, /* lock instr */ + mem_snoop:5, /* snoop mode */ + mem_lvl:14, /* memory hierarchy level */ + mem_op:5; /* type of opcode */ + }; +}; +#else +#error "Unknown endianness" +#endif + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 */ +#define PERF_MEM_LVL_L3 0x40 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ +/* 5-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + +#define PERF_MEM_S(a, s) \ + (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction + * cycles: cycles from last branch (or 0 if not supported) + * type: branch type + * spec: branch speculation info (or 0 if not supported) + */ +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + cycles:16, /* cycle count to last branch */ + type:4, /* branch type */ + spec:2, /* branch speculation info */ + new_type:4, /* additional branch type */ + priv:3, /* privilege level */ + reserved:31; +}; + +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + +#endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/third_party/libbpf/include/uapi/linux/pkt_cls.h b/third_party/libbpf/include/uapi/linux/pkt_cls.h new file mode 100644 index 0000000..3faee01 --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/pkt_cls.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include +#include + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + + +/* RSVP filter */ + +enum { + TCA_RSVP_UNSPEC, + TCA_RSVP_CLASSID, + TCA_RSVP_DST, + TCA_RSVP_SRC, + TCA_RSVP_PINFO, + TCA_RSVP_POLICE, + TCA_RSVP_ACT, + __TCA_RSVP_MAX +}; + +#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) + +struct tc_rsvp_gpi { + __u32 key; + __u32 mask; + int offset; +}; + +struct tc_rsvp_pinfo { + struct tc_rsvp_gpi dpi; + struct tc_rsvp_gpi spi; + __u8 protocol; + __u8 tunnelid; + __u8 tunnelhdr; + __u8 pad; +}; + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* TC index filter */ + +enum { + TCA_TCINDEX_UNSPEC, + TCA_TCINDEX_HASH, + TCA_TCINDEX_MASK, + TCA_TCINDEX_SHIFT, + TCA_TCINDEX_FALL_THROUGH, + TCA_TCINDEX_CLASSID, + TCA_TCINDEX_POLICE, + TCA_TCINDEX_ACT, + __TCA_TCINDEX_MAX +}; + +#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/third_party/libbpf/include/uapi/linux/pkt_sched.h b/third_party/libbpf/include/uapi/linux/pkt_sched.h new file mode 100644 index 0000000..5c903ab --- /dev/null +++ b/third_party/libbpf/include/uapi/linux/pkt_sched.h @@ -0,0 +1,1164 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + TCA_HTB_OFFLOAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + + +/* CBQ section */ + +#define TC_CBQ_MAXPRIO 8 +#define TC_CBQ_MAXLEVEL 8 +#define TC_CBQ_DEF_EWMA 5 + +struct tc_cbq_lssopt { + unsigned char change; + unsigned char flags; +#define TCF_CBQ_LSS_BOUNDED 1 +#define TCF_CBQ_LSS_ISOLATED 2 + unsigned char ewma_log; + unsigned char level; +#define TCF_CBQ_LSS_FLAGS 1 +#define TCF_CBQ_LSS_EWMA 2 +#define TCF_CBQ_LSS_MAXIDLE 4 +#define TCF_CBQ_LSS_MINIDLE 8 +#define TCF_CBQ_LSS_OFFTIME 0x10 +#define TCF_CBQ_LSS_AVPKT 0x20 + __u32 maxidle; + __u32 minidle; + __u32 offtime; + __u32 avpkt; +}; + +struct tc_cbq_wrropt { + unsigned char flags; + unsigned char priority; + unsigned char cpriority; + unsigned char __reserved; + __u32 allot; + __u32 weight; +}; + +struct tc_cbq_ovl { + unsigned char strategy; +#define TC_CBQ_OVL_CLASSIC 0 +#define TC_CBQ_OVL_DELAY 1 +#define TC_CBQ_OVL_LOWPRIO 2 +#define TC_CBQ_OVL_DROP 3 +#define TC_CBQ_OVL_RCLASSIC 4 + unsigned char priority2; + __u16 pad; + __u32 penalty; +}; + +struct tc_cbq_police { + unsigned char police; + unsigned char __res1; + unsigned short __res2; +}; + +struct tc_cbq_fopt { + __u32 split; + __u32 defmap; + __u32 defchange; +}; + +struct tc_cbq_xstats { + __u32 borrows; + __u32 overactions; + __s32 avgidle; + __s32 undertime; +}; + +enum { + TCA_CBQ_UNSPEC, + TCA_CBQ_LSSOPT, + TCA_CBQ_WRROPT, + TCA_CBQ_FOPT, + TCA_CBQ_OVL_STRATEGY, + TCA_CBQ_RATE, + TCA_CBQ_RTAB, + TCA_CBQ_POLICE, + __TCA_CBQ_MAX, +}; + +#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) + +/* dsmark section */ + +enum { + TCA_DSMARK_UNSPEC, + TCA_DSMARK_INDICES, + TCA_DSMARK_DEFAULT_INDEX, + TCA_DSMARK_SET_TC_INDEX, + TCA_DSMARK_MASK, + TCA_DSMARK_VALUE, + __TCA_DSMARK_MAX, +}; + +#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) + +/* ATM section */ + +enum { + TCA_ATM_UNSPEC, + TCA_ATM_FD, /* file/socket descriptor */ + TCA_ATM_PTR, /* pointer to descriptor - later */ + TCA_ATM_HDR, /* LL header */ + TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ + TCA_ATM_ADDR, /* PVC address (for output only) */ + TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ + __TCA_ATM_MAX, +}; + +#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u32 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif diff --git a/third_party/libbpf/scripts/build-fuzzers.sh b/third_party/libbpf/scripts/build-fuzzers.sh new file mode 100755 index 0000000..5171e95 --- /dev/null +++ b/third_party/libbpf/scripts/build-fuzzers.sh @@ -0,0 +1,82 @@ +#!/bin/bash +set -eux + +SANITIZER=${SANITIZER:-address} +flags="-O1 -fno-omit-frame-pointer -g -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=$SANITIZER -fsanitize=fuzzer-no-link" + +export CC=${CC:-clang} +export CFLAGS=${CFLAGS:-$flags} + +export CXX=${CXX:-clang++} +export CXXFLAGS=${CXXFLAGS:-$flags} + +cd "$(dirname -- "$0")/.." + +export OUT=${OUT:-"$(pwd)/out"} +mkdir -p "$OUT" + +export LIB_FUZZING_ENGINE=${LIB_FUZZING_ENGINE:--fsanitize=fuzzer} + +# libelf is compiled with _FORTIFY_SOURCE by default and it +# isn't compatible with MSan. It was borrowed +# from https://github.com/google/oss-fuzz/pull/7422 +if [[ "$SANITIZER" == memory ]]; then + CFLAGS+=" -U_FORTIFY_SOURCE" + CXXFLAGS+=" -U_FORTIFY_SOURCE" +fi + +# The alignment check is turned off by default on OSS-Fuzz/CFLite so it should be +# turned on explicitly there. It was borrowed from +# https://github.com/google/oss-fuzz/pull/7092 +if [[ "$SANITIZER" == undefined ]]; then + additional_ubsan_checks=alignment + UBSAN_FLAGS="-fsanitize=$additional_ubsan_checks -fno-sanitize-recover=$additional_ubsan_checks" + CFLAGS+=" $UBSAN_FLAGS" + CXXFLAGS+=" $UBSAN_FLAGS" +fi + +# Ideally libbelf should be built using release tarballs available +# at https://sourceware.org/elfutils/ftp/. Unfortunately sometimes they +# fail to compile (for example, elfutils-0.185 fails to compile with LDFLAGS enabled +# due to https://bugs.gentoo.org/794601) so let's just point the script to +# commits referring to versions of libelf that actually can be built +rm -rf elfutils +git clone git://sourceware.org/git/elfutils.git +( +cd elfutils +git checkout 67a187d4c1790058fc7fd218317851cb68bb087c +git log --oneline -1 + +# ASan isn't compatible with -Wl,--no-undefined: https://github.com/google/sanitizers/issues/380 +sed -i 's/^\(NO_UNDEFINED=\).*/\1/' configure.ac + +# ASan isn't compatible with -Wl,-z,defs either: +# https://clang.llvm.org/docs/AddressSanitizer.html#usage +sed -i 's/^\(ZDEFS_LDFLAGS=\).*/\1/' configure.ac + +if [[ "$SANITIZER" == undefined ]]; then + # That's basicaly what --enable-sanitize-undefined does to turn off unaligned access + # elfutils heavily relies on on i386/x86_64 but without changing compiler flags along the way + sed -i 's/\(check_undefined_val\)=[0-9]/\1=1/' configure.ac +fi + +autoreconf -i -f +if ! ./configure --enable-maintainer-mode --disable-debuginfod --disable-libdebuginfod \ + --disable-demangler --without-bzlib --without-lzma --without-zstd \ + CC="$CC" CFLAGS="-Wno-error $CFLAGS" CXX="$CXX" CXXFLAGS="-Wno-error $CXXFLAGS" LDFLAGS="$CFLAGS"; then + cat config.log + exit 1 +fi + +make -C config -j$(nproc) V=1 +make -C lib -j$(nproc) V=1 +make -C libelf -j$(nproc) V=1 +) + +make -C src BUILD_STATIC_ONLY=y V=1 clean +make -C src -j$(nproc) CFLAGS="-I$(pwd)/elfutils/libelf $CFLAGS" BUILD_STATIC_ONLY=y V=1 + +$CC $CFLAGS -Isrc -Iinclude -Iinclude/uapi -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -c fuzz/bpf-object-fuzzer.c -o bpf-object-fuzzer.o +$CXX $CXXFLAGS $LIB_FUZZING_ENGINE bpf-object-fuzzer.o src/libbpf.a "$(pwd)/elfutils/libelf/libelf.a" -l:libz.a -o "$OUT/bpf-object-fuzzer" + +cp fuzz/bpf-object-fuzzer_seed_corpus.zip "$OUT" diff --git a/third_party/libbpf/scripts/coverity.sh b/third_party/libbpf/scripts/coverity.sh new file mode 100755 index 0000000..99e4809 --- /dev/null +++ b/third_party/libbpf/scripts/coverity.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Taken from: https://scan.coverity.com/scripts/travisci_build_coverity_scan.sh +# Local changes are annotated with "#[local]" + +set -e + +# Environment check +echo -e "\033[33;1mNote: COVERITY_SCAN_PROJECT_NAME and COVERITY_SCAN_TOKEN are available on Project Settings page on scan.coverity.com\033[0m" +[ -z "$COVERITY_SCAN_PROJECT_NAME" ] && echo "ERROR: COVERITY_SCAN_PROJECT_NAME must be set" && exit 1 +[ -z "$COVERITY_SCAN_NOTIFICATION_EMAIL" ] && echo "ERROR: COVERITY_SCAN_NOTIFICATION_EMAIL must be set" && exit 1 +[ -z "$COVERITY_SCAN_BRANCH_PATTERN" ] && echo "ERROR: COVERITY_SCAN_BRANCH_PATTERN must be set" && exit 1 +[ -z "$COVERITY_SCAN_BUILD_COMMAND" ] && echo "ERROR: COVERITY_SCAN_BUILD_COMMAND must be set" && exit 1 +[ -z "$COVERITY_SCAN_TOKEN" ] && echo "ERROR: COVERITY_SCAN_TOKEN must be set" && exit 1 + +PLATFORM=`uname` +#[local] Use /var/tmp for TOOL_ARCHIVE and TOOL_BASE, as on certain systems +# /tmp is tmpfs and is sometimes too small to handle all necessary tooling +TOOL_ARCHIVE=/var//tmp/cov-analysis-${PLATFORM}.tgz +TOOL_URL=https://scan.coverity.com/download/${PLATFORM} +TOOL_BASE=/var/tmp/coverity-scan-analysis +UPLOAD_URL="https://scan.coverity.com/builds" +SCAN_URL="https://scan.coverity.com" + +# Do not run on pull requests +if [ "${TRAVIS_PULL_REQUEST}" = "true" ]; then + echo -e "\033[33;1mINFO: Skipping Coverity Analysis: branch is a pull request.\033[0m" + exit 0 +fi + +# Verify this branch should run +IS_COVERITY_SCAN_BRANCH=`ruby -e "puts '${TRAVIS_BRANCH}' =~ /\\A$COVERITY_SCAN_BRANCH_PATTERN\\z/ ? 1 : 0"` +if [ "$IS_COVERITY_SCAN_BRANCH" = "1" ]; then + echo -e "\033[33;1mCoverity Scan configured to run on branch ${TRAVIS_BRANCH}\033[0m" +else + echo -e "\033[33;1mCoverity Scan NOT configured to run on branch ${TRAVIS_BRANCH}\033[0m" + exit 1 +fi + +# Verify upload is permitted +AUTH_RES=`curl -s --form project="$COVERITY_SCAN_PROJECT_NAME" --form token="$COVERITY_SCAN_TOKEN" $SCAN_URL/api/upload_permitted` +if [ "$AUTH_RES" = "Access denied" ]; then + echo -e "\033[33;1mCoverity Scan API access denied. Check COVERITY_SCAN_PROJECT_NAME and COVERITY_SCAN_TOKEN.\033[0m" + exit 1 +else + AUTH=`echo $AUTH_RES | ruby -e "require 'rubygems'; require 'json'; puts JSON[STDIN.read]['upload_permitted']"` + if [ "$AUTH" = "true" ]; then + echo -e "\033[33;1mCoverity Scan analysis authorized per quota.\033[0m" + else + WHEN=`echo $AUTH_RES | ruby -e "require 'rubygems'; require 'json'; puts JSON[STDIN.read]['next_upload_permitted_at']"` + echo -e "\033[33;1mCoverity Scan analysis NOT authorized until $WHEN.\033[0m" + exit 0 + fi +fi + +if [ ! -d $TOOL_BASE ]; then + # Download Coverity Scan Analysis Tool + if [ ! -e $TOOL_ARCHIVE ]; then + echo -e "\033[33;1mDownloading Coverity Scan Analysis Tool...\033[0m" + wget -nv -O $TOOL_ARCHIVE $TOOL_URL --post-data "project=$COVERITY_SCAN_PROJECT_NAME&token=$COVERITY_SCAN_TOKEN" + fi + + # Extract Coverity Scan Analysis Tool + echo -e "\033[33;1mExtracting Coverity Scan Analysis Tool...\033[0m" + mkdir -p $TOOL_BASE + pushd $TOOL_BASE + tar xzf $TOOL_ARCHIVE + popd +fi + +TOOL_DIR=`find $TOOL_BASE -type d -name 'cov-analysis*'` +export PATH=$TOOL_DIR/bin:$PATH + +# Build +echo -e "\033[33;1mRunning Coverity Scan Analysis Tool...\033[0m" +COV_BUILD_OPTIONS="" +#COV_BUILD_OPTIONS="--return-emit-failures 8 --parse-error-threshold 85" +RESULTS_DIR="cov-int" +eval "${COVERITY_SCAN_BUILD_COMMAND_PREPEND}" +COVERITY_UNSUPPORTED=1 cov-build --dir $RESULTS_DIR $COV_BUILD_OPTIONS $COVERITY_SCAN_BUILD_COMMAND +cov-import-scm --dir $RESULTS_DIR --scm git --log $RESULTS_DIR/scm_log.txt 2>&1 + +# Upload results +echo -e "\033[33;1mTarring Coverity Scan Analysis results...\033[0m" +RESULTS_ARCHIVE=analysis-results.tgz +tar czf $RESULTS_ARCHIVE $RESULTS_DIR +SHA=`git rev-parse --short HEAD` + +echo -e "\033[33;1mUploading Coverity Scan Analysis results...\033[0m" +response=$(curl \ + --silent --write-out "\n%{http_code}\n" \ + --form project=$COVERITY_SCAN_PROJECT_NAME \ + --form token=$COVERITY_SCAN_TOKEN \ + --form email=$COVERITY_SCAN_NOTIFICATION_EMAIL \ + --form file=@$RESULTS_ARCHIVE \ + --form version=$SHA \ + --form description="Travis CI build" \ + $UPLOAD_URL) +status_code=$(echo "$response" | sed -n '$p') +#[local] Coverity used to return 201 on success, but it's 200 now +# See https://github.com/systemd/systemd/blob/master/tools/coverity.sh#L145 +if [ "$status_code" != "200" ]; then + TEXT=$(echo "$response" | sed '$d') + echo -e "\033[33;1mCoverity Scan upload failed: $TEXT.\033[0m" + exit 1 +fi diff --git a/third_party/libbpf/scripts/sync-kernel.sh b/third_party/libbpf/scripts/sync-kernel.sh new file mode 100755 index 0000000..55ffcd1 --- /dev/null +++ b/third_party/libbpf/scripts/sync-kernel.sh @@ -0,0 +1,355 @@ +#!/bin/bash + +usage () { + echo "USAGE: ./sync-kernel.sh " + echo "" + echo "Set BPF_NEXT_BASELINE to override bpf-next tree commit, otherwise read from /CHECKPOINT-COMMIT." + echo "Set BPF_BASELINE to override bpf tree commit, otherwise read from /BPF-CHECKPOINT-COMMIT." + echo "Set MANUAL_MODE to 1 to manually control every cherry-picked commits." + exit 1 +} + +set -eu + +LIBBPF_REPO=${1-""} +LINUX_REPO=${2-""} +BPF_BRANCH=${3-""} +BASELINE_COMMIT=${BPF_NEXT_BASELINE:-$(cat ${LIBBPF_REPO}/CHECKPOINT-COMMIT)} +BPF_BASELINE_COMMIT=${BPF_BASELINE:-$(cat ${LIBBPF_REPO}/BPF-CHECKPOINT-COMMIT)} + +if [ -z "${LIBBPF_REPO}" ] || [ -z "${LINUX_REPO}" ]; then + echo "Error: libbpf or linux repos are not specified" + usage +fi +if [ -z "${BPF_BRANCH}" ]; then + echo "Error: linux's bpf tree branch is not specified" + usage +fi +if [ -z "${BASELINE_COMMIT}" ] || [ -z "${BPF_BASELINE_COMMIT}" ]; then + echo "Error: bpf or bpf-next baseline commits are not provided" + usage +fi + +SUFFIX=$(date --utc +%Y-%m-%dT%H-%M-%S.%3NZ) +WORKDIR=$(pwd) +TMP_DIR=$(mktemp -d) + +trap "cd ${WORKDIR}; exit" INT TERM EXIT + +declare -A PATH_MAP +PATH_MAP=( \ + [tools/lib/bpf]=src \ + [tools/include/uapi/linux/bpf_common.h]=include/uapi/linux/bpf_common.h \ + [tools/include/uapi/linux/bpf.h]=include/uapi/linux/bpf.h \ + [tools/include/uapi/linux/btf.h]=include/uapi/linux/btf.h \ + [tools/include/uapi/linux/fcntl.h]=include/uapi/linux/fcntl.h \ + [tools/include/uapi/linux/openat2.h]=include/uapi/linux/openat2.h \ + [tools/include/uapi/linux/if_link.h]=include/uapi/linux/if_link.h \ + [tools/include/uapi/linux/if_xdp.h]=include/uapi/linux/if_xdp.h \ + [tools/include/uapi/linux/netdev.h]=include/uapi/linux/netdev.h \ + [tools/include/uapi/linux/netlink.h]=include/uapi/linux/netlink.h \ + [tools/include/uapi/linux/pkt_cls.h]=include/uapi/linux/pkt_cls.h \ + [tools/include/uapi/linux/pkt_sched.h]=include/uapi/linux/pkt_sched.h \ + [include/uapi/linux/perf_event.h]=include/uapi/linux/perf_event.h \ + [Documentation/bpf/libbpf]=docs \ +) + +LIBBPF_PATHS=("${!PATH_MAP[@]}" ":^tools/lib/bpf/Makefile" ":^tools/lib/bpf/Build" ":^tools/lib/bpf/.gitignore" ":^tools/include/tools/libc_compat.h") +LIBBPF_VIEW_PATHS=("${PATH_MAP[@]}") +LIBBPF_VIEW_EXCLUDE_REGEX='^src/(Makefile|Build|test_libbpf\.c|bpf_helper_defs\.h|\.gitignore)$|^docs/(\.gitignore|api\.rst|conf\.py)$|^docs/sphinx/.*' +LINUX_VIEW_EXCLUDE_REGEX='^include/tools/libc_compat.h$' + +LIBBPF_TREE_FILTER="mkdir -p __libbpf/include/uapi/linux __libbpf/include/tools && "$'\\\n' +for p in "${!PATH_MAP[@]}"; do + LIBBPF_TREE_FILTER+="git mv -kf ${p} __libbpf/${PATH_MAP[${p}]} && "$'\\\n' +done +LIBBPF_TREE_FILTER+="git rm --ignore-unmatch -f __libbpf/src/{Makefile,Build,test_libbpf.c,.gitignore} >/dev/null" + +cd_to() +{ + cd ${WORKDIR} && cd "$1" +} + +# Output brief single-line commit description +# $1 - commit ref +commit_desc() +{ + git log -n1 --pretty='%h ("%s")' $1 +} + +# Create commit single-line signature, which consists of: +# - full commit subject +# - author date in ISO8601 format +# - full commit body with newlines replaced with vertical bars (|) +# - shortstat appended at the end +# The idea is that this single-line signature is good enough to make final +# decision about whether two commits are the same, across different repos. +# $1 - commit ref +# $2 - paths filter +commit_signature() +{ + local ref=$1 + shift + git show --pretty='("%s")|%aI|%b' --shortstat $ref -- "${@-.}" | tr '\n' '|' +} + +# Cherry-pick commits touching libbpf-related files +# $1 - baseline_tag +# $2 - tip_tag +cherry_pick_commits() +{ + local manual_mode=${MANUAL_MODE:-0} + local baseline_tag=$1 + local tip_tag=$2 + local new_commits + local signature + local should_skip + local synced_cnt + local manual_check + local libbpf_conflict_cnt + local desc + + new_commits=$(git rev-list --no-merges --topo-order --reverse ${baseline_tag}..${tip_tag} -- "${LIBBPF_PATHS[@]}") + for new_commit in ${new_commits}; do + desc="$(commit_desc ${new_commit})" + signature="$(commit_signature ${new_commit} "${LIBBPF_PATHS[@]}")" + synced_cnt=$(grep -F "${signature}" ${TMP_DIR}/libbpf_commits.txt | wc -l) + manual_check=0 + if ((${synced_cnt} > 0)); then + # commit with the same subject is already in libbpf, but it's + # not 100% the same commit, so check with user + echo "Commit '${desc}' is synced into libbpf as:" + grep -F "${signature}" ${TMP_DIR}/libbpf_commits.txt | \ + cut -d'|' -f1 | sed -e 's/^/- /' + if ((${manual_mode} != 1 && ${synced_cnt} == 1)); then + echo "Skipping '${desc}' due to unique match..." + continue + fi + if ((${synced_cnt} > 1)); then + echo "'${desc} matches multiple commits, please, double-check!" + manual_check=1 + fi + fi + if ((${manual_mode} == 1 || ${manual_check} == 1)); then + read -p "Do you want to skip '${desc}'? [y/N]: " should_skip + case "${should_skip}" in + "y" | "Y") + echo "Skipping '${desc}'..." + continue + ;; + esac + fi + # commit hasn't been synced into libbpf yet + echo "Picking '${desc}'..." + if ! git cherry-pick ${new_commit} &>/dev/null; then + echo "Warning! Cherry-picking '${desc} failed, checking if it's non-libbpf files causing problems..." + libbpf_conflict_cnt=$(git diff --name-only --diff-filter=U -- "${LIBBPF_PATHS[@]}" | wc -l) + conflict_cnt=$(git diff --name-only | wc -l) + prompt_resolution=1 + + if ((${libbpf_conflict_cnt} == 0)); then + echo "Looks like only non-libbpf files have conflicts, ignoring..." + if ((${conflict_cnt} == 0)); then + echo "Empty cherry-pick, skipping it..." + git cherry-pick --abort + continue + fi + + git add . + # GIT_EDITOR=true to avoid editor popping up to edit commit message + if ! GIT_EDITOR=true git cherry-pick --continue &>/dev/null; then + echo "Error! That still failed! Please resolve manually." + else + echo "Success! All cherry-pick conflicts were resolved for '${desc}'!" + prompt_resolution=0 + fi + fi + + if ((${prompt_resolution} == 1)); then + read -p "Error! Cherry-picking '${desc}' failed, please fix manually and press to proceed..." + fi + fi + # Append signature of just cherry-picked commit to avoid + # potentially cherry-picking the same commit twice later when + # processing bpf tree commits. At this point we don't know yet + # the final commit sha in libbpf repo, so we record Linux SHA + # instead as LINUX_. + echo LINUX_$(git log --pretty='%h' -n1) "${signature}" >> ${TMP_DIR}/libbpf_commits.txt + done +} + +cleanup() +{ + echo "Cleaning up..." + rm -r ${TMP_DIR} + cd_to ${LINUX_REPO} + git checkout ${TIP_SYM_REF} + git branch -D ${BASELINE_TAG} ${TIP_TAG} ${BPF_BASELINE_TAG} ${BPF_TIP_TAG} \ + ${SQUASH_BASE_TAG} ${SQUASH_TIP_TAG} ${VIEW_TAG} || true + + cd_to . + echo "DONE." +} + + +cd_to ${LIBBPF_REPO} +GITHUB_ABS_DIR=$(pwd) +echo "Dumping existing libbpf commit signatures..." +for h in $(git log --pretty='%h' -n500); do + echo $h "$(commit_signature $h)" >> ${TMP_DIR}/libbpf_commits.txt +done + +# Use current kernel repo HEAD as a source of patches +cd_to ${LINUX_REPO} +LINUX_ABS_DIR=$(pwd) +TIP_SYM_REF=$(git symbolic-ref -q --short HEAD || git rev-parse HEAD) +TIP_COMMIT=$(git rev-parse HEAD) +BPF_TIP_COMMIT=$(git rev-parse ${BPF_BRANCH}) +BASELINE_TAG=libbpf-baseline-${SUFFIX} +TIP_TAG=libbpf-tip-${SUFFIX} +BPF_BASELINE_TAG=libbpf-bpf-baseline-${SUFFIX} +BPF_TIP_TAG=libbpf-bpf-tip-${SUFFIX} +VIEW_TAG=libbpf-view-${SUFFIX} +LIBBPF_SYNC_TAG=libbpf-sync-${SUFFIX} + +# Squash state of kernel repo at baseline into single commit +SQUASH_BASE_TAG=libbpf-squash-base-${SUFFIX} +SQUASH_TIP_TAG=libbpf-squash-tip-${SUFFIX} +SQUASH_COMMIT=$(git commit-tree ${BASELINE_COMMIT}^{tree} -m "BASELINE SQUASH ${BASELINE_COMMIT}") + +echo "WORKDIR: ${WORKDIR}" +echo "LINUX REPO: ${LINUX_REPO}" +echo "LIBBPF REPO: ${LIBBPF_REPO}" +echo "TEMP DIR: ${TMP_DIR}" +echo "SUFFIX: ${SUFFIX}" +echo "BASE COMMIT: '$(commit_desc ${BASELINE_COMMIT})'" +echo "TIP COMMIT: '$(commit_desc ${TIP_COMMIT})'" +echo "BPF BASE COMMIT: '$(commit_desc ${BPF_BASELINE_COMMIT})'" +echo "BPF TIP COMMIT: '$(commit_desc ${BPF_TIP_COMMIT})'" +echo "SQUASH COMMIT: ${SQUASH_COMMIT}" +echo "BASELINE TAG: ${BASELINE_TAG}" +echo "TIP TAG: ${TIP_TAG}" +echo "BPF BASELINE TAG: ${BPF_BASELINE_TAG}" +echo "BPF TIP TAG: ${BPF_TIP_TAG}" +echo "SQUASH BASE TAG: ${SQUASH_BASE_TAG}" +echo "SQUASH TIP TAG: ${SQUASH_TIP_TAG}" +echo "VIEW TAG: ${VIEW_TAG}" +echo "LIBBPF SYNC TAG: ${LIBBPF_SYNC_TAG}" +echo "PATCHES: ${TMP_DIR}/patches" + +git branch ${BASELINE_TAG} ${BASELINE_COMMIT} +git branch ${TIP_TAG} ${TIP_COMMIT} +git branch ${BPF_BASELINE_TAG} ${BPF_BASELINE_COMMIT} +git branch ${BPF_TIP_TAG} ${BPF_TIP_COMMIT} +git branch ${SQUASH_BASE_TAG} ${SQUASH_COMMIT} +git checkout -b ${SQUASH_TIP_TAG} ${SQUASH_COMMIT} + +# Cherry-pick new commits onto squashed baseline commit +cherry_pick_commits ${BASELINE_TAG} ${TIP_TAG} +cherry_pick_commits ${BPF_BASELINE_TAG} ${BPF_TIP_TAG} + +# Move all libbpf files into __libbpf directory. +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --tree-filter "${LIBBPF_TREE_FILTER}" ${SQUASH_TIP_TAG} ${SQUASH_BASE_TAG} +# Make __libbpf a new root directory +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch --prune-empty -f --subdirectory-filter __libbpf ${SQUASH_TIP_TAG} ${SQUASH_BASE_TAG} + +# If there are no new commits with libbpf-related changes, bail out +COMMIT_CNT=$(git rev-list --count ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG}) +if ((${COMMIT_CNT} <= 0)); then + echo "No new changes to apply, we are done!" + cleanup + exit 2 +fi + +# Exclude baseline commit and generate nice cover letter with summary +git format-patch --no-signature ${SQUASH_BASE_TAG}..${SQUASH_TIP_TAG} --cover-letter -o ${TMP_DIR}/patches + +# Now is time to re-apply libbpf-related linux patches to libbpf repo +cd_to ${LIBBPF_REPO} +git checkout -b ${LIBBPF_SYNC_TAG} + +for patch in $(ls -1 ${TMP_DIR}/patches | tail -n +2); do + if ! git am -3 --committer-date-is-author-date "${TMP_DIR}/patches/${patch}"; then + if ! patch -p1 --merge < "${TMP_DIR}/patches/${patch}"; then + read -p "Applying ${TMP_DIR}/patches/${patch} failed, please resolve manually and press to proceed..." + fi + git am --continue + fi +done + +# Generate bpf_helper_defs.h and commit, if anything changed +# restore Linux tip to use bpf_doc.py +cd_to ${LINUX_REPO} +git checkout ${TIP_TAG} +# re-generate bpf_helper_defs.h +cd_to ${LIBBPF_REPO} +"${LINUX_ABS_DIR}/scripts/bpf_doc.py" --header \ + --file include/uapi/linux/bpf.h > src/bpf_helper_defs.h +# if anything changed, commit it +helpers_changes=$(git status --porcelain src/bpf_helper_defs.h | wc -l) +if ((${helpers_changes} == 1)); then + git add src/bpf_helper_defs.h + git commit -s -m "sync: auto-generate latest BPF helpers + +Latest changes to BPF helper definitions. +" -- src/bpf_helper_defs.h +fi + +# Use generated cover-letter as a template for "sync commit" with +# baseline and checkpoint commits from kernel repo (and leave summary +# from cover letter intact, of course) +echo ${TIP_COMMIT} > CHECKPOINT-COMMIT && \ +echo ${BPF_TIP_COMMIT} > BPF-CHECKPOINT-COMMIT && \ +git add CHECKPOINT-COMMIT && \ +git add BPF-CHECKPOINT-COMMIT && \ +awk '/\*\*\* BLURB HERE \*\*\*/ {p=1} p' ${TMP_DIR}/patches/0000-cover-letter.patch | \ +sed "s/\*\*\* BLURB HERE \*\*\*/\ +sync: latest libbpf changes from kernel\n\ +\n\ +Syncing latest libbpf commits from kernel repository.\n\ +Baseline bpf-next commit: ${BASELINE_COMMIT}\n\ +Checkpoint bpf-next commit: ${TIP_COMMIT}\n\ +Baseline bpf commit: ${BPF_BASELINE_COMMIT}\n\ +Checkpoint bpf commit: ${BPF_TIP_COMMIT}/" | \ +git commit -s --file=- + +echo "SUCCESS! ${COMMIT_CNT} commits synced." + +echo "Verifying Linux's and Github's libbpf state" + +cd_to ${LINUX_REPO} +git checkout -b ${VIEW_TAG} ${TIP_COMMIT} +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --tree-filter "${LIBBPF_TREE_FILTER}" ${VIEW_TAG}^..${VIEW_TAG} +FILTER_BRANCH_SQUELCH_WARNING=1 git filter-branch -f --subdirectory-filter __libbpf ${VIEW_TAG}^..${VIEW_TAG} +git ls-files -- "${LIBBPF_VIEW_PATHS[@]}" | grep -v -E "${LINUX_VIEW_EXCLUDE_REGEX}" > ${TMP_DIR}/linux-view.ls + +cd_to ${LIBBPF_REPO} +git ls-files -- "${LIBBPF_VIEW_PATHS[@]}" | grep -v -E "${LIBBPF_VIEW_EXCLUDE_REGEX}" > ${TMP_DIR}/github-view.ls + +echo "Comparing list of files..." +diff -u ${TMP_DIR}/linux-view.ls ${TMP_DIR}/github-view.ls +echo "Comparing file contents..." +CONSISTENT=1 +for F in $(cat ${TMP_DIR}/linux-view.ls); do + if ! diff -u "${LINUX_ABS_DIR}/${F}" "${GITHUB_ABS_DIR}/${F}"; then + echo "${LINUX_ABS_DIR}/${F} and ${GITHUB_ABS_DIR}/${F} are different!" + CONSISTENT=0 + fi +done +if ((${CONSISTENT} == 1)); then + echo "Great! Content is identical!" +else + ignore_inconsistency=n + echo "Unfortunately, there are some inconsistencies, please double check." + read -p "Does everything look good? [y/N]: " ignore_inconsistency + case "${ignore_inconsistency}" in + "y" | "Y") + echo "Ok, proceeding..." + ;; + *) + echo "Oops, exiting with error..." + exit 4 + esac +fi + +cleanup diff --git a/third_party/libbpf/src/.gitignore b/third_party/libbpf/src/.gitignore new file mode 100644 index 0000000..0473afb --- /dev/null +++ b/third_party/libbpf/src/.gitignore @@ -0,0 +1,6 @@ +*.o +*.a +/libbpf.pc +/libbpf.so* +/staticobjs +/sharedobjs diff --git a/third_party/libbpf/src/Makefile b/third_party/libbpf/src/Makefile new file mode 100644 index 0000000..52188db --- /dev/null +++ b/third_party/libbpf/src/Makefile @@ -0,0 +1,183 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' "$(1)" "$(2)" "$(if $(3), $(3))"; +endif + +LIBBPF_MAJOR_VERSION := 1 +LIBBPF_MINOR_VERSION := 3 +LIBBPF_PATCH_VERSION := 0 +LIBBPF_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).$(LIBBPF_PATCH_VERSION) +LIBBPF_MAJMIN_VERSION := $(LIBBPF_MAJOR_VERSION).$(LIBBPF_MINOR_VERSION).0 +LIBBPF_MAP_VERSION := $(shell grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | sort -rV | head -n1 | cut -d'_' -f2) +ifneq ($(LIBBPF_MAJMIN_VERSION), $(LIBBPF_MAP_VERSION)) +$(error Libbpf release ($(LIBBPF_VERSION)) and map ($(LIBBPF_MAP_VERSION)) versions are out of sync!) +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +TOPDIR = .. + +INCLUDES := -I. -I$(TOPDIR)/include -I$(TOPDIR)/include/uapi +ALL_CFLAGS := $(INCLUDES) + +SHARED_CFLAGS += -fPIC -fvisibility=hidden -DSHARED + +CFLAGS ?= -g -O2 -Werror -Wall -std=gnu89 +ALL_CFLAGS += $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(EXTRA_CFLAGS) +ALL_LDFLAGS += $(LDFLAGS) $(EXTRA_LDFLAGS) + +ifdef NO_PKG_CONFIG + ALL_LDFLAGS += -lelf -lz +else + PKG_CONFIG ?= pkg-config + ALL_CFLAGS += $(shell $(PKG_CONFIG) --cflags libelf zlib) + ALL_LDFLAGS += $(shell $(PKG_CONFIG) --libs libelf zlib) +endif + +OBJDIR ?= . +SHARED_OBJDIR := $(OBJDIR)/sharedobjs +STATIC_OBJDIR := $(OBJDIR)/staticobjs +OBJS := bpf.o btf.o libbpf.o libbpf_errno.o netlink.o \ + nlattr.o str_error.o libbpf_probes.o bpf_prog_linfo.o \ + btf_dump.o hashmap.o ringbuf.o strset.o linker.o gen_loader.o \ + relo_core.o usdt.o zip.o +SHARED_OBJS := $(addprefix $(SHARED_OBJDIR)/,$(OBJS)) +STATIC_OBJS := $(addprefix $(STATIC_OBJDIR)/,$(OBJS)) + +STATIC_LIBS := $(OBJDIR)/libbpf.a +ifndef BUILD_STATIC_ONLY + SHARED_LIBS := $(OBJDIR)/libbpf.so \ + $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + VERSION_SCRIPT := libbpf.map +endif + +HEADERS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h \ + bpf_helpers.h bpf_helper_defs.h bpf_tracing.h \ + bpf_endian.h bpf_core_read.h skel_internal.h libbpf_version.h \ + usdt.bpf.h +UAPI_HEADERS := $(addprefix $(TOPDIR)/include/uapi/linux/,\ + bpf.h bpf_common.h btf.h) + +PC_FILE := $(OBJDIR)/libbpf.pc + +INSTALL = install + +DESTDIR ?= + +HOSTARCH = $(firstword $(subst -, ,$(shell $(CC) -dumpmachine))) +ifeq ($(filter-out %64 %64be %64eb %64le %64el s390x, $(HOSTARCH)),) + LIBSUBDIR := lib64 +else + LIBSUBDIR := lib +endif + +# By default let the pc file itself use ${prefix} in includedir/libdir so that +# the prefix can be overridden at runtime (eg: --define-prefix) +ifndef LIBDIR + LIBDIR_PC := $$\{prefix\}/$(LIBSUBDIR) +else + LIBDIR_PC := $(LIBDIR) +endif +PREFIX ?= /usr +LIBDIR ?= $(PREFIX)/$(LIBSUBDIR) +INCLUDEDIR ?= $(PREFIX)/include +UAPIDIR ?= $(PREFIX)/include + +TAGS_PROG := $(if $(shell which etags 2>/dev/null),etags,ctags) + +all: $(STATIC_LIBS) $(SHARED_LIBS) $(PC_FILE) + +$(OBJDIR)/libbpf.a: $(STATIC_OBJS) + $(call msg,AR,$@) + $(Q)$(AR) rcs $@ $^ + +$(OBJDIR)/libbpf.so: $(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_MAJOR_VERSION): $(OBJDIR)/libbpf.so.$(LIBBPF_VERSION) + $(Q)ln -sf $(^F) $@ + +$(OBJDIR)/libbpf.so.$(LIBBPF_VERSION): $(SHARED_OBJS) + $(call msg,CC,$@) + $(Q)$(CC) -shared -Wl,--version-script=$(VERSION_SCRIPT) \ + -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \ + $^ $(ALL_LDFLAGS) -o $@ + +$(OBJDIR)/libbpf.pc: force + $(Q)sed -e "s|@PREFIX@|$(PREFIX)|" \ + -e "s|@LIBDIR@|$(LIBDIR_PC)|" \ + -e "s|@VERSION@|$(LIBBPF_VERSION)|" \ + < libbpf.pc.template > $@ + +$(STATIC_OBJDIR) $(SHARED_OBJDIR): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +$(STATIC_OBJDIR)/%.o: %.c | $(STATIC_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +$(SHARED_OBJDIR)/%.o: %.c | $(SHARED_OBJDIR) + $(call msg,CC,$@) + $(Q)$(CC) $(ALL_CFLAGS) $(SHARED_CFLAGS) $(CPPFLAGS) -c $< -o $@ + +define do_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)$(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR)$2' +endef + +# Preserve symlinks at installation. +define do_s_install + $(call msg,INSTALL,$1) + $(Q)if [ ! -d '$(DESTDIR)$2' ]; then \ + $(INSTALL) -d -m 755 '$(DESTDIR)$2'; \ + fi; + $(Q)cp -fR $1 '$(DESTDIR)$2' +endef + +install: all install_headers install_pkgconfig + $(call do_s_install,$(STATIC_LIBS) $(SHARED_LIBS),$(LIBDIR)) + +install_headers: + $(call do_install,$(HEADERS),$(INCLUDEDIR)/bpf,644) + +# UAPI headers can be installed by a different package so they're not installed +# in by install rule. +install_uapi_headers: + $(call do_install,$(UAPI_HEADERS),$(UAPIDIR)/linux,644) + +install_pkgconfig: $(PC_FILE) + $(call do_install,$(PC_FILE),$(LIBDIR)/pkgconfig,644) + +clean: + $(call msg,CLEAN) + $(Q)rm -rf *.o *.a *.so *.so.* *.pc $(SHARED_OBJDIR) $(STATIC_OBJDIR) + +.PHONY: cscope tags force +cscope: + $(call msg,CSCOPE) + $(Q)ls *.c *.h > cscope.files + $(Q)cscope -b -q -f cscope.out + +tags: + $(call msg,CTAGS) + $(Q)rm -f TAGS tags + $(Q)ls *.c *.h | xargs $(TAGS_PROG) -a + +force: diff --git a/third_party/libbpf/src/bpf.c b/third_party/libbpf/src/bpf.c new file mode 100644 index 0000000..3b0da19 --- /dev/null +++ b/third_party/libbpf/src/bpf.c @@ -0,0 +1,1211 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * common eBPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +/* + * When building perf, unistd.h is overridden. __NR_bpf is + * required to be defined explicitly. + */ +#ifndef __NR_bpf +# if defined(__i386__) +# define __NR_bpf 357 +# elif defined(__x86_64__) +# define __NR_bpf 321 +# elif defined(__aarch64__) +# define __NR_bpf 280 +# elif defined(__sparc__) +# define __NR_bpf 349 +# elif defined(__s390__) +# define __NR_bpf 351 +# elif defined(__arc__) +# define __NR_bpf 280 +# elif defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# else +# error __NR_bpf not defined. libbpf does not support your arch. +# endif +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + return syscall(__NR_bpf, cmd, attr, size); +} + +static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ + int fd; + + fd = sys_bpf(cmd, attr, size); + return ensure_good_fd(fd); +} + +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) +{ + int fd; + + do { + fd = sys_bpf_fd(BPF_PROG_LOAD, attr, size); + } while (fd < 0 && errno == EAGAIN && --attempts > 0); + + return fd; +} + +/* Probe whether kernel switched from memlock-based (RLIMIT_MEMLOCK) to + * memcg-based memory accounting for BPF maps and progs. This was done in [0]. + * We use the support for bpf_ktime_get_coarse_ns() helper, which was added in + * the same 5.11 Linux release ([1]), to detect memcg-based accounting for BPF. + * + * [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/ + * [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") + */ +int probe_memcg_account(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + struct bpf_insn insns[] = { + BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), + BPF_EXIT_INSN(), + }; + size_t insn_cnt = ARRAY_SIZE(insns); + union bpf_attr attr; + int prog_fd; + + /* attempt loading freplace trying to use custom BTF */ + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = insn_cnt; + attr.license = ptr_to_u64("GPL"); + + prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); + if (prog_fd >= 0) { + close(prog_fd); + return 1; + } + return 0; +} + +static bool memlock_bumped; +static rlim_t memlock_rlim = RLIM_INFINITY; + +int libbpf_set_memlock_rlim(size_t memlock_bytes) +{ + if (memlock_bumped) + return libbpf_err(-EBUSY); + + memlock_rlim = memlock_bytes; + return 0; +} + +int bump_rlimit_memlock(void) +{ + struct rlimit rlim; + + /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ + if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) + return 0; + + memlock_bumped = true; + + /* zero memlock_rlim_max disables auto-bumping RLIMIT_MEMLOCK */ + if (memlock_rlim == 0) + return 0; + + rlim.rlim_cur = rlim.rlim_max = memlock_rlim; + if (setrlimit(RLIMIT_MEMLOCK, &rlim)) + return -errno; + + return 0; +} + +int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_map_create_opts)) + return libbpf_err(-EINVAL); + + attr.map_type = map_type; + if (map_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + attr.btf_fd = OPTS_GET(opts, btf_fd, 0); + attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0); + attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0); + attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0); + + attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0); + attr.map_flags = OPTS_GET(opts, map_flags, 0); + attr.map_extra = OPTS_GET(opts, map_extra, 0); + attr.numa_node = OPTS_GET(opts, numa_node, 0); + attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); + + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +static void * +alloc_zero_tailing_info(const void *orecord, __u32 cnt, + __u32 actual_rec_size, __u32 expected_rec_size) +{ + __u64 info_len = (__u64)actual_rec_size * cnt; + void *info, *nrecord; + int i; + + info = malloc(info_len); + if (!info) + return NULL; + + /* zero out bytes kernel does not understand */ + nrecord = info; + for (i = 0; i < cnt; i++) { + memcpy(nrecord, orecord, expected_rec_size); + memset(nrecord + expected_rec_size, 0, + actual_rec_size - expected_rec_size); + orecord += actual_rec_size; + nrecord += actual_rec_size; + } + + return info; +} + +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, log_true_size); + void *finfo = NULL, *linfo = NULL; + const char *func_info, *line_info; + __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; + __u32 func_info_rec_size, line_info_rec_size; + int fd, attempts; + union bpf_attr attr; + char *log_buf; + + bump_rlimit_memlock(); + + if (!OPTS_VALID(opts, bpf_prog_load_opts)) + return libbpf_err(-EINVAL); + + attempts = OPTS_GET(opts, attempts, 0); + if (attempts < 0) + return libbpf_err(-EINVAL); + if (attempts == 0) + attempts = PROG_LOAD_ATTEMPTS; + + memset(&attr, 0, attr_sz); + + attr.prog_type = prog_type; + attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0); + + attr.prog_btf_fd = OPTS_GET(opts, prog_btf_fd, 0); + attr.prog_flags = OPTS_GET(opts, prog_flags, 0); + attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); + attr.kern_version = OPTS_GET(opts, kern_version, 0); + + if (prog_name && kernel_supports(NULL, FEAT_PROG_NAME)) + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + attr.license = ptr_to_u64(license); + + if (insn_cnt > UINT_MAX) + return libbpf_err(-E2BIG); + + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)insn_cnt; + + attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); + attach_btf_obj_fd = OPTS_GET(opts, attach_btf_obj_fd, 0); + + if (attach_prog_fd && attach_btf_obj_fd) + return libbpf_err(-EINVAL); + + attr.attach_btf_id = OPTS_GET(opts, attach_btf_id, 0); + if (attach_prog_fd) + attr.attach_prog_fd = attach_prog_fd; + else + attr.attach_btf_obj_fd = attach_btf_obj_fd; + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (!!log_buf != !!log_size) + return libbpf_err(-EINVAL); + + func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); + func_info = OPTS_GET(opts, func_info, NULL); + attr.func_info_rec_size = func_info_rec_size; + attr.func_info = ptr_to_u64(func_info); + attr.func_info_cnt = OPTS_GET(opts, func_info_cnt, 0); + + line_info_rec_size = OPTS_GET(opts, line_info_rec_size, 0); + line_info = OPTS_GET(opts, line_info, NULL); + attr.line_info_rec_size = line_info_rec_size; + attr.line_info = ptr_to_u64(line_info); + attr.line_info_cnt = OPTS_GET(opts, line_info_cnt, 0); + + attr.fd_array = ptr_to_u64(OPTS_GET(opts, fd_array, NULL)); + + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = log_level; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + if (fd >= 0) + return fd; + + /* After bpf_prog_load, the kernel may modify certain attributes + * to give user space a hint how to deal with loading failure. + * Check to see whether we can make some changes and load again. + */ + while (errno == E2BIG && (!finfo || !linfo)) { + if (!finfo && attr.func_info_cnt && + attr.func_info_rec_size < func_info_rec_size) { + /* try with corrected func info records */ + finfo = alloc_zero_tailing_info(func_info, + attr.func_info_cnt, + func_info_rec_size, + attr.func_info_rec_size); + if (!finfo) { + errno = E2BIG; + goto done; + } + + attr.func_info = ptr_to_u64(finfo); + attr.func_info_rec_size = func_info_rec_size; + } else if (!linfo && attr.line_info_cnt && + attr.line_info_rec_size < line_info_rec_size) { + linfo = alloc_zero_tailing_info(line_info, + attr.line_info_cnt, + line_info_rec_size, + attr.line_info_rec_size); + if (!linfo) { + errno = E2BIG; + goto done; + } + + attr.line_info = ptr_to_u64(linfo); + attr.line_info_rec_size = line_info_rec_size; + } else { + break; + } + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + if (fd >= 0) + goto done; + } + + if (log_level == 0 && log_buf) { + /* log_level == 0 with non-NULL log_buf requires retrying on error + * with log_level == 1 and log_buf/log_buf_size set, to get details of + * failure + */ + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_size; + attr.log_level = 1; + + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); + } +done: + /* free() doesn't affect errno, so we don't need to restore it */ + free(finfo); + free(linfo); + return libbpf_err_errno(fd); +} + +int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.flags = flags; + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_get_next_key(int fd, const void *key, void *next_key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, next_key); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.next_key = ptr_to_u64(next_key); + + ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_freeze(int fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + + ret = sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +static int bpf_map_batch_common(int cmd, int fd, void *in_batch, + void *out_batch, void *keys, void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, batch); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_map_batch_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.batch.map_fd = fd; + attr.batch.in_batch = ptr_to_u64(in_batch); + attr.batch.out_batch = ptr_to_u64(out_batch); + attr.batch.keys = ptr_to_u64(keys); + attr.batch.values = ptr_to_u64(values); + attr.batch.count = *count; + attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); + attr.batch.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(cmd, &attr, attr_sz); + *count = attr.batch.count; + + return libbpf_err_errno(ret); +} + +int bpf_map_delete_batch(int fd, const void *keys, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, NULL, + NULL, (void *)keys, NULL, count, opts); +} + +int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, in_batch, + out_batch, keys, values, count, opts); +} + +int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH, + fd, in_batch, out_batch, keys, values, + count, opts); +} + +int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *count, + const struct bpf_map_batch_opts *opts) +{ + return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH, fd, NULL, NULL, + (void *)keys, (void *)values, count, opts); +} + +int bpf_obj_pin_opts(int fd, const char *pathname, const struct bpf_obj_pin_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_obj_pin_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); + attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); + attr.bpf_fd = fd; + + ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_obj_pin(int fd, const char *pathname) +{ + return bpf_obj_pin_opts(fd, pathname, NULL); +} + +int bpf_obj_get(const char *pathname) +{ + return bpf_obj_get_opts(pathname, NULL); +} + +int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_obj_get_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); + attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); + + fd = sys_bpf_fd(BPF_OBJ_GET, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) +{ + DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = flags, + ); + + return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts); +} + +int bpf_prog_attach_opts(int prog_fd, int target_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_attach_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.attach_flags = OPTS_GET(opts, flags, 0); + attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); + + ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, replace_bpf_fd); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.target_fd = target_fd; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create); + __u32 target_btf_id, iter_info_len; + union bpf_attr attr; + int fd, err; + + if (!OPTS_VALID(opts, bpf_link_create_opts)) + return libbpf_err(-EINVAL); + + iter_info_len = OPTS_GET(opts, iter_info_len, 0); + target_btf_id = OPTS_GET(opts, target_btf_id, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (iter_info_len || target_btf_id) { + if (iter_info_len && target_btf_id) + return libbpf_err(-EINVAL); + if (!OPTS_ZEROED(opts, target_btf_id)) + return libbpf_err(-EINVAL); + } + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + attr.link_create.flags = OPTS_GET(opts, flags, 0); + + if (target_btf_id) { + attr.link_create.target_btf_id = target_btf_id; + goto proceed; + } + + switch (attach_type) { + case BPF_TRACE_ITER: + attr.link_create.iter_info = ptr_to_u64(OPTS_GET(opts, iter_info, (void *)0)); + attr.link_create.iter_info_len = iter_info_len; + break; + case BPF_PERF_EVENT: + attr.link_create.perf_event.bpf_cookie = OPTS_GET(opts, perf_event.bpf_cookie, 0); + if (!OPTS_ZEROED(opts, perf_event)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_KPROBE_MULTI: + attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); + attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); + attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); + attr.link_create.kprobe_multi.addrs = ptr_to_u64(OPTS_GET(opts, kprobe_multi.addrs, 0)); + attr.link_create.kprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, kprobe_multi.cookies, 0)); + if (!OPTS_ZEROED(opts, kprobe_multi)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: + attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); + if (!OPTS_ZEROED(opts, tracing)) + return libbpf_err(-EINVAL); + break; + case BPF_NETFILTER: + attr.link_create.netfilter.pf = OPTS_GET(opts, netfilter.pf, 0); + attr.link_create.netfilter.hooknum = OPTS_GET(opts, netfilter.hooknum, 0); + attr.link_create.netfilter.priority = OPTS_GET(opts, netfilter.priority, 0); + attr.link_create.netfilter.flags = OPTS_GET(opts, netfilter.flags, 0); + if (!OPTS_ZEROED(opts, netfilter)) + return libbpf_err(-EINVAL); + break; + default: + if (!OPTS_ZEROED(opts, flags)) + return libbpf_err(-EINVAL); + break; + } +proceed: + fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz); + if (fd >= 0) + return fd; + /* we'll get EINVAL if LINK_CREATE doesn't support attaching fentry + * and other similar programs + */ + err = -errno; + if (err != -EINVAL) + return libbpf_err(err); + + /* if user used features not supported by + * BPF_RAW_TRACEPOINT_OPEN command, then just give up immediately + */ + if (attr.link_create.target_fd || attr.link_create.target_btf_id) + return libbpf_err(err); + if (!OPTS_ZEROED(opts, sz)) + return libbpf_err(err); + + /* otherwise, for few select kinds of programs that can be + * attached using BPF_RAW_TRACEPOINT_OPEN command, try that as + * a fallback for older kernels + */ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + case BPF_LSM_MAC: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + return bpf_raw_tracepoint_open(NULL, prog_fd); + default: + return libbpf_err(err); + } +} + +int bpf_link_detach(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_detach); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.link_detach.link_fd = link_fd; + + ret = sys_bpf(BPF_LINK_DETACH, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_update); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_link_update_opts)) + return libbpf_err(-EINVAL); + + if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_update.link_fd = link_fd; + attr.link_update.new_prog_fd = new_prog_fd; + attr.link_update.flags = OPTS_GET(opts, flags, 0); + if (OPTS_GET(opts, old_prog_fd, 0)) + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + else if (OPTS_GET(opts, old_map_fd, 0)) + attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0); + + ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_iter_create(int link_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, iter_create); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.iter_create.link_fd = link_fd; + + fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, query); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + + attr.query.target_fd = target_fd; + attr.query.attach_type = type; + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.prog_cnt = OPTS_GET(opts, prog_cnt, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + + ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); + + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, prog_cnt, attr.query.prog_cnt); + + return libbpf_err_errno(ret); +} + +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +{ + LIBBPF_OPTS(bpf_prog_query_opts, opts); + int ret; + + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; + + ret = bpf_prog_query_opts(target_fd, type, &opts); + + if (attach_flags) + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; + + return libbpf_err_errno(ret); +} + +int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, test); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_test_run_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.batch_size = OPTS_GET(opts, batch_size, 0); + attr.test.cpu = OPTS_GET(opts, cpu, 0); + attr.test.flags = OPTS_GET(opts, flags, 0); + attr.test.repeat = OPTS_GET(opts, repeat, 0); + attr.test.duration = OPTS_GET(opts, duration, 0); + attr.test.ctx_size_in = OPTS_GET(opts, ctx_size_in, 0); + attr.test.ctx_size_out = OPTS_GET(opts, ctx_size_out, 0); + attr.test.data_size_in = OPTS_GET(opts, data_size_in, 0); + attr.test.data_size_out = OPTS_GET(opts, data_size_out, 0); + attr.test.ctx_in = ptr_to_u64(OPTS_GET(opts, ctx_in, NULL)); + attr.test.ctx_out = ptr_to_u64(OPTS_GET(opts, ctx_out, NULL)); + attr.test.data_in = ptr_to_u64(OPTS_GET(opts, data_in, NULL)); + attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL)); + + ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, attr_sz); + + OPTS_SET(opts, data_size_out, attr.test.data_size_out); + OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out); + OPTS_SET(opts, duration, attr.test.duration); + OPTS_SET(opts, retval, attr.test.retval); + + return libbpf_err_errno(ret); +} + +static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.start_id = start_id; + + err = sys_bpf(cmd, &attr, attr_sz); + if (!err) + *next_id = attr.next_id; + + return libbpf_err_errno(err); +} + +int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID); +} + +int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID); +} + +int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID); +} + +int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) +{ + return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); +} + +int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_get_fd_by_id(__u32 id) +{ + return bpf_prog_get_fd_by_id_opts(id, NULL); +} + +int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.map_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_map_get_fd_by_id(__u32 id) +{ + return bpf_map_get_fd_by_id_opts(id, NULL); +} + +int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.btf_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_get_fd_by_id(__u32 id) +{ + return bpf_btf_get_fd_by_id_opts(id, NULL); +} + +int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.link_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); + + fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_link_get_fd_by_id(__u32 id) +{ + return bpf_link_get_fd_by_id_opts(id, NULL); +} + +int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) +{ + const size_t attr_sz = offsetofend(union bpf_attr, info); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.info.bpf_fd = bpf_fd; + attr.info.info_len = *info_len; + attr.info.info = ptr_to_u64(info); + + err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); + if (!err) + *info_len = attr.info.info_len; + return libbpf_err_errno(err); +} + +int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(prog_fd, info, info_len); +} + +int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(map_fd, info, info_len); +} + +int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(btf_fd, info, info_len); +} + +int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(link_fd, info, info_len); +} + +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = ptr_to_u64(name); + attr.raw_tracepoint.prog_fd = prog_fd; + + fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, btf_log_true_size); + union bpf_attr attr; + char *log_buf; + size_t log_size; + __u32 log_level; + int fd; + + bump_rlimit_memlock(); + + memset(&attr, 0, attr_sz); + + if (!OPTS_VALID(opts, bpf_btf_load_opts)) + return libbpf_err(-EINVAL); + + log_buf = OPTS_GET(opts, log_buf, NULL); + log_size = OPTS_GET(opts, log_size, 0); + log_level = OPTS_GET(opts, log_level, 0); + + if (log_size > UINT_MAX) + return libbpf_err(-EINVAL); + if (log_size && !log_buf) + return libbpf_err(-EINVAL); + + attr.btf = ptr_to_u64(btf_data); + attr.btf_size = btf_size; + /* log_level == 0 and log_buf != NULL means "try loading without + * log_buf, but retry with log_buf and log_level=1 on error", which is + * consistent across low-level and high-level BTF and program loading + * APIs within libbpf and provides a sensible behavior in practice + */ + if (log_level) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = log_level; + } + + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + if (fd < 0 && log_buf && log_level == 0) { + attr.btf_log_buf = ptr_to_u64(log_buf); + attr.btf_log_size = (__u32)log_size; + attr.btf_log_level = 1; + fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); + } + + OPTS_SET(opts, log_true_size, attr.btf_log_true_size); + return libbpf_err_errno(fd); +} + +int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, + __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, + __u64 *probe_addr) +{ + const size_t attr_sz = offsetofend(union bpf_attr, task_fd_query); + union bpf_attr attr; + int err; + + memset(&attr, 0, attr_sz); + attr.task_fd_query.pid = pid; + attr.task_fd_query.fd = fd; + attr.task_fd_query.flags = flags; + attr.task_fd_query.buf = ptr_to_u64(buf); + attr.task_fd_query.buf_len = *buf_len; + + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, attr_sz); + + *buf_len = attr.task_fd_query.buf_len; + *prog_id = attr.task_fd_query.prog_id; + *fd_type = attr.task_fd_query.fd_type; + *probe_offset = attr.task_fd_query.probe_offset; + *probe_addr = attr.task_fd_query.probe_addr; + + return libbpf_err_errno(err); +} + +int bpf_enable_stats(enum bpf_stats_type type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, enable_stats); + union bpf_attr attr; + int fd; + + memset(&attr, 0, attr_sz); + attr.enable_stats.type = type; + + fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, attr_sz); + return libbpf_err_errno(fd); +} + +int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_bind_map); + union bpf_attr attr; + int ret; + + if (!OPTS_VALID(opts, bpf_prog_bind_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_bind_map.prog_fd = prog_fd; + attr.prog_bind_map.map_fd = map_fd; + attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); + + ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); + return libbpf_err_errno(ret); +} diff --git a/third_party/libbpf/src/bpf.h b/third_party/libbpf/src/bpf.h new file mode 100644 index 0000000..c676295 --- /dev/null +++ b/third_party/libbpf/src/bpf.h @@ -0,0 +1,564 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common BPF ELF operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License (not later!) + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see + */ +#ifndef __LIBBPF_BPF_H +#define __LIBBPF_BPF_H + +#include +#include +#include +#include + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int libbpf_set_memlock_rlim(size_t memlock_bytes); + +struct bpf_map_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 btf_fd; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + + __u32 inner_map_fd; + __u32 map_flags; + __u64 map_extra; + + __u32 numa_node; + __u32 map_ifindex; +}; +#define bpf_map_create_opts__last_field map_ifindex + +LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries, + const struct bpf_map_create_opts *opts); + +struct bpf_prog_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* libbpf can retry BPF_PROG_LOAD command if bpf() syscall returns + * -EAGAIN. This field determines how many attempts libbpf has to + * make. If not specified, libbpf will use default value of 5. + */ + int attempts; + + enum bpf_attach_type expected_attach_type; + __u32 prog_btf_fd; + __u32 prog_flags; + __u32 prog_ifindex; + __u32 kern_version; + + __u32 attach_btf_id; + __u32 attach_prog_fd; + __u32 attach_btf_obj_fd; + + const int *fd_array; + + /* .BTF.ext func info data */ + const void *func_info; + __u32 func_info_cnt; + __u32 func_info_rec_size; + + /* .BTF.ext line info data */ + const void *line_info; + __u32 line_info_cnt; + __u32 line_info_rec_size; + + /* verifier log options */ + __u32 log_level; + __u32 log_size; + char *log_buf; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; +}; +#define bpf_prog_load_opts__last_field log_true_size + +LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *opts); + +/* Flags to direct loading requirements */ +#define MAPS_RELAX_COMPAT 0x01 + +/* Recommended log buffer size */ +#define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ + +struct bpf_btf_load_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + /* kernel log options */ + char *log_buf; + __u32 log_level; + __u32 log_size; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + size_t :0; +}; +#define bpf_btf_load_opts__last_field log_true_size + +LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, + struct bpf_btf_load_opts *opts); + +LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, + __u64 flags); + +LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, + __u64 flags); +LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, + void *value); +LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, + void *value, __u64 flags); +LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags); +LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); +LIBBPF_API int bpf_map_freeze(int fd); + +struct bpf_map_batch_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 elem_flags; + __u64 flags; +}; +#define bpf_map_batch_opts__last_field flags + + +/** + * @brief **bpf_map_delete_batch()** allows for batch deletion of multiple + * elements in a BPF map. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param count input and output parameter; on input **count** represents the + * number of elements in the map to delete in batch; + * on output if a non-EFAULT error is returned, **count** represents the number of deleted + * elements if the output **count** value is not equal to the input **count** value + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch deletion works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. + * + * The parameter *in_batch* is the address of the first element in the batch to read. + * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent + * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate + * that the batched lookup starts from the beginning of the map. + * + * The *keys* and *values* are output parameters which must point to memory large enough to + * hold *count* items based on the key and value size of the map *map_fd*. The *keys* + * buffer must be of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * indicate that the batched lookup starts from the beginning of the map. + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array large enough for *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read in batch; on output it's the number of elements that were + * successfully read. + * If a non-EFAULT error is returned, count will be set as the number of elements + * that were read before the error occurred. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch lookup works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, + void *keys, void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_lookup_and_delete_batch()** allows for batch lookup and deletion + * of BPF map elements where each element is deleted after being retrieved. + * + * @param fd BPF map file descriptor + * @param in_batch address of the first element in batch to read, can pass NULL to + * get address of the first element in *out_batch* + * @param out_batch output parameter that should be passed to next call as *in_batch* + * @param keys pointer to an array of *count* keys + * @param values pointer to an array large enough for *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to read and delete in batch; on output it represents the number of + * elements that were successfully read and deleted + * If a non-**EFAULT** error code is returned and if the output **count** value + * is not equal to the input **count** value, up to **count** elements may + * have been deleted. + * if **EFAULT** is returned up to *count* elements may have been deleted without + * being returned via the *keys* and *values* output parameters. + * @param opts options for configuring the way the batch lookup and delete works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, + void *out_batch, void *keys, + void *values, __u32 *count, + const struct bpf_map_batch_opts *opts); + +/** + * @brief **bpf_map_update_batch()** updates multiple elements in a map + * by specifying keys and their corresponding values. + * + * The *keys* and *values* parameters must point to memory large enough + * to hold *count* items based on the key and value size of the map. + * + * The *opts* parameter can be used to control how *bpf_map_update_batch()* + * should handle keys that either do or do not already exist in the map. + * In particular the *flags* parameter of *bpf_map_batch_opts* can be + * one of the following: + * + * Note that *count* is an input and output parameter, where on output it + * represents how many elements were successfully updated. Also note that if + * **EFAULT** then *count* should not be trusted to be correct. + * + * **BPF_ANY** + * Create new elements or update existing. + * + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * + * **BPF_EXIST** + * Update existing elements. + * + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * @param fd BPF map file descriptor + * @param keys pointer to an array of *count* keys + * @param values pointer to an array of *count* values + * @param count input and output parameter; on input it's the number of elements + * in the map to update in batch; on output if a non-EFAULT error is returned, + * **count** represents the number of updated elements if the output **count** + * value is not equal to the input **count** value. + * If EFAULT is returned, **count** should not be trusted to be correct. + * @param opts options for configuring the way the batch update works + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values, + __u32 *count, + const struct bpf_map_batch_opts *opts); + +struct bpf_obj_pin_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_pin_opts__last_field path_fd + +LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); +LIBBPF_API int bpf_obj_pin_opts(int fd, const char *pathname, + const struct bpf_obj_pin_opts *opts); + +struct bpf_obj_get_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_get_opts__last_field path_fd + +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_obj_get_opts(const char *pathname, + const struct bpf_obj_get_opts *opts); + +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + unsigned int flags; + int replace_prog_fd; +}; +#define bpf_prog_attach_opts__last_field replace_prog_fd + +LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, + enum bpf_attach_type type, unsigned int flags); +LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); +LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); +LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, + enum bpf_attach_type type); + +union bpf_iter_link_info; /* defined in up-to-date linux/bpf.h */ +struct bpf_link_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + union bpf_iter_link_info *iter_info; + __u32 iter_info_len; + __u32 target_btf_id; + union { + struct { + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + const char **syms; + const unsigned long *addrs; + const __u64 *cookies; + } kprobe_multi; + struct { + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + }; + size_t :0; +}; +#define bpf_link_create_opts__last_field kprobe_multi.cookies + +LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type, + const struct bpf_link_create_opts *opts); + +LIBBPF_API int bpf_link_detach(int link_fd); + +struct bpf_link_update_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; /* extra flags */ + __u32 old_prog_fd; /* expected old program FD */ + __u32 old_map_fd; /* expected old map FD */ +}; +#define bpf_link_update_opts__last_field old_map_fd + +LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, + const struct bpf_link_update_opts *opts); + +LIBBPF_API int bpf_iter_create(int link_fd); + +struct bpf_prog_test_run_attr { + int prog_fd; + int repeat; + const void *data_in; + __u32 data_size_in; + void *data_out; /* optional */ + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out */ + __u32 retval; /* out: return code of the BPF program */ + __u32 duration; /* out: average per repetition in ns */ + const void *ctx_in; /* optional */ + __u32 ctx_size_in; + void *ctx_out; /* optional */ + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out */ +}; + +LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); +LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); + +struct bpf_get_fd_by_id_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 open_flags; /* permissions requested for the operation on fd */ + size_t :0; +}; +#define bpf_get_fd_by_id_opts__last_field open_flags + +LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); +LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +/** + * @brief **bpf_prog_get_info_by_fd()** obtains information about the BPF + * program corresponding to *prog_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param prog_fd BPF program file descriptor + * @param info pointer to **struct bpf_prog_info** that will be populated with + * BPF program information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len); + +/** + * @brief **bpf_map_get_info_by_fd()** obtains information about the BPF + * map corresponding to *map_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param map_fd BPF map file descriptor + * @param info pointer to **struct bpf_map_info** that will be populated with + * BPF map information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the + * BTF object corresponding to *btf_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param btf_fd BTF object file descriptor + * @param info pointer to **struct bpf_btf_info** that will be populated with + * BTF object information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the BPF + * link corresponding to *link_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. + * + * @param link_fd BPF link file descriptor + * @param info pointer to **struct bpf_link_info** that will be populated with + * BPF link information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + __u32 prog_cnt; /* input+output argument */ + __u32 *prog_attach_flags; +}; +#define bpf_prog_query_opts__last_field prog_attach_flags + +LIBBPF_API int bpf_prog_query_opts(int target_fd, + enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); +LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, + __u32 query_flags, __u32 *attach_flags, + __u32 *prog_ids, __u32 *prog_cnt); + +LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); +LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, + __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, + __u64 *probe_offset, __u64 *probe_addr); + +#ifdef __cplusplus +/* forward-declaring enums in C++ isn't compatible with pure C enums, so + * instead define bpf_enable_stats() as accepting int as an input + */ +LIBBPF_API int bpf_enable_stats(int type); +#else +enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ +LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +#endif + +struct bpf_prog_bind_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; +}; +#define bpf_prog_bind_opts__last_field flags + +LIBBPF_API int bpf_prog_bind_map(int prog_fd, int map_fd, + const struct bpf_prog_bind_opts *opts); + +struct bpf_test_run_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const void *data_in; /* optional */ + void *data_out; /* optional */ + __u32 data_size_in; + __u32 data_size_out; /* in: max length of data_out + * out: length of data_out + */ + const void *ctx_in; /* optional */ + void *ctx_out; /* optional */ + __u32 ctx_size_in; + __u32 ctx_size_out; /* in: max length of ctx_out + * out: length of cxt_out + */ + __u32 retval; /* out: return code of the BPF program */ + int repeat; + __u32 duration; /* out: average per repetition in ns */ + __u32 flags; + __u32 cpu; + __u32 batch_size; +}; +#define bpf_test_run_opts__last_field batch_size + +LIBBPF_API int bpf_prog_test_run_opts(int prog_fd, + struct bpf_test_run_opts *opts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BPF_H */ diff --git a/third_party/libbpf/src/bpf_core_read.h b/third_party/libbpf/src/bpf_core_read.h new file mode 100644 index 0000000..1ac57bb --- /dev/null +++ b/third_party/libbpf/src/bpf_core_read.h @@ -0,0 +1,484 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_CORE_READ_H__ +#define __BPF_CORE_READ_H__ + +/* + * enum bpf_field_info_kind is passed as a second argument into + * __builtin_preserve_field_info() built-in to get a specific aspect of + * a field, captured as a first argument. __builtin_preserve_field_info(field, + * info_kind) returns __u32 integer and produces BTF field relocation, which + * is understood and processed by libbpf during BPF object loading. See + * selftests/bpf for examples. + */ +enum bpf_field_info_kind { + BPF_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_FIELD_BYTE_SIZE = 1, + BPF_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_FIELD_SIGNED = 3, + BPF_FIELD_LSHIFT_U64 = 4, + BPF_FIELD_RSHIFT_U64 = 5, +}; + +/* second argument to __builtin_btf_type_id() built-in */ +enum bpf_type_id_kind { + BPF_TYPE_ID_LOCAL = 0, /* BTF type ID in local program */ + BPF_TYPE_ID_TARGET = 1, /* BTF type ID in target kernel */ +}; + +/* second argument to __builtin_preserve_type_info() built-in */ +enum bpf_type_info_kind { + BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ + BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ +}; + +/* second argument to __builtin_preserve_enum_value() built-in */ +enum bpf_enum_value_kind { + BPF_ENUMVAL_EXISTS = 0, /* enum value existence in kernel */ + BPF_ENUMVAL_VALUE = 1, /* enum value value relocation */ +}; + +#define __CORE_RELO(src, field, info) \ + __builtin_preserve_field_info((src)->field, BPF_FIELD_##info) + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst, \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#else +/* semantics of LSHIFT_64 assumes loading values into low-ordered bytes, so + * for big-endian we need to adjust destination pointer accordingly, based on + * field byte size + */ +#define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ + bpf_probe_read_kernel( \ + (void *)dst + (8 - __CORE_RELO(src, fld, BYTE_SIZE)), \ + __CORE_RELO(src, fld, BYTE_SIZE), \ + (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) +#endif + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * All this is done in relocatable manner, so bitfield changes such as + * signedness, bit size, offset changes, this will be handled automatically. + * This version of macro is using bpf_probe_read_kernel() to read underlying + * integer storage. Macro functions as an expression and its return type is + * bpf_probe_read_kernel()'s return value: 0, on success, <0 on error. + */ +#define BPF_CORE_READ_BITFIELD_PROBED(s, field) ({ \ + unsigned long long val = 0; \ + \ + __CORE_BITFIELD_PROBE_READ(&val, s, field); \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +/* + * Extract bitfield, identified by s->field, and return its value as u64. + * This version of macro is using direct memory reads and should be used from + * BPF program types that support such functionality (e.g., typed raw + * tracepoints). + */ +#define BPF_CORE_READ_BITFIELD(s, field) ({ \ + const void *p = (const void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ + unsigned long long val; \ + \ + /* This is a so-called barrier_var() operation that makes specified \ + * variable "a black box" for optimizing compiler. \ + * It forces compiler to perform BYTE_OFFSET relocation on p and use \ + * its calculated value in the switch below, instead of applying \ + * the same relocation 4 times for each individual memory load. \ + */ \ + asm volatile("" : "=r"(p) : "0"(p)); \ + \ + switch (__CORE_RELO(s, field, BYTE_SIZE)) { \ + case 1: val = *(const unsigned char *)p; break; \ + case 2: val = *(const unsigned short *)p; break; \ + case 4: val = *(const unsigned int *)p; break; \ + case 8: val = *(const unsigned long long *)p; break; \ + } \ + val <<= __CORE_RELO(s, field, LSHIFT_U64); \ + if (__CORE_RELO(s, field, SIGNED)) \ + val = ((long long)val) >> __CORE_RELO(s, field, RSHIFT_U64); \ + else \ + val = val >> __CORE_RELO(s, field, RSHIFT_U64); \ + val; \ +}) + +#define ___bpf_field_ref1(field) (field) +#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field) +#define ___bpf_field_ref(args...) \ + ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) + +/* + * Convenience macro to check that field actually exists in target kernel's. + * Returns: + * 1, if matching field is present in target kernel; + * 0, if no matching field found. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_exists(p->my_field); + * - field reference through type and field names: + * bpf_core_field_exists(struct my_type, my_field). + */ +#define bpf_core_field_exists(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_EXISTS) + +/* + * Convenience macro to get the byte size of a field. Works for integers, + * struct/unions, pointers, arrays, and enums. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_size(p->my_field); + * - field reference through type and field names: + * bpf_core_field_size(struct my_type, my_field). + */ +#define bpf_core_field_size(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_SIZE) + +/* + * Convenience macro to get field's byte offset. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_offset(p->my_field); + * - field reference through type and field names: + * bpf_core_field_offset(struct my_type, my_field). + */ +#define bpf_core_field_offset(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_OFFSET) + +/* + * Convenience macro to get BTF type ID of a specified type, using a local BTF + * information. Return 32-bit unsigned integer with type ID from program's own + * BTF. Always succeeds. + */ +#define bpf_core_type_id_local(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_LOCAL) + +/* + * Convenience macro to get BTF type ID of a target kernel's type that matches + * specified local type. + * Returns: + * - valid 32-bit unsigned type ID in kernel BTF; + * - 0, if no matching type was found in a target kernel BTF. + */ +#define bpf_core_type_id_kernel(type) \ + __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) exists in a target kernel. + * Returns: + * 1, if such type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_exists(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_MATCHES) + +/* + * Convenience macro to get the byte size of a provided named type + * (struct/union/enum/typedef) in a target kernel. + * Returns: + * >= 0 size (in bytes), if type is present in target kernel's BTF; + * 0, if no matching type is found. + */ +#define bpf_core_type_size(type) \ + __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_SIZE) + +/* + * Convenience macro to check that provided enumerator value is defined in + * a target kernel. + * Returns: + * 1, if specified enum type and its enumerator value are present in target + * kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) + +/* + * Convenience macro to get the integer value of an enumerator value in + * a target kernel. + * Returns: + * 64-bit value, if specified enum type and its enumerator value are + * present in target kernel's BTF; + * 0, if no matching enum and/or enum value within that enum is found. + */ +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) + +/* + * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures + * offset relocation for source address using __builtin_preserve_access_index() + * built-in, provided by Clang. + * + * __builtin_preserve_access_index() takes as an argument an expression of + * taking an address of a field within struct/union. It makes compiler emit + * a relocation, which records BTF type ID describing root struct/union and an + * accessor string which describes exact embedded field that was used to take + * an address. See detailed description of this relocation format and + * semantics in comments to struct bpf_field_reloc in libbpf_internal.h. + * + * This relocation allows libbpf to adjust BPF instruction to use correct + * actual field offset, based on target kernel BTF type that matches original + * (local) BTF, used to record relocation. + */ +#define bpf_core_read(dst, sz, src) \ + bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user(dst, sz, src) \ + bpf_probe_read_user(dst, sz, (const void *)__builtin_preserve_access_index(src)) +/* + * bpf_core_read_str() is a thin wrapper around bpf_probe_read_str() + * additionally emitting BPF CO-RE field relocation for specified source + * argument. + */ +#define bpf_core_read_str(dst, sz, src) \ + bpf_probe_read_kernel_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +/* NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. */ +#define bpf_core_read_user_str(dst, sz, src) \ + bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) + +#define ___concat(a, b) a ## b +#define ___apply(fn, n) ___concat(fn, n) +#define ___nth(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, __11, N, ...) N + +/* + * return number of provided arguments; used for switch-based variadic macro + * definitions (see ___last, ___arrow, etc below) + */ +#define ___narg(...) ___nth(_, ##__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +/* + * return 0 if no arguments are passed, N - otherwise; used for + * recursively-defined macros to specify termination (0) case, and generic + * (N) case (e.g., ___read_ptrs, ___core_read) + */ +#define ___empty(...) ___nth(_, ##__VA_ARGS__, N, N, N, N, N, N, N, N, N, N, 0) + +#define ___last1(x) x +#define ___last2(a, x) x +#define ___last3(a, b, x) x +#define ___last4(a, b, c, x) x +#define ___last5(a, b, c, d, x) x +#define ___last6(a, b, c, d, e, x) x +#define ___last7(a, b, c, d, e, f, x) x +#define ___last8(a, b, c, d, e, f, g, x) x +#define ___last9(a, b, c, d, e, f, g, h, x) x +#define ___last10(a, b, c, d, e, f, g, h, i, x) x +#define ___last(...) ___apply(___last, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___nolast2(a, _) a +#define ___nolast3(a, b, _) a, b +#define ___nolast4(a, b, c, _) a, b, c +#define ___nolast5(a, b, c, d, _) a, b, c, d +#define ___nolast6(a, b, c, d, e, _) a, b, c, d, e +#define ___nolast7(a, b, c, d, e, f, _) a, b, c, d, e, f +#define ___nolast8(a, b, c, d, e, f, g, _) a, b, c, d, e, f, g +#define ___nolast9(a, b, c, d, e, f, g, h, _) a, b, c, d, e, f, g, h +#define ___nolast10(a, b, c, d, e, f, g, h, i, _) a, b, c, d, e, f, g, h, i +#define ___nolast(...) ___apply(___nolast, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___arrow1(a) a +#define ___arrow2(a, b) a->b +#define ___arrow3(a, b, c) a->b->c +#define ___arrow4(a, b, c, d) a->b->c->d +#define ___arrow5(a, b, c, d, e) a->b->c->d->e +#define ___arrow6(a, b, c, d, e, f) a->b->c->d->e->f +#define ___arrow7(a, b, c, d, e, f, g) a->b->c->d->e->f->g +#define ___arrow8(a, b, c, d, e, f, g, h) a->b->c->d->e->f->g->h +#define ___arrow9(a, b, c, d, e, f, g, h, i) a->b->c->d->e->f->g->h->i +#define ___arrow10(a, b, c, d, e, f, g, h, i, j) a->b->c->d->e->f->g->h->i->j +#define ___arrow(...) ___apply(___arrow, ___narg(__VA_ARGS__))(__VA_ARGS__) + +#define ___type(...) typeof(___arrow(__VA_ARGS__)) + +#define ___read(read_fn, dst, src_type, src, accessor) \ + read_fn((void *)(dst), sizeof(*(dst)), &((src_type)(src))->accessor) + +/* "recursively" read a sequence of inner pointers using local __t var */ +#define ___rd_first(fn, src, a) ___read(fn, &__t, ___type(src), src, a); +#define ___rd_last(fn, ...) \ + ___read(fn, &__t, ___type(___nolast(__VA_ARGS__)), __t, ___last(__VA_ARGS__)); +#define ___rd_p1(fn, ...) const void *__t; ___rd_first(fn, __VA_ARGS__) +#define ___rd_p2(fn, ...) ___rd_p1(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p3(fn, ...) ___rd_p2(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p4(fn, ...) ___rd_p3(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p5(fn, ...) ___rd_p4(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p6(fn, ...) ___rd_p5(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p7(fn, ...) ___rd_p6(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p8(fn, ...) ___rd_p7(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___rd_p9(fn, ...) ___rd_p8(fn, ___nolast(__VA_ARGS__)) ___rd_last(fn, __VA_ARGS__) +#define ___read_ptrs(fn, src, ...) \ + ___apply(___rd_p, ___narg(__VA_ARGS__))(fn, src, __VA_ARGS__) + +#define ___core_read0(fn, fn_ptr, dst, src, a) \ + ___read(fn, dst, ___type(src), src, a); +#define ___core_readN(fn, fn_ptr, dst, src, ...) \ + ___read_ptrs(fn_ptr, src, ___nolast(__VA_ARGS__)) \ + ___read(fn, dst, ___type(src, ___nolast(__VA_ARGS__)), __t, \ + ___last(__VA_ARGS__)); +#define ___core_read(fn, fn_ptr, dst, src, a, ...) \ + ___apply(___core_read, ___empty(__VA_ARGS__))(fn, fn_ptr, dst, \ + src, a, ##__VA_ARGS__) + +/* + * BPF_CORE_READ_INTO() is a more performance-conscious variant of + * BPF_CORE_READ(), in which final field is read into user-provided storage. + * See BPF_CORE_READ() below for more details on general usage. + */ +#define BPF_CORE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_INTO() */ +#define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_USER_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ_STR_INTO() does same "pointer chasing" as + * BPF_CORE_READ() for intermediate pointers, but then executes (and returns + * corresponding error code) bpf_core_read_str() for final string read. + */ +#define BPF_CORE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_str, bpf_core_read, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Variant of BPF_CORE_READ_STR_INTO() for reading from user-space memory. + * + * NOTE: see comments for BPF_CORE_READ_USER() about the proper types use. + */ +#define BPF_CORE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_core_read_user_str, bpf_core_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ +#define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER_STR_INTO(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER_STR_INTO(dst, src, a, ...) ({ \ + ___core_read(bpf_probe_read_user_str, bpf_probe_read_user, \ + dst, (src), a, ##__VA_ARGS__) \ +}) + +/* + * BPF_CORE_READ() is used to simplify BPF CO-RE relocatable read, especially + * when there are few pointer chasing steps. + * E.g., what in non-BPF world (or in BPF w/ BCC) would be something like: + * int x = s->a.b.c->d.e->f->g; + * can be succinctly achieved using BPF_CORE_READ as: + * int x = BPF_CORE_READ(s, a.b.c, d.e, f, g); + * + * BPF_CORE_READ will decompose above statement into 4 bpf_core_read (BPF + * CO-RE relocatable bpf_probe_read_kernel() wrapper) calls, logically + * equivalent to: + * 1. const void *__t = s->a.b.c; + * 2. __t = __t->d.e; + * 3. __t = __t->f; + * 4. return __t->g; + * + * Equivalence is logical, because there is a heavy type casting/preservation + * involved, as well as all the reads are happening through + * bpf_probe_read_kernel() calls using __builtin_preserve_access_index() to + * emit CO-RE relocations. + * + * N.B. Only up to 9 "field accessors" are supported, which should be more + * than enough for any practical purpose. + */ +#define BPF_CORE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Variant of BPF_CORE_READ() for reading from user-space memory. + * + * NOTE: all the source types involved are still *kernel types* and need to + * exist in kernel (or kernel module) BTF, otherwise CO-RE relocation will + * fail. Custom user types are not relocatable with CO-RE. + * The typical situation in which BPF_CORE_READ_USER() might be used is to + * read kernel UAPI types from the user-space memory passed in as a syscall + * input argument. + */ +#define BPF_CORE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_CORE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* Non-CO-RE variant of BPF_CORE_READ() */ +#define BPF_PROBE_READ(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +/* + * Non-CO-RE variant of BPF_CORE_READ_USER(). + * + * As no CO-RE relocations are emitted, source types can be arbitrary and are + * not restricted to kernel types only. + */ +#define BPF_PROBE_READ_USER(src, a, ...) ({ \ + ___type((src), a, ##__VA_ARGS__) __r; \ + BPF_PROBE_READ_USER_INTO(&__r, (src), a, ##__VA_ARGS__); \ + __r; \ +}) + +#endif + diff --git a/third_party/libbpf/src/bpf_endian.h b/third_party/libbpf/src/bpf_endian.h new file mode 100644 index 0000000..ec9db4f --- /dev/null +++ b/third_party/libbpf/src/bpf_endian.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_ENDIAN__ +#define __BPF_ENDIAN__ + +/* + * Isolate byte #n and put it into byte #m, for __u##b type. + * E.g., moving byte #6 (nnnnnnnn) into byte #1 (mmmmmmmm) for __u64: + * 1) xxxxxxxx nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx + * 2) nnnnnnnn xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx mmmmmmmm xxxxxxxx 00000000 + * 3) 00000000 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn + * 4) 00000000 00000000 00000000 00000000 00000000 00000000 nnnnnnnn 00000000 + */ +#define ___bpf_mvb(x, b, n, m) ((__u##b)(x) << (b-(n+1)*8) >> (b-8) << (m*8)) + +#define ___bpf_swab16(x) ((__u16)( \ + ___bpf_mvb(x, 16, 0, 1) | \ + ___bpf_mvb(x, 16, 1, 0))) + +#define ___bpf_swab32(x) ((__u32)( \ + ___bpf_mvb(x, 32, 0, 3) | \ + ___bpf_mvb(x, 32, 1, 2) | \ + ___bpf_mvb(x, 32, 2, 1) | \ + ___bpf_mvb(x, 32, 3, 0))) + +#define ___bpf_swab64(x) ((__u64)( \ + ___bpf_mvb(x, 64, 0, 7) | \ + ___bpf_mvb(x, 64, 1, 6) | \ + ___bpf_mvb(x, 64, 2, 5) | \ + ___bpf_mvb(x, 64, 3, 4) | \ + ___bpf_mvb(x, 64, 4, 3) | \ + ___bpf_mvb(x, 64, 5, 2) | \ + ___bpf_mvb(x, 64, 6, 1) | \ + ___bpf_mvb(x, 64, 7, 0))) + +/* LLVM's BPF target selects the endianness of the CPU + * it compiles on, or the user specifies (bpfel/bpfeb), + * respectively. The used __BYTE_ORDER__ is defined by + * the compiler, we cannot rely on __BYTE_ORDER from + * libc headers, since it doesn't reflect the actual + * requested byte order. + * + * Note, LLVM's BPF target has different __builtin_bswapX() + * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE + * in bpfel and bpfeb case, which means below, that we map + * to cpu_to_be16(). We could use it unconditionally in BPF + * case, but better not rely on it, so that this header here + * can be used from application and BPF program side, which + * use different targets. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __bpf_ntohs(x) __builtin_bswap16(x) +# define __bpf_htons(x) __builtin_bswap16(x) +# define __bpf_constant_ntohs(x) ___bpf_swab16(x) +# define __bpf_constant_htons(x) ___bpf_swab16(x) +# define __bpf_ntohl(x) __builtin_bswap32(x) +# define __bpf_htonl(x) __builtin_bswap32(x) +# define __bpf_constant_ntohl(x) ___bpf_swab32(x) +# define __bpf_constant_htonl(x) ___bpf_swab32(x) +# define __bpf_be64_to_cpu(x) __builtin_bswap64(x) +# define __bpf_cpu_to_be64(x) __builtin_bswap64(x) +# define __bpf_constant_be64_to_cpu(x) ___bpf_swab64(x) +# define __bpf_constant_cpu_to_be64(x) ___bpf_swab64(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __bpf_ntohs(x) (x) +# define __bpf_htons(x) (x) +# define __bpf_constant_ntohs(x) (x) +# define __bpf_constant_htons(x) (x) +# define __bpf_ntohl(x) (x) +# define __bpf_htonl(x) (x) +# define __bpf_constant_ntohl(x) (x) +# define __bpf_constant_htonl(x) (x) +# define __bpf_be64_to_cpu(x) (x) +# define __bpf_cpu_to_be64(x) (x) +# define __bpf_constant_be64_to_cpu(x) (x) +# define __bpf_constant_cpu_to_be64(x) (x) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#define bpf_htons(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htons(x) : __bpf_htons(x)) +#define bpf_ntohs(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohs(x) : __bpf_ntohs(x)) +#define bpf_htonl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_htonl(x) : __bpf_htonl(x)) +#define bpf_ntohl(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_ntohl(x) : __bpf_ntohl(x)) +#define bpf_cpu_to_be64(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x)) +#define bpf_be64_to_cpu(x) \ + (__builtin_constant_p(x) ? \ + __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x)) + +#endif /* __BPF_ENDIAN__ */ diff --git a/third_party/libbpf/src/bpf_gen_internal.h b/third_party/libbpf/src/bpf_gen_internal.h new file mode 100644 index 0000000..fdf4440 --- /dev/null +++ b/third_party/libbpf/src/bpf_gen_internal.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __BPF_GEN_INTERNAL_H +#define __BPF_GEN_INTERNAL_H + +#include "bpf.h" + +struct ksym_relo_desc { + const char *name; + int kind; + int insn_idx; + bool is_weak; + bool is_typeless; + bool is_ld64; +}; + +struct ksym_desc { + const char *name; + int ref; + int kind; + union { + /* used for kfunc */ + int off; + /* used for typeless ksym */ + bool typeless; + }; + int insn; + bool is_ld64; +}; + +struct bpf_gen { + struct gen_loader_opts *opts; + void *data_start; + void *data_cur; + void *insn_start; + void *insn_cur; + ssize_t cleanup_label; + __u32 nr_progs; + __u32 nr_maps; + int log_level; + int error; + struct ksym_relo_desc *relos; + int relo_cnt; + struct bpf_core_relo *core_relos; + int core_relo_cnt; + char attach_target[128]; + int attach_kind; + struct ksym_desc *ksyms; + __u32 nr_ksyms; + int fd_array; + int nr_fd_array; +}; + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps); +void bpf_gen__free(struct bpf_gen *gen); +void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size); +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx); +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx); +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u32 value_size); +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, bool is_ld64, int kind, int insn_idx); +void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); + +#endif diff --git a/third_party/libbpf/src/bpf_helper_defs.h b/third_party/libbpf/src/bpf_helper_defs.h new file mode 100644 index 0000000..e338dbe --- /dev/null +++ b/third_party/libbpf/src/bpf_helper_defs.h @@ -0,0 +1,4752 @@ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */ +struct bpf_fib_lookup; +struct bpf_sk_lookup; +struct bpf_perf_event_data; +struct bpf_perf_event_value; +struct bpf_pidns_info; +struct bpf_redir_neigh; +struct bpf_sock; +struct bpf_sock_addr; +struct bpf_sock_ops; +struct bpf_sock_tuple; +struct bpf_spin_lock; +struct bpf_sysctl; +struct bpf_tcp_sock; +struct bpf_tunnel_key; +struct bpf_xfrm_state; +struct linux_binprm; +struct pt_regs; +struct sk_reuseport_md; +struct sockaddr; +struct tcphdr; +struct seq_file; +struct tcp6_sock; +struct tcp_sock; +struct tcp_timewait_sock; +struct tcp_request_sock; +struct udp6_sock; +struct unix_sock; +struct task_struct; +struct cgroup; +struct __sk_buff; +struct sk_msg_md; +struct xdp_md; +struct path; +struct btf_ptr; +struct inode; +struct socket; +struct file; +struct bpf_timer; +struct mptcp_sock; +struct bpf_dynptr; +struct iphdr; +struct ipv6hdr; + +/* + * bpf_map_lookup_elem + * + * Perform a lookup in *map* for an entry associated to *key*. + * + * Returns + * Map value associated to *key*, or **NULL** if no entry was + * found. + */ +static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; + +/* + * bpf_map_update_elem + * + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; + +/* + * bpf_map_delete_elem + * + * Delete entry with *key* from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; + +/* + * bpf_probe_read + * + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; + +/* + * bpf_ktime_get_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; + +/* + * bpf_trace_printk + * + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; + +/* + * bpf_get_prandom_u32 + * + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * + * Returns + * A random 32-bit unsigned value. + */ +static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; + +/* + * bpf_get_smp_processor_id + * + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * + * Returns + * The SMP id of the processor running the program. + */ +static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; + +/* + * bpf_skb_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; + +/* + * bpf_l3_csum_replace + * + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; + +/* + * bpf_l4_csum_replace + * + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; + +/* + * bpf_tail_call + * + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; + +/* + * bpf_clone_redirect + * + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; + +/* + * bpf_get_current_pid_tgid + * + * Get the current pid and tgid. + * + * Returns + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + */ +static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; + +/* + * bpf_get_current_uid_gid + * + * Get the current uid and gid. + * + * Returns + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + */ +static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; + +/* + * bpf_get_current_comm + * + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; + +/* + * bpf_get_cgroup_classid + * + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * + * Returns + * The classid, or 0 for the default unconfigured classid. + */ +static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; + +/* + * bpf_skb_vlan_push + * + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; + +/* + * bpf_skb_vlan_pop + * + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; + +/* + * bpf_skb_get_tunnel_key + * + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; + +/* + * bpf_skb_set_tunnel_key + * + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; + +/* + * bpf_perf_event_read + * + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * + * Returns + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + */ +static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; + +/* + * bpf_redirect + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * + * Returns + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + */ +static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; + +/* + * bpf_get_route_realm + * + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * + * Returns + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + */ +static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; + +/* + * bpf_perf_event_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; + +/* + * bpf_skb_load_bytes + * + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; + +/* + * bpf_get_stackid + * + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The positive or null stack id on success, or a negative error + * in case of failure. + */ +static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; + +/* + * bpf_csum_diff + * + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * + * Returns + * The checksum result, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; + +/* + * bpf_skb_get_tunnel_opt + * + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * + * Returns + * The size of the option data retrieved. + */ +static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; + +/* + * bpf_skb_set_tunnel_opt + * + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; + +/* + * bpf_skb_change_proto + * + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; + +/* + * bpf_skb_change_type + * + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; + +/* + * bpf_skb_under_cgroup + * + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + */ +static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; + +/* + * bpf_get_hash_recalc + * + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * + * Returns + * The 32-bit hash. + */ +static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; + +/* + * bpf_get_current_task + * + * Get the current task. + * + * Returns + * A pointer to the current task struct. + */ +static __u64 (*bpf_get_current_task)(void) = (void *) 35; + +/* + * bpf_probe_write_user + * + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; + +/* + * bpf_current_task_under_cgroup + * + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * + * Returns + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + */ +static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; + +/* + * bpf_skb_change_tail + * + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; + +/* + * bpf_skb_pull_data + * + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; + +/* + * bpf_csum_update + * + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * + * Returns + * The checksum on success, or a negative error code in case of + * failure. + */ +static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; + +/* + * bpf_set_hash_invalid + * + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * + * Returns + * void. + */ +static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; + +/* + * bpf_get_numa_node_id + * + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * + * Returns + * The id of current NUMA node. + */ +static long (*bpf_get_numa_node_id)(void) = (void *) 42; + +/* + * bpf_skb_change_head + * + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; + +/* + * bpf_xdp_adjust_head + * + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; + +/* + * bpf_probe_read_str + * + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; + +/* + * bpf_get_socket_cookie + * + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * + * Returns + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + */ +static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; + +/* + * bpf_get_socket_uid + * + * Get the owner UID of the socked associated to *skb*. + * + * Returns + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + */ +static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; + +/* + * bpf_set_hash + * + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * + * Returns + * 0 + */ +static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; + +/* + * bpf_setsockopt + * + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; + +/* + * bpf_skb_adjust_room + * + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; + +/* + * bpf_redirect_map + * + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * + * Returns + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + */ +static long (*bpf_redirect_map)(void *map, __u64 key, __u64 flags) = (void *) 51; + +/* + * bpf_sk_redirect_map + * + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; + +/* + * bpf_sock_map_update + * + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; + +/* + * bpf_xdp_adjust_meta + * + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; + +/* + * bpf_perf_event_read_value + * + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; + +/* + * bpf_perf_prog_read_value + * + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; + +/* + * bpf_getsockopt + * + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; + +/* + * bpf_override_return + * + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * + * Returns + * 0 + */ +static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; + +/* + * bpf_sock_ops_cb_flags_set + * + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * + * Returns + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + */ +static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; + +/* + * bpf_msg_redirect_map + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; + +/* + * bpf_msg_apply_bytes + * + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * + * Returns + * 0 + */ +static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; + +/* + * bpf_msg_cork_bytes + * + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * + * Returns + * 0 + */ +static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; + +/* + * bpf_msg_pull_data + * + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; + +/* + * bpf_bind + * + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; + +/* + * bpf_xdp_adjust_tail + * + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; + +/* + * bpf_skb_get_xfrm_state + * + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; + +/* + * bpf_get_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; + +/* + * bpf_skb_load_bytes_relative + * + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; + +/* + * bpf_fib_lookup + * + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Returns + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + */ +static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; + +/* + * bpf_sock_hash_update + * + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; + +/* + * bpf_msg_redirect_hash + * + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; + +/* + * bpf_sk_redirect_hash + * + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * + * Returns + * **SK_PASS** on success, or **SK_DROP** on error. + */ +static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; + +/* + * bpf_lwt_push_encap + * + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; + +/* + * bpf_lwt_seg6_store_bytes + * + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; + +/* + * bpf_lwt_seg6_adjust_srh + * + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; + +/* + * bpf_lwt_seg6_action + * + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; + +/* + * bpf_rc_repeat + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; + +/* + * bpf_rc_keydown + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; + +/* + * bpf_skb_cgroup_id + * + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; + +/* + * bpf_get_current_cgroup_id + * + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * + * Returns + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + */ +static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; + +/* + * bpf_get_local_storage + * + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * + * Returns + * A pointer to the local storage area. + */ +static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; + +/* + * bpf_sk_select_reuseport + * + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; + +/* + * bpf_skb_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; + +/* + * bpf_sk_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; + +/* + * bpf_sk_lookup_udp + * + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; + +/* + * bpf_sk_release + * + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_sk_release)(void *sock) = (void *) 86; + +/* + * bpf_map_push_elem + * + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; + +/* + * bpf_map_pop_elem + * + * Pop an element from *map*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; + +/* + * bpf_map_peek_elem + * + * Get an element from *map* without removing it. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; + +/* + * bpf_msg_push_data + * + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; + +/* + * bpf_msg_pop_data + * + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; + +/* + * bpf_rc_pointer_rel + * + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Returns + * 0 + */ +static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; + +/* + * bpf_spin_lock + * + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * + * Returns + * 0 + */ +static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; + +/* + * bpf_spin_unlock + * + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * + * Returns + * 0 + */ +static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; + +/* + * bpf_sk_fullsock + * + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; + +/* + * bpf_tcp_sock + * + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Returns + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; + +/* + * bpf_skb_ecn_set_ce + * + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * + * Returns + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + */ +static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; + +/* + * bpf_get_listener_sock + * + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * + * Returns + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + */ +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; + +/* + * bpf_skc_lookup_tcp + * + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * + * Returns + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + */ +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; + +/* + * bpf_tcp_check_syncookie + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + */ +static long (*bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; + +/* + * bpf_sysctl_get_name + * + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + */ +static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; + +/* + * bpf_sysctl_get_current_value + * + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + */ +static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; + +/* + * bpf_sysctl_get_new_value + * + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * Returns + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; + +/* + * bpf_sysctl_set_new_value + * + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * + * Returns + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + */ +static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; + +/* + * bpf_strtol + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; + +/* + * bpf_strtoul + * + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * + * Returns + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + */ +static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; + +/* + * bpf_sk_storage_get + * + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * + * Returns + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + */ +static void *(*bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags) = (void *) 107; + +/* + * bpf_sk_storage_delete + * + * Delete a bpf-local-storage from a *sk*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + */ +static long (*bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; + +/* + * bpf_send_signal + * + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal)(__u32 sig) = (void *) 109; + +/* + * bpf_tcp_gen_syncookie + * + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + */ +static __s64 (*bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; + +/* + * bpf_skb_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; + +/* + * bpf_probe_read_user + * + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; + +/* + * bpf_probe_read_kernel + * + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; + +/* + * bpf_probe_read_user_str + * + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * + * Returns + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; + +/* + * bpf_probe_read_kernel_str + * + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * + * Returns + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + */ +static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; + +/* + * bpf_tcp_send_ack + * + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; + +/* + * bpf_send_signal_thread + * + * Send signal *sig* to the thread corresponding to the current task. + * + * Returns + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + */ +static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; + +/* + * bpf_jiffies64 + * + * Obtain the 64bit jiffies + * + * Returns + * The 64 bit jiffies + */ +static __u64 (*bpf_jiffies64)(void) = (void *) 118; + +/* + * bpf_read_branch_records + * + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; + +/* + * bpf_get_ns_current_pid_tgid + * + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * + * Returns + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + */ +static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; + +/* + * bpf_xdp_output + * + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; + +/* + * bpf_get_netns_cookie + * + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * + * Returns + * A 8-byte long opaque number. + */ +static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; + +/* + * bpf_get_current_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; + +/* + * bpf_sk_assign + * + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * **-ESOCKTNOSUPPORT** if the socket type is not supported + * (reuseport). + */ +static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; + +/* + * bpf_ktime_get_boot_ns + * + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; + +/* + * bpf_seq_printf + * + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; + +/* + * bpf_seq_write + * + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * + * Returns + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + */ +static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; + +/* + * bpf_sk_cgroup_id + * + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_cgroup_id)(void *sk) = (void *) 128; + +/* + * bpf_sk_ancestor_cgroup_id + * + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void *) 129; + +/* + * bpf_ringbuf_output + * + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; + +/* + * bpf_ringbuf_reserve + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * + * Returns + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + */ +static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; + +/* + * bpf_ringbuf_submit + * + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; + +/* + * bpf_ringbuf_discard + * + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; + +/* + * bpf_ringbuf_query + * + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * + * Returns + * Requested value, or 0, if *flags* are not recognized. + */ +static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; + +/* + * bpf_csum_level + * + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * + * Returns + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + */ +static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; + +/* + * bpf_skc_to_tcp6_sock + * + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; + +/* + * bpf_skc_to_tcp_sock + * + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; + +/* + * bpf_skc_to_tcp_timewait_sock + * + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; + +/* + * bpf_skc_to_tcp_request_sock + * + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; + +/* + * bpf_skc_to_udp6_sock + * + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; + +/* + * bpf_get_task_stack + * + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * + * Returns + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + */ +static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; + +/* + * bpf_load_hdr_opt + * + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * + * Returns + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, __u32 len, __u64 flags) = (void *) 142; + +/* + * bpf_store_hdr_opt + * + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, __u32 len, __u64 flags) = (void *) 143; + +/* + * bpf_reserve_hdr_opt + * + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * + * Returns + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + */ +static long (*bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 flags) = (void *) 144; + +/* + * bpf_inode_storage_get + * + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 flags) = (void *) 145; + +/* + * bpf_inode_storage_delete + * + * Delete a bpf_local_storage from an *inode*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static int (*bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; + +/* + * bpf_d_path + * + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * + * Returns + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + */ +static long (*bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147; + +/* + * bpf_copy_from_user + * + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = (void *) 148; + +/* + * bpf_snprintf_btf + * + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * + * Returns + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + */ +static long (*bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, __u32 btf_ptr_size, __u64 flags) = (void *) 149; + +/* + * bpf_seq_printf_btf + * + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * + * Returns + * 0 on success or a negative error in case of failure. + */ +static long (*bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 ptr_size, __u64 flags) = (void *) 150; + +/* + * bpf_skb_cgroup_classid + * + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * + * Returns + * The id is returned or 0 in case the id could not be retrieved. + */ +static __u64 (*bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; + +/* + * bpf_redirect_neigh + * + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, int plen, __u64 flags) = (void *) 152; + +/* + * bpf_per_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * + * Returns + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + */ +static void *(*bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 153; + +/* + * bpf_this_cpu_ptr + * + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * + * Returns + * A pointer pointing to the kernel percpu variable on this cpu. + */ +static void *(*bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; + +/* + * bpf_redirect_peer + * + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * + * Returns + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + */ +static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; + +/* + * bpf_task_storage_get + * + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags) = (void *) 156; + +/* + * bpf_task_storage_delete + * + * Delete a bpf_local_storage from a *task*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_task_storage_delete)(void *map, struct task_struct *task) = (void *) 157; + +/* + * bpf_get_current_task_btf + * + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * + * Returns + * Pointer to the current task. + */ +static struct task_struct *(*bpf_get_current_task_btf)(void) = (void *) 158; + +/* + * bpf_bprm_opts_set + * + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * + * Returns + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + */ +static long (*bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void *) 159; + +/* + * bpf_ktime_get_coarse_ns + * + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_coarse_ns)(void) = (void *) 160; + +/* + * bpf_ima_inode_hash + * + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161; + +/* + * bpf_sock_from_file + * + * If the given file represents a socket, returns the associated + * socket. + * + * Returns + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + */ +static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; + +/* + * bpf_check_mtu + * + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * + * Returns + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + */ +static long (*bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len_diff, __u64 flags) = (void *) 163; + +/* + * bpf_for_each_map_elem + * + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * + * Returns + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + */ +static long (*bpf_for_each_map_elem)(void *map, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 164; + +/* + * bpf_snprintf + * + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * + * Returns + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + */ +static long (*bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *data, __u32 data_len) = (void *) 165; + +/* + * bpf_sys_bpf + * + * Execute bpf syscall with given arguments. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 166; + +/* + * bpf_btf_find_by_name_kind + * + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * + * Returns + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + */ +static long (*bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, int flags) = (void *) 167; + +/* + * bpf_sys_close + * + * Execute close syscall for given FD. + * + * Returns + * A syscall result. + */ +static long (*bpf_sys_close)(__u32 fd) = (void *) 168; + +/* + * bpf_timer_init + * + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * + * Returns + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = (void *) 169; + +/* + * bpf_timer_set_callback + * + * Configure the timer to call *callback_fn* static function. + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + */ +static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn) = (void *) 170; + +/* + * bpf_timer_start + * + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * + * + * Returns + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + */ +static long (*bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags) = (void *) 171; + +/* + * bpf_timer_cancel + * + * Cancel the timer and wait for callback_fn to finish if it was running. + * + * Returns + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + */ +static long (*bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; + +/* + * bpf_get_func_ip + * + * Get address of the traced function (for tracing and kprobe programs). + * + * Returns + * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). + */ +static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; + +/* + * bpf_get_attach_cookie + * + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * + * Returns + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + */ +static __u64 (*bpf_get_attach_cookie)(void *ctx) = (void *) 174; + +/* + * bpf_task_pt_regs + * + * Get the struct pt_regs associated with **task**. + * + * Returns + * A pointer to struct pt_regs. + */ +static long (*bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; + +/* + * bpf_get_branch_snapshot + * + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * + * Returns + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + */ +static long (*bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = (void *) 176; + +/* + * bpf_trace_vprintk + * + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * + * Returns + * The number of bytes written to the buffer, or a negative error + * in case of failure. + */ +static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; + +/* + * bpf_skc_to_unix_sock + * + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct unix_sock *(*bpf_skc_to_unix_sock)(void *sk) = (void *) 178; + +/* + * bpf_kallsyms_lookup_name + * + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * + * Returns + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + */ +static long (*bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags, __u64 *res) = (void *) 179; + +/* + * bpf_find_vma + * + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * + * Returns + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + */ +static long (*bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 180; + +/* + * bpf_loop + * + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * + * Returns + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + */ +static long (*bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 181; + +/* + * bpf_strncmp + * + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * + * Returns + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + */ +static long (*bpf_strncmp)(const char *s1, __u32 s1_sz, const char *s2) = (void *) 182; + +/* + * bpf_get_func_arg + * + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * + * Returns + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + */ +static long (*bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183; + +/* + * bpf_get_func_ret + * + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * + * Returns + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + */ +static long (*bpf_get_func_ret)(void *ctx, __u64 *value) = (void *) 184; + +/* + * bpf_get_func_arg_cnt + * + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * + * Returns + * The number of argument registers of the traced function. + */ +static long (*bpf_get_func_arg_cnt)(void *ctx) = (void *) 185; + +/* + * bpf_get_retval + * + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Returns + * The BPF program's return value. + */ +static int (*bpf_get_retval)(void) = (void *) 186; + +/* + * bpf_set_retval + * + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static int (*bpf_set_retval)(int retval) = (void *) 187; + +/* + * bpf_xdp_get_buff_len + * + * Get the total size of a given xdp buff (linear and paged area) + * + * Returns + * The total size of a given xdp buffer. + */ +static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; + +/* + * bpf_xdp_load_bytes + * + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; + +/* + * bpf_xdp_store_bytes + * + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; + +/* + * bpf_copy_from_user_task + * + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * + * Returns + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + */ +static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr, struct task_struct *tsk, __u64 flags) = (void *) 191; + +/* + * bpf_skb_set_tstamp + * + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * + * Returns + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + */ +static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) = (void *) 192; + +/* + * bpf_ima_file_hash + * + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * + * Returns + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + */ +static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193; + +/* + * bpf_kptr_xchg + * + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * + * Returns + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + */ +static void *(*bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194; + +/* + * bpf_map_lookup_percpu_elem + * + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * + * Returns + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + */ +static void *(*bpf_map_lookup_percpu_elem)(void *map, const void *key, __u32 cpu) = (void *) 195; + +/* + * bpf_skc_to_mptcp_sock + * + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct mptcp_sock *(*bpf_skc_to_mptcp_sock)(void *sk) = (void *) 196; + +/* + * bpf_dynptr_from_mem + * + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + */ +static long (*bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197; + +/* + * bpf_ringbuf_reserve_dynptr + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (*bpf_ringbuf_reserve_dynptr)(void *ringbuf, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 198; + +/* + * bpf_ringbuf_submit_dynptr + * + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 199; + +/* + * bpf_ringbuf_discard_dynptr + * + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (*bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 200; + +/* + * bpf_dynptr_read + * + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + */ +static long (*bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201; + +/* + * bpf_dynptr_write + * + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + */ +static long (*bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202; + +/* + * bpf_dynptr_data + * + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * + * Returns + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + */ +static void *(*bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203; + +/* + * bpf_tcp_raw_gen_syncookie_ipv4 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 204; + +/* + * bpf_tcp_raw_gen_syncookie_ipv6 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static __s64 (*bpf_tcp_raw_gen_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 205; + +/* + * bpf_tcp_raw_check_syncookie_ipv4 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th) = (void *) 206; + +/* + * bpf_tcp_raw_check_syncookie_ipv6 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static long (*bpf_tcp_raw_check_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th) = (void *) 207; + +/* + * bpf_ktime_get_tai_ns + * + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * + * Returns + * Current *ktime*. + */ +static __u64 (*bpf_ktime_get_tai_ns)(void) = (void *) 208; + +/* + * bpf_user_ringbuf_drain + * + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * + * Returns + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + */ +static long (*bpf_user_ringbuf_drain)(void *map, void *callback_fn, void *ctx, __u64 flags) = (void *) 209; + +/* + * bpf_cgrp_storage_get + * + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(*bpf_cgrp_storage_get)(void *map, struct cgroup *cgroup, void *value, __u64 flags) = (void *) 210; + +/* + * bpf_cgrp_storage_delete + * + * Delete a bpf_local_storage from a *cgroup*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (*bpf_cgrp_storage_delete)(void *map, struct cgroup *cgroup) = (void *) 211; + + diff --git a/third_party/libbpf/src/bpf_helpers.h b/third_party/libbpf/src/bpf_helpers.h new file mode 100644 index 0000000..bbab9ad --- /dev/null +++ b/third_party/libbpf/src/bpf_helpers.h @@ -0,0 +1,402 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_HELPERS__ +#define __BPF_HELPERS__ + +/* + * Note that bpf programs need to include either + * vmlinux.h (auto-generated from BTF) or linux/types.h + * in advance since bpf_helper_defs.h uses such types + * as __u64. + */ +#include "bpf_helper_defs.h" + +#define __uint(name, val) int (*name)[val] +#define __type(name, val) typeof(val) *name +#define __array(name, val) typeof(val) *name[] + +/* + * Helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by libbpf depending on the context (BPF programs, BPF maps, + * extern variables, etc). + * To allow use of SEC() with externs (e.g., for extern .maps declarations), + * make sure __attribute__((unused)) doesn't trigger compilation warning. + */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + +#define SEC(name) \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ + __attribute__((section(name), used)) \ + _Pragma("GCC diagnostic pop") \ + +#endif + +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline +#define __always_inline inline __attribute__((always_inline)) + +#ifndef __noinline +#define __noinline __attribute__((noinline)) +#endif +#ifndef __weak +#define __weak __attribute__((weak)) +#endif + +/* + * Use __hidden attribute to mark a non-static BPF subprogram effectively + * static for BPF verifier's verification algorithm purposes, allowing more + * extensive and permissive BPF verification process, taking into account + * subprogram's caller context. + */ +#define __hidden __attribute__((visibility("hidden"))) + +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) +#endif + +/* + * Helper macros to manipulate data structures + */ + +/* offsetof() definition that uses __builtin_offset() might not preserve field + * offset CO-RE relocation properly, so force-redefine offsetof() using + * old-school approach which works with CO-RE correctly + */ +#undef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) + +/* redefined container_of() to ensure we use the above offsetof() macro */ +#undef container_of +#define container_of(ptr, type, member) \ + ({ \ + void *__mptr = (void *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) + +/* + * Compiler (optimization) barrier. + */ +#ifndef barrier +#define barrier() asm volatile("" ::: "memory") +#endif + +/* Variable-specific compiler (optimization) barrier. It's a no-op which makes + * compiler believe that there is some black box modification of a given + * variable and thus prevents compiler from making extra assumption about its + * value and potential simplifications and optimizations on this variable. + * + * E.g., compiler might often delay or even omit 32-bit to 64-bit casting of + * a variable, making some code patterns unverifiable. Putting barrier_var() + * in place will ensure that cast is performed before the barrier_var() + * invocation, because compiler has to pessimistically assume that embedded + * asm section might perform some extra operations on that variable. + * + * This is a variable-specific variant of more global barrier(). + */ +#ifndef barrier_var +#define barrier_var(var) asm volatile("" : "+r"(var)) +#endif + +/* + * Helper macro to throw a compilation error if __bpf_unreachable() gets + * built into the resulting code. This works given BPF back end does not + * implement __builtin_trap(). This is useful to assert that certain paths + * of the program code are never used and hence eliminated by the compiler. + * + * For example, consider a switch statement that covers known cases used by + * the program. __bpf_unreachable() can then reside in the default case. If + * the program gets extended such that a case is not covered in the switch + * statement, then it will throw a build error due to the default case not + * being compiled out. + */ +#ifndef __bpf_unreachable +# define __bpf_unreachable() __builtin_trap() +#endif + +/* + * Helper function to perform a tail call with a constant/immediate map slot. + */ +#if __clang_major__ >= 8 && defined(__bpf__) +static __always_inline void +bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) +{ + if (!__builtin_constant_p(slot)) + __bpf_unreachable(); + + /* + * Provide a hard guarantee that LLVM won't optimize setting r2 (map + * pointer) and r3 (constant map index) from _different paths_ ending + * up at the _same_ call insn as otherwise we won't be able to use the + * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel + * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key + * tracking for prog array pokes") for details on verifier tracking. + * + * Note on clobber list: we need to stay in-line with BPF calling + * convention, so even if we don't end up using r0, r4, r5, we need + * to mark them as clobber so that LLVM doesn't end up using them + * before / after the call. + */ + asm volatile("r1 = %[ctx]\n\t" + "r2 = %[map]\n\t" + "r3 = %[slot]\n\t" + "call 12" + :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot) + : "r0", "r1", "r2", "r3", "r4", "r5"); +} +#endif + +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +#define __kconfig __attribute__((section(".kconfig"))) +#define __ksym __attribute__((section(".ksyms"))) +#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) +#define __kptr __attribute__((btf_type_tag("kptr"))) + +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), #sym " should be marked as __weak"); \ + !!sym; \ +}) + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) \ + ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_fill0(arr, p, x) do {} while (0) +#define ___bpf_fill1(arr, p, x) arr[p] = x +#define ___bpf_fill2(arr, p, x, args...) arr[p] = x; ___bpf_fill1(arr, p + 1, args) +#define ___bpf_fill3(arr, p, x, args...) arr[p] = x; ___bpf_fill2(arr, p + 1, args) +#define ___bpf_fill4(arr, p, x, args...) arr[p] = x; ___bpf_fill3(arr, p + 1, args) +#define ___bpf_fill5(arr, p, x, args...) arr[p] = x; ___bpf_fill4(arr, p + 1, args) +#define ___bpf_fill6(arr, p, x, args...) arr[p] = x; ___bpf_fill5(arr, p + 1, args) +#define ___bpf_fill7(arr, p, x, args...) arr[p] = x; ___bpf_fill6(arr, p + 1, args) +#define ___bpf_fill8(arr, p, x, args...) arr[p] = x; ___bpf_fill7(arr, p + 1, args) +#define ___bpf_fill9(arr, p, x, args...) arr[p] = x; ___bpf_fill8(arr, p + 1, args) +#define ___bpf_fill10(arr, p, x, args...) arr[p] = x; ___bpf_fill9(arr, p + 1, args) +#define ___bpf_fill11(arr, p, x, args...) arr[p] = x; ___bpf_fill10(arr, p + 1, args) +#define ___bpf_fill12(arr, p, x, args...) arr[p] = x; ___bpf_fill11(arr, p + 1, args) +#define ___bpf_fill(arr, args...) \ + ___bpf_apply(___bpf_fill, ___bpf_narg(args))(arr, 0, args) + +/* + * BPF_SEQ_PRINTF to wrap bpf_seq_printf to-be-printed values + * in a structure. + */ +#define BPF_SEQ_PRINTF(seq, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_seq_printf(seq, ___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* + * BPF_SNPRINTF wraps the bpf_snprintf helper with variadic arguments instead of + * an array of u64. + */ +#define BPF_SNPRINTF(out, out_size, fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_snprintf(out, out_size, ___fmt, \ + ___param, sizeof(___param)); \ +}) + +#ifdef BPF_NO_GLOBAL_DATA +#define BPF_PRINTK_FMT_MOD +#else +#define BPF_PRINTK_FMT_MOD static const +#endif + +#define __bpf_printk(fmt, ...) \ +({ \ + BPF_PRINTK_FMT_MOD char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +/* + * __bpf_vprintk wraps the bpf_trace_vprintk helper with variadic arguments + * instead of an array of u64. + */ +#define __bpf_vprintk(fmt, args...) \ +({ \ + static const char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args)]; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + bpf_trace_vprintk(___fmt, sizeof(___fmt), \ + ___param, sizeof(___param)); \ +}) + +/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args + * Otherwise use __bpf_vprintk + */ +#define ___bpf_pick_printk(...) \ + ___bpf_nth(_, ##__VA_ARGS__, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, __bpf_vprintk, \ + __bpf_vprintk, __bpf_vprintk, __bpf_printk /*3*/, __bpf_printk /*2*/,\ + __bpf_printk /*1*/, __bpf_printk /*0*/) + +/* Helper macro to print out debug messages */ +#define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) + +struct bpf_iter_num; + +extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak __ksym; +extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym; +extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym; + +#ifndef bpf_for_each +/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for + * using BPF open-coded iterators without having to write mundane explicit + * low-level loop logic. Instead, it provides for()-like generic construct + * that can be used pretty naturally. E.g., for some hypothetical cgroup + * iterator, you'd write: + * + * struct cgroup *cg, *parent_cg = <...>; + * + * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) { + * bpf_printk("Child cgroup id = %d", cg->cgroup_id); + * if (cg->cgroup_id == 123) + * break; + * } + * + * I.e., it looks almost like high-level for each loop in other languages, + * supports continue/break, and is verifiable by BPF verifier. + * + * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to + * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int + * *`, not just `int`. So for integers bpf_for() is more convenient. + * + * Note: this macro relies on C99 feature of allowing to declare variables + * inside for() loop, bound to for() loop lifetime. It also utilizes GCC + * extension: __attribute__((cleanup())), supported by both GCC and + * Clang. + */ +#define bpf_for_each(type, cur, args...) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */, \ + cleanup(bpf_iter_##type##_destroy))), \ + /* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_##type##_new(&___it, ##args), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_##type##_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_##type##_destroy, (void *)0); \ + /* iteration and termination check */ \ + (((cur) = bpf_iter_##type##_next(&___it))); \ +) +#endif /* bpf_for_each */ + +#ifndef bpf_for +/* bpf_for(i, start, end) implements a for()-like looping construct that sets + * provided integer variable *i* to values starting from *start* through, + * but not including, *end*. It also proves to BPF verifier that *i* belongs + * to range [start, end), so this can be used for accessing arrays without + * extra checks. + * + * Note: *start* and *end* are assumed to be expressions with no side effects + * and whose values do not change throughout bpf_for() loop execution. They do + * not have to be statically known or constant, though. + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_for(i, start, end) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, (start), (end)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + ({ \ + /* iteration step */ \ + int *___t = bpf_iter_num_next(&___it); \ + /* termination and bounds check */ \ + (___t && ((i) = *___t, (i) >= (start) && (i) < (end))); \ + }); \ +) +#endif /* bpf_for */ + +#ifndef bpf_repeat +/* bpf_repeat(N) performs N iterations without exposing iteration number + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_repeat(N) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, 0, (N)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + bpf_iter_num_next(&___it); \ + /* nothing here */ \ +) +#endif /* bpf_repeat */ + +#endif diff --git a/third_party/libbpf/src/bpf_prog_linfo.c b/third_party/libbpf/src/bpf_prog_linfo.c new file mode 100644 index 0000000..5c50309 --- /dev/null +++ b/third_party/libbpf/src/bpf_prog_linfo.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include "libbpf.h" +#include "libbpf_internal.h" + +struct bpf_prog_linfo { + void *raw_linfo; + void *raw_jited_linfo; + __u32 *nr_jited_linfo_per_func; + __u32 *jited_linfo_func_idx; + __u32 nr_linfo; + __u32 nr_jited_func; + __u32 rec_size; + __u32 jited_rec_size; +}; + +static int dissect_jited_func(struct bpf_prog_linfo *prog_linfo, + const __u64 *ksym_func, const __u32 *ksym_len) +{ + __u32 nr_jited_func, nr_linfo; + const void *raw_jited_linfo; + const __u64 *jited_linfo; + __u64 last_jited_linfo; + /* + * Index to raw_jited_linfo: + * i: Index for searching the next ksym_func + * prev_i: Index to the last found ksym_func + */ + __u32 i, prev_i; + __u32 f; /* Index to ksym_func */ + + raw_jited_linfo = prog_linfo->raw_jited_linfo; + jited_linfo = raw_jited_linfo; + if (ksym_func[0] != *jited_linfo) + goto errout; + + prog_linfo->jited_linfo_func_idx[0] = 0; + nr_jited_func = prog_linfo->nr_jited_func; + nr_linfo = prog_linfo->nr_linfo; + + for (prev_i = 0, i = 1, f = 1; + i < nr_linfo && f < nr_jited_func; + i++) { + raw_jited_linfo += prog_linfo->jited_rec_size; + last_jited_linfo = *jited_linfo; + jited_linfo = raw_jited_linfo; + + if (ksym_func[f] == *jited_linfo) { + prog_linfo->jited_linfo_func_idx[f] = i; + + /* Sanity check */ + if (last_jited_linfo - ksym_func[f - 1] + 1 > + ksym_len[f - 1]) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[f - 1] = + i - prev_i; + prev_i = i; + + /* + * The ksym_func[f] is found in jited_linfo. + * Look for the next one. + */ + f++; + } else if (*jited_linfo <= last_jited_linfo) { + /* Ensure the addr is increasing _within_ a func */ + goto errout; + } + } + + if (f != nr_jited_func) + goto errout; + + prog_linfo->nr_jited_linfo_per_func[nr_jited_func - 1] = + nr_linfo - prev_i; + + return 0; + +errout: + return -EINVAL; +} + +void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo) +{ + if (!prog_linfo) + return; + + free(prog_linfo->raw_linfo); + free(prog_linfo->raw_jited_linfo); + free(prog_linfo->nr_jited_linfo_per_func); + free(prog_linfo->jited_linfo_func_idx); + free(prog_linfo); +} + +struct bpf_prog_linfo *bpf_prog_linfo__new(const struct bpf_prog_info *info) +{ + struct bpf_prog_linfo *prog_linfo; + __u32 nr_linfo, nr_jited_func; + __u64 data_sz; + + nr_linfo = info->nr_line_info; + + if (!nr_linfo) + return errno = EINVAL, NULL; + + /* + * The min size that bpf_prog_linfo has to access for + * searching purpose. + */ + if (info->line_info_rec_size < + offsetof(struct bpf_line_info, file_name_off)) + return errno = EINVAL, NULL; + + prog_linfo = calloc(1, sizeof(*prog_linfo)); + if (!prog_linfo) + return errno = ENOMEM, NULL; + + /* Copy xlated line_info */ + prog_linfo->nr_linfo = nr_linfo; + prog_linfo->rec_size = info->line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->rec_size; + prog_linfo->raw_linfo = malloc(data_sz); + if (!prog_linfo->raw_linfo) + goto err_free; + memcpy(prog_linfo->raw_linfo, (void *)(long)info->line_info, data_sz); + + nr_jited_func = info->nr_jited_ksyms; + if (!nr_jited_func || + !info->jited_line_info || + info->nr_jited_line_info != nr_linfo || + info->jited_line_info_rec_size < sizeof(__u64) || + info->nr_jited_func_lens != nr_jited_func || + !info->jited_ksyms || + !info->jited_func_lens) + /* Not enough info to provide jited_line_info */ + return prog_linfo; + + /* Copy jited_line_info */ + prog_linfo->nr_jited_func = nr_jited_func; + prog_linfo->jited_rec_size = info->jited_line_info_rec_size; + data_sz = (__u64)nr_linfo * prog_linfo->jited_rec_size; + prog_linfo->raw_jited_linfo = malloc(data_sz); + if (!prog_linfo->raw_jited_linfo) + goto err_free; + memcpy(prog_linfo->raw_jited_linfo, + (void *)(long)info->jited_line_info, data_sz); + + /* Number of jited_line_info per jited func */ + prog_linfo->nr_jited_linfo_per_func = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->nr_jited_linfo_per_func) + goto err_free; + + /* + * For each jited func, + * the start idx to the "linfo" and "jited_linfo" array, + */ + prog_linfo->jited_linfo_func_idx = malloc(nr_jited_func * + sizeof(__u32)); + if (!prog_linfo->jited_linfo_func_idx) + goto err_free; + + if (dissect_jited_func(prog_linfo, + (__u64 *)(long)info->jited_ksyms, + (__u32 *)(long)info->jited_func_lens)) + goto err_free; + + return prog_linfo; + +err_free: + bpf_prog_linfo__free(prog_linfo); + return errno = EINVAL, NULL; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip) +{ + __u32 jited_rec_size, rec_size, nr_linfo, start, i; + const void *raw_jited_linfo, *raw_linfo; + const __u64 *jited_linfo; + + if (func_idx >= prog_linfo->nr_jited_func) + return errno = ENOENT, NULL; + + nr_linfo = prog_linfo->nr_jited_linfo_per_func[func_idx]; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + start = prog_linfo->jited_linfo_func_idx[func_idx] + nr_skip; + jited_rec_size = prog_linfo->jited_rec_size; + raw_jited_linfo = prog_linfo->raw_jited_linfo + + (start * jited_rec_size); + jited_linfo = raw_jited_linfo; + if (addr < *jited_linfo) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (start * rec_size); + for (i = 0; i < nr_linfo; i++) { + if (addr < *jited_linfo) + break; + + raw_linfo += rec_size; + raw_jited_linfo += jited_rec_size; + jited_linfo = raw_jited_linfo; + } + + return raw_linfo - rec_size; +} + +const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip) +{ + const struct bpf_line_info *linfo; + __u32 rec_size, nr_linfo, i; + const void *raw_linfo; + + nr_linfo = prog_linfo->nr_linfo; + if (nr_skip >= nr_linfo) + return errno = ENOENT, NULL; + + rec_size = prog_linfo->rec_size; + raw_linfo = prog_linfo->raw_linfo + (nr_skip * rec_size); + linfo = raw_linfo; + if (insn_off < linfo->insn_off) + return errno = ENOENT, NULL; + + nr_linfo -= nr_skip; + for (i = 0; i < nr_linfo; i++) { + if (insn_off < linfo->insn_off) + break; + + raw_linfo += rec_size; + linfo = raw_linfo; + } + + return raw_linfo - rec_size; +} diff --git a/third_party/libbpf/src/bpf_tracing.h b/third_party/libbpf/src/bpf_tracing.h new file mode 100644 index 0000000..be076a4 --- /dev/null +++ b/third_party/libbpf/src/bpf_tracing.h @@ -0,0 +1,924 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __BPF_TRACING_H__ +#define __BPF_TRACING_H__ + +#include + +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s390) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_riscv) + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arc) + #define bpf_target_arc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_loongarch) + #define bpf_target_loongarch + #define bpf_target_defined +#else + +/* Fall back to what the compiler says */ +#if defined(__x86_64__) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__s390__) + #define bpf_target_s390 + #define bpf_target_defined +#elif defined(__arm__) + #define bpf_target_arm + #define bpf_target_defined +#elif defined(__aarch64__) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__mips__) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__powerpc__) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__sparc__) + #define bpf_target_sparc + #define bpf_target_defined +#elif defined(__riscv) && __riscv_xlen == 64 + #define bpf_target_riscv + #define bpf_target_defined +#elif defined(__arc__) + #define bpf_target_arc + #define bpf_target_defined +#elif defined(__loongarch__) + #define bpf_target_loongarch + #define bpf_target_defined +#endif /* no compiler target */ + +#endif + +#ifndef __BPF_TARGET_MISSING +#define __BPF_TARGET_MISSING "GCC error \"Must specify a BPF target arch via __TARGET_ARCH_xxx\"" +#endif + +#if defined(bpf_target_x86) + +/* + * https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI + */ + +#if defined(__KERNEL__) || defined(__VMLINUX_H__) + +#define __PT_PARM1_REG di +#define __PT_PARM2_REG si +#define __PT_PARM3_REG dx +#define __PT_PARM4_REG cx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +/* + * Syscall uses r10 for PARM4. See arch/x86/entry/entry_64.S:entry_SYSCALL_64 + * comments in Linux sources. And refer to syscall(2) manpage. + */ +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG sp +#define __PT_FP_REG bp +#define __PT_RC_REG ax +#define __PT_SP_REG sp +#define __PT_IP_REG ip + +#else + +#ifdef __i386__ + +/* i386 kernel is built with -mregparm=3 */ +#define __PT_PARM1_REG eax +#define __PT_PARM2_REG edx +#define __PT_PARM3_REG ecx +/* i386 syscall ABI is very different, refer to syscall(2) manpage */ +#define __PT_PARM1_SYSCALL_REG ebx +#define __PT_PARM2_SYSCALL_REG ecx +#define __PT_PARM3_SYSCALL_REG edx +#define __PT_PARM4_SYSCALL_REG esi +#define __PT_PARM5_SYSCALL_REG edi +#define __PT_PARM6_SYSCALL_REG ebp + +#define __PT_RET_REG esp +#define __PT_FP_REG ebp +#define __PT_RC_REG eax +#define __PT_SP_REG esp +#define __PT_IP_REG eip + +#else /* __i386__ */ + +#define __PT_PARM1_REG rdi +#define __PT_PARM2_REG rsi +#define __PT_PARM3_REG rdx +#define __PT_PARM4_REG rcx +#define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG rsp +#define __PT_FP_REG rbp +#define __PT_RC_REG rax +#define __PT_SP_REG rsp +#define __PT_IP_REG rip + +#endif /* __i386__ */ + +#endif /* __KERNEL__ || __VMLINUX_H__ */ + +#elif defined(bpf_target_s390) + +/* + * https://github.com/IBM/s390x-abi/releases/download/v1.6/lzsabi_s390x.pdf + */ + +struct pt_regs___s390 { + unsigned long orig_gpr2; +}; + +/* s390 provides user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const user_pt_regs *)(x)) +#define __PT_PARM1_REG gprs[2] +#define __PT_PARM2_REG gprs[3] +#define __PT_PARM3_REG gprs[4] +#define __PT_PARM4_REG gprs[5] +#define __PT_PARM5_REG gprs[6] + +#define __PT_PARM1_SYSCALL_REG orig_gpr2 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG gprs[7] +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG gprs[14] +#define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG gprs[2] +#define __PT_SP_REG gprs[15] +#define __PT_IP_REG psw.addr + +#elif defined(bpf_target_arm) + +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst#machine-registers + */ + +#define __PT_PARM1_REG uregs[0] +#define __PT_PARM2_REG uregs[1] +#define __PT_PARM3_REG uregs[2] +#define __PT_PARM4_REG uregs[3] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + +#define __PT_RET_REG uregs[14] +#define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG uregs[0] +#define __PT_SP_REG uregs[13] +#define __PT_IP_REG uregs[12] + +#elif defined(bpf_target_arm64) + +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#machine-registers + */ + +struct pt_regs___arm64 { + unsigned long orig_x0; +}; + +/* arm64 provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[0] +#define __PT_PARM2_REG regs[1] +#define __PT_PARM3_REG regs[2] +#define __PT_PARM4_REG regs[3] +#define __PT_PARM5_REG regs[4] +#define __PT_PARM6_REG regs[5] +#define __PT_PARM7_REG regs[6] +#define __PT_PARM8_REG regs[7] + +#define __PT_PARM1_SYSCALL_REG orig_x0 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG regs[30] +#define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[0] +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#elif defined(bpf_target_mips) + +/* + * N64 ABI is assumed right now. + * https://en.wikipedia.org/wiki/MIPS_architecture#Calling_conventions + */ + +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG /* only N32/N64 */ +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG /* only N32/N64 */ + +#define __PT_RET_REG regs[31] +#define __PT_FP_REG regs[30] /* Works only with CONFIG_FRAME_POINTER */ +#define __PT_RC_REG regs[2] +#define __PT_SP_REG regs[29] +#define __PT_IP_REG cp0_epc + +#elif defined(bpf_target_powerpc) + +/* + * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf (page 3-14, + * section "Function Calling Sequence") + */ + +#define __PT_PARM1_REG gpr[3] +#define __PT_PARM2_REG gpr[4] +#define __PT_PARM3_REG gpr[5] +#define __PT_PARM4_REG gpr[6] +#define __PT_PARM5_REG gpr[7] +#define __PT_PARM6_REG gpr[8] +#define __PT_PARM7_REG gpr[9] +#define __PT_PARM8_REG gpr[10] + +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG orig_gpr3 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#if !defined(__arch64__) +#define __PT_PARM7_SYSCALL_REG __PT_PARM7_REG /* only powerpc (not powerpc64) */ +#endif + +#define __PT_RET_REG regs[31] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG gpr[3] +#define __PT_SP_REG sp +#define __PT_IP_REG nip + +#elif defined(bpf_target_sparc) + +/* + * https://en.wikipedia.org/wiki/Calling_convention#SPARC + */ + +#define __PT_PARM1_REG u_regs[UREG_I0] +#define __PT_PARM2_REG u_regs[UREG_I1] +#define __PT_PARM3_REG u_regs[UREG_I2] +#define __PT_PARM4_REG u_regs[UREG_I3] +#define __PT_PARM5_REG u_regs[UREG_I4] +#define __PT_PARM6_REG u_regs[UREG_I5] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG u_regs[UREG_I7] +#define __PT_FP_REG __unsupported__ +#define __PT_RC_REG u_regs[UREG_I0] +#define __PT_SP_REG u_regs[UREG_FP] +/* Should this also be a bpf_target check for the sparc case? */ +#if defined(__arch64__) +#define __PT_IP_REG tpc +#else +#define __PT_IP_REG pc +#endif + +#elif defined(bpf_target_riscv) + +/* + * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions + */ + +/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG a0 +#define __PT_PARM2_REG a1 +#define __PT_PARM3_REG a2 +#define __PT_PARM4_REG a3 +#define __PT_PARM5_REG a4 +#define __PT_PARM6_REG a5 +#define __PT_PARM7_REG a6 +#define __PT_PARM8_REG a7 + +/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG ra +#define __PT_FP_REG s0 +#define __PT_RC_REG a0 +#define __PT_SP_REG sp +#define __PT_IP_REG pc + +#elif defined(bpf_target_arc) + +/* + * Section "Function Calling Sequence" (page 24): + * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf + */ + +/* arc provides struct user_regs_struct instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) +#define __PT_PARM1_REG scratch.r0 +#define __PT_PARM2_REG scratch.r1 +#define __PT_PARM3_REG scratch.r2 +#define __PT_PARM4_REG scratch.r3 +#define __PT_PARM5_REG scratch.r4 +#define __PT_PARM6_REG scratch.r5 +#define __PT_PARM7_REG scratch.r6 +#define __PT_PARM8_REG scratch.r7 + +/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG scratch.blink +#define __PT_FP_REG scratch.fp +#define __PT_RC_REG scratch.r0 +#define __PT_SP_REG scratch.sp +#define __PT_IP_REG scratch.ret + +#elif defined(bpf_target_loongarch) + +/* + * https://docs.kernel.org/loongarch/introduction.html + * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html + */ + +/* loongarch provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG regs[1] +#define __PT_FP_REG regs[22] +#define __PT_RC_REG regs[4] +#define __PT_SP_REG regs[3] +#define __PT_IP_REG csr_era + +#endif + +#if defined(bpf_target_defined) + +struct pt_regs; + +/* allow some architectures to override `struct pt_regs` */ +#ifndef __PT_REGS_CAST +#define __PT_REGS_CAST(x) (x) +#endif + +/* + * Different architectures support different number of arguments passed + * through registers. i386 supports just 3, some arches support up to 8. + */ +#ifndef __PT_PARM4_REG +#define __PT_PARM4_REG __unsupported__ +#endif +#ifndef __PT_PARM5_REG +#define __PT_PARM5_REG __unsupported__ +#endif +#ifndef __PT_PARM6_REG +#define __PT_PARM6_REG __unsupported__ +#endif +#ifndef __PT_PARM7_REG +#define __PT_PARM7_REG __unsupported__ +#endif +#ifndef __PT_PARM8_REG +#define __PT_PARM8_REG __unsupported__ +#endif +/* + * Similarly, syscall-specific conventions might differ between function call + * conventions within each architecutre. All supported architectures pass + * either 6 or 7 syscall arguments in registers. + * + * See syscall(2) manpage for succinct table with information on each arch. + */ +#ifndef __PT_PARM7_SYSCALL_REG +#define __PT_PARM7_SYSCALL_REG __unsupported__ +#endif + +#define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) +#define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) +#define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) +#define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG) +#define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_PARM7(x) (__PT_REGS_CAST(x)->__PT_PARM7_REG) +#define PT_REGS_PARM8(x) (__PT_REGS_CAST(x)->__PT_PARM8_REG) +#define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) +#define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) +#define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) +#define PT_REGS_SP(x) (__PT_REGS_CAST(x)->__PT_SP_REG) +#define PT_REGS_IP(x) (__PT_REGS_CAST(x)->__PT_IP_REG) + +#define PT_REGS_PARM1_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_REG) +#define PT_REGS_PARM2_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_REG) +#define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG) +#define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG) +#define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG) +#define PT_REGS_PARM6_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_REG) +#define PT_REGS_PARM7_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_REG) +#define PT_REGS_PARM8_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM8_REG) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG) +#define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG) +#define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG) +#define PT_REGS_SP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_SP_REG) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_IP_REG) + +#if defined(bpf_target_powerpc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#elif defined(bpf_target_sparc) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP + +#else + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) \ + ({ bpf_probe_read_kernel(&(ip), sizeof(ip), (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) + +#endif + +#ifndef PT_REGS_PARM1_SYSCALL +#define PT_REGS_PARM1_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM1_SYSCALL_REG) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM2_SYSCALL +#define PT_REGS_PARM2_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM2_SYSCALL_REG) +#define PT_REGS_PARM2_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM3_SYSCALL +#define PT_REGS_PARM3_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM3_SYSCALL_REG) +#define PT_REGS_PARM3_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM4_SYSCALL +#define PT_REGS_PARM4_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM4_SYSCALL_REG) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM5_SYSCALL +#define PT_REGS_PARM5_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM5_SYSCALL_REG) +#define PT_REGS_PARM5_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM6_SYSCALL +#define PT_REGS_PARM6_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM6_SYSCALL_REG) +#define PT_REGS_PARM6_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM7_SYSCALL +#define PT_REGS_PARM7_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM7_SYSCALL_REG) +#define PT_REGS_PARM7_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_SYSCALL_REG) +#endif + +#else /* defined(bpf_target_defined) */ + +#define PT_REGS_PARM1(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_SP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_IP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) + +#endif /* defined(bpf_target_defined) */ + +/* + * When invoked from a syscall handler kprobe, returns a pointer to a + * struct pt_regs containing syscall arguments and suitable for passing to + * PT_REGS_PARMn_SYSCALL() and PT_REGS_PARMn_CORE_SYSCALL(). + */ +#ifndef PT_REGS_SYSCALL_REGS +/* By default, assume that the arch selects ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ((struct pt_regs *)PT_REGS_PARM1(ctx)) +#endif + +#ifndef ___bpf_concat +#define ___bpf_concat(a, b) a ## b +#endif +#ifndef ___bpf_apply +#define ___bpf_apply(fn, n) ___bpf_concat(fn, n) +#endif +#ifndef ___bpf_nth +#define ___bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _a, _b, _c, N, ...) N +#endif +#ifndef ___bpf_narg +#define ___bpf_narg(...) ___bpf_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define ___bpf_ctx_cast0() ctx +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) + +/* + * BPF_PROG is a convenience wrapper for generic tp_btf/fentry/fexit and + * similar kinds of BPF programs, that accept input arguments as a single + * pointer to untyped u64 array, where each u64 can actually be a typed + * pointer or integer of different size. Instead of requring user to write + * manual casts and work with array elements by index, BPF_PROG macro + * allows user to declare a list of named and typed input arguments in the + * same syntax as for normal C function. All the casting is hidden and + * performed transparently, while user code can just assume working with + * function arguments of specified type and name. + * + * Original raw context argument is preserved as well as 'ctx' argument. + * This is useful when using BPF helpers that expect original context + * as one of the parameters (e.g., for bpf_perf_event_output()). + */ +#define BPF_PROG(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_ctx_cast(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx, ##args) + +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))))) + +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) + +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) + +/* + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) + +struct pt_regs; + +#define ___bpf_kprobe_args0() ctx +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (void *)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (void *)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (void *)PT_REGS_PARM8(ctx) +#define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KPROBE serves the same purpose for kprobes as BPF_PROG for + * tp_btf/fentry/fexit BPF programs. It hides the underlying platform-specific + * low-level way of getting kprobe input arguments from struct pt_regs, and + * provides a familiar typed and named function arguments syntax and + * semantics of accessing kprobe input paremeters. + * + * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + */ +#define BPF_KPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define ___bpf_kretprobe_args0() ctx +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) + +/* + * BPF_KRETPROBE is similar to BPF_KPROBE, except, it only provides optional + * return value (in addition to `struct pt_regs *ctx`), but no input + * arguments, because they will be clobbered by the time probed function + * returns. + */ +#define BPF_KRETPROBE(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_kretprobe_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) + +/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ +#define ___bpf_syscall_args0() ctx +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (void *)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (void *)PT_REGS_PARM7_SYSCALL(regs) +#define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) + +/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ +#define ___bpf_syswrap_args0() ctx +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (void *)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (void *)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) + +/* + * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for + * tracing syscall functions, like __x64_sys_close. It hides the underlying + * platform-specific low-level way of getting syscall input arguments from + * struct pt_regs, and provides a familiar typed and named function arguments + * syntax and semantics of accessing syscall input parameters. + * + * Original struct pt_regs * context is preserved as 'ctx' argument. This might + * be necessary when using BPF helpers like bpf_perf_event_output(). + * + * At the moment BPF_KSYSCALL does not transparently handle all the calling + * convention quirks for the following syscalls: + * + * - mmap(): __ARCH_WANT_SYS_OLD_MMAP. + * - clone(): CONFIG_CLONE_BACKWARDS, CONFIG_CLONE_BACKWARDS2 and + * CONFIG_CLONE_BACKWARDS3. + * - socket-related syscalls: __ARCH_WANT_SYS_SOCKETCALL. + * - compat syscalls. + * + * This may or may not change in the future. User needs to take extra measures + * to handle such quirks explicitly, if necessary. + * + * This macro relies on BPF CO-RE support and virtual __kconfig externs. + */ +#define BPF_KSYSCALL(name, args...) \ +name(struct pt_regs *ctx); \ +extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ + ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ + : ctx; \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + if (LINUX_HAS_SYSCALL_WRAPPER) \ + return ____##name(___bpf_syswrap_args(args)); \ + else \ + return ____##name(___bpf_syscall_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#define BPF_KPROBE_SYSCALL BPF_KSYSCALL + +/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE, + * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe") + * use cases. + */ +#define BPF_UPROBE(name, args...) BPF_KPROBE(name, ##args) +#define BPF_URETPROBE(name, args...) BPF_KRETPROBE(name, ##args) + +#endif diff --git a/third_party/libbpf/src/btf.c b/third_party/libbpf/src/btf.c new file mode 100644 index 0000000..8484b56 --- /dev/null +++ b/third_party/libbpf/src/btf.c @@ -0,0 +1,5025 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "strset.h" + +#define BTF_MAX_NR_TYPES 0x7fffffffU +#define BTF_MAX_STR_OFFSET 0x7fffffffU + +static struct btf_type btf_void; + +struct btf { + /* raw BTF data in native endianness */ + void *raw_data; + /* raw BTF data in non-native endianness */ + void *raw_data_swapped; + __u32 raw_size; + /* whether target endianness differs from the native one */ + bool swapped_endian; + + /* + * When BTF is loaded from an ELF or raw memory it is stored + * in a contiguous memory block. The hdr, type_data, and, strs_data + * point inside that memory region to their respective parts of BTF + * representation: + * + * +--------------------------------+ + * | Header | Types | Strings | + * +--------------------------------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data-+ | + * strs_data------------+ + * + * If BTF data is later modified, e.g., due to types added or + * removed, BTF deduplication performed, etc, this contiguous + * representation is broken up into three independently allocated + * memory regions to be able to modify them independently. + * raw_data is nulled out at that point, but can be later allocated + * and cached again if user calls btf__raw_data(), at which point + * raw_data will contain a contiguous copy of header, types, and + * strings: + * + * +----------+ +---------+ +-----------+ + * | Header | | Types | | Strings | + * +----------+ +---------+ +-----------+ + * ^ ^ ^ + * | | | + * hdr | | + * types_data----+ | + * strset__data(strs_set)-----+ + * + * +----------+---------+-----------+ + * | Header | Types | Strings | + * raw_data----->+----------+---------+-----------+ + */ + struct btf_header *hdr; + + void *types_data; + size_t types_data_cap; /* used size stored in hdr->type_len */ + + /* type ID to `struct btf_type *` lookup index + * type_offs[0] corresponds to the first non-VOID type: + * - for base BTF it's type [1]; + * - for split BTF it's the first non-base BTF type. + */ + __u32 *type_offs; + size_t type_offs_cap; + /* number of types in this BTF instance: + * - doesn't include special [0] void type; + * - for split BTF counts number of types added on top of base BTF. + */ + __u32 nr_types; + /* if not NULL, points to the base BTF on top of which the current + * split BTF is based + */ + struct btf *base_btf; + /* BTF type ID of the first type in this BTF instance: + * - for base BTF it's equal to 1; + * - for split BTF it's equal to biggest type ID of base BTF plus 1. + */ + int start_id; + /* logical string offset of this BTF instance: + * - for base BTF it's equal to 0; + * - for split BTF it's equal to total size of base BTF's string section size. + */ + int start_str_off; + + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ + void *strs_data; + /* a set of unique strings */ + struct strset *strs_set; + /* whether strings are already deduplicated */ + bool strs_deduped; + + /* BTF object FD, if loaded into kernel */ + int fd; + + /* Pointer size (in bytes) for a target architecture of this BTF */ + int ptr_sz; +}; + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +/* Ensure given dynamically allocated memory region pointed to by *data* with + * capacity of *cap_cnt* elements each taking *elem_sz* bytes has enough + * memory to accommodate *add_cnt* new elements, assuming *cur_cnt* elements + * are already used. At most *max_cnt* elements can be ever allocated. + * If necessary, memory is reallocated and all existing data is copied over, + * new pointer to the memory region is stored at *data, new memory region + * capacity (in number of elements) is stored in *cap. + * On success, memory pointer to the beginning of unused memory is returned. + * On error, NULL is returned. + */ +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) +{ + size_t new_cnt; + void *new_data; + + if (cur_cnt + add_cnt <= *cap_cnt) + return *data + cur_cnt * elem_sz; + + /* requested more than the set limit */ + if (cur_cnt + add_cnt > max_cnt) + return NULL; + + new_cnt = *cap_cnt; + new_cnt += new_cnt / 4; /* expand by 25% */ + if (new_cnt < 16) /* but at least 16 elements */ + new_cnt = 16; + if (new_cnt > max_cnt) /* but not exceeding a set limit */ + new_cnt = max_cnt; + if (new_cnt < cur_cnt + add_cnt) /* also ensure we have enough memory */ + new_cnt = cur_cnt + add_cnt; + + new_data = libbpf_reallocarray(*data, new_cnt, elem_sz); + if (!new_data) + return NULL; + + /* zero out newly allocated portion of memory */ + memset(new_data + (*cap_cnt) * elem_sz, 0, (new_cnt - *cap_cnt) * elem_sz); + + *data = new_data; + *cap_cnt = new_cnt; + return new_data + cur_cnt * elem_sz; +} + +/* Ensure given dynamically allocated memory region has enough allocated space + * to accommodate *need_cnt* elements of size *elem_sz* bytes each + */ +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +{ + void *p; + + if (need_cnt <= *cap_cnt) + return 0; + + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + if (!p) + return -ENOMEM; + + return 0; +} + +static void *btf_add_type_offs_mem(struct btf *btf, size_t add_cnt) +{ + return libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, add_cnt); +} + +static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) +{ + __u32 *p; + + p = btf_add_type_offs_mem(btf, 1); + if (!p) + return -ENOMEM; + + *p = type_off; + return 0; +} + +static void btf_bswap_hdr(struct btf_header *h) +{ + h->magic = bswap_16(h->magic); + h->hdr_len = bswap_32(h->hdr_len); + h->type_off = bswap_32(h->type_off); + h->type_len = bswap_32(h->type_len); + h->str_off = bswap_32(h->str_off); + h->str_len = bswap_32(h->str_len); +} + +static int btf_parse_hdr(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + __u32 meta_left; + + if (btf->raw_size < sizeof(struct btf_header)) { + pr_debug("BTF header not found\n"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + btf->swapped_endian = true; + if (bswap_32(hdr->hdr_len) != sizeof(struct btf_header)) { + pr_warn("Can't load BTF with non-native endianness due to unsupported header length %u\n", + bswap_32(hdr->hdr_len)); + return -ENOTSUP; + } + btf_bswap_hdr(hdr); + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF magic: %x\n", hdr->magic); + return -EINVAL; + } + + if (btf->raw_size < hdr->hdr_len) { + pr_debug("BTF header len %u larger than data size %u\n", + hdr->hdr_len, btf->raw_size); + return -EINVAL; + } + + meta_left = btf->raw_size - hdr->hdr_len; + if (meta_left < (long long)hdr->str_off + hdr->str_len) { + pr_debug("Invalid BTF total size: %u\n", btf->raw_size); + return -EINVAL; + } + + if ((long long)hdr->type_off + hdr->type_len > hdr->str_off) { + pr_debug("Invalid BTF data sections layout: type data at %u + %u, strings data at %u + %u\n", + hdr->type_off, hdr->type_len, hdr->str_off, hdr->str_len); + return -EINVAL; + } + + if (hdr->type_off % 4) { + pr_debug("BTF type section is not aligned to 4 bytes\n"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_str_sec(struct btf *btf) +{ + const struct btf_header *hdr = btf->hdr; + const char *start = btf->strs_data; + const char *end = start + btf->hdr->str_len; + + if (btf->base_btf && hdr->str_len == 0) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || end[-1]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { + pr_debug("Invalid BTF string section\n"); + return -EINVAL; + } + return 0; +} + +static int btf_type_size(const struct btf_type *t) +{ + const int base_size = sizeof(struct btf_type); + __u16 vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return base_size; + case BTF_KIND_INT: + return base_size + sizeof(__u32); + case BTF_KIND_ENUM: + return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); + case BTF_KIND_ARRAY: + return base_size + sizeof(struct btf_array); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return base_size + vlen * sizeof(struct btf_member); + case BTF_KIND_FUNC_PROTO: + return base_size + vlen * sizeof(struct btf_param); + case BTF_KIND_VAR: + return base_size + sizeof(struct btf_var); + case BTF_KIND_DATASEC: + return base_size + vlen * sizeof(struct btf_var_secinfo); + case BTF_KIND_DECL_TAG: + return base_size + sizeof(struct btf_decl_tag); + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static void btf_bswap_type_base(struct btf_type *t) +{ + t->name_off = bswap_32(t->name_off); + t->info = bswap_32(t->info); + t->type = bswap_32(t->type); +} + +static int btf_bswap_type_rest(struct btf_type *t) +{ + struct btf_var_secinfo *v; + struct btf_enum64 *e64; + struct btf_member *m; + struct btf_array *a; + struct btf_param *p; + struct btf_enum *e; + __u16 vlen = btf_vlen(t); + int i; + + switch (btf_kind(t)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + return 0; + case BTF_KIND_INT: + *(__u32 *)(t + 1) = bswap_32(*(__u32 *)(t + 1)); + return 0; + case BTF_KIND_ENUM: + for (i = 0, e = btf_enum(t); i < vlen; i++, e++) { + e->name_off = bswap_32(e->name_off); + e->val = bswap_32(e->val); + } + return 0; + case BTF_KIND_ENUM64: + for (i = 0, e64 = btf_enum64(t); i < vlen; i++, e64++) { + e64->name_off = bswap_32(e64->name_off); + e64->val_lo32 = bswap_32(e64->val_lo32); + e64->val_hi32 = bswap_32(e64->val_hi32); + } + return 0; + case BTF_KIND_ARRAY: + a = btf_array(t); + a->type = bswap_32(a->type); + a->index_type = bswap_32(a->index_type); + a->nelems = bswap_32(a->nelems); + return 0; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + for (i = 0, m = btf_members(t); i < vlen; i++, m++) { + m->name_off = bswap_32(m->name_off); + m->type = bswap_32(m->type); + m->offset = bswap_32(m->offset); + } + return 0; + case BTF_KIND_FUNC_PROTO: + for (i = 0, p = btf_params(t); i < vlen; i++, p++) { + p->name_off = bswap_32(p->name_off); + p->type = bswap_32(p->type); + } + return 0; + case BTF_KIND_VAR: + btf_var(t)->linkage = bswap_32(btf_var(t)->linkage); + return 0; + case BTF_KIND_DATASEC: + for (i = 0, v = btf_var_secinfos(t); i < vlen; i++, v++) { + v->type = bswap_32(v->type); + v->offset = bswap_32(v->offset); + v->size = bswap_32(v->size); + } + return 0; + case BTF_KIND_DECL_TAG: + btf_decl_tag(t)->component_idx = bswap_32(btf_decl_tag(t)->component_idx); + return 0; + default: + pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t)); + return -EINVAL; + } +} + +static int btf_parse_type_sec(struct btf *btf) +{ + struct btf_header *hdr = btf->hdr; + void *next_type = btf->types_data; + void *end_type = next_type + hdr->type_len; + int err, type_size; + + while (next_type + sizeof(struct btf_type) <= end_type) { + if (btf->swapped_endian) + btf_bswap_type_base(next_type); + + type_size = btf_type_size(next_type); + if (type_size < 0) + return type_size; + if (next_type + type_size > end_type) { + pr_warn("BTF type [%d] is malformed\n", btf->start_id + btf->nr_types); + return -EINVAL; + } + + if (btf->swapped_endian && btf_bswap_type_rest(next_type)) + return -EINVAL; + + err = btf_add_type_idx_entry(btf, next_type - btf->types_data); + if (err) + return err; + + next_type += type_size; + btf->nr_types++; + } + + if (next_type != end_type) { + pr_warn("BTF types data is malformed\n"); + return -EINVAL; + } + + return 0; +} + +__u32 btf__type_cnt(const struct btf *btf) +{ + return btf->start_id + btf->nr_types; +} + +const struct btf *btf__base_btf(const struct btf *btf) +{ + return btf->base_btf; +} + +/* internal helper returning non-const pointer to a type */ +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id == 0) + return &btf_void; + if (type_id < btf->start_id) + return btf_type_by_id(btf->base_btf, type_id); + return btf->types_data + btf->type_offs[type_id - btf->start_id]; +} + +const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) +{ + if (type_id >= btf->start_id + btf->nr_types) + return errno = EINVAL, NULL; + return btf_type_by_id((struct btf *)btf, type_id); +} + +static int determine_ptr_size(const struct btf *btf) +{ + static const char * const long_aliases[] = { + "long", + "long int", + "int long", + "unsigned long", + "long unsigned", + "unsigned long int", + "unsigned int long", + "long unsigned int", + "long int unsigned", + "int unsigned long", + "int long unsigned", + }; + const struct btf_type *t; + const char *name; + int i, j, n; + + if (btf->base_btf && btf->base_btf->ptr_sz > 0) + return btf->base_btf->ptr_sz; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + if (!btf_is_int(t)) + continue; + + if (t->size != 4 && t->size != 8) + continue; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + continue; + + for (j = 0; j < ARRAY_SIZE(long_aliases); j++) { + if (strcmp(name, long_aliases[j]) == 0) + return t->size; + } + } + + return -1; +} + +static size_t btf_ptr_sz(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + return btf->ptr_sz < 0 ? sizeof(void *) : btf->ptr_sz; +} + +/* Return pointer size this BTF instance assumes. The size is heuristically + * determined by looking for 'long' or 'unsigned long' integer type and + * recording its size in bytes. If BTF type information doesn't have any such + * type, this function returns 0. In the latter case, native architecture's + * pointer size is assumed, so will be either 4 or 8, depending on + * architecture that libbpf was compiled for. It's possible to override + * guessed value by using btf__set_pointer_size() API. + */ +size_t btf__pointer_size(const struct btf *btf) +{ + if (!btf->ptr_sz) + ((struct btf *)btf)->ptr_sz = determine_ptr_size(btf); + + if (btf->ptr_sz < 0) + /* not enough BTF type info to guess */ + return 0; + + return btf->ptr_sz; +} + +/* Override or set pointer size in bytes. Only values of 4 and 8 are + * supported. + */ +int btf__set_pointer_size(struct btf *btf, size_t ptr_sz) +{ + if (ptr_sz != 4 && ptr_sz != 8) + return libbpf_err(-EINVAL); + btf->ptr_sz = ptr_sz; + return 0; +} + +static bool is_host_big_endian(void) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return false; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return true; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif +} + +enum btf_endianness btf__endianness(const struct btf *btf) +{ + if (is_host_big_endian()) + return btf->swapped_endian ? BTF_LITTLE_ENDIAN : BTF_BIG_ENDIAN; + else + return btf->swapped_endian ? BTF_BIG_ENDIAN : BTF_LITTLE_ENDIAN; +} + +int btf__set_endianness(struct btf *btf, enum btf_endianness endian) +{ + if (endian != BTF_LITTLE_ENDIAN && endian != BTF_BIG_ENDIAN) + return libbpf_err(-EINVAL); + + btf->swapped_endian = is_host_big_endian() != (endian == BTF_BIG_ENDIAN); + if (!btf->swapped_endian) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } + return 0; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + return t == &btf_void || btf_is_fwd(t); +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +#define MAX_RESOLVE_DEPTH 32 + +__s64 btf__resolve_size(const struct btf *btf, __u32 type_id) +{ + const struct btf_array *array; + const struct btf_type *t; + __u32 nelems = 1; + __s64 size = -1; + int i; + + t = btf__type_by_id(btf, type_id); + for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t); i++) { + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_DATASEC: + case BTF_KIND_FLOAT: + size = t->size; + goto done; + case BTF_KIND_PTR: + size = btf_ptr_sz(btf); + goto done; + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + type_id = t->type; + break; + case BTF_KIND_ARRAY: + array = btf_array(t); + if (nelems && array->nelems > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + nelems *= array->nelems; + type_id = array->type; + break; + default: + return libbpf_err(-EINVAL); + } + + t = btf__type_by_id(btf, type_id); + } + +done: + if (size < 0) + return libbpf_err(-EINVAL); + if (nelems && size > UINT32_MAX / nelems) + return libbpf_err(-E2BIG); + + return nelems * size; +} + +int btf__align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + __u16 kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FLOAT: + return min(btf_ptr_sz(btf), (size_t)t->size); + case BTF_KIND_PTR: + return btf_ptr_sz(btf); + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf__align_of(btf, t->type); + case BTF_KIND_ARRAY: + return btf__align_of(btf, btf_array(t)->type); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, max_align = 1, align; + + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + if (align <= 0) + return libbpf_err(align); + max_align = max(max_align, align); + + /* if field offset isn't aligned according to field + * type's alignment, then struct must be packed + */ + if (btf_member_bitfield_size(t, i) == 0 && + (m->offset % (8 * align)) != 0) + return 1; + } + + /* if struct/union size isn't a multiple of its alignment, + * then struct must be packed + */ + if ((t->size % max_align) != 0) + return 1; + + return max_align; + } + default: + pr_warn("unsupported BTF_KIND:%u\n", btf_kind(t)); + return errno = EINVAL, 0; + } +} + +int btf__resolve_type(const struct btf *btf, __u32 type_id) +{ + const struct btf_type *t; + int depth = 0; + + t = btf__type_by_id(btf, type_id); + while (depth < MAX_RESOLVE_DEPTH && + !btf_type_is_void_or_null(t) && + (btf_is_mod(t) || btf_is_typedef(t) || btf_is_var(t))) { + type_id = t->type; + t = btf__type_by_id(btf, type_id); + depth++; + } + + if (depth == MAX_RESOLVE_DEPTH || btf_type_is_void_or_null(t)) + return libbpf_err(-EINVAL); + + return type_id; +} + +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (!strcmp(type_name, "void")) + return 0; + + for (i = 1; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name = btf__name_by_offset(btf, t->name_off); + + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, + const char *type_name, __u32 kind) +{ + __u32 i, nr_types = btf__type_cnt(btf); + + if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) + return 0; + + for (i = start_id; i < nr_types; i++) { + const struct btf_type *t = btf__type_by_id(btf, i); + const char *name; + + if (btf_kind(t) != kind) + continue; + name = btf__name_by_offset(btf, t->name_off); + if (name && !strcmp(type_name, name)) + return i; + } + + return libbpf_err(-ENOENT); +} + +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, btf->start_id, type_name, kind); +} + +__s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, + __u32 kind) +{ + return btf_find_by_name_kind(btf, 1, type_name, kind); +} + +static bool btf_is_modifiable(const struct btf *btf) +{ + return (void *)btf->hdr != btf->raw_data; +} + +void btf__free(struct btf *btf) +{ + if (IS_ERR_OR_NULL(btf)) + return; + + if (btf->fd >= 0) + close(btf->fd); + + if (btf_is_modifiable(btf)) { + /* if BTF was modified after loading, it will have a split + * in-memory representation for header, types, and strings + * sections, so we need to free all of them individually. It + * might still have a cached contiguous raw data present, + * which will be unconditionally freed below. + */ + free(btf->hdr); + free(btf->types_data); + strset__free(btf->strs_set); + } + free(btf->raw_data); + free(btf->raw_data_swapped); + free(btf->type_offs); + free(btf); +} + +static struct btf *btf_new_empty(struct btf *base_btf) +{ + struct btf *btf; + + btf = calloc(1, sizeof(*btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + btf->ptr_sz = sizeof(void *); + btf->swapped_endian = false; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + /* +1 for empty string at offset 0 */ + btf->raw_size = sizeof(struct btf_header) + (base_btf ? 0 : 1); + btf->raw_data = calloc(1, btf->raw_size); + if (!btf->raw_data) { + free(btf); + return ERR_PTR(-ENOMEM); + } + + btf->hdr = btf->raw_data; + btf->hdr->hdr_len = sizeof(struct btf_header); + btf->hdr->magic = BTF_MAGIC; + btf->hdr->version = BTF_VERSION; + + btf->types_data = btf->raw_data + btf->hdr->hdr_len; + btf->strs_data = btf->raw_data + btf->hdr->hdr_len; + btf->hdr->str_len = base_btf ? 0 : 1; /* empty string at offset 0 */ + + return btf; +} + +struct btf *btf__new_empty(void) +{ + return libbpf_ptr(btf_new_empty(NULL)); +} + +struct btf *btf__new_empty_split(struct btf *base_btf) +{ + return libbpf_ptr(btf_new_empty(base_btf)); +} + +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +{ + struct btf *btf; + int err; + + btf = calloc(1, sizeof(struct btf)); + if (!btf) + return ERR_PTR(-ENOMEM); + + btf->nr_types = 0; + btf->start_id = 1; + btf->start_str_off = 0; + btf->fd = -1; + + if (base_btf) { + btf->base_btf = base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; + } + + btf->raw_data = malloc(size); + if (!btf->raw_data) { + err = -ENOMEM; + goto done; + } + memcpy(btf->raw_data, data, size); + btf->raw_size = size; + + btf->hdr = btf->raw_data; + err = btf_parse_hdr(btf); + if (err) + goto done; + + btf->strs_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->str_off; + btf->types_data = btf->raw_data + btf->hdr->hdr_len + btf->hdr->type_off; + + err = btf_parse_str_sec(btf); + err = err ?: btf_parse_type_sec(btf); + if (err) + goto done; + +done: + if (err) { + btf__free(btf); + return ERR_PTR(err); + } + + return btf; +} + +struct btf *btf__new(const void *data, __u32 size) +{ + return libbpf_ptr(btf_new(data, size, NULL)); +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) +{ + Elf_Data *btf_data = NULL, *btf_ext_data = NULL; + int err = 0, fd = -1, idx = 0; + struct btf *btf = NULL; + Elf_Scn *scn = NULL; + Elf *elf = NULL; + GElf_Ehdr ehdr; + size_t shstrndx; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", path); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("failed to open %s: %s\n", path, strerror(errno)); + return ERR_PTR(err); + } + + err = -LIBBPF_ERRNO__FORMAT; + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_warn("failed to open %s as ELF file\n", path); + goto done; + } + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("failed to get EHDR from %s\n", path); + goto done; + } + + if (elf_getshdrstrndx(elf, &shstrndx)) { + pr_warn("failed to get section names section index for %s\n", + path); + goto done; + } + + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) { + pr_warn("failed to get e_shstrndx from %s\n", path); + goto done; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + char *name; + + idx++; + if (gelf_getshdr(scn, &sh) != &sh) { + pr_warn("failed to get section(%d) header from %s\n", + idx, path); + goto done; + } + name = elf_strptr(elf, shstrndx, sh.sh_name); + if (!name) { + pr_warn("failed to get section(%d) name from %s\n", + idx, path); + goto done; + } + if (strcmp(name, BTF_ELF_SEC) == 0) { + btf_data = elf_getdata(scn, 0); + if (!btf_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } else if (btf_ext && strcmp(name, BTF_EXT_ELF_SEC) == 0) { + btf_ext_data = elf_getdata(scn, 0); + if (!btf_ext_data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto done; + } + continue; + } + } + + if (!btf_data) { + pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); + err = -ENODATA; + goto done; + } + btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); + err = libbpf_get_error(btf); + if (err) + goto done; + + switch (gelf_getclass(elf)) { + case ELFCLASS32: + btf__set_pointer_size(btf, 4); + break; + case ELFCLASS64: + btf__set_pointer_size(btf, 8); + break; + default: + pr_warn("failed to get ELF class (bitness) for %s\n", path); + break; + } + + if (btf_ext && btf_ext_data) { + *btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(*btf_ext); + if (err) + goto done; + } else if (btf_ext) { + *btf_ext = NULL; + } +done: + if (elf) + elf_end(elf); + close(fd); + + if (!err) + return btf; + + if (btf_ext) + btf_ext__free(*btf_ext); + btf__free(btf); + + return ERR_PTR(err); +} + +struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse_elf(path, NULL, btf_ext)); +} + +struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_elf(path, base_btf, NULL)); +} + +static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) +{ + struct btf *btf = NULL; + void *data = NULL; + FILE *f = NULL; + __u16 magic; + int err = 0; + long sz; + + f = fopen(path, "rbe"); + if (!f) { + err = -errno; + goto err_out; + } + + /* check BTF magic */ + if (fread(&magic, 1, sizeof(magic), f) < sizeof(magic)) { + err = -EIO; + goto err_out; + } + if (magic != BTF_MAGIC && magic != bswap_16(BTF_MAGIC)) { + /* definitely not a raw BTF */ + err = -EPROTO; + goto err_out; + } + + /* get file size */ + if (fseek(f, 0, SEEK_END)) { + err = -errno; + goto err_out; + } + sz = ftell(f); + if (sz < 0) { + err = -errno; + goto err_out; + } + /* rewind to the start */ + if (fseek(f, 0, SEEK_SET)) { + err = -errno; + goto err_out; + } + + /* pre-alloc memory and read all of BTF data */ + data = malloc(sz); + if (!data) { + err = -ENOMEM; + goto err_out; + } + if (fread(data, 1, sz, f) < sz) { + err = -EIO; + goto err_out; + } + + /* finally parse BTF data */ + btf = btf_new(data, sz, base_btf); + +err_out: + free(data); + if (f) + fclose(f); + return err ? ERR_PTR(err) : btf; +} + +struct btf *btf__parse_raw(const char *path) +{ + return libbpf_ptr(btf_parse_raw(path, NULL)); +} + +struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse_raw(path, base_btf)); +} + +static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) +{ + struct btf *btf; + int err; + + if (btf_ext) + *btf_ext = NULL; + + btf = btf_parse_raw(path, base_btf); + err = libbpf_get_error(btf); + if (!err) + return btf; + if (err != -EPROTO) + return ERR_PTR(err); + return btf_parse_elf(path, base_btf, btf_ext); +} + +struct btf *btf__parse(const char *path, struct btf_ext **btf_ext) +{ + return libbpf_ptr(btf_parse(path, NULL, btf_ext)); +} + +struct btf *btf__parse_split(const char *path, struct btf *base_btf) +{ + return libbpf_ptr(btf_parse(path, base_btf, NULL)); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); + +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) +{ + LIBBPF_OPTS(bpf_btf_load_opts, opts); + __u32 buf_sz = 0, raw_size; + char *buf = NULL, *tmp; + void *raw_data; + int err = 0; + + if (btf->fd >= 0) + return libbpf_err(-EEXIST); + if (log_sz && !log_buf) + return libbpf_err(-EINVAL); + + /* cache native raw data representation */ + raw_data = btf_get_raw_data(btf, &raw_size, false); + if (!raw_data) { + err = -ENOMEM; + goto done; + } + btf->raw_size = raw_size; + btf->raw_data = raw_data; + +retry_load: + /* if log_level is 0, we won't provide log_buf/log_size to the kernel, + * initially. Only if BTF loading fails, we bump log_level to 1 and + * retry, using either auto-allocated or custom log_buf. This way + * non-NULL custom log_buf provides a buffer just in case, but hopes + * for successful load and no need for log_buf. + */ + if (log_level) { + /* if caller didn't provide custom log_buf, we'll keep + * allocating our own progressively bigger buffers for BTF + * verification log + */ + if (!log_buf) { + buf_sz = max((__u32)BPF_LOG_BUF_SIZE, buf_sz * 2); + tmp = realloc(buf, buf_sz); + if (!tmp) { + err = -ENOMEM; + goto done; + } + buf = tmp; + buf[0] = '\0'; + } + + opts.log_buf = log_buf ? log_buf : buf; + opts.log_size = log_buf ? log_sz : buf_sz; + opts.log_level = log_level; + } + + btf->fd = bpf_btf_load(raw_data, raw_size, &opts); + if (btf->fd < 0) { + /* time to turn on verbose mode and try again */ + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* only retry if caller didn't provide custom log_buf, but + * make sure we can never overflow buf_sz + */ + if (!log_buf && errno == ENOSPC && buf_sz <= UINT_MAX / 2) + goto retry_load; + + err = -errno; + pr_warn("BTF loading error: %d\n", err); + /* don't print out contents of custom log_buf */ + if (!log_buf && buf[0]) + pr_warn("-- BEGIN BTF LOAD LOG ---\n%s\n-- END BTF LOAD LOG --\n", buf); + } + +done: + free(buf); + return libbpf_err(err); +} + +int btf__load_into_kernel(struct btf *btf) +{ + return btf_load_into_kernel(btf, NULL, 0, 0); +} + +int btf__fd(const struct btf *btf) +{ + return btf->fd; +} + +void btf__set_fd(struct btf *btf, int fd) +{ + btf->fd = fd; +} + +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + +static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) +{ + struct btf_header *hdr = btf->hdr; + struct btf_type *t; + void *data, *p; + __u32 data_sz; + int i; + + data = swap_endian ? btf->raw_data_swapped : btf->raw_data; + if (data) { + *size = btf->raw_size; + return data; + } + + data_sz = hdr->hdr_len + hdr->type_len + hdr->str_len; + data = calloc(1, data_sz); + if (!data) + return NULL; + p = data; + + memcpy(p, hdr, hdr->hdr_len); + if (swap_endian) + btf_bswap_hdr(p); + p += hdr->hdr_len; + + memcpy(p, btf->types_data, hdr->type_len); + if (swap_endian) { + for (i = 0; i < btf->nr_types; i++) { + t = p + btf->type_offs[i]; + /* btf_bswap_type_rest() relies on native t->info, so + * we swap base type info after we swapped all the + * additional information + */ + if (btf_bswap_type_rest(t)) + goto err_out; + btf_bswap_type_base(t); + } + } + p += hdr->type_len; + + memcpy(p, btf_strs_data(btf), hdr->str_len); + p += hdr->str_len; + + *size = data_sz; + return data; +err_out: + free(data); + return NULL; +} + +const void *btf__raw_data(const struct btf *btf_ro, __u32 *size) +{ + struct btf *btf = (struct btf *)btf_ro; + __u32 data_sz; + void *data; + + data = btf_get_raw_data(btf, &data_sz, btf->swapped_endian); + if (!data) + return errno = ENOMEM, NULL; + + btf->raw_size = data_sz; + if (btf->swapped_endian) + btf->raw_data_swapped = data; + else + btf->raw_data = data; + *size = data_sz; + return data; +} + +__attribute__((alias("btf__raw_data"))) +const void *btf__get_raw_data(const struct btf *btf, __u32 *size); + +const char *btf__str_by_offset(const struct btf *btf, __u32 offset) +{ + if (offset < btf->start_str_off) + return btf__str_by_offset(btf->base_btf, offset); + else if (offset - btf->start_str_off < btf->hdr->str_len) + return btf_strs_data(btf) + (offset - btf->start_str_off); + else + return errno = EINVAL, NULL; +} + +const char *btf__name_by_offset(const struct btf *btf, __u32 offset) +{ + return btf__str_by_offset(btf, offset); +} + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) +{ + struct bpf_btf_info btf_info; + __u32 len = sizeof(btf_info); + __u32 last_size; + struct btf *btf; + void *ptr; + int err; + + /* we won't know btf_size until we call bpf_btf_get_info_by_fd(). so + * let's start with a sane default - 4KiB here - and resize it only if + * bpf_btf_get_info_by_fd() needs a bigger buffer. + */ + last_size = 4096; + ptr = malloc(last_size); + if (!ptr) + return ERR_PTR(-ENOMEM); + + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); + + if (!err && btf_info.btf_size > last_size) { + void *temp_ptr; + + last_size = btf_info.btf_size; + temp_ptr = realloc(ptr, last_size); + if (!temp_ptr) { + btf = ERR_PTR(-ENOMEM); + goto exit_free; + } + ptr = temp_ptr; + + len = sizeof(btf_info); + memset(&btf_info, 0, sizeof(btf_info)); + btf_info.btf = ptr_to_u64(ptr); + btf_info.btf_size = last_size; + + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); + } + + if (err || btf_info.btf_size > last_size) { + btf = err ? ERR_PTR(-errno) : ERR_PTR(-E2BIG); + goto exit_free; + } + + btf = btf_new(ptr, btf_info.btf_size, base_btf); + +exit_free: + free(ptr); + return btf; +} + +struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf) +{ + struct btf *btf; + int btf_fd; + + btf_fd = bpf_btf_get_fd_by_id(id); + if (btf_fd < 0) + return libbpf_err_ptr(-errno); + + btf = btf_get_from_fd(btf_fd, base_btf); + close(btf_fd); + + return libbpf_ptr(btf); +} + +struct btf *btf__load_from_kernel_by_id(__u32 id) +{ + return btf__load_from_kernel_by_id_split(id, NULL); +} + +static void btf_invalidate_raw_data(struct btf *btf) +{ + if (btf->raw_data) { + free(btf->raw_data); + btf->raw_data = NULL; + } + if (btf->raw_data_swapped) { + free(btf->raw_data_swapped); + btf->raw_data_swapped = NULL; + } +} + +/* Ensure BTF is ready to be modified (by splitting into a three memory + * regions for header, types, and strings). Also invalidate cached + * raw_data, if any. + */ +static int btf_ensure_modifiable(struct btf *btf) +{ + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; + + if (btf_is_modifiable(btf)) { + /* any BTF modification invalidates raw_data */ + btf_invalidate_raw_data(btf); + return 0; + } + + /* split raw data into three memory regions */ + hdr = malloc(btf->hdr->hdr_len); + types = malloc(btf->hdr->type_len); + if (!hdr || !types) + goto err_out; + + memcpy(hdr, btf->hdr, btf->hdr->hdr_len); + memcpy(types, btf->types_data, btf->hdr->type_len); + + /* build lookup index for all strings */ + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); + goto err_out; + } + + /* only when everything was successful, update internal state */ + btf->hdr = hdr; + btf->types_data = types; + btf->types_data_cap = btf->hdr->type_len; + btf->strs_data = NULL; + btf->strs_set = set; + /* if BTF was created from scratch, all strings are guaranteed to be + * unique and deduplicated + */ + if (btf->hdr->str_len == 0) + btf->strs_deduped = true; + if (!btf->base_btf && btf->hdr->str_len == 1) + btf->strs_deduped = true; + + /* invalidate raw_data representation */ + btf_invalidate_raw_data(btf); + + return 0; + +err_out: + strset__free(set); + free(hdr); + free(types); + return err; +} + +/* Find an offset in BTF string section that corresponds to a given string *s*. + * Returns: + * - >0 offset into string section, if string is found; + * - -ENOENT, if string is not in the string section; + * - <0, on any other error. + */ +int btf__find_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + /* BTF needs to be in a modifiable state to build string lookup index */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + return btf->start_str_off + off; +} + +/* Add a string s to the BTF string section. + * Returns: + * - > 0 offset into string section, on success; + * - < 0, on error. + */ +int btf__add_str(struct btf *btf, const char *s) +{ + int off; + + if (btf->base_btf) { + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; + } + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return libbpf_err(off); + + btf->hdr->str_len = strset__data_size(btf->strs_set); + + return btf->start_str_off + off; +} + +static void *btf_add_type_mem(struct btf *btf, size_t add_sz) +{ + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); +} + +static void btf_type_inc_vlen(struct btf_type *t) +{ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, btf_kflag(t)); +} + +static int btf_commit_type(struct btf *btf, int data_sz) +{ + int err; + + err = btf_add_type_idx_entry(btf, btf->hdr->type_len); + if (err) + return libbpf_err(err); + + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types++; + return btf->start_id + btf->nr_types - 1; +} + +struct btf_pipe { + const struct btf *src; + struct btf *dst; + struct hashmap *str_off_map; /* map string offsets from src to dst */ +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + long mapped_off; + int off, err; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + if (p->str_off_map && + hashmap__find(p->str_off_map, *str_off, &mapped_off)) { + *str_off = mapped_off; + return 0; + } + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + /* Remember string mapping from src to dst. It avoids + * performing expensive string comparisons. + */ + if (p->str_off_map) { + err = hashmap__append(p->str_off_map, *str_off, off); + if (err) + return err; + } + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return libbpf_err(sz); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return libbpf_err(err); + + return btf_commit_type(btf, sz); +} + +static int btf_rewrite_type_ids(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (!*type_id) /* nothing to do for VOID references */ + return 0; + + /* we haven't updated btf's type count yet, so + * btf->start_id + btf->nr_types - 1 is the type ID offset we should + * add to all newly added BTF types + */ + *type_id += btf->start_id + btf->nr_types - 1; + return 0; +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx); +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx); + +int btf__add_btf(struct btf *btf, const struct btf *src_btf) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + int data_sz, sz, cnt, i, err, old_strs_len; + __u32 *off; + void *t; + + /* appending split BTF isn't supported yet */ + if (src_btf->base_btf) + return libbpf_err(-ENOTSUP); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + /* remember original strings section size if we have to roll back + * partial strings section changes + */ + old_strs_len = btf->hdr->str_len; + + data_sz = src_btf->hdr->type_len; + cnt = btf__type_cnt(src_btf) - 1; + + /* pre-allocate enough memory for new types */ + t = btf_add_type_mem(btf, data_sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* pre-allocate enough memory for type offset index for new types */ + off = btf_add_type_offs_mem(btf, cnt); + if (!off) + return libbpf_err(-ENOMEM); + + /* Map the string offsets from src_btf to the offsets from btf to improve performance */ + p.str_off_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(p.str_off_map)) + return libbpf_err(-ENOMEM); + + /* bulk copy types data for all types from src_btf */ + memcpy(t, src_btf->types_data, data_sz); + + for (i = 0; i < cnt; i++) { + sz = btf_type_size(t); + if (sz < 0) { + /* unlikely, has to be corrupted src_btf */ + err = sz; + goto err_out; + } + + /* fill out type ID to type offset mapping for lookups by type ID */ + *off = t - btf->types_data; + + /* add, dedup, and remap strings referenced by this BTF type */ + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + goto err_out; + + /* remap all type IDs referenced from this BTF type */ + err = btf_type_visit_type_ids(t, btf_rewrite_type_ids, btf); + if (err) + goto err_out; + + /* go to next type data and type offset index entry */ + t += sz; + off++; + } + + /* Up until now any of the copied type data was effectively invisible, + * so if we exited early before this point due to error, BTF would be + * effectively unmodified. There would be extra internal memory + * pre-allocated, but it would not be available for querying. But now + * that we've copied and rewritten all the data successfully, we can + * update type count and various internal offsets and sizes to + * "commit" the changes and made them visible to the outside world. + */ + btf->hdr->type_len += data_sz; + btf->hdr->str_off += data_sz; + btf->nr_types += cnt; + + hashmap__free(p.str_off_map); + + /* return type ID of the first added BTF type */ + return btf->start_id + btf->nr_types - cnt; +err_out: + /* zero out preallocated memory as if it was just allocated with + * libbpf_add_mem() + */ + memset(btf->types_data + btf->hdr->type_len, 0, data_sz); + memset(btf->strs_data + old_strs_len, 0, btf->hdr->str_len - old_strs_len); + + /* and now restore original strings section size; types data size + * wasn't modified, so doesn't need restoring, see big comment above + */ + btf->hdr->str_len = old_strs_len; + + hashmap__free(p.str_off_map); + + return libbpf_err(err); +} + +/* + * Append new BTF_KIND_INT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - power-of-2 (1, 2, 4, ..) size of the type, in bytes; + * - encoding is a combination of BTF_INT_SIGNED, BTF_INT_CHAR, BTF_INT_BOOL. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) + return libbpf_err(-EINVAL); + if (encoding & ~(BTF_INT_SIGNED | BTF_INT_CHAR | BTF_INT_BOOL)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(int); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* if something goes wrong later, we might end up with an extra string, + * but that shouldn't be a problem, because BTF can't be constructed + * completely anyway and will most probably be just discarded + */ + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_INT, 0, 0); + t->size = byte_sz; + /* set INT info, we don't allow setting legacy bit offset/size */ + *(__u32 *)(t + 1) = (encoding << 24) | (byte_sz * 8); + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_FLOAT type with: + * - *name* - non-empty, non-NULL type name; + * - *sz* - size of the type, in bytes; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* byte_sz must be one of the explicitly allowed values */ + if (byte_sz != 2 && byte_sz != 4 && byte_sz != 8 && byte_sz != 12 && + byte_sz != 16) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_FLOAT, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* it's completely legal to append BTF types with type IDs pointing forward to + * types that haven't been appended yet, so we only make sure that id looks + * sane, we can't guarantee that ID will always be valid + */ +static int validate_type_id(int id) +{ + if (id < 0 || id > BTF_MAX_NR_TYPES) + return -EINVAL; + return 0; +} + +/* generic append function for PTR, TYPEDEF, CONST/VOLATILE/RESTRICT */ +static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref_type_id) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->type = ref_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_PTR type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_ptr(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_PTR, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_ARRAY type with: + * - *index_type_id* - type ID of the type describing array index; + * - *elem_type_id* - type ID of the type describing array element; + * - *nr_elems* - the size of the array; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_array(struct btf *btf, int index_type_id, int elem_type_id, __u32 nr_elems) +{ + struct btf_type *t; + struct btf_array *a; + int sz; + + if (validate_type_id(index_type_id) || validate_type_id(elem_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_array); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_ARRAY, 0, 0); + t->size = 0; + + a = btf_array(t); + a->type = elem_type_id; + a->index_type = index_type_id; + a->nelems = nr_elems; + + return btf_commit_type(btf, sz); +} + +/* generic STRUCT/UNION append function */ +static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 bytes_sz) +{ + struct btf_type *t; + int sz, name_off = 0; + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0 and no kflag; this will be adjusted when + * adding each member + */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, 0); + t->size = bytes_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_STRUCT type with: + * - *name* - name of the struct, can be NULL or empty for anonymous structs; + * - *byte_sz* - size of the struct, in bytes; + * + * Struct initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_struct() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_struct(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_STRUCT, name, byte_sz); +} + +/* + * Append new BTF_KIND_UNION type with: + * - *name* - name of the union, can be NULL or empty for anonymous union; + * - *byte_sz* - size of the union, in bytes; + * + * Union initially has no fields in it. Fields can be added by + * btf__add_field() right after btf__add_union() succeeds. All fields + * should have *bit_offset* of 0. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_union(struct btf *btf, const char *name, __u32 byte_sz) +{ + return btf_add_composite(btf, BTF_KIND_UNION, name, byte_sz); +} + +static struct btf_type *btf_last_type(struct btf *btf) +{ + return btf_type_by_id(btf, btf__type_cnt(btf) - 1); +} + +/* + * Append new field for the current STRUCT/UNION type with: + * - *name* - name of the field, can be NULL or empty for anonymous field; + * - *type_id* - type ID for the type describing field type; + * - *bit_offset* - bit offset of the start of the field within struct/union; + * - *bit_size* - bit size of a bitfield, 0 for non-bitfield fields; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_field(struct btf *btf, const char *name, int type_id, + __u32 bit_offset, __u32 bit_size) +{ + struct btf_type *t; + struct btf_member *m; + bool is_bitfield; + int sz, name_off = 0; + + /* last type should be union/struct */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_composite(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + /* best-effort bit field offset/size enforcement */ + is_bitfield = bit_size || (bit_offset % 8 != 0); + if (is_bitfield && (bit_size == 0 || bit_size > 255 || bit_offset > 0xffffff)) + return libbpf_err(-EINVAL); + + /* only offset 0 is allowed for unions */ + if (btf_is_union(t) && bit_offset) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_member); + m = btf_add_type_mem(btf, sz); + if (!m) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + m->name_off = name_off; + m->type = type_id; + m->offset = bit_offset | (bit_size << 24); + + /* btf_add_type_mem can invalidate t pointer */ + t = btf_last_type(btf); + /* update parent type's vlen and kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t) + 1, is_bitfield || btf_kflag(t)); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed, __u8 kind) +{ + struct btf_type *t; + int sz, name_off = 0; + + /* byte_sz must be power of 2 */ + if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 8) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + /* start out with vlen=0; it will be adjusted when adding enum values */ + t->name_off = name_off; + t->info = btf_type_info(kind, 0, is_signed); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_ENUM type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum_value() + * immediately after btf__add_enum() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +{ + /* + * set the signedness to be unsigned, it will change to signed + * if any later enumerator is negative. + */ + return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); +} + +/* + * Append new enum value for the current ENUM type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) +{ + struct btf_type *t; + struct btf_enum *v; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (value < INT_MIN || value > UINT_MAX) + return libbpf_err(-E2BIG); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val = value; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + /* if negative value, set signedness to signed */ + if (value < 0) + t->info = btf_type_info(btf_kind(t), btf_vlen(t), true); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_ENUM64 type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * - *is_signed* - whether the enum values are signed or not; + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum64_value() + * immediately after btf__add_enum64() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum64(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed) +{ + return btf_add_enum_common(btf, name, byte_sz, is_signed, + BTF_KIND_ENUM64); +} + +/* + * Append new enum value for the current ENUM64 type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) +{ + struct btf_enum64 *v; + struct btf_type *t; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM64 */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum64(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum64); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val_lo32 = (__u32)value; + v->val_hi32 = value >> 32; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_FWD type with: + * - *name*, non-empty/non-NULL name; + * - *fwd_kind*, kind of forward declaration, one of BTF_FWD_STRUCT, + * BTF_FWD_UNION, or BTF_FWD_ENUM; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + switch (fwd_kind) { + case BTF_FWD_STRUCT: + case BTF_FWD_UNION: { + struct btf_type *t; + int id; + + id = btf_add_ref_kind(btf, BTF_KIND_FWD, name, 0); + if (id <= 0) + return id; + t = btf_type_by_id(btf, id); + t->info = btf_type_info(BTF_KIND_FWD, 0, fwd_kind == BTF_FWD_UNION); + return id; + } + case BTF_FWD_ENUM: + /* enum forward in BTF currently is just an enum with no enum + * values; we also assume a standard 4-byte size for it + */ + return btf__add_enum(btf, name, sizeof(int)); + default: + return libbpf_err(-EINVAL); + } +} + +/* + * Append new BTF_KING_TYPEDEF type with: + * - *name*, non-empty/non-NULL name; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) +{ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id); +} + +/* + * Append new BTF_KIND_VOLATILE type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_volatile(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_VOLATILE, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_CONST type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_const(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_CONST, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_RESTRICT type with: + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_restrict(struct btf *btf, int ref_type_id) +{ + return btf_add_ref_kind(btf, BTF_KIND_RESTRICT, NULL, ref_type_id); +} + +/* + * Append new BTF_KIND_TYPE_TAG type with: + * - *value*, non-empty/non-NULL tag value; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) +{ + if (!value || !value[0]) + return libbpf_err(-EINVAL); + + return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id); +} + +/* + * Append new BTF_KIND_FUNC type with: + * - *name*, non-empty/non-NULL name; + * - *proto_type_id* - FUNC_PROTO's type ID, it might not exist yet; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id) +{ + int id; + + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && + linkage != BTF_FUNC_EXTERN) + return libbpf_err(-EINVAL); + + id = btf_add_ref_kind(btf, BTF_KIND_FUNC, name, proto_type_id); + if (id > 0) { + struct btf_type *t = btf_type_by_id(btf, id); + + t->info = btf_type_info(BTF_KIND_FUNC, linkage, 0); + } + return libbpf_err(id); +} + +/* + * Append new BTF_KIND_FUNC_PROTO with: + * - *ret_type_id* - type ID for return result of a function. + * + * Function prototype initially has no arguments, but they can be added by + * btf__add_func_param() one by one, immediately after + * btf__add_func_proto() succeeded. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_func_proto(struct btf *btf, int ret_type_id) +{ + struct btf_type *t; + int sz; + + if (validate_type_id(ret_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + /* start out with vlen=0; this will be adjusted when adding enum + * values, if necessary + */ + t->name_off = 0; + t->info = btf_type_info(BTF_KIND_FUNC_PROTO, 0, 0); + t->type = ret_type_id; + + return btf_commit_type(btf, sz); +} + +/* + * Append new function parameter for current FUNC_PROTO type with: + * - *name* - parameter name, can be NULL or empty; + * - *type_id* - type ID describing the type of the parameter. + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_func_param(struct btf *btf, const char *name, int type_id) +{ + struct btf_type *t; + struct btf_param *p; + int sz, name_off = 0; + + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* last type should be BTF_KIND_FUNC_PROTO */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_func_proto(t)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_param); + p = btf_add_type_mem(btf, sz); + if (!p) + return libbpf_err(-ENOMEM); + + if (name && name[0]) { + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + } + + p->name_off = name_off; + p->type = type_id; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_VAR type with: + * - *name* - non-empty/non-NULL name; + * - *linkage* - variable linkage, one of BTF_VAR_STATIC, + * BTF_VAR_GLOBAL_ALLOCATED, or BTF_VAR_GLOBAL_EXTERN; + * - *type_id* - type ID of the type describing the type of the variable. + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) +{ + struct btf_type *t; + struct btf_var *v; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && + linkage != BTF_VAR_GLOBAL_EXTERN) + return libbpf_err(-EINVAL); + if (validate_type_id(type_id)) + return libbpf_err(-EINVAL); + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_var); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_VAR, 0, 0); + t->type = type_id; + + v = btf_var(t); + v->linkage = linkage; + + return btf_commit_type(btf, sz); +} + +/* + * Append new BTF_KIND_DATASEC type with: + * - *name* - non-empty/non-NULL name; + * - *byte_sz* - data section size, in bytes. + * + * Data section is initially empty. Variables info can be added with + * btf__add_datasec_var_info() calls, after btf__add_datasec() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) +{ + struct btf_type *t; + int sz, name_off; + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + /* start with vlen=0, which will be update as var_secinfos are added */ + t->name_off = name_off; + t->info = btf_type_info(BTF_KIND_DATASEC, 0, 0); + t->size = byte_sz; + + return btf_commit_type(btf, sz); +} + +/* + * Append new data section variable information entry for current DATASEC type: + * - *var_type_id* - type ID, describing type of the variable; + * - *offset* - variable offset within data section, in bytes; + * - *byte_sz* - variable size, in bytes. + * + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_datasec_var_info(struct btf *btf, int var_type_id, __u32 offset, __u32 byte_sz) +{ + struct btf_type *t; + struct btf_var_secinfo *v; + int sz; + + /* last type should be BTF_KIND_DATASEC */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_datasec(t)) + return libbpf_err(-EINVAL); + + if (validate_type_id(var_type_id)) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_var_secinfo); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + v->type = var_type_id; + v->offset = offset; + v->size = byte_sz; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_DECL_TAG type with: + * - *value* - non-empty/non-NULL string; + * - *ref_type_id* - referenced type ID, it might not exist yet; + * - *component_idx* - -1 for tagging reference type, otherwise struct/union + * member or function argument index; + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx) +{ + struct btf_type *t; + int sz, value_off; + + if (!value || !value[0] || component_idx < -1) + return libbpf_err(-EINVAL); + + if (validate_type_id(ref_type_id)) + return libbpf_err(-EINVAL); + + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_type) + sizeof(struct btf_decl_tag); + t = btf_add_type_mem(btf, sz); + if (!t) + return libbpf_err(-ENOMEM); + + value_off = btf__add_str(btf, value); + if (value_off < 0) + return value_off; + + t->name_off = value_off; + t->info = btf_type_info(BTF_KIND_DECL_TAG, 0, false); + t->type = ref_type_id; + btf_decl_tag(t)->component_idx = component_idx; + + return btf_commit_type(btf, sz); +} + +struct btf_ext_sec_setup_param { + __u32 off; + __u32 len; + __u32 min_rec_size; + struct btf_ext_info *ext_info; + const char *desc; +}; + +static int btf_ext_setup_info(struct btf_ext *btf_ext, + struct btf_ext_sec_setup_param *ext_sec) +{ + const struct btf_ext_info_sec *sinfo; + struct btf_ext_info *ext_info; + __u32 info_left, record_size; + size_t sec_cnt = 0; + /* The start of the info sec (including the __u32 record_size). */ + void *info; + + if (ext_sec->len == 0) + return 0; + + if (ext_sec->off & 0x03) { + pr_debug(".BTF.ext %s section is not aligned to 4 bytes\n", + ext_sec->desc); + return -EINVAL; + } + + info = btf_ext->data + btf_ext->hdr->hdr_len + ext_sec->off; + info_left = ext_sec->len; + + if (btf_ext->data + btf_ext->data_size < info + ext_sec->len) { + pr_debug("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", + ext_sec->desc, ext_sec->off, ext_sec->len); + return -EINVAL; + } + + /* At least a record size */ + if (info_left < sizeof(__u32)) { + pr_debug(".BTF.ext %s record size not found\n", ext_sec->desc); + return -EINVAL; + } + + /* The record size needs to meet the minimum standard */ + record_size = *(__u32 *)info; + if (record_size < ext_sec->min_rec_size || + record_size & 0x03) { + pr_debug("%s section in .BTF.ext has invalid record size %u\n", + ext_sec->desc, record_size); + return -EINVAL; + } + + sinfo = info + sizeof(__u32); + info_left -= sizeof(__u32); + + /* If no records, return failure now so .BTF.ext won't be used. */ + if (!info_left) { + pr_debug("%s section in .BTF.ext has no records", ext_sec->desc); + return -EINVAL; + } + + while (info_left) { + unsigned int sec_hdrlen = sizeof(struct btf_ext_info_sec); + __u64 total_record_size; + __u32 num_records; + + if (info_left < sec_hdrlen) { + pr_debug("%s section header is not found in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + num_records = sinfo->num_info; + if (num_records == 0) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + total_record_size = sec_hdrlen + (__u64)num_records * record_size; + if (info_left < total_record_size) { + pr_debug("%s section has incorrect num_records in .BTF.ext\n", + ext_sec->desc); + return -EINVAL; + } + + info_left -= total_record_size; + sinfo = (void *)sinfo + total_record_size; + sec_cnt++; + } + + ext_info = ext_sec->ext_info; + ext_info->len = ext_sec->len - sizeof(__u32); + ext_info->rec_size = record_size; + ext_info->info = info + sizeof(__u32); + ext_info->sec_cnt = sec_cnt; + + return 0; +} + +static int btf_ext_setup_func_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->func_info_off, + .len = btf_ext->hdr->func_info_len, + .min_rec_size = sizeof(struct bpf_func_info_min), + .ext_info = &btf_ext->func_info, + .desc = "func_info" + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_line_info(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->line_info_off, + .len = btf_ext->hdr->line_info_len, + .min_rec_size = sizeof(struct bpf_line_info_min), + .ext_info = &btf_ext->line_info, + .desc = "line_info", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_setup_core_relos(struct btf_ext *btf_ext) +{ + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->core_relo_off, + .len = btf_ext->hdr->core_relo_len, + .min_rec_size = sizeof(struct bpf_core_relo), + .ext_info = &btf_ext->core_relo_info, + .desc = "core_relo", + }; + + return btf_ext_setup_info(btf_ext, ¶m); +} + +static int btf_ext_parse_hdr(__u8 *data, __u32 data_size) +{ + const struct btf_ext_header *hdr = (struct btf_ext_header *)data; + + if (data_size < offsetofend(struct btf_ext_header, hdr_len) || + data_size < hdr->hdr_len) { + pr_debug("BTF.ext header not found"); + return -EINVAL; + } + + if (hdr->magic == bswap_16(BTF_MAGIC)) { + pr_warn("BTF.ext in non-native endianness is not supported\n"); + return -ENOTSUP; + } else if (hdr->magic != BTF_MAGIC) { + pr_debug("Invalid BTF.ext magic:%x\n", hdr->magic); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + pr_debug("Unsupported BTF.ext version:%u\n", hdr->version); + return -ENOTSUP; + } + + if (hdr->flags) { + pr_debug("Unsupported BTF.ext flags:%x\n", hdr->flags); + return -ENOTSUP; + } + + if (data_size == hdr->hdr_len) { + pr_debug("BTF.ext has no data\n"); + return -EINVAL; + } + + return 0; +} + +void btf_ext__free(struct btf_ext *btf_ext) +{ + if (IS_ERR_OR_NULL(btf_ext)) + return; + free(btf_ext->func_info.sec_idxs); + free(btf_ext->line_info.sec_idxs); + free(btf_ext->core_relo_info.sec_idxs); + free(btf_ext->data); + free(btf_ext); +} + +struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) +{ + struct btf_ext *btf_ext; + int err; + + btf_ext = calloc(1, sizeof(struct btf_ext)); + if (!btf_ext) + return libbpf_err_ptr(-ENOMEM); + + btf_ext->data_size = size; + btf_ext->data = malloc(size); + if (!btf_ext->data) { + err = -ENOMEM; + goto done; + } + memcpy(btf_ext->data, data, size); + + err = btf_ext_parse_hdr(btf_ext->data, size); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, line_info_len)) { + err = -EINVAL; + goto done; + } + + err = btf_ext_setup_func_info(btf_ext); + if (err) + goto done; + + err = btf_ext_setup_line_info(btf_ext); + if (err) + goto done; + + if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) + goto done; /* skip core relos parsing */ + + err = btf_ext_setup_core_relos(btf_ext); + if (err) + goto done; + +done: + if (err) { + btf_ext__free(btf_ext); + return libbpf_err_ptr(err); + } + + return btf_ext; +} + +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +{ + *size = btf_ext->data_size; + return btf_ext->data; +} + +struct btf_dedup; + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts); +static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_prep(struct btf_dedup *d); +static int btf_dedup_strings(struct btf_dedup *d); +static int btf_dedup_prim_types(struct btf_dedup *d); +static int btf_dedup_struct_types(struct btf_dedup *d); +static int btf_dedup_ref_types(struct btf_dedup *d); +static int btf_dedup_resolve_fwds(struct btf_dedup *d); +static int btf_dedup_compact_types(struct btf_dedup *d); +static int btf_dedup_remap_types(struct btf_dedup *d); + +/* + * Deduplicate BTF types and strings. + * + * BTF dedup algorithm takes as an input `struct btf` representing `.BTF` ELF + * section with all BTF type descriptors and string data. It overwrites that + * memory in-place with deduplicated types and strings without any loss of + * information. If optional `struct btf_ext` representing '.BTF.ext' ELF section + * is provided, all the strings referenced from .BTF.ext section are honored + * and updated to point to the right offsets after deduplication. + * + * If function returns with error, type/string data might be garbled and should + * be discarded. + * + * More verbose and detailed description of both problem btf_dedup is solving, + * as well as solution could be found at: + * https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html + * + * Problem description and justification + * ===================================== + * + * BTF type information is typically emitted either as a result of conversion + * from DWARF to BTF or directly by compiler. In both cases, each compilation + * unit contains information about a subset of all the types that are used + * in an application. These subsets are frequently overlapping and contain a lot + * of duplicated information when later concatenated together into a single + * binary. This algorithm ensures that each unique type is represented by single + * BTF type descriptor, greatly reducing resulting size of BTF data. + * + * Compilation unit isolation and subsequent duplication of data is not the only + * problem. The same type hierarchy (e.g., struct and all the type that struct + * references) in different compilation units can be represented in BTF to + * various degrees of completeness (or, rather, incompleteness) due to + * struct/union forward declarations. + * + * Let's take a look at an example, that we'll use to better understand the + * problem (and solution). Suppose we have two compilation units, each using + * same `struct S`, but each of them having incomplete type information about + * struct's fields: + * + * // CU #1: + * struct S; + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * // CU #2: + * struct S; + * struct A; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * In case of CU #1, BTF data will know only that `struct B` exist (but no + * more), but will know the complete type information about `struct A`. While + * for CU #2, it will know full type information about `struct B`, but will + * only know about forward declaration of `struct A` (in BTF terms, it will + * have `BTF_KIND_FWD` type descriptor with name `B`). + * + * This compilation unit isolation means that it's possible that there is no + * single CU with complete type information describing structs `S`, `A`, and + * `B`. Also, we might get tons of duplicated and redundant type information. + * + * Additional complication we need to keep in mind comes from the fact that + * types, in general, can form graphs containing cycles, not just DAGs. + * + * While algorithm does deduplication, it also merges and resolves type + * information (unless disabled throught `struct btf_opts`), whenever possible. + * E.g., in the example above with two compilation units having partial type + * information for structs `A` and `B`, the output of algorithm will emit + * a single copy of each BTF type that describes structs `A`, `B`, and `S` + * (as well as type information for `int` and pointers), as if they were defined + * in a single compilation unit as: + * + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * Algorithm summary + * ================= + * + * Algorithm completes its work in 7 separate passes: + * + * 1. Strings deduplication. + * 2. Primitive types deduplication (int, enum, fwd). + * 3. Struct/union types deduplication. + * 4. Resolve unambiguous forward declarations. + * 5. Reference types deduplication (pointers, typedefs, arrays, funcs, func + * protos, and const/volatile/restrict modifiers). + * 6. Types compaction. + * 7. Types remapping. + * + * Algorithm determines canonical type descriptor, which is a single + * representative type for each truly unique type. This canonical type is the + * one that will go into final deduplicated BTF type information. For + * struct/unions, it is also the type that algorithm will merge additional type + * information into (while resolving FWDs), as it discovers it from data in + * other CUs. Each input BTF type eventually gets either mapped to itself, if + * that type is canonical, or to some other type, if that type is equivalent + * and was chosen as canonical representative. This mapping is stored in + * `btf_dedup->map` array. This map is also used to record STRUCT/UNION that + * FWD type got resolved to. + * + * To facilitate fast discovery of canonical types, we also maintain canonical + * index (`btf_dedup->dedup_table`), which maps type descriptor's signature hash + * (i.e., hashed kind, name, size, fields, etc) into a list of canonical types + * that match that signature. With sufficiently good choice of type signature + * hashing function, we can limit number of canonical types for each unique type + * signature to a very small number, allowing to find canonical type for any + * duplicated type very quickly. + * + * Struct/union deduplication is the most critical part and algorithm for + * deduplicating structs/unions is described in greater details in comments for + * `btf_dedup_is_equiv` function. + */ +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d; + int err; + + if (!OPTS_VALID(opts, btf_dedup_opts)) + return libbpf_err(-EINVAL); + + d = btf_dedup_new(btf, opts); + if (IS_ERR(d)) { + pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d)); + return libbpf_err(-EINVAL); + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + err = btf_dedup_prep(d); + if (err) { + pr_debug("btf_dedup_prep failed:%d\n", err); + goto done; + } + err = btf_dedup_strings(d); + if (err < 0) { + pr_debug("btf_dedup_strings failed:%d\n", err); + goto done; + } + err = btf_dedup_prim_types(d); + if (err < 0) { + pr_debug("btf_dedup_prim_types failed:%d\n", err); + goto done; + } + err = btf_dedup_struct_types(d); + if (err < 0) { + pr_debug("btf_dedup_struct_types failed:%d\n", err); + goto done; + } + err = btf_dedup_resolve_fwds(d); + if (err < 0) { + pr_debug("btf_dedup_resolve_fwds failed:%d\n", err); + goto done; + } + err = btf_dedup_ref_types(d); + if (err < 0) { + pr_debug("btf_dedup_ref_types failed:%d\n", err); + goto done; + } + err = btf_dedup_compact_types(d); + if (err < 0) { + pr_debug("btf_dedup_compact_types failed:%d\n", err); + goto done; + } + err = btf_dedup_remap_types(d); + if (err < 0) { + pr_debug("btf_dedup_remap_types failed:%d\n", err); + goto done; + } + +done: + btf_dedup_free(d); + return libbpf_err(err); +} + +#define BTF_UNPROCESSED_ID ((__u32)-1) +#define BTF_IN_PROGRESS_ID ((__u32)-2) + +struct btf_dedup { + /* .BTF section to be deduped in-place */ + struct btf *btf; + /* + * Optional .BTF.ext section. When provided, any strings referenced + * from it will be taken into account when deduping strings + */ + struct btf_ext *btf_ext; + /* + * This is a map from any type's signature hash to a list of possible + * canonical representative type candidates. Hash collisions are + * ignored, so even types of various kinds can share same list of + * candidates, which is fine because we rely on subsequent + * btf_xxx_equal() checks to authoritatively verify type equality. + */ + struct hashmap *dedup_table; + /* Canonical types map */ + __u32 *map; + /* Hypothetical mapping, used during type graph equivalence checks */ + __u32 *hypot_map; + __u32 *hypot_list; + size_t hypot_cnt; + size_t hypot_cap; + /* Whether hypothetical mapping, if successful, would need to adjust + * already canonicalized types (due to a new forward declaration to + * concrete type resolution). In such case, during split BTF dedup + * candidate type would still be considered as different, because base + * BTF is considered to be immutable. + */ + bool hypot_adjust_canon; + /* Various option modifying behavior of algorithm */ + struct btf_dedup_opts opts; + /* temporary strings deduplication state */ + struct strset *strs_set; +}; + +static long hash_combine(long h, long value) +{ + return h * 31 + value; +} + +#define for_each_dedup_cand(d, node, hash) \ + hashmap__for_each_key_entry(d->dedup_table, node, hash) + +static int btf_dedup_table_add(struct btf_dedup *d, long hash, __u32 type_id) +{ + return hashmap__append(d->dedup_table, hash, type_id); +} + +static int btf_dedup_hypot_map_add(struct btf_dedup *d, + __u32 from_id, __u32 to_id) +{ + if (d->hypot_cnt == d->hypot_cap) { + __u32 *new_list; + + d->hypot_cap += max((size_t)16, d->hypot_cap / 2); + new_list = libbpf_reallocarray(d->hypot_list, d->hypot_cap, sizeof(__u32)); + if (!new_list) + return -ENOMEM; + d->hypot_list = new_list; + } + d->hypot_list[d->hypot_cnt++] = from_id; + d->hypot_map[from_id] = to_id; + return 0; +} + +static void btf_dedup_clear_hypot_map(struct btf_dedup *d) +{ + int i; + + for (i = 0; i < d->hypot_cnt; i++) + d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; + d->hypot_cnt = 0; + d->hypot_adjust_canon = false; +} + +static void btf_dedup_free(struct btf_dedup *d) +{ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + + free(d->map); + d->map = NULL; + + free(d->hypot_map); + d->hypot_map = NULL; + + free(d->hypot_list); + d->hypot_list = NULL; + + free(d); +} + +static size_t btf_dedup_identity_hash_fn(long key, void *ctx) +{ + return key; +} + +static size_t btf_dedup_collision_hash_fn(long key, void *ctx) +{ + return 0; +} + +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx) +{ + return k1 == k2; +} + +static struct btf_dedup *btf_dedup_new(struct btf *btf, const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); + hashmap_hash_fn hash_fn = btf_dedup_identity_hash_fn; + int i, err = 0, type_cnt; + + if (!d) + return ERR_PTR(-ENOMEM); + + if (OPTS_GET(opts, force_collisions, false)) + hash_fn = btf_dedup_collision_hash_fn; + + d->btf = btf; + d->btf_ext = OPTS_GET(opts, btf_ext, NULL); + + d->dedup_table = hashmap__new(hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(d->dedup_table)) { + err = PTR_ERR(d->dedup_table); + d->dedup_table = NULL; + goto done; + } + + type_cnt = btf__type_cnt(btf); + d->map = malloc(sizeof(__u32) * type_cnt); + if (!d->map) { + err = -ENOMEM; + goto done; + } + /* special BTF "void" type is made canonical immediately */ + d->map[0] = 0; + for (i = 1; i < type_cnt; i++) { + struct btf_type *t = btf_type_by_id(d->btf, i); + + /* VAR and DATASEC are never deduped and are self-canonical */ + if (btf_is_var(t) || btf_is_datasec(t)) + d->map[i] = i; + else + d->map[i] = BTF_UNPROCESSED_ID; + } + + d->hypot_map = malloc(sizeof(__u32) * type_cnt); + if (!d->hypot_map) { + err = -ENOMEM; + goto done; + } + for (i = 0; i < type_cnt; i++) + d->hypot_map[i] = BTF_UNPROCESSED_ID; + +done: + if (err) { + btf_dedup_free(d); + return ERR_PTR(err); + } + + return d; +} + +/* + * Iterate over all possible places in .BTF and .BTF.ext that can reference + * string and pass pointer to it to a provided callback `fn`. + */ +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; + + return 0; +} + +static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 str_off = *str_off_ptr; + const char *s; + int off, err; + + /* don't touch empty string or string in main BTF */ + if (str_off == 0 || str_off < d->btf->start_str_off) + return 0; + + s = btf__str_by_offset(d->btf, str_off); + if (d->btf->base_btf) { + err = btf__find_str(d->btf->base_btf, s); + if (err >= 0) { + *str_off_ptr = err; + return 0; + } + if (err != -ENOENT) + return err; + } + + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; + + *str_off_ptr = d->btf->start_str_off + off; + return 0; +} + +/* + * Dedup string and filter out those that are not referenced from either .BTF + * or .BTF.ext (if provided) sections. + * + * This is done by building index of all strings in BTF's string section, + * then iterating over all entities that can reference strings (e.g., type + * names, struct field names, .BTF.ext line info, etc) and marking corresponding + * strings as used. After that all used strings are deduped and compacted into + * sequential blob of memory and new offsets are calculated. Then all the string + * references are iterated again and rewritten using new offsets. + */ +static int btf_dedup_strings(struct btf_dedup *d) +{ + int err; + + if (d->btf->strs_deduped) + return 0; + + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); + goto err_out; + } + + if (!d->btf->base_btf) { + /* insert empty string; we won't be looking it up during strings + * dedup, but it's good to have it for generic BTF string lookups + */ + err = strset__add_str(d->strs_set, ""); + if (err < 0) + goto err_out; + } + + /* remap string offsets */ + err = btf_for_each_str_off(d, strs_dedup_remap_str_off, d); + if (err) + goto err_out; + + /* replace BTF string data and hash with deduped ones */ + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; + d->btf->strs_deduped = true; + return 0; + +err_out: + strset__free(d->strs_set); + d->strs_set = NULL; + + return err; +} + +static long btf_hash_common(struct btf_type *t) +{ + long h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + h = hash_combine(h, t->size); + return h; +} + +static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info && + t1->size == t2->size; +} + +/* Calculate type signature hash of INT or TAG. */ +static long btf_hash_int_decl_tag(struct btf_type *t) +{ + __u32 info = *(__u32 *)(t + 1); + long h; + + h = btf_hash_common(t); + h = hash_combine(h, info); + return h; +} + +/* Check structural equality of two INTs or TAGs. */ +static bool btf_equal_int_tag(struct btf_type *t1, struct btf_type *t2) +{ + __u32 info1, info2; + + if (!btf_equal_common(t1, t2)) + return false; + info1 = *(__u32 *)(t1 + 1); + info2 = *(__u32 *)(t2 + 1); + return info1 == info2; +} + +/* Calculate type signature hash of ENUM/ENUM64. */ +static long btf_hash_enum(struct btf_type *t) +{ + long h; + + /* don't hash vlen, enum members and size to support enum fwd resolving */ + h = hash_combine(0, t->name_off); + return h; +} + +static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum(t1); + m2 = btf_enum(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val != m2->val) + return false; + m1++; + m2++; + } + return true; +} + +static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum64 *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum64(t1); + m2 = btf_enum64(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val_lo32 != m2->val_lo32 || + m1->val_hi32 != m2->val_hi32) + return false; + m1++; + m2++; + } + return true; +} + +/* Check structural equality of two ENUMs or ENUM64s. */ +static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + /* t1 & t2 kinds are identical because of btf_equal_common */ + if (btf_kind(t1) == BTF_KIND_ENUM) + return btf_equal_enum_members(t1, t2); + else + return btf_equal_enum64_members(t1, t2); +} + +static inline bool btf_is_enum_fwd(struct btf_type *t) +{ + return btf_is_any_enum(t) && btf_vlen(t) == 0; +} + +static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2)) + return btf_equal_enum(t1, t2); + /* At this point either t1 or t2 or both are forward declarations, thus: + * - skip comparing vlen because it is zero for forward declarations; + * - skip comparing size to allow enum forward declarations + * to be compatible with enum64 full declarations; + * - skip comparing kind for the same reason. + */ + return t1->name_off == t2->name_off && + btf_is_any_enum(t1) && btf_is_any_enum(t2); +} + +/* + * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static long btf_hash_struct(struct btf_type *t) +{ + const struct btf_member *member = btf_members(t); + __u32 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->offset); + /* no hashing of referenced type ID, it can be unresolved yet */ + member++; + } + return h; +} + +/* + * Check structural compatibility of two STRUCTs/UNIONs, ignoring referenced + * type IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_member *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->offset != m2->offset) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Calculate type signature hash of ARRAY, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_array(struct btf_type *t) +{ + const struct btf_array *info = btf_array(t); + long h = btf_hash_common(t); + + h = hash_combine(h, info->type); + h = hash_combine(h, info->index_type); + h = hash_combine(h, info->nelems); + return h; +} + +/* + * Check exact equality of two ARRAYs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * ARRAY to potential canonical representative. + */ +static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_array *info1, *info2; + + if (!btf_equal_common(t1, t2)) + return false; + + info1 = btf_array(t1); + info2 = btf_array(t2); + return info1->type == info2->type && + info1->index_type == info2->index_type && + info1->nelems == info2->nelems; +} + +/* + * Check structural compatibility of two ARRAYs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + return btf_array(t1)->nelems == btf_array(t2)->nelems; +} + +/* + * Calculate type signature hash of FUNC_PROTO, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static long btf_hash_fnproto(struct btf_type *t) +{ + const struct btf_param *member = btf_params(t); + __u16 vlen = btf_vlen(t); + long h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->type); + member++; + } + return h; +} + +/* + * Check exact equality of two FUNC_PROTOs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * FUNC_PROTO to potential canonical representative. + */ +static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->type != m2->type) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_param *m1, *m2; + __u16 vlen; + int i; + + /* skip return type ID */ + if (t1->name_off != t2->name_off || t1->info != t2->info) + return false; + + vlen = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off) + return false; + m1++; + m2++; + } + return true; +} + +/* Prepare split BTF for deduplication by calculating hashes of base BTF's + * types and initializing the rest of the state (canonical type mapping) for + * the fixed base BTF part. + */ +static int btf_dedup_prep(struct btf_dedup *d) +{ + struct btf_type *t; + int type_id; + long h; + + if (!d->btf->base_btf) + return 0; + + for (type_id = 1; type_id < d->btf->start_id; type_id++) { + t = btf_type_by_id(d->btf, type_id); + + /* all base BTF types are self-canonical by definition */ + d->map[type_id] = type_id; + + switch (btf_kind(t)) { + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + /* VAR and DATASEC are never hash/deduplicated */ + continue; + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_FWD: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_FLOAT: + case BTF_KIND_TYPE_TAG: + h = btf_hash_common(t); + break; + case BTF_KIND_INT: + case BTF_KIND_DECL_TAG: + h = btf_hash_int_decl_tag(t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + h = btf_hash_struct(t); + break; + case BTF_KIND_ARRAY: + h = btf_hash_array(t); + break; + case BTF_KIND_FUNC_PROTO: + h = btf_hash_fnproto(t); + break; + default: + pr_debug("unknown kind %d for type [%d]\n", btf_kind(t), type_id); + return -EINVAL; + } + if (btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + } + + return 0; +} + +/* + * Deduplicate primitive types, that can't reference other types, by calculating + * their type signature hash and comparing them with any possible canonical + * candidate. If no canonical candidate matches, type itself is marked as + * canonical and is added into `btf_dedup->dedup_table` as another candidate. + */ +static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + struct hashmap_entry *hash_entry; + struct btf_type *cand; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u32 cand_id; + long h; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_ARRAY: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return 0; + + case BTF_KIND_INT: + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + h = btf_hash_enum(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_enum(t, cand)) { + new_id = cand_id; + break; + } + if (btf_compat_enum(t, cand)) { + if (btf_is_enum_fwd(t)) { + /* resolve fwd to full enum */ + new_id = cand_id; + break; + } + /* resolve canonical enum fwd to full enum */ + d->map[cand_id] = type_id; + } + } + break; + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_prim_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Check whether type is already mapped into canonical one (could be to itself). + */ +static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id) +{ + return d->map[type_id] <= BTF_MAX_NR_TYPES; +} + +/* + * Resolve type ID into its canonical type ID, if any; otherwise return original + * type ID. If type is FWD and is resolved into STRUCT/UNION already, follow + * STRUCT/UNION link and resolve it into canonical type ID as well. + */ +static inline __u32 resolve_type_id(struct btf_dedup *d, __u32 type_id) +{ + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + return type_id; +} + +/* + * Resolve FWD to underlying STRUCT/UNION, if any; otherwise return original + * type ID. + */ +static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id) +{ + __u32 orig_type_id = type_id; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + + if (!btf_is_fwd(btf__type_by_id(d->btf, type_id))) + return type_id; + + return orig_type_id; +} + + +static inline __u16 btf_fwd_kind(struct btf_type *t) +{ + return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT; +} + +/* Check if given two types are identical ARRAY definitions */ +static bool btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + struct btf_type *t1, *t2; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + if (!btf_is_array(t1) || !btf_is_array(t2)) + return false; + + return btf_equal_array(t1, t2); +} + +/* Check if given two types are identical STRUCT/UNION definitions */ +static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id2) +{ + const struct btf_member *m1, *m2; + struct btf_type *t1, *t2; + int n, i; + + t1 = btf_type_by_id(d->btf, id1); + t2 = btf_type_by_id(d->btf, id2); + + if (!btf_is_composite(t1) || btf_kind(t1) != btf_kind(t2)) + return false; + + if (!btf_shallow_equal_struct(t1, t2)) + return false; + + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { + if (m1->type != m2->type && + !btf_dedup_identical_arrays(d, m1->type, m2->type) && + !btf_dedup_identical_structs(d, m1->type, m2->type)) + return false; + } + return true; +} + +/* + * Check equivalence of BTF type graph formed by candidate struct/union (we'll + * call it "candidate graph" in this description for brevity) to a type graph + * formed by (potential) canonical struct/union ("canonical graph" for brevity + * here, though keep in mind that not all types in canonical graph are + * necessarily canonical representatives themselves, some of them might be + * duplicates or its uniqueness might not have been established yet). + * Returns: + * - >0, if type graphs are equivalent; + * - 0, if not equivalent; + * - <0, on error. + * + * Algorithm performs side-by-side DFS traversal of both type graphs and checks + * equivalence of BTF types at each step. If at any point BTF types in candidate + * and canonical graphs are not compatible structurally, whole graphs are + * incompatible. If types are structurally equivalent (i.e., all information + * except referenced type IDs is exactly the same), a mapping from `canon_id` to + * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`). + * If a type references other types, then those referenced types are checked + * for equivalence recursively. + * + * During DFS traversal, if we find that for current `canon_id` type we + * already have some mapping in hypothetical map, we check for two possible + * situations: + * - `canon_id` is mapped to exactly the same type as `cand_id`. This will + * happen when type graphs have cycles. In this case we assume those two + * types are equivalent. + * - `canon_id` is mapped to different type. This is contradiction in our + * hypothetical mapping, because same graph in canonical graph corresponds + * to two different types in candidate graph, which for equivalent type + * graphs shouldn't happen. This condition terminates equivalence check + * with negative result. + * + * If type graphs traversal exhausts types to check and find no contradiction, + * then type graphs are equivalent. + * + * When checking types for equivalence, there is one special case: FWD types. + * If FWD type resolution is allowed and one of the types (either from canonical + * or candidate graph) is FWD and other is STRUCT/UNION (depending on FWD's kind + * flag) and their names match, hypothetical mapping is updated to point from + * FWD to STRUCT/UNION. If graphs will be determined as equivalent successfully, + * this mapping will be used to record FWD -> STRUCT/UNION mapping permanently. + * + * Technically, this could lead to incorrect FWD to STRUCT/UNION resolution, + * if there are two exactly named (or anonymous) structs/unions that are + * compatible structurally, one of which has FWD field, while other is concrete + * STRUCT/UNION, but according to C sources they are different structs/unions + * that are referencing different types with the same name. This is extremely + * unlikely to happen, but btf_dedup API allows to disable FWD resolution if + * this logic is causing problems. + * + * Doing FWD resolution means that both candidate and/or canonical graphs can + * consists of portions of the graph that come from multiple compilation units. + * This is due to the fact that types within single compilation unit are always + * deduplicated and FWDs are already resolved, if referenced struct/union + * definiton is available. So, if we had unresolved FWD and found corresponding + * STRUCT/UNION, they will be from different compilation units. This + * consequently means that when we "link" FWD to corresponding STRUCT/UNION, + * type graph will likely have at least two different BTF types that describe + * same type (e.g., most probably there will be two different BTF types for the + * same 'int' primitive type) and could even have "overlapping" parts of type + * graph that describe same subset of types. + * + * This in turn means that our assumption that each type in canonical graph + * must correspond to exactly one type in candidate graph might not hold + * anymore and will make it harder to detect contradictions using hypothetical + * map. To handle this problem, we allow to follow FWD -> STRUCT/UNION + * resolution only in canonical graph. FWDs in candidate graphs are never + * resolved. To see why it's OK, let's check all possible situations w.r.t. FWDs + * that can occur: + * - Both types in canonical and candidate graphs are FWDs. If they are + * structurally equivalent, then they can either be both resolved to the + * same STRUCT/UNION or not resolved at all. In both cases they are + * equivalent and there is no need to resolve FWD on candidate side. + * - Both types in canonical and candidate graphs are concrete STRUCT/UNION, + * so nothing to resolve as well, algorithm will check equivalence anyway. + * - Type in canonical graph is FWD, while type in candidate is concrete + * STRUCT/UNION. In this case candidate graph comes from single compilation + * unit, so there is exactly one BTF type for each unique C type. After + * resolving FWD into STRUCT/UNION, there might be more than one BTF type + * in canonical graph mapping to single BTF type in candidate graph, but + * because hypothetical mapping maps from canonical to candidate types, it's + * alright, and we still maintain the property of having single `canon_id` + * mapping to single `cand_id` (there could be two different `canon_id` + * mapped to the same `cand_id`, but it's not contradictory). + * - Type in canonical graph is concrete STRUCT/UNION, while type in candidate + * graph is FWD. In this case we are just going to check compatibility of + * STRUCT/UNION and corresponding FWD, and if they are compatible, we'll + * assume that whatever STRUCT/UNION FWD resolves to must be equivalent to + * a concrete STRUCT/UNION from canonical graph. If the rest of type graphs + * turn out equivalent, we'll re-resolve FWD to concrete STRUCT/UNION from + * canonical graph. + */ +static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, + __u32 canon_id) +{ + struct btf_type *cand_type; + struct btf_type *canon_type; + __u32 hypot_type_id; + __u16 cand_kind; + __u16 canon_kind; + int i, eq; + + /* if both resolve to the same canonical, they must be equivalent */ + if (resolve_type_id(d, cand_id) == resolve_type_id(d, canon_id)) + return 1; + + canon_id = resolve_fwd_id(d, canon_id); + + hypot_type_id = d->hypot_map[canon_id]; + if (hypot_type_id <= BTF_MAX_NR_TYPES) { + if (hypot_type_id == cand_id) + return 1; + /* In some cases compiler will generate different DWARF types + * for *identical* array type definitions and use them for + * different fields within the *same* struct. This breaks type + * equivalence check, which makes an assumption that candidate + * types sub-graph has a consistent and deduped-by-compiler + * types within a single CU. So work around that by explicitly + * allowing identical array types here. + */ + if (btf_dedup_identical_arrays(d, hypot_type_id, cand_id)) + return 1; + /* It turns out that similar situation can happen with + * struct/union sometimes, sigh... Handle the case where + * structs/unions are exactly the same, down to the referenced + * type IDs. Anything more complicated (e.g., if referenced + * types are different, but equivalent) is *way more* + * complicated and requires a many-to-many equivalence mapping. + */ + if (btf_dedup_identical_structs(d, hypot_type_id, cand_id)) + return 1; + return 0; + } + + if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) + return -ENOMEM; + + cand_type = btf_type_by_id(d->btf, cand_id); + canon_type = btf_type_by_id(d->btf, canon_id); + cand_kind = btf_kind(cand_type); + canon_kind = btf_kind(canon_type); + + if (cand_type->name_off != canon_type->name_off) + return 0; + + /* FWD <--> STRUCT/UNION equivalence check, if enabled */ + if ((cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD) + && cand_kind != canon_kind) { + __u16 real_kind; + __u16 fwd_kind; + + if (cand_kind == BTF_KIND_FWD) { + real_kind = canon_kind; + fwd_kind = btf_fwd_kind(cand_type); + } else { + real_kind = cand_kind; + fwd_kind = btf_fwd_kind(canon_type); + /* we'd need to resolve base FWD to STRUCT/UNION */ + if (fwd_kind == real_kind && canon_id < d->btf->start_id) + d->hypot_adjust_canon = true; + } + return fwd_kind == real_kind; + } + + if (cand_kind != canon_kind) + return 0; + + switch (cand_kind) { + case BTF_KIND_INT: + return btf_equal_int_tag(cand_type, canon_type); + + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return btf_compat_enum(cand_type, canon_type); + + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + return btf_equal_common(cand_type, canon_type); + + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + if (cand_type->info != canon_type->info) + return 0; + return btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + + case BTF_KIND_ARRAY: { + const struct btf_array *cand_arr, *canon_arr; + + if (!btf_compat_array(cand_type, canon_type)) + return 0; + cand_arr = btf_array(cand_type); + canon_arr = btf_array(canon_type); + eq = btf_dedup_is_equiv(d, cand_arr->index_type, canon_arr->index_type); + if (eq <= 0) + return eq; + return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *cand_m, *canon_m; + __u16 vlen; + + if (!btf_shallow_equal_struct(cand_type, canon_type)) + return 0; + vlen = btf_vlen(cand_type); + cand_m = btf_members(cand_type); + canon_m = btf_members(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); + if (eq <= 0) + return eq; + cand_m++; + canon_m++; + } + + return 1; + } + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *cand_p, *canon_p; + __u16 vlen; + + if (!btf_compat_fnproto(cand_type, canon_type)) + return 0; + eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + if (eq <= 0) + return eq; + vlen = btf_vlen(cand_type); + cand_p = btf_params(cand_type); + canon_p = btf_params(canon_type); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type); + if (eq <= 0) + return eq; + cand_p++; + canon_p++; + } + return 1; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Use hypothetical mapping, produced by successful type graph equivalence + * check, to augment existing struct/union canonical mapping, where possible. + * + * If BTF_KIND_FWD resolution is allowed, this mapping is also used to record + * FWD -> STRUCT/UNION correspondence as well. FWD resolution is bidirectional: + * it doesn't matter if FWD type was part of canonical graph or candidate one, + * we are recording the mapping anyway. As opposed to carefulness required + * for struct/union correspondence mapping (described below), for FWD resolution + * it's not important, as by the time that FWD type (reference type) will be + * deduplicated all structs/unions will be deduped already anyway. + * + * Recording STRUCT/UNION mapping is purely a performance optimization and is + * not required for correctness. It needs to be done carefully to ensure that + * struct/union from candidate's type graph is not mapped into corresponding + * struct/union from canonical type graph that itself hasn't been resolved into + * canonical representative. The only guarantee we have is that canonical + * struct/union was determined as canonical and that won't change. But any + * types referenced through that struct/union fields could have been not yet + * resolved, so in case like that it's too early to establish any kind of + * correspondence between structs/unions. + * + * No canonical correspondence is derived for primitive types (they are already + * deduplicated completely already anyway) or reference types (they rely on + * stability of struct/union canonical relationship for equivalence checks). + */ +static void btf_dedup_merge_hypot_map(struct btf_dedup *d) +{ + __u32 canon_type_id, targ_type_id; + __u16 t_kind, c_kind; + __u32 t_id, c_id; + int i; + + for (i = 0; i < d->hypot_cnt; i++) { + canon_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[canon_type_id]; + t_id = resolve_type_id(d, targ_type_id); + c_id = resolve_type_id(d, canon_type_id); + t_kind = btf_kind(btf__type_by_id(d->btf, t_id)); + c_kind = btf_kind(btf__type_by_id(d->btf, c_id)); + /* + * Resolve FWD into STRUCT/UNION. + * It's ok to resolve FWD into STRUCT/UNION that's not yet + * mapped to canonical representative (as opposed to + * STRUCT/UNION <--> STRUCT/UNION mapping logic below), because + * eventually that struct is going to be mapped and all resolved + * FWDs will automatically resolve to correct canonical + * representative. This will happen before ref type deduping, + * which critically depends on stability of these mapping. This + * stability is not a requirement for STRUCT/UNION equivalence + * checks, though. + */ + + /* if it's the split BTF case, we still need to point base FWD + * to STRUCT/UNION in a split BTF, because FWDs from split BTF + * will be resolved against base FWD. If we don't point base + * canonical FWD to the resolved STRUCT/UNION, then all the + * FWDs in split BTF won't be correctly resolved to a proper + * STRUCT/UNION. + */ + if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) + d->map[c_id] = t_id; + + /* if graph equivalence determined that we'd need to adjust + * base canonical types, then we need to only point base FWDs + * to STRUCTs/UNIONs and do no more modifications. For all + * other purposes the type graphs were not equivalent. + */ + if (d->hypot_adjust_canon) + continue; + + if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + d->map[t_id] = c_id; + + if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && + c_kind != BTF_KIND_FWD && + is_type_mapped(d, c_id) && + !is_type_mapped(d, t_id)) { + /* + * as a perf optimization, we can map struct/union + * that's part of type graph we just verified for + * equivalence. We can do that for struct/union that has + * canonical representative only, though. + */ + d->map[t_id] = c_id; + } + } +} + +/* + * Deduplicate struct/union types. + * + * For each struct/union type its type signature hash is calculated, taking + * into account type's name, size, number, order and names of fields, but + * ignoring type ID's referenced from fields, because they might not be deduped + * completely until after reference types deduplication phase. This type hash + * is used to iterate over all potential canonical types, sharing same hash. + * For each canonical candidate we check whether type graphs that they form + * (through referenced types in fields and so on) are equivalent using algorithm + * implemented in `btf_dedup_is_equiv`. If such equivalence is found and + * BTF_KIND_FWD resolution is allowed, then hypothetical mapping + * (btf_dedup->hypot_map) produced by aforementioned type graph equivalence + * algorithm is used to record FWD -> STRUCT/UNION mapping. It's also used to + * potentially map other structs/unions to their canonical representatives, + * if such relationship hasn't yet been established. This speeds up algorithm + * by eliminating some of the duplicate work. + * + * If no matching canonical representative was found, struct/union is marked + * as canonical for itself and is added into btf_dedup->dedup_table hash map + * for further look ups. + */ +static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *cand_type, *t; + struct hashmap_entry *hash_entry; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u16 kind; + long h; + + /* already deduped or is in process of deduping (loop detected) */ + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return 0; + + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + return 0; + + h = btf_hash_struct(t); + for_each_dedup_cand(d, hash_entry, h) { + __u32 cand_id = hash_entry->value; + int eq; + + /* + * Even though btf_dedup_is_equiv() checks for + * btf_shallow_equal_struct() internally when checking two + * structs (unions) for equivalence, we need to guard here + * from picking matching FWD type as a dedup candidate. + * This can happen due to hash collision. In such case just + * relying on btf_dedup_is_equiv() would lead to potentially + * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because + * FWD and compatible STRUCT/UNION are considered equivalent. + */ + cand_type = btf_type_by_id(d->btf, cand_id); + if (!btf_shallow_equal_struct(t, cand_type)) + continue; + + btf_dedup_clear_hypot_map(d); + eq = btf_dedup_is_equiv(d, type_id, cand_id); + if (eq < 0) + return eq; + if (!eq) + continue; + btf_dedup_merge_hypot_map(d); + if (d->hypot_adjust_canon) /* not really equivalent */ + continue; + new_id = cand_id; + break; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_struct_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, d->btf->start_id + i); + if (err) + return err; + } + return 0; +} + +/* + * Deduplicate reference type. + * + * Once all primitive and struct/union types got deduplicated, we can easily + * deduplicate all other (reference) BTF types. This is done in two steps: + * + * 1. Resolve all referenced type IDs into their canonical type IDs. This + * resolution can be done either immediately for primitive or struct/union types + * (because they were deduped in previous two phases) or recursively for + * reference types. Recursion will always terminate at either primitive or + * struct/union type, at which point we can "unwind" chain of reference types + * one by one. There is no danger of encountering cycles because in C type + * system the only way to form type cycle is through struct/union, so any chain + * of reference types, even those taking part in a type cycle, will inevitably + * reach struct/union at some point. + * + * 2. Once all referenced type IDs are resolved into canonical ones, BTF type + * becomes "stable", in the sense that no further deduplication will cause + * any changes to it. With that, it's now possible to calculate type's signature + * hash (this time taking into account referenced type IDs) and loop over all + * potential canonical representatives. If no match was found, current type + * will become canonical representative of itself and will be added into + * btf_dedup->dedup_table as another possible canonical representative. + */ +static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) +{ + struct hashmap_entry *hash_entry; + __u32 new_id = type_id, cand_id; + struct btf_type *t, *cand; + /* if we don't find equivalent type, then we are representative type */ + int ref_type_id; + long h; + + if (d->map[type_id] == BTF_IN_PROGRESS_ID) + return -ELOOP; + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return resolve_type_id(d, type_id); + + t = btf_type_by_id(d->btf, type_id); + d->map[type_id] = BTF_IN_PROGRESS_ID; + + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_TYPE_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_common(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_common(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_DECL_TAG: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_int_decl_tag(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_int_tag(t, cand)) { + new_id = cand_id; + break; + } + } + break; + + case BTF_KIND_ARRAY: { + struct btf_array *info = btf_array(t); + + ref_type_id = btf_dedup_ref_type(d, info->type); + if (ref_type_id < 0) + return ref_type_id; + info->type = ref_type_id; + + ref_type_id = btf_dedup_ref_type(d, info->index_type); + if (ref_type_id < 0) + return ref_type_id; + info->index_type = ref_type_id; + + h = btf_hash_array(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_array(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *param; + __u16 vlen; + int i; + + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + vlen = btf_vlen(t); + param = btf_params(t); + for (i = 0; i < vlen; i++) { + ref_type_id = btf_dedup_ref_type(d, param->type); + if (ref_type_id < 0) + return ref_type_id; + param->type = ref_type_id; + param++; + } + + h = btf_hash_fnproto(t); + for_each_dedup_cand(d, hash_entry, h) { + cand_id = hash_entry->value; + cand = btf_type_by_id(d->btf, cand_id); + if (btf_equal_fnproto(t, cand)) { + new_id = cand_id; + break; + } + } + break; + } + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return new_id; +} + +static int btf_dedup_ref_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, d->btf->start_id + i); + if (err < 0) + return err; + } + /* we won't need d->dedup_table anymore */ + hashmap__free(d->dedup_table); + d->dedup_table = NULL; + return 0; +} + +/* + * Collect a map from type names to type ids for all canonical structs + * and unions. If the same name is shared by several canonical types + * use a special value 0 to indicate this fact. + */ +static int btf_dedup_fill_unique_names_map(struct btf_dedup *d, struct hashmap *names_map) +{ + __u32 nr_types = btf__type_cnt(d->btf); + struct btf_type *t; + __u32 type_id; + __u16 kind; + int err; + + /* + * Iterate over base and split module ids in order to get all + * available structs in the map. + */ + for (type_id = 1; type_id < nr_types; ++type_id) { + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + continue; + + /* Skip non-canonical types */ + if (type_id != d->map[type_id]) + continue; + + err = hashmap__add(names_map, t->name_off, type_id); + if (err == -EEXIST) + err = hashmap__set(names_map, t->name_off, 0, NULL, NULL); + + if (err) + return err; + } + + return 0; +} + +static int btf_dedup_resolve_fwd(struct btf_dedup *d, struct hashmap *names_map, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + enum btf_fwd_kind fwd_kind = btf_kflag(t); + __u16 cand_kind, kind = btf_kind(t); + struct btf_type *cand_t; + uintptr_t cand_id; + + if (kind != BTF_KIND_FWD) + return 0; + + /* Skip if this FWD already has a mapping */ + if (type_id != d->map[type_id]) + return 0; + + if (!hashmap__find(names_map, t->name_off, &cand_id)) + return 0; + + /* Zero is a special value indicating that name is not unique */ + if (!cand_id) + return 0; + + cand_t = btf_type_by_id(d->btf, cand_id); + cand_kind = btf_kind(cand_t); + if ((cand_kind == BTF_KIND_STRUCT && fwd_kind != BTF_FWD_STRUCT) || + (cand_kind == BTF_KIND_UNION && fwd_kind != BTF_FWD_UNION)) + return 0; + + d->map[type_id] = cand_id; + + return 0; +} + +/* + * Resolve unambiguous forward declarations. + * + * The lion's share of all FWD declarations is resolved during + * `btf_dedup_struct_types` phase when different type graphs are + * compared against each other. However, if in some compilation unit a + * FWD declaration is not a part of a type graph compared against + * another type graph that declaration's canonical type would not be + * changed. Example: + * + * CU #1: + * + * struct foo; + * struct foo *some_global; + * + * CU #2: + * + * struct foo { int u; }; + * struct foo *another_global; + * + * After `btf_dedup_struct_types` the BTF looks as follows: + * + * [1] STRUCT 'foo' size=4 vlen=1 ... + * [2] INT 'int' size=4 ... + * [3] PTR '(anon)' type_id=1 + * [4] FWD 'foo' fwd_kind=struct + * [5] PTR '(anon)' type_id=4 + * + * This pass assumes that such FWD declarations should be mapped to + * structs or unions with identical name in case if the name is not + * ambiguous. + */ +static int btf_dedup_resolve_fwds(struct btf_dedup *d) +{ + int i, err; + struct hashmap *names_map; + + names_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(names_map)) + return PTR_ERR(names_map); + + err = btf_dedup_fill_unique_names_map(d, names_map); + if (err < 0) + goto exit; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_resolve_fwd(d, names_map, d->btf->start_id + i); + if (err < 0) + break; + } + +exit: + hashmap__free(names_map); + return err; +} + +/* + * Compact types. + * + * After we established for each type its corresponding canonical representative + * type, we now can eliminate types that are not canonical and leave only + * canonical ones layed out sequentially in memory by copying them over + * duplicates. During compaction btf_dedup->hypot_map array is reused to store + * a map from original type ID to a new compacted type ID, which will be used + * during next phase to "fix up" type IDs, referenced from struct/union and + * reference types. + */ +static int btf_dedup_compact_types(struct btf_dedup *d) +{ + __u32 *new_offs; + __u32 next_type_id = d->btf->start_id; + const struct btf_type *t; + void *p; + int i, id, len; + + /* we are going to reuse hypot_map to store compaction remapping */ + d->hypot_map[0] = 0; + /* base BTF types are not renumbered */ + for (id = 1; id < d->btf->start_id; id++) + d->hypot_map[id] = id; + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) + d->hypot_map[id] = BTF_UNPROCESSED_ID; + + p = d->btf->types_data; + + for (i = 0, id = d->btf->start_id; i < d->btf->nr_types; i++, id++) { + if (d->map[id] != id) + continue; + + t = btf__type_by_id(d->btf, id); + len = btf_type_size(t); + if (len < 0) + return len; + + memmove(p, t, len); + d->hypot_map[id] = next_type_id; + d->btf->type_offs[next_type_id - d->btf->start_id] = p - d->btf->types_data; + p += len; + next_type_id++; + } + + /* shrink struct btf's internal types index and update btf_header */ + d->btf->nr_types = next_type_id - d->btf->start_id; + d->btf->type_offs_cap = d->btf->nr_types; + d->btf->hdr->type_len = p - d->btf->types_data; + new_offs = libbpf_reallocarray(d->btf->type_offs, d->btf->type_offs_cap, + sizeof(*new_offs)); + if (d->btf->type_offs_cap && !new_offs) + return -ENOMEM; + d->btf->type_offs = new_offs; + d->btf->hdr->str_off = d->btf->hdr->type_len; + d->btf->raw_size = d->btf->hdr->hdr_len + d->btf->hdr->type_len + d->btf->hdr->str_len; + return 0; +} + +/* + * Figure out final (deduplicated and compacted) type ID for provided original + * `type_id` by first resolving it into corresponding canonical type ID and + * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, + * which is populated during compaction phase. + */ +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_dedup *d = ctx; + __u32 resolved_type_id, new_type_id; + + resolved_type_id = resolve_type_id(d, *type_id); + new_type_id = d->hypot_map[resolved_type_id]; + if (new_type_id > BTF_MAX_NR_TYPES) + return -EINVAL; + + *type_id = new_type_id; + return 0; +} + +/* + * Remap referenced type IDs into deduped type IDs. + * + * After BTF types are deduplicated and compacted, their final type IDs may + * differ from original ones. The map from original to a corresponding + * deduped type ID is stored in btf_dedup->hypot_map and is populated during + * compaction phase. During remapping phase we are rewriting all type IDs + * referenced from any BTF type (e.g., struct fields, func proto args, etc) to + * their final deduped type IDs. + */ +static int btf_dedup_remap_types(struct btf_dedup *d) +{ + int i, r; + + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) + return r; + } + + if (!d->btf_ext) + return 0; + + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; + + return 0; +} + +/* + * Probe few well-known locations for vmlinux kernel image and try to load BTF + * data out of it to use for target BTF. + */ +struct btf *btf__load_vmlinux_btf(void) +{ + const char *locations[] = { + /* try canonical vmlinux BTF through sysfs first */ + "/sys/kernel/btf/vmlinux", + /* fall back to trying to find vmlinux on disk otherwise */ + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", + }; + char path[PATH_MAX + 1]; + struct utsname buf; + struct btf *btf; + int i, err; + + uname(&buf); + + for (i = 0; i < ARRAY_SIZE(locations); i++) { + snprintf(path, PATH_MAX, locations[i], buf.release); + + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) + continue; + + btf = btf__parse(path, NULL); + err = libbpf_get_error(btf); + pr_debug("loading kernel BTF '%s': %d\n", path, err); + if (err) + continue; + + return btf; + } + + pr_warn("failed to find valid kernel BTF\n"); + return libbpf_err_ptr(-ESRCH); +} + +struct btf *libbpf_find_kernel_btf(void) __attribute__((alias("btf__load_vmlinux_btf"))); + +struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf) +{ + char path[80]; + + snprintf(path, sizeof(path), "/sys/kernel/btf/%s", module_name); + return btf__parse_split(path, vmlinux_btf); +} + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM64: { + struct btf_enum64 *m = btf_enum64(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/third_party/libbpf/src/btf.h b/third_party/libbpf/src/btf.h new file mode 100644 index 0000000..8e6880d --- /dev/null +++ b/third_party/libbpf/src/btf.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2018 Facebook */ +/*! \file */ + +#ifndef __LIBBPF_BTF_H +#define __LIBBPF_BTF_H + +#include +#include +#include +#include + +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BTF_ELF_SEC ".BTF" +#define BTF_EXT_ELF_SEC ".BTF.ext" +#define MAPS_ELF_SEC ".maps" + +struct btf; +struct btf_ext; +struct btf_type; + +struct bpf_object; + +enum btf_endianness { + BTF_LITTLE_ENDIAN = 0, + BTF_BIG_ENDIAN = 1, +}; + +/** + * @brief **btf__free()** frees all data of a BTF object + * @param btf BTF object to free + */ +LIBBPF_API void btf__free(struct btf *btf); + +/** + * @brief **btf__new()** creates a new instance of a BTF object from the raw + * bytes of an ELF's BTF section + * @param data raw bytes + * @param size number of bytes passed in `data` + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new(const void *data, __u32 size); + +/** + * @brief **btf__new_split()** create a new instance of a BTF object from the + * provided raw data bytes. It takes another BTF instance, **base_btf**, which + * serves as a base BTF, which is extended by types in a newly created BTF + * instance + * @param data raw bytes + * @param size length of raw bytes + * @param base_btf the base BTF object + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_split()` is equivalent to `btf__new()` and + * creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf); + +/** + * @brief **btf__new_empty()** creates an empty BTF object. Use + * `btf__add_*()` to populate such BTF object. + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty(void); + +/** + * @brief **btf__new_empty_split()** creates an unpopulated BTF object from an + * ELF BTF section except with a base BTF on top of which split BTF should be + * based + * @return new BTF object instance which has to be eventually freed with + * **btf__free()** + * + * If *base_btf* is NULL, `btf__new_empty_split()` is equivalent to + * `btf__new_empty()` and creates non-split BTF. + * + * On error, error-code-encoded-as-pointer is returned, not a NULL. To extract + * error code from such a pointer `libbpf_get_error()` should be used. If + * `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` is enabled, NULL is + * returned on error instead. In both cases thread-local `errno` variable is + * always set to error code as well. + */ +LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); + +LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); +LIBBPF_API struct btf *btf__parse_elf_split(const char *path, struct btf *base_btf); +LIBBPF_API struct btf *btf__parse_raw(const char *path); +LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf); + +LIBBPF_API struct btf *btf__load_vmlinux_btf(void); +LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); + +LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); +LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); + +LIBBPF_API int btf__load_into_kernel(struct btf *btf); +LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, + const char *type_name); +LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, + const char *type_name, __u32 kind); +LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); +LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); +LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, + __u32 id); +LIBBPF_API size_t btf__pointer_size(const struct btf *btf); +LIBBPF_API int btf__set_pointer_size(struct btf *btf, size_t ptr_sz); +LIBBPF_API enum btf_endianness btf__endianness(const struct btf *btf); +LIBBPF_API int btf__set_endianness(struct btf *btf, enum btf_endianness endian); +LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); +LIBBPF_API int btf__align_of(const struct btf *btf, __u32 id); +LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API void btf__set_fd(struct btf *btf, int fd); +LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); +LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); + +LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); +LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); +LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); + +LIBBPF_API int btf__find_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); +/** + * @brief **btf__add_btf()** appends all the BTF types from *src_btf* into *btf* + * @param btf BTF object which all the BTF types and strings are added to + * @param src_btf BTF object which all BTF types and referenced strings are copied from + * @return BTF type ID of the first appended BTF type, or negative error code + * + * **btf__add_btf()** can be used to simply and efficiently append the entire + * contents of one BTF object to another one. All the BTF type data is copied + * over, all referenced type IDs are adjusted by adding a necessary ID offset. + * Only strings referenced from BTF types are copied over and deduplicated, so + * if there were some unused strings in *src_btf*, those won't be copied over, + * which is consistent with the general string deduplication semantics of BTF + * writing APIs. + * + * If any error is encountered during this process, the contents of *btf* is + * left intact, which means that **btf__add_btf()** follows the transactional + * semantics and the operation as a whole is all-or-nothing. + * + * *src_btf* has to be non-split BTF, as of now copying types from split BTF + * is not supported and will result in -ENOTSUP error code returned. + */ +LIBBPF_API int btf__add_btf(struct btf *btf, const struct btf *src_btf); + +LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); +LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); +LIBBPF_API int btf__add_ptr(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_array(struct btf *btf, + int index_type_id, int elem_type_id, __u32 nr_elems); +/* struct/union construction APIs */ +LIBBPF_API int btf__add_struct(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_union(struct btf *btf, const char *name, __u32 sz); +LIBBPF_API int btf__add_field(struct btf *btf, const char *name, int field_type_id, + __u32 bit_offset, __u32 bit_size); + +/* enum construction APIs */ +LIBBPF_API int btf__add_enum(struct btf *btf, const char *name, __u32 bytes_sz); +LIBBPF_API int btf__add_enum_value(struct btf *btf, const char *name, __s64 value); +LIBBPF_API int btf__add_enum64(struct btf *btf, const char *name, __u32 bytes_sz, bool is_signed); +LIBBPF_API int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value); + +enum btf_fwd_kind { + BTF_FWD_STRUCT = 0, + BTF_FWD_UNION = 1, + BTF_FWD_ENUM = 2, +}; + +LIBBPF_API int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind); +LIBBPF_API int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id); +LIBBPF_API int btf__add_volatile(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_const(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_restrict(struct btf *btf, int ref_type_id); +LIBBPF_API int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id); + +/* func and func_proto construction APIs */ +LIBBPF_API int btf__add_func(struct btf *btf, const char *name, + enum btf_func_linkage linkage, int proto_type_id); +LIBBPF_API int btf__add_func_proto(struct btf *btf, int ret_type_id); +LIBBPF_API int btf__add_func_param(struct btf *btf, const char *name, int type_id); + +/* var & datasec construction APIs */ +LIBBPF_API int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id); +LIBBPF_API int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz); +LIBBPF_API int btf__add_datasec_var_info(struct btf *btf, int var_type_id, + __u32 offset, __u32 byte_sz); + +/* tag construction API */ +LIBBPF_API int btf__add_decl_tag(struct btf *btf, const char *value, int ref_type_id, + int component_idx); + +struct btf_dedup_opts { + size_t sz; + /* optional .BTF.ext info to dedup along the main BTF info */ + struct btf_ext *btf_ext; + /* force hash collisions (used for testing) */ + bool force_collisions; + size_t :0; +}; +#define btf_dedup_opts__last_field force_collisions + +LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); + +struct btf_dump; + +struct btf_dump_opts { + size_t sz; +}; +#define btf_dump_opts__last_field sz + +typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); + +LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts); + +LIBBPF_API void btf_dump__free(struct btf_dump *d); + +LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); + +struct btf_dump_emit_type_decl_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; + /* optional field name for type declaration, e.g.: + * - struct my_struct + * - void (*)(int) + * - char (*)[123] + */ + const char *field_name; + /* extra indentation level (in number of tabs) to emit for multi-line + * type declarations (e.g., anonymous struct); applies for lines + * starting from the second one (first line is assumed to have + * necessary indentation already + */ + int indent_level; + /* strip all the const/volatile/restrict mods */ + bool strip_mods; + size_t :0; +}; +#define btf_dump_emit_type_decl_opts__last_field strip_mods + +LIBBPF_API int +btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts); + + +struct btf_dump_type_data_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + const char *indent_str; + int indent_level; + /* below match "show" flags for bpf_show_snprintf() */ + bool compact; /* no newlines/indentation */ + bool skip_names; /* skip member/type names */ + bool emit_zeroes; /* show 0-valued fields */ + size_t :0; +}; +#define btf_dump_type_data_opts__last_field emit_zeroes + +LIBBPF_API int +btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts); + +/* + * A set of helpers for easier BTF types handling. + * + * The inline functions below rely on constants from the kernel headers which + * may not be available for applications including this header file. To avoid + * compilation errors, we define all the constants here that were added after + * the initial introduction of the BTF_KIND* constants. + */ +#ifndef BTF_KIND_FUNC +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +#endif +#ifndef BTF_KIND_VAR +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#endif +#ifndef BTF_KIND_FLOAT +#define BTF_KIND_FLOAT 16 /* Floating point */ +#endif +/* The kernel header switched to enums, so the following were never #defined */ +#define BTF_KIND_DECL_TAG 17 /* Decl Tag */ +#define BTF_KIND_TYPE_TAG 18 /* Type Tag */ +#define BTF_KIND_ENUM64 19 /* Enum for up-to 64bit values */ + +static inline __u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline __u16 btf_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static inline bool btf_kflag(const struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info); +} + +static inline bool btf_is_void(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNKN; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_struct(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_STRUCT; +} + +static inline bool btf_is_union(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_UNION; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + +static inline bool btf_is_fwd(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FWD; +} + +static inline bool btf_is_typedef(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPEDEF; +} + +static inline bool btf_is_volatile(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VOLATILE; +} + +static inline bool btf_is_const(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_CONST; +} + +static inline bool btf_is_restrict(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_RESTRICT; +} + +static inline bool btf_is_mod(const struct btf_type *t) +{ + __u16 kind = btf_kind(t); + + return kind == BTF_KIND_VOLATILE || + kind == BTF_KIND_CONST || + kind == BTF_KIND_RESTRICT || + kind == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_func(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC; +} + +static inline bool btf_is_func_proto(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FUNC_PROTO; +} + +static inline bool btf_is_var(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_VAR; +} + +static inline bool btf_is_datasec(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DATASEC; +} + +static inline bool btf_is_float(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_FLOAT; +} + +static inline bool btf_is_decl_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_DECL_TAG; +} + +static inline bool btf_is_type_tag(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_TYPE_TAG; +} + +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return btf_is_enum(t) || btf_is_enum64(t); +} + +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return btf_kind(t1) == btf_kind(t2) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + +static inline __u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(__u32 *)(t + 1)); +} + +static inline __u8 btf_int_bits(const struct btf_type *t) +{ + return BTF_INT_BITS(*(__u32 *)(t + 1)); +} + +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + +struct btf_enum64; + +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + +static inline __u64 btf_enum64_value(const struct btf_enum64 *e) +{ + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; +} + +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +/* Get bit offset of a member with specified index. */ +static inline __u32 btf_member_bit_offset(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BIT_OFFSET(m->offset) : m->offset; +} +/* + * Get bitfield size of a member, assuming t is BTF_KIND_STRUCT or + * BTF_KIND_UNION. If member is not a bitfield, zero is returned. + */ +static inline __u32 btf_member_bitfield_size(const struct btf_type *t, + __u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + bool kflag = btf_kflag(t); + + return kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0; +} + +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + +static inline struct btf_var *btf_var(const struct btf_type *t) +{ + return (struct btf_var *)(t + 1); +} + +static inline struct btf_var_secinfo * +btf_var_secinfos(const struct btf_type *t) +{ + return (struct btf_var_secinfo *)(t + 1); +} + +struct btf_decl_tag; +static inline struct btf_decl_tag *btf_decl_tag(const struct btf_type *t) +{ + return (struct btf_decl_tag *)(t + 1); +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_BTF_H */ diff --git a/third_party/libbpf/src/btf_dump.c b/third_party/libbpf/src/btf_dump.c new file mode 100644 index 0000000..4d9f30b --- /dev/null +++ b/third_party/libbpf/src/btf_dump.c @@ -0,0 +1,2542 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * BTF-to-C type converter. + * + * Copyright (c) 2019 Facebook + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "hashmap.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t"; +static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1; + +static const char *pfx(int lvl) +{ + return lvl >= PREFIX_CNT ? PREFIXES : &PREFIXES[PREFIX_CNT - lvl]; +} + +enum btf_dump_type_order_state { + NOT_ORDERED, + ORDERING, + ORDERED, +}; + +enum btf_dump_type_emit_state { + NOT_EMITTED, + EMITTING, + EMITTED, +}; + +/* per-type auxiliary state */ +struct btf_dump_type_aux_state { + /* topological sorting state */ + enum btf_dump_type_order_state order_state: 2; + /* emitting state used to determine the need for forward declaration */ + enum btf_dump_type_emit_state emit_state: 2; + /* whether forward declaration was already emitted */ + __u8 fwd_emitted: 1; + /* whether unique non-duplicate name was already assigned */ + __u8 name_resolved: 1; + /* whether type is referenced from any other type */ + __u8 referenced: 1; +}; + +/* indent string length; one indent string is added for each indent level */ +#define BTF_DATA_INDENT_STR_LEN 32 + +/* + * Common internal data for BTF type data dump operations. + */ +struct btf_dump_data { + const void *data_end; /* end of valid data to show */ + bool compact; + bool skip_names; + bool emit_zeroes; + __u8 indent_lvl; /* base indent level */ + char indent_str[BTF_DATA_INDENT_STR_LEN]; + /* below are used during iteration */ + int depth; + bool is_array_member; + bool is_array_terminated; + bool is_array_char; +}; + +struct btf_dump { + const struct btf *btf; + btf_dump_printf_fn_t printf_fn; + void *cb_ctx; + int ptr_sz; + bool strip_mods; + bool skip_anon_defs; + int last_id; + + /* per-type auxiliary state */ + struct btf_dump_type_aux_state *type_states; + size_t type_states_cap; + /* per-type optional cached unique name, must be freed, if present */ + const char **cached_names; + size_t cached_names_cap; + + /* topo-sorted list of dependent type definitions */ + __u32 *emit_queue; + int emit_queue_cap; + int emit_queue_cnt; + + /* + * stack of type declarations (e.g., chain of modifiers, arrays, + * funcs, etc) + */ + __u32 *decl_stack; + int decl_stack_cap; + int decl_stack_cnt; + + /* maps struct/union/enum name to a number of name occurrences */ + struct hashmap *type_names; + /* + * maps typedef identifiers and enum value names to a number of such + * name occurrences + */ + struct hashmap *ident_names; + /* + * data for typed display; allocated if needed. + */ + struct btf_dump_data *typed_dump; +}; + +static size_t str_hash_fn(long key, void *ctx) +{ + return str_hash((void *)key); +} + +static bool str_equal_fn(long a, long b, void *ctx) +{ + return strcmp((void *)a, (void *)b) == 0; +} + +static const char *btf_name_of(const struct btf_dump *d, __u32 name_off) +{ + return btf__name_by_offset(d->btf, name_off); +} + +static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + d->printf_fn(d->cb_ctx, fmt, args); + va_end(args); +} + +static int btf_dump_mark_referenced(struct btf_dump *d); +static int btf_dump_resize(struct btf_dump *d); + +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) +{ + struct btf_dump *d; + int err; + + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + + if (!printf_fn) + return libbpf_err_ptr(-EINVAL); + + d = calloc(1, sizeof(struct btf_dump)); + if (!d) + return libbpf_err_ptr(-ENOMEM); + + d->btf = btf; + d->printf_fn = printf_fn; + d->cb_ctx = ctx; + d->ptr_sz = btf__pointer_size(btf) ? : sizeof(void *); + + d->type_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->type_names)) { + err = PTR_ERR(d->type_names); + d->type_names = NULL; + goto err; + } + d->ident_names = hashmap__new(str_hash_fn, str_equal_fn, NULL); + if (IS_ERR(d->ident_names)) { + err = PTR_ERR(d->ident_names); + d->ident_names = NULL; + goto err; + } + + err = btf_dump_resize(d); + if (err) + goto err; + + return d; +err: + btf_dump__free(d); + return libbpf_err_ptr(err); +} + +static int btf_dump_resize(struct btf_dump *d) +{ + int err, last_id = btf__type_cnt(d->btf) - 1; + + if (last_id <= d->last_id) + return 0; + + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) + return -ENOMEM; + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) + return -ENOMEM; + + if (d->last_id == 0) { + /* VOID is special */ + d->type_states[0].order_state = ORDERED; + d->type_states[0].emit_state = EMITTED; + } + + /* eagerly determine referenced types for anon enums */ + err = btf_dump_mark_referenced(d); + if (err) + return err; + + d->last_id = last_id; + return 0; +} + +static void btf_dump_free_names(struct hashmap *map) +{ + size_t bkt; + struct hashmap_entry *cur; + + hashmap__for_each_entry(map, cur, bkt) + free((void *)cur->pkey); + + hashmap__free(map); +} + +void btf_dump__free(struct btf_dump *d) +{ + int i; + + if (IS_ERR_OR_NULL(d)) + return; + + free(d->type_states); + if (d->cached_names) { + /* any set cached name is owned by us and should be freed */ + for (i = 0; i <= d->last_id; i++) { + if (d->cached_names[i]) + free((void *)d->cached_names[i]); + } + } + free(d->cached_names); + free(d->emit_queue); + free(d->decl_stack); + btf_dump_free_names(d->type_names); + btf_dump_free_names(d->ident_names); + + free(d); +} + +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr); +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id); + +/* + * Dump BTF type in a compilable C syntax, including all the necessary + * dependent types, necessary for compilation. If some of the dependent types + * were already emitted as part of previous btf_dump__dump_type() invocation + * for another type, they won't be emitted again. This API allows callers to + * filter out BTF types according to user-defined criterias and emitted only + * minimal subset of types, necessary to compile everything. Full struct/union + * definitions will still be emitted, even if the only usage is through + * pointer and could be satisfied with just a forward declaration. + * + * Dumping is done in two high-level passes: + * 1. Topologically sort type definitions to satisfy C rules of compilation. + * 2. Emit type definitions in C syntax. + * + * Returns 0 on success; <0, otherwise. + */ +int btf_dump__dump_type(struct btf_dump *d, __u32 id) +{ + int err, i; + + if (id >= btf__type_cnt(d->btf)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + d->emit_queue_cnt = 0; + err = btf_dump_order_type(d, id, false); + if (err < 0) + return libbpf_err(err); + + for (i = 0; i < d->emit_queue_cnt; i++) + btf_dump_emit_type(d, d->emit_queue[i], 0 /*top-level*/); + + return 0; +} + +/* + * Mark all types that are referenced from any other type. This is used to + * determine top-level anonymous enums that need to be emitted as an + * independent type declarations. + * Anonymous enums come in two flavors: either embedded in a struct's field + * definition, in which case they have to be declared inline as part of field + * type declaration; or as a top-level anonymous enum, typically used for + * declaring global constants. It's impossible to distinguish between two + * without knowning whether given enum type was referenced from other type: + * top-level anonymous enum won't be referenced by anything, while embedded + * one will. + */ +static int btf_dump_mark_referenced(struct btf_dump *d) +{ + int i, j, n = btf__type_cnt(d->btf); + const struct btf_type *t; + __u16 vlen; + + for (i = d->last_id + 1; i < n; i++) { + t = btf__type_by_id(d->btf, i); + vlen = btf_vlen(t); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + break; + + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + d->type_states[t->type].referenced = 1; + break; + + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + d->type_states[a->index_type].referenced = 1; + d->type_states[a->type].referenced = 1; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + for (j = 0; j < vlen; j++, m++) + d->type_states[m->type].referenced = 1; + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + + for (j = 0; j < vlen; j++, p++) + d->type_states[p->type].referenced = 1; + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *v = btf_var_secinfos(t); + + for (j = 0; j < vlen; j++, v++) + d->type_states[v->type].referenced = 1; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + +static int btf_dump_add_emit_queue_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_queue; + size_t new_cap; + + if (d->emit_queue_cnt >= d->emit_queue_cap) { + new_cap = max(16, d->emit_queue_cap * 3 / 2); + new_queue = libbpf_reallocarray(d->emit_queue, new_cap, sizeof(new_queue[0])); + if (!new_queue) + return -ENOMEM; + d->emit_queue = new_queue; + d->emit_queue_cap = new_cap; + } + + d->emit_queue[d->emit_queue_cnt++] = id; + return 0; +} + +/* + * Determine order of emitting dependent types and specified type to satisfy + * C compilation rules. This is done through topological sorting with an + * additional complication which comes from C rules. The main idea for C is + * that if some type is "embedded" into a struct/union, it's size needs to be + * known at the time of definition of containing type. E.g., for: + * + * struct A {}; + * struct B { struct A x; } + * + * struct A *HAS* to be defined before struct B, because it's "embedded", + * i.e., it is part of struct B layout. But in the following case: + * + * struct A; + * struct B { struct A *x; } + * struct A {}; + * + * it's enough to just have a forward declaration of struct A at the time of + * struct B definition, as struct B has a pointer to struct A, so the size of + * field x is known without knowing struct A size: it's sizeof(void *). + * + * Unfortunately, there are some trickier cases we need to handle, e.g.: + * + * struct A {}; // if this was forward-declaration: compilation error + * struct B { + * struct { // anonymous struct + * struct A y; + * } *x; + * }; + * + * In this case, struct B's field x is a pointer, so it's size is known + * regardless of the size of (anonymous) struct it points to. But because this + * struct is anonymous and thus defined inline inside struct B, *and* it + * embeds struct A, compiler requires full definition of struct A to be known + * before struct B can be defined. This creates a transitive dependency + * between struct A and struct B. If struct A was forward-declared before + * struct B definition and fully defined after struct B definition, that would + * trigger compilation error. + * + * All this means that while we are doing topological sorting on BTF type + * graph, we need to determine relationships between different types (graph + * nodes): + * - weak link (relationship) between X and Y, if Y *CAN* be + * forward-declared at the point of X definition; + * - strong link, if Y *HAS* to be fully-defined before X can be defined. + * + * The rule is as follows. Given a chain of BTF types from X to Y, if there is + * BTF_KIND_PTR type in the chain and at least one non-anonymous type + * Z (excluding X, including Y), then link is weak. Otherwise, it's strong. + * Weak/strong relationship is determined recursively during DFS traversal and + * is returned as a result from btf_dump_order_type(). + * + * btf_dump_order_type() is trying to avoid unnecessary forward declarations, + * but it is not guaranteeing that no extraneous forward declarations will be + * emitted. + * + * To avoid extra work, algorithm marks some of BTF types as ORDERED, when + * it's done with them, but not for all (e.g., VOLATILE, CONST, RESTRICT, + * ARRAY, FUNC_PROTO), as weak/strong semantics for those depends on the + * entire graph path, so depending where from one came to that BTF type, it + * might cause weak or strong ordering. For types like STRUCT/UNION/INT/ENUM, + * once they are processed, there is no need to do it again, so they are + * marked as ORDERED. We can mark PTR as ORDERED as well, as it semi-forces + * weak link, unless subsequent referenced STRUCT/UNION/ENUM is anonymous. But + * in any case, once those are processed, no need to do it again, as the + * result won't change. + * + * Returns: + * - 1, if type is part of strong link (so there is strong topological + * ordering requirements); + * - 0, if type is part of weak link (so can be satisfied through forward + * declaration); + * - <0, on error (e.g., unsatisfiable type loop detected). + */ +static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) +{ + /* + * Order state is used to detect strong link cycles, but only for BTF + * kinds that are or could be an independent definition (i.e., + * stand-alone fwd decl, enum, typedef, struct, union). Ptrs, arrays, + * func_protos, modifiers are just means to get to these definitions. + * Int/void don't need definitions, they are assumed to be always + * properly defined. We also ignore datasec, var, and funcs for now. + * So for all non-defining kinds, we never even set ordering state, + * for defining kinds we set ORDERING and subsequently ORDERED if it + * forms a strong link. + */ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + const struct btf_type *t; + __u16 vlen; + int err, i; + + /* return true, letting typedefs know that it's ok to be emitted */ + if (tstate->order_state == ORDERED) + return 1; + + t = btf__type_by_id(d->btf, id); + + if (tstate->order_state == ORDERING) { + /* type loop, but resolvable through fwd declaration */ + if (btf_is_composite(t) && through_ptr && t->name_off != 0) + return 0; + pr_warn("unsatisfiable type cycle, id:[%u]\n", id); + return -ELOOP; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + tstate->order_state = ORDERED; + return 0; + + case BTF_KIND_PTR: + err = btf_dump_order_type(d, t->type, true); + tstate->order_state = ORDERED; + return err; + + case BTF_KIND_ARRAY: + return btf_dump_order_type(d, btf_array(t)->type, false); + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + /* + * struct/union is part of strong link, only if it's embedded + * (so no ptr in a path) or it's anonymous (so has to be + * defined inline, even if declared through ptr) + */ + if (through_ptr && t->name_off != 0) + return 0; + + tstate->order_state = ORDERING; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, m++) { + err = btf_dump_order_type(d, m->type, false); + if (err < 0) + return err; + } + + if (t->name_off != 0) { + err = btf_dump_add_emit_queue_id(d, id); + if (err < 0) + return err; + } + + tstate->order_state = ORDERED; + return 1; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + /* + * non-anonymous or non-referenced enums are top-level + * declarations and should be emitted. Same logic can be + * applied to FWDs, it won't hurt anyways. + */ + if (t->name_off != 0 || !tstate->referenced) { + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + } + tstate->order_state = ORDERED; + return 1; + + case BTF_KIND_TYPEDEF: { + int is_strong; + + is_strong = btf_dump_order_type(d, t->type, through_ptr); + if (is_strong < 0) + return is_strong; + + /* typedef is similar to struct/union w.r.t. fwd-decls */ + if (through_ptr && !is_strong) + return 0; + + /* typedef is always a named definition */ + err = btf_dump_add_emit_queue_id(d, id); + if (err) + return err; + + d->type_states[id].order_state = ORDERED; + return 1; + } + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + return btf_dump_order_type(d, t->type, through_ptr); + + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + bool is_strong; + + err = btf_dump_order_type(d, t->type, through_ptr); + if (err < 0) + return err; + is_strong = err > 0; + + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, p++) { + err = btf_dump_order_type(d, p->type, through_ptr); + if (err < 0) + return err; + if (err > 0) + is_strong = true; + } + return is_strong; + } + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DATASEC: + case BTF_KIND_DECL_TAG: + d->type_states[id].order_state = ORDERED; + return 0; + + default: + return -EINVAL; + } +} + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_struct_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t); +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t); + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl); + +/* a local view into a shared stack */ +struct id_stack { + const __u32 *ids; + int cnt; +}; + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl); +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decl_stack, + const char *fname, int lvl); + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id); +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id); +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name); + +static bool btf_dump_is_blacklisted(struct btf_dump *d, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(d->btf, id); + + /* __builtin_va_list is a compiler built-in, which causes compilation + * errors, when compiling w/ different compiler, then used to compile + * original code (e.g., GCC to compile kernel, Clang to use generated + * C header from BTF). As it is built-in, it should be already defined + * properly internally in compiler. + */ + if (t->name_off == 0) + return false; + return strcmp(btf_name_of(d, t->name_off), "__builtin_va_list") == 0; +} + +/* + * Emit C-syntax definitions of types from chains of BTF types. + * + * High-level handling of determining necessary forward declarations are handled + * by btf_dump_emit_type() itself, but all nitty-gritty details of emitting type + * declarations/definitions in C syntax are handled by a combo of + * btf_dump_emit_type_decl()/btf_dump_emit_type_chain() w/ delegation to + * corresponding btf_dump_emit_*_{def,fwd}() functions. + * + * We also keep track of "containing struct/union type ID" to determine when + * we reference it from inside and thus can avoid emitting unnecessary forward + * declaration. + * + * This algorithm is designed in such a way, that even if some error occurs + * (either technical, e.g., out of memory, or logical, i.e., malformed BTF + * that doesn't comply to C rules completely), algorithm will try to proceed + * and produce as much meaningful output as possible. + */ +static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) +{ + struct btf_dump_type_aux_state *tstate = &d->type_states[id]; + bool top_level_def = cont_id == 0; + const struct btf_type *t; + __u16 kind; + + if (tstate->emit_state == EMITTED) + return; + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + if (tstate->emit_state == EMITTING) { + if (tstate->fwd_emitted) + return; + + switch (kind) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* + * if we are referencing a struct/union that we are + * part of - then no need for fwd declaration + */ + if (id == cont_id) + return; + if (t->name_off == 0) { + pr_warn("anonymous struct/union loop, id:[%u]\n", + id); + return; + } + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + break; + case BTF_KIND_TYPEDEF: + /* + * for typedef fwd_emitted means typedef definition + * was emitted, but it can be used only for "weak" + * references through pointer only, not for embedding + */ + if (!btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->fwd_emitted = 1; + break; + default: + break; + } + + return; + } + + switch (kind) { + case BTF_KIND_INT: + /* Emit type alias definitions if necessary */ + btf_dump_emit_missing_aliases(d, id, t); + + tstate->emit_state = EMITTED; + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (top_level_def) { + btf_dump_emit_enum_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: + btf_dump_emit_type(d, t->type, cont_id); + break; + case BTF_KIND_ARRAY: + btf_dump_emit_type(d, btf_array(t)->type, cont_id); + break; + case BTF_KIND_FWD: + btf_dump_emit_fwd_def(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + break; + case BTF_KIND_TYPEDEF: + tstate->emit_state = EMITTING; + btf_dump_emit_type(d, t->type, id); + /* + * typedef can server as both definition and forward + * declaration; at this stage someone depends on + * typedef as a forward declaration (refers to it + * through pointer), so unless we already did it, + * emit typedef as a forward declaration + */ + if (!tstate->fwd_emitted && !btf_dump_is_blacklisted(d, id)) { + btf_dump_emit_typedef_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + } + tstate->emit_state = EMITTED; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + tstate->emit_state = EMITTING; + /* if it's a top-level struct/union definition or struct/union + * is anonymous, then in C we'll be emitting all fields and + * their types (as opposed to just `struct X`), so we need to + * make sure that all types, referenced from struct/union + * members have necessary forward-declarations, where + * applicable + */ + if (top_level_def || t->name_off == 0) { + const struct btf_member *m = btf_members(t); + __u16 vlen = btf_vlen(t); + int i, new_cont_id; + + new_cont_id = t->name_off == 0 ? cont_id : id; + for (i = 0; i < vlen; i++, m++) + btf_dump_emit_type(d, m->type, new_cont_id); + } else if (!tstate->fwd_emitted && id != cont_id) { + btf_dump_emit_struct_fwd(d, id, t); + btf_dump_printf(d, ";\n\n"); + tstate->fwd_emitted = 1; + } + + if (top_level_def) { + btf_dump_emit_struct_def(d, id, t, 0); + btf_dump_printf(d, ";\n\n"); + tstate->emit_state = EMITTED; + } else { + tstate->emit_state = NOT_EMITTED; + } + break; + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 n = btf_vlen(t); + int i; + + btf_dump_emit_type(d, t->type, cont_id); + for (i = 0; i < n; i++, p++) + btf_dump_emit_type(d, p->type, cont_id); + + break; + } + default: + break; + } +} + +static bool btf_is_struct_packed(const struct btf *btf, __u32 id, + const struct btf_type *t) +{ + const struct btf_member *m; + int max_align = 1, align, i, bit_sz; + __u16 vlen; + + m = btf_members(t); + vlen = btf_vlen(t); + /* all non-bitfield fields have to be naturally aligned */ + for (i = 0; i < vlen; i++, m++) { + align = btf__align_of(btf, m->type); + bit_sz = btf_member_bitfield_size(t, i); + if (align && bit_sz == 0 && m->offset % (8 * align) != 0) + return true; + max_align = max(align, max_align); + } + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; + /* + * if original struct was marked as packed, but its layout is + * naturally aligned, we'll detect that it's not packed + */ + return false; +} + +static void btf_dump_emit_bit_padding(const struct btf_dump *d, + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) +{ + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; + + if (cur_off >= next_off) + return; /* no gap */ + + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; + + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } + + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than 's + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `: sizeof();` markers, except for the last one, which + * might need smaller than sizeof() padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; + } + } +} + +static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "%s%s%s", + btf_is_struct(t) ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_struct_def(struct btf_dump *d, + __u32 id, + const struct btf_type *t, + int lvl) +{ + const struct btf_member *m = btf_members(t); + bool is_struct = btf_is_struct(t); + bool packed, prev_bitfield = false; + int align, i, off = 0; + __u16 vlen = btf_vlen(t); + + align = btf__align_of(d->btf, id); + packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; + + btf_dump_printf(d, "%s%s%s {", + is_struct ? "struct" : "union", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + for (i = 0; i < vlen; i++, m++) { + const char *fname; + int m_off, m_sz, m_align; + bool in_bitfield; + + fname = btf_name_of(d, m->name_off); + m_sz = btf_member_bitfield_size(t, i); + m_off = btf_member_bit_offset(t, i); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); + + in_bitfield = prev_bitfield && m_sz != 0; + + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); + btf_dump_printf(d, "\n%s", pfx(lvl + 1)); + btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); + + if (m_sz) { + btf_dump_printf(d, ": %d", m_sz); + off = m_off + m_sz; + prev_bitfield = true; + } else { + m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); + off = m_off + m_sz * 8; + prev_bitfield = false; + } + + btf_dump_printf(d, ";"); + } + + /* pad at the end, if necessary */ + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); + + /* + * Keep `struct empty {}` on a single line, + * only print newline when there are regular or padding fields. + */ + if (vlen || t->size) { + btf_dump_printf(d, "\n"); + btf_dump_printf(d, "%s}", pfx(lvl)); + } else { + btf_dump_printf(d, "}"); + } + if (packed) + btf_dump_printf(d, " __attribute__((packed))"); +} + +static const char *missing_base_types[][2] = { + /* + * GCC emits typedefs to its internal __PolyX_t types when compiling Arm + * SIMD intrinsics. Alias them to standard base types. + */ + { "__Poly8_t", "unsigned char" }, + { "__Poly16_t", "unsigned short" }, + { "__Poly64_t", "unsigned long long" }, + { "__Poly128_t", "unsigned __int128" }, +}; + +static void btf_dump_emit_missing_aliases(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + int i; + + for (i = 0; i < ARRAY_SIZE(missing_base_types); i++) { + if (strcmp(name, missing_base_types[i][0]) == 0) { + btf_dump_printf(d, "typedef %s %s;\n\n", + missing_base_types[i][1], name); + break; + } + } +} + +static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); +} + +static void btf_dump_emit_enum32_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum *v = btf_enum(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + /* enumerators share namespace with typedef idents */ + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %d," : "\n%s%s___%zd = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, dup_cnt, v->val); + } else { + fmt_str = is_signed ? "\n%s%s = %d," : "\n%s%s = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, v->val); + } + } +} + +static void btf_dump_emit_enum64_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum64 *v = btf_enum64(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + __u64 val; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + val = btf_enum64_value(v); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %lldLL," + : "\n%s%s___%zd = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, dup_cnt, + (unsigned long long)val); + } else { + fmt_str = is_signed ? "\n%s%s = %lldLL," + : "\n%s%s = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, + (unsigned long long)val); + } + } +} +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, + int lvl) +{ + __u16 vlen = btf_vlen(t); + + btf_dump_printf(d, "enum%s%s", + t->name_off ? " " : "", + btf_dump_type_name(d, id)); + + if (!vlen) + return; + + btf_dump_printf(d, " {"); + if (btf_is_enum(t)) + btf_dump_emit_enum32_val(d, t, lvl, vlen); + else + btf_dump_emit_enum64_val(d, t, lvl, vlen); + btf_dump_printf(d, "\n%s}", pfx(lvl)); + + /* special case enums with special sizes */ + if (t->size == 1) { + /* one-byte enums can be forced with mode(byte) attribute */ + btf_dump_printf(d, " __attribute__((mode(byte)))"); + } else if (t->size == 8 && d->ptr_sz == 8) { + /* enum can be 8-byte sized if one of the enumerator values + * doesn't fit in 32-bit integer, or by adding mode(word) + * attribute (but probably only on 64-bit architectures); do + * our best here to try to satisfy the contract without adding + * unnecessary attributes + */ + bool needs_word_mode; + + if (btf_is_enum(t)) { + /* enum can't represent 64-bit values, so we need word mode */ + needs_word_mode = true; + } else { + /* enum64 needs mode(word) if none of its values has + * non-zero upper 32-bits (which means that all values + * fit in 32-bit integers and won't cause compiler to + * bump enum to be 64-bit naturally + */ + int i; + + needs_word_mode = true; + for (i = 0; i < vlen; i++) { + if (btf_enum64(t)[i].val_hi32 != 0) { + needs_word_mode = false; + break; + } + } + } + if (needs_word_mode) + btf_dump_printf(d, " __attribute__((mode(word)))"); + } + +} + +static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, + const struct btf_type *t) +{ + const char *name = btf_dump_type_name(d, id); + + if (btf_kflag(t)) + btf_dump_printf(d, "union %s", name); + else + btf_dump_printf(d, "struct %s", name); +} + +static void btf_dump_emit_typedef_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, int lvl) +{ + const char *name = btf_dump_ident_name(d, id); + + /* + * Old GCC versions are emitting invalid typedef for __gnuc_va_list + * pointing to VOID. This generates warnings from btf_dump() and + * results in uncompilable header file, so we are fixing it up here + * with valid typedef into __builtin_va_list. + */ + if (t->type == 0 && strcmp(name, "__gnuc_va_list") == 0) { + btf_dump_printf(d, "typedef __builtin_va_list __gnuc_va_list"); + return; + } + + btf_dump_printf(d, "typedef "); + btf_dump_emit_type_decl(d, t->type, name, lvl); +} + +static int btf_dump_push_decl_stack_id(struct btf_dump *d, __u32 id) +{ + __u32 *new_stack; + size_t new_cap; + + if (d->decl_stack_cnt >= d->decl_stack_cap) { + new_cap = max(16, d->decl_stack_cap * 3 / 2); + new_stack = libbpf_reallocarray(d->decl_stack, new_cap, sizeof(new_stack[0])); + if (!new_stack) + return -ENOMEM; + d->decl_stack = new_stack; + d->decl_stack_cap = new_cap; + } + + d->decl_stack[d->decl_stack_cnt++] = id; + + return 0; +} + +/* + * Emit type declaration (e.g., field type declaration in a struct or argument + * declaration in function prototype) in correct C syntax. + * + * For most types it's trivial, but there are few quirky type declaration + * cases worth mentioning: + * - function prototypes (especially nesting of function prototypes); + * - arrays; + * - const/volatile/restrict for pointers vs other types. + * + * For a good discussion of *PARSING* C syntax (as a human), see + * Peter van der Linden's "Expert C Programming: Deep C Secrets", + * Ch.3 "Unscrambling Declarations in C". + * + * It won't help with BTF to C conversion much, though, as it's an opposite + * problem. So we came up with this algorithm in reverse to van der Linden's + * parsing algorithm. It goes from structured BTF representation of type + * declaration to a valid compilable C syntax. + * + * For instance, consider this C typedef: + * typedef const int * const * arr[10] arr_t; + * It will be represented in BTF with this chain of BTF types: + * [typedef] -> [array] -> [ptr] -> [const] -> [ptr] -> [const] -> [int] + * + * Notice how [const] modifier always goes before type it modifies in BTF type + * graph, but in C syntax, const/volatile/restrict modifiers are written to + * the right of pointers, but to the left of other types. There are also other + * quirks, like function pointers, arrays of them, functions returning other + * functions, etc. + * + * We handle that by pushing all the types to a stack, until we hit "terminal" + * type (int/enum/struct/union/fwd). Then depending on the kind of a type on + * top of a stack, modifiers are handled differently. Array/function pointers + * have also wildly different syntax and how nesting of them are done. See + * code for authoritative definition. + * + * To avoid allocating new stack for each independent chain of BTF types, we + * share one bigger stack, with each chain working only on its own local view + * of a stack frame. Some care is required to "pop" stack frames after + * processing type declaration chain. + */ +int btf_dump__emit_type_decl(struct btf_dump *d, __u32 id, + const struct btf_dump_emit_type_decl_opts *opts) +{ + const char *fname; + int lvl, err; + + if (!OPTS_VALID(opts, btf_dump_emit_type_decl_opts)) + return libbpf_err(-EINVAL); + + err = btf_dump_resize(d); + if (err) + return libbpf_err(err); + + fname = OPTS_GET(opts, field_name, ""); + lvl = OPTS_GET(opts, indent_level, 0); + d->strip_mods = OPTS_GET(opts, strip_mods, false); + btf_dump_emit_type_decl(d, id, fname, lvl); + d->strip_mods = false; + return 0; +} + +static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, + const char *fname, int lvl) +{ + struct id_stack decl_stack; + const struct btf_type *t; + int err, stack_start; + + stack_start = d->decl_stack_cnt; + for (;;) { + t = btf__type_by_id(d->btf, id); + if (d->strip_mods && btf_is_mod(t)) + goto skip_mod; + + err = btf_dump_push_decl_stack_id(d, id); + if (err < 0) { + /* + * if we don't have enough memory for entire type decl + * chain, restore stack, emit warning, and try to + * proceed nevertheless + */ + pr_warn("not enough memory for decl stack:%d", err); + d->decl_stack_cnt = stack_start; + return; + } +skip_mod: + /* VOID */ + if (id == 0) + break; + + switch (btf_kind(t)) { + case BTF_KIND_PTR: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_TYPE_TAG: + id = t->type; + break; + case BTF_KIND_ARRAY: + id = btf_array(t)->type; + break; + case BTF_KIND_INT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FLOAT: + goto done; + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + btf_kind(t), id); + goto done; + } + } +done: + /* + * We might be inside a chain of declarations (e.g., array of function + * pointers returning anonymous (so inlined) structs, having another + * array field). Each of those needs its own "stack frame" to handle + * emitting of declarations. Those stack frames are non-overlapping + * portions of shared btf_dump->decl_stack. To make it a bit nicer to + * handle this set of nested stacks, we create a view corresponding to + * our own "stack frame" and work with it as an independent stack. + * We'll need to clean up after emit_type_chain() returns, though. + */ + decl_stack.ids = d->decl_stack + stack_start; + decl_stack.cnt = d->decl_stack_cnt - stack_start; + btf_dump_emit_type_chain(d, &decl_stack, fname, lvl); + /* + * emit_type_chain() guarantees that it will pop its entire decl_stack + * frame before returning. But it works with a read-only view into + * decl_stack, so it doesn't actually pop anything from the + * perspective of shared btf_dump->decl_stack, per se. We need to + * reset decl_stack state to how it was before us to avoid it growing + * all the time. + */ + d->decl_stack_cnt = stack_start; +} + +static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + + switch (btf_kind(t)) { + case BTF_KIND_VOLATILE: + btf_dump_printf(d, "volatile "); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, "const "); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, "restrict "); + break; + default: + return; + } + decl_stack->cnt--; + } +} + +static void btf_dump_drop_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + if (!btf_is_mod(t)) + return; + decl_stack->cnt--; + } +} + +static void btf_dump_emit_name(const struct btf_dump *d, + const char *name, bool last_was_ptr) +{ + bool separate = name[0] && !last_was_ptr; + + btf_dump_printf(d, "%s%s", separate ? " " : "", name); +} + +static void btf_dump_emit_type_chain(struct btf_dump *d, + struct id_stack *decls, + const char *fname, int lvl) +{ + /* + * last_was_ptr is used to determine if we need to separate pointer + * asterisk (*) from previous part of type signature with space, so + * that we get `int ***`, instead of `int * * *`. We default to true + * for cases where we have single pointer in a chain. E.g., in ptr -> + * func_proto case. func_proto will start a new emit_type_chain call + * with just ptr, which should be emitted as (*) or (*), so we + * don't want to prepend space for that last pointer. + */ + bool last_was_ptr = true; + const struct btf_type *t; + const char *name; + __u16 kind; + __u32 id; + + while (decls->cnt) { + id = decls->ids[--decls->cnt]; + if (id == 0) { + /* VOID is a special snowflake */ + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "void"); + last_was_ptr = false; + continue; + } + + t = btf__type_by_id(d->btf, id); + kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, "%s", name); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + btf_dump_emit_mods(d, decls); + /* inline anonymous struct/union */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_struct_def(d, id, t, lvl); + else + btf_dump_emit_struct_fwd(d, id, t); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + btf_dump_emit_mods(d, decls); + /* inline anonymous enum */ + if (t->name_off == 0 && !d->skip_anon_defs) + btf_dump_emit_enum_def(d, id, t, lvl); + else + btf_dump_emit_enum_fwd(d, id, t); + break; + case BTF_KIND_FWD: + btf_dump_emit_mods(d, decls); + btf_dump_emit_fwd_def(d, id, t); + break; + case BTF_KIND_TYPEDEF: + btf_dump_emit_mods(d, decls); + btf_dump_printf(d, "%s", btf_dump_ident_name(d, id)); + break; + case BTF_KIND_PTR: + btf_dump_printf(d, "%s", last_was_ptr ? "*" : " *"); + break; + case BTF_KIND_VOLATILE: + btf_dump_printf(d, " volatile"); + break; + case BTF_KIND_CONST: + btf_dump_printf(d, " const"); + break; + case BTF_KIND_RESTRICT: + btf_dump_printf(d, " restrict"); + break; + case BTF_KIND_TYPE_TAG: + btf_dump_emit_mods(d, decls); + name = btf_name_of(d, t->name_off); + btf_dump_printf(d, " __attribute__((btf_type_tag(\"%s\")))", name); + break; + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + const struct btf_type *next_t; + __u32 next_id; + bool multidim; + /* + * GCC has a bug + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=8354) + * which causes it to emit extra const/volatile + * modifiers for an array, if array's element type has + * const/volatile modifiers. Clang doesn't do that. + * In general, it doesn't seem very meaningful to have + * a const/volatile modifier for array, so we are + * going to silently skip them here. + */ + btf_dump_drop_mods(d, decls); + + if (decls->cnt == 0) { + btf_dump_emit_name(d, fname, last_was_ptr); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + + next_id = decls->ids[decls->cnt - 1]; + next_t = btf__type_by_id(d->btf, next_id); + multidim = btf_is_array(next_t); + /* we need space if we have named non-pointer */ + if (fname[0] && !last_was_ptr) + btf_dump_printf(d, " "); + /* no parentheses for multi-dimensional array */ + if (!multidim) + btf_dump_printf(d, "("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + if (!multidim) + btf_dump_printf(d, ")"); + btf_dump_printf(d, "[%u]", a->nelems); + return; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *p = btf_params(t); + __u16 vlen = btf_vlen(t); + int i; + + /* + * GCC emits extra volatile qualifier for + * __attribute__((noreturn)) function pointers. Clang + * doesn't do it. It's a GCC quirk for backwards + * compatibility with code written for GCC <2.5. So, + * similarly to extra qualifiers for array, just drop + * them, instead of handling them. + */ + btf_dump_drop_mods(d, decls); + if (decls->cnt) { + btf_dump_printf(d, " ("); + btf_dump_emit_type_chain(d, decls, fname, lvl); + btf_dump_printf(d, ")"); + } else { + btf_dump_emit_name(d, fname, last_was_ptr); + } + btf_dump_printf(d, "("); + /* + * Clang for BPF target generates func_proto with no + * args as a func_proto with a single void arg (e.g., + * `int (*f)(void)` vs just `int (*f)()`). We are + * going to pretend there are no args for such case. + */ + if (vlen == 1 && p->type == 0) { + btf_dump_printf(d, ")"); + return; + } + + for (i = 0; i < vlen; i++, p++) { + if (i > 0) + btf_dump_printf(d, ", "); + + /* last arg of type void is vararg */ + if (i == vlen - 1 && p->type == 0) { + btf_dump_printf(d, "..."); + break; + } + + name = btf_name_of(d, p->name_off); + btf_dump_emit_type_decl(d, p->type, name, lvl); + } + + btf_dump_printf(d, ")"); + return; + } + default: + pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n", + kind, id); + return; + } + + last_was_ptr = kind == BTF_KIND_PTR; + } + + btf_dump_emit_name(d, fname, last_was_ptr); +} + +/* show type name as (type_name) */ +static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id, + bool top_level) +{ + const struct btf_type *t; + + /* for array members, we don't bother emitting type name for each + * member to avoid the redundancy of + * .name = (char[4])[(char)'f',(char)'o',(char)'o',] + */ + if (d->typed_dump->is_array_member) + return; + + /* avoid type name specification for variable/section; it will be done + * for the associated variable value(s). + */ + t = btf__type_by_id(d->btf, id); + if (btf_is_var(t) || btf_is_datasec(t)) + return; + + if (top_level) + btf_dump_printf(d, "("); + + d->skip_anon_defs = true; + d->strip_mods = true; + btf_dump_emit_type_decl(d, id, "", 0); + d->strip_mods = false; + d->skip_anon_defs = false; + + if (top_level) + btf_dump_printf(d, ")"); +} + +/* return number of duplicates (occurrences) of a given name */ +static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, + const char *orig_name) +{ + char *old_name, *new_name; + size_t dup_cnt = 0; + int err; + + new_name = strdup(orig_name); + if (!new_name) + return 1; + + (void)hashmap__find(name_map, orig_name, &dup_cnt); + dup_cnt++; + + err = hashmap__set(name_map, new_name, dup_cnt, &old_name, NULL); + if (err) + free(new_name); + + free(old_name); + + return dup_cnt; +} + +static const char *btf_dump_resolve_name(struct btf_dump *d, __u32 id, + struct hashmap *name_map) +{ + struct btf_dump_type_aux_state *s = &d->type_states[id]; + const struct btf_type *t = btf__type_by_id(d->btf, id); + const char *orig_name = btf_name_of(d, t->name_off); + const char **cached_name = &d->cached_names[id]; + size_t dup_cnt; + + if (t->name_off == 0) + return ""; + + if (s->name_resolved) + return *cached_name ? *cached_name : orig_name; + + if (btf_is_fwd(t) || (btf_is_enum(t) && btf_vlen(t) == 0)) { + s->name_resolved = 1; + return orig_name; + } + + dup_cnt = btf_dump_name_dups(d, name_map, orig_name); + if (dup_cnt > 1) { + const size_t max_len = 256; + char new_name[max_len]; + + snprintf(new_name, max_len, "%s___%zu", orig_name, dup_cnt); + *cached_name = strdup(new_name); + } + + s->name_resolved = 1; + return *cached_name ? *cached_name : orig_name; +} + +static const char *btf_dump_type_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->type_names); +} + +static const char *btf_dump_ident_name(struct btf_dump *d, __u32 id) +{ + return btf_dump_resolve_name(d, id, d->ident_names); +} + +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz); + +static const char *btf_dump_data_newline(struct btf_dump *d) +{ + return d->typed_dump->compact || d->typed_dump->depth == 0 ? "" : "\n"; +} + +static const char *btf_dump_data_delim(struct btf_dump *d) +{ + return d->typed_dump->depth == 0 ? "" : ","; +} + +static void btf_dump_data_pfx(struct btf_dump *d) +{ + int i, lvl = d->typed_dump->indent_lvl + d->typed_dump->depth; + + if (d->typed_dump->compact) + return; + + for (i = 0; i < lvl; i++) + btf_dump_printf(d, "%s", d->typed_dump->indent_str); +} + +/* A macro is used here as btf_type_value[s]() appends format specifiers + * to the format specifier passed in; these do the work of appending + * delimiters etc while the caller simply has to specify the type values + * in the format specifier + value(s). + */ +#define btf_dump_type_values(d, fmt, ...) \ + btf_dump_printf(d, fmt "%s%s", \ + ##__VA_ARGS__, \ + btf_dump_data_delim(d), \ + btf_dump_data_newline(d)) + +static int btf_dump_unsupported_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id) +{ + btf_dump_printf(d, "", btf_kind(t)); + return -ENOTSUP; +} + +static int btf_dump_get_bitfield_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz, + __u64 *value) +{ + __u16 left_shift_bits, right_shift_bits; + const __u8 *bytes = data; + __u8 nr_copy_bits; + __u64 num = 0; + int i; + + /* Maximum supported bitfield size is 64 bits */ + if (t->size > 8) { + pr_warn("unexpected bitfield size %d\n", t->size); + return -EINVAL; + } + + /* Bitfield value retrieval is done in two steps; first relevant bytes are + * stored in num, then we left/right shift num to eliminate irrelevant bits. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + for (i = t->size - 1; i >= 0; i--) + num = num * 256 + bytes[i]; + nr_copy_bits = bit_sz + bits_offset; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + for (i = 0; i < t->size; i++) + num = num * 256 + bytes[i]; + nr_copy_bits = t->size * 8 - bits_offset; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + left_shift_bits = 64 - nr_copy_bits; + right_shift_bits = 64 - bit_sz; + + *value = (num << left_shift_bits) >> right_shift_bits; + + return 0; +} + +static int btf_dump_bitfield_check_zero(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 check_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &check_num); + if (err) + return err; + if (check_num == 0) + return -ENODATA; + return 0; +} + +static int btf_dump_bitfield_data(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __u64 print_num; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, &print_num); + if (err) + return err; + + btf_dump_type_values(d, "0x%llx", (unsigned long long)print_num); + + return 0; +} + +/* ints, floats and ptrs */ +static int btf_dump_base_type_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + static __u8 bytecmp[16] = {}; + int nr_bytes; + + /* For pointer types, pointer size is not defined on a per-type basis. + * On dump creation however, we store the pointer size. + */ + if (btf_kind(t) == BTF_KIND_PTR) + nr_bytes = d->ptr_sz; + else + nr_bytes = t->size; + + if (nr_bytes < 1 || nr_bytes > 16) { + pr_warn("unexpected size %d for id [%u]\n", nr_bytes, id); + return -EINVAL; + } + + if (memcmp(data, bytecmp, nr_bytes) == 0) + return -ENODATA; + return 0; +} + +static bool ptr_is_aligned(const struct btf *btf, __u32 type_id, + const void *data) +{ + int alignment = btf__align_of(btf, type_id); + + if (alignment == 0) + return false; + + return ((uintptr_t)data) % alignment == 0; +} + +static int btf_dump_int_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data, + __u8 bits_offset) +{ + __u8 encoding = btf_int_encoding(t); + bool sign = encoding & BTF_INT_SIGNED; + char buf[16] __attribute__((aligned(16))); + int sz = t->size; + + if (sz == 0 || sz > sizeof(buf)) { + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + + /* handle packed int data - accesses of integers not aligned on + * int boundaries can cause problems on some platforms. + */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(buf, data, sz); + data = buf; + } + + switch (sz) { + case 16: { + const __u64 *ints = data; + __u64 lsi, msi; + + /* avoid use of __int128 as some 32-bit platforms do not + * support it. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + lsi = ints[0]; + msi = ints[1]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + lsi = ints[1]; + msi = ints[0]; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (msi == 0) + btf_dump_type_values(d, "0x%llx", (unsigned long long)lsi); + else + btf_dump_type_values(d, "0x%llx%016llx", (unsigned long long)msi, + (unsigned long long)lsi); + break; + } + case 8: + if (sign) + btf_dump_type_values(d, "%lld", *(long long *)data); + else + btf_dump_type_values(d, "%llu", *(unsigned long long *)data); + break; + case 4: + if (sign) + btf_dump_type_values(d, "%d", *(__s32 *)data); + else + btf_dump_type_values(d, "%u", *(__u32 *)data); + break; + case 2: + if (sign) + btf_dump_type_values(d, "%d", *(__s16 *)data); + else + btf_dump_type_values(d, "%u", *(__u16 *)data); + break; + case 1: + if (d->typed_dump->is_array_char) { + /* check for null terminator */ + if (d->typed_dump->is_array_terminated) + break; + if (*(char *)data == '\0') { + d->typed_dump->is_array_terminated = true; + break; + } + if (isprint(*(char *)data)) { + btf_dump_type_values(d, "'%c'", *(char *)data); + break; + } + } + if (sign) + btf_dump_type_values(d, "%d", *(__s8 *)data); + else + btf_dump_type_values(d, "%u", *(__u8 *)data); + break; + default: + pr_warn("unexpected sz %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +union float_data { + long double ld; + double d; + float f; +}; + +static int btf_dump_float_data(struct btf_dump *d, + const struct btf_type *t, + __u32 type_id, + const void *data) +{ + const union float_data *flp = data; + union float_data fl; + int sz = t->size; + + /* handle unaligned data; copy to local union */ + if (!ptr_is_aligned(d->btf, type_id, data)) { + memcpy(&fl, data, sz); + flp = &fl; + } + + switch (sz) { + case 16: + btf_dump_type_values(d, "%Lf", flp->ld); + break; + case 8: + btf_dump_type_values(d, "%lf", flp->d); + break; + case 4: + btf_dump_type_values(d, "%f", flp->f); + break; + default: + pr_warn("unexpected size %d for id [%u]\n", sz, type_id); + return -EINVAL; + } + return 0; +} + +static int btf_dump_var_data(struct btf_dump *d, + const struct btf_type *v, + __u32 id, + const void *data) +{ + enum btf_func_linkage linkage = btf_var(v)->linkage; + const struct btf_type *t; + const char *l; + __u32 type_id; + + switch (linkage) { + case BTF_FUNC_STATIC: + l = "static "; + break; + case BTF_FUNC_EXTERN: + l = "extern "; + break; + case BTF_FUNC_GLOBAL: + default: + l = ""; + break; + } + + /* format of output here is [linkage] [type] [varname] = (type)value, + * for example "static int cpu_profile_flip = (int)1" + */ + btf_dump_printf(d, "%s", l); + type_id = v->type; + t = btf__type_by_id(d->btf, type_id); + btf_dump_emit_type_cast(d, type_id, false); + btf_dump_printf(d, " %s = ", btf_name_of(d, v->name_off)); + return btf_dump_dump_type_data(d, NULL, t, type_id, data, 0, 0); +} + +static int btf_dump_array_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 i, elem_type_id; + __s64 elem_size; + bool is_array_member; + + elem_type_id = array->type; + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + elem_size = btf__resolve_size(d->btf, elem_type_id); + if (elem_size <= 0) { + pr_warn("unexpected elem size %zd for array type [%u]\n", + (ssize_t)elem_size, id); + return -EINVAL; + } + + if (btf_is_int(elem_type)) { + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + d->typed_dump->is_array_char = true; + } + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "[%s", btf_dump_data_newline(d)); + + /* may be a multidimensional array, so store current "is array member" + * status so we can restore it correctly later. + */ + is_array_member = d->typed_dump->is_array_member; + d->typed_dump->is_array_member = true; + for (i = 0; i < array->nelems; i++, data += elem_size) { + if (d->typed_dump->is_array_terminated) + break; + btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); + } + d->typed_dump->is_array_member = is_array_member; + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "]"); + + return 0; +} + +static int btf_dump_struct_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + int i, err = 0; + + /* note that we increment depth before calling btf_dump_print() below; + * this is intentional. btf_dump_data_newline() will not print a + * newline for depth 0 (since this leaves us with trailing newlines + * at the end of typed display), so depth is incremented first. + * For similar reasons, we decrement depth before showing the closing + * parenthesis. + */ + d->typed_dump->depth++; + btf_dump_printf(d, "{%s", btf_dump_data_newline(d)); + + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + const char *mname; + __u32 moffset; + __u8 bit_sz; + + mtype = btf__type_by_id(d->btf, m->type); + mname = btf_name_of(d, m->name_off); + moffset = btf_member_bit_offset(t, i); + + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_dump_type_data(d, mname, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err < 0) + return err; + } + d->typed_dump->depth--; + btf_dump_data_pfx(d); + btf_dump_type_values(d, "}"); + return err; +} + +union ptr_data { + unsigned int p; + unsigned long long lp; +}; + +static int btf_dump_ptr_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + if (ptr_is_aligned(d->btf, id, data) && d->ptr_sz == sizeof(void *)) { + btf_dump_type_values(d, "%p", *(void **)data); + } else { + union ptr_data pt; + + memcpy(&pt, data, d->ptr_sz); + if (d->ptr_sz == 4) + btf_dump_type_values(d, "0x%x", pt.p); + else + btf_dump_type_values(d, "0x%llx", pt.lp); + } + return 0; +} + +static int btf_dump_get_enum_value(struct btf_dump *d, + const struct btf_type *t, + const void *data, + __u32 id, + __s64 *value) +{ + bool is_signed = btf_kflag(t); + + if (!ptr_is_aligned(d->btf, id, data)) { + __u64 val; + int err; + + err = btf_dump_get_bitfield_value(d, t, data, 0, 0, &val); + if (err) + return err; + *value = (__s64)val; + return 0; + } + + switch (t->size) { + case 8: + *value = *(__s64 *)data; + return 0; + case 4: + *value = is_signed ? (__s64)*(__s32 *)data : *(__u32 *)data; + return 0; + case 2: + *value = is_signed ? *(__s16 *)data : *(__u16 *)data; + return 0; + case 1: + *value = is_signed ? *(__s8 *)data : *(__u8 *)data; + return 0; + default: + pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); + return -EINVAL; + } +} + +static int btf_dump_enum_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + bool is_signed; + __s64 value; + int i, err; + + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + + is_signed = btf_kflag(t); + if (btf_is_enum(t)) { + const struct btf_enum *e; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%d" : "%u", value); + } else { + const struct btf_enum64 *e; + + for (i = 0, e = btf_enum64(t); i < btf_vlen(t); i++, e++) { + if (value != btf_enum64_value(e)) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%lldLL" : "%lluULL", + (unsigned long long)value); + } + return 0; +} + +static int btf_dump_datasec_data(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + __u32 i; + int err; + + btf_dump_type_values(d, "SEC(\"%s\") ", btf_name_of(d, t->name_off)); + + for (i = 0, vsi = btf_var_secinfos(t); i < btf_vlen(t); i++, vsi++) { + var = btf__type_by_id(d->btf, vsi->type); + err = btf_dump_dump_type_data(d, NULL, var, vsi->type, data + vsi->offset, 0, 0); + if (err < 0) + return err; + btf_dump_printf(d, ";"); + } + return 0; +} + +/* return size of type, or if base type overflows, return -E2BIG. */ +static int btf_dump_type_data_check_overflow(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 size; + + if (bit_sz) { + /* bits_offset is at most 7. bit_sz is at most 128. */ + __u8 nr_bytes = (bits_offset + bit_sz + 7) / 8; + + /* When bit_sz is non zero, it is called from + * btf_dump_struct_data() where it only cares about + * negative error value. + * Return nr_bytes in success case to make it + * consistent as the regular integer case below. + */ + return data + nr_bytes > d->typed_dump->data_end ? -E2BIG : nr_bytes; + } + + size = btf__resolve_size(d->btf, id); + + if (size < 0 || size >= INT_MAX) { + pr_warn("unexpected size [%zu] for id [%u]\n", + (size_t)size, id); + return -EINVAL; + } + + /* Only do overflow checking for base types; we do not want to + * avoid showing part of a struct, union or array, even if we + * do not have enough data to show the full object. By + * restricting overflow checking to base types we can ensure + * that partial display succeeds, while avoiding overflowing + * and using bogus data for display. + */ + t = skip_mods_and_typedefs(d->btf, id, NULL); + if (!t) { + pr_warn("unexpected error skipping mods/typedefs for id [%u]\n", + id); + return -EINVAL; + } + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (data + bits_offset / 8 + size > d->typed_dump->data_end) + return -E2BIG; + break; + default: + break; + } + return (int)size; +} + +static int btf_dump_type_data_check_zero(struct btf_dump *d, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + __s64 value; + int i, err; + + /* toplevel exceptions; we show zero values if + * - we ask for them (emit_zeros) + * - if we are at top-level so we see "struct empty { }" + * - or if we are an array member and the array is non-empty and + * not a char array; we don't want to be in a situation where we + * have an integer array 0, 1, 0, 1 and only show non-zero values. + * If the array contains zeroes only, or is a char array starting + * with a '\0', the array-level check_zero() will prevent showing it; + * we are concerned with determining zero value at the array member + * level here. + */ + if (d->typed_dump->emit_zeroes || d->typed_dump->depth == 0 || + (d->typed_dump->is_array_member && + !d->typed_dump->is_array_char)) + return 0; + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_INT: + if (bit_sz) + return btf_dump_bitfield_check_zero(d, t, data, bits_offset, bit_sz); + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_FLOAT: + case BTF_KIND_PTR: + return btf_dump_base_type_check_zero(d, t, id, data); + case BTF_KIND_ARRAY: { + const struct btf_array *array = btf_array(t); + const struct btf_type *elem_type; + __u32 elem_type_id, elem_size; + bool ischar; + + elem_type_id = array->type; + elem_size = btf__resolve_size(d->btf, elem_type_id); + elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); + + ischar = btf_is_int(elem_type) && elem_size == 1; + + /* check all elements; if _any_ element is nonzero, all + * of array is displayed. We make an exception however + * for char arrays where the first element is 0; these + * are considered zeroed also, even if later elements are + * non-zero because the string is terminated. + */ + for (i = 0; i < array->nelems; i++) { + if (i == 0 && ischar && *(char *)data == 0) + return -ENODATA; + err = btf_dump_type_data_check_zero(d, elem_type, + elem_type_id, + data + + (i * elem_size), + bits_offset, 0); + if (err != -ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + __u16 n = btf_vlen(t); + + /* if any struct/union member is non-zero, the struct/union + * is considered non-zero and dumped. + */ + for (i = 0; i < n; i++, m++) { + const struct btf_type *mtype; + __u32 moffset; + + mtype = btf__type_by_id(d->btf, m->type); + moffset = btf_member_bit_offset(t, i); + + /* btf_int_bits() does not store member bitfield size; + * bitfield size needs to be stored here so int display + * of member can retrieve it. + */ + bit_sz = btf_member_bitfield_size(t, i); + err = btf_dump_type_data_check_zero(d, mtype, m->type, data + moffset / 8, + moffset % 8, bit_sz); + if (err != ENODATA) + return err; + } + return -ENODATA; + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + err = btf_dump_get_enum_value(d, t, data, id, &value); + if (err) + return err; + if (value == 0) + return -ENODATA; + return 0; + default: + return 0; + } +} + +/* returns size of data dumped, or error. */ +static int btf_dump_dump_type_data(struct btf_dump *d, + const char *fname, + const struct btf_type *t, + __u32 id, + const void *data, + __u8 bits_offset, + __u8 bit_sz) +{ + int size, err = 0; + + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset, bit_sz); + if (size < 0) + return size; + err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); + if (err) { + /* zeroed data is expected and not an error, so simply skip + * dumping such data. Record other errors however. + */ + if (err == -ENODATA) + return size; + return err; + } + btf_dump_data_pfx(d); + + if (!d->typed_dump->skip_names) { + if (fname && strlen(fname) > 0) + btf_dump_printf(d, ".%s = ", fname); + btf_dump_emit_type_cast(d, id, true); + } + + t = skip_mods_and_typedefs(d->btf, id, NULL); + + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_DECL_TAG: + err = btf_dump_unsupported_data(d, t, id); + break; + case BTF_KIND_INT: + if (bit_sz) + err = btf_dump_bitfield_data(d, t, data, bits_offset, bit_sz); + else + err = btf_dump_int_data(d, t, id, data, bits_offset); + break; + case BTF_KIND_FLOAT: + err = btf_dump_float_data(d, t, id, data); + break; + case BTF_KIND_PTR: + err = btf_dump_ptr_data(d, t, id, data); + break; + case BTF_KIND_ARRAY: + err = btf_dump_array_data(d, t, id, data); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + err = btf_dump_struct_data(d, t, id, data); + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* handle bitfield and int enum values */ + if (bit_sz) { + __u64 print_num; + __s64 enum_val; + + err = btf_dump_get_bitfield_value(d, t, data, bits_offset, bit_sz, + &print_num); + if (err) + break; + enum_val = (__s64)print_num; + err = btf_dump_enum_data(d, t, id, &enum_val); + } else + err = btf_dump_enum_data(d, t, id, data); + break; + case BTF_KIND_VAR: + err = btf_dump_var_data(d, t, id, data); + break; + case BTF_KIND_DATASEC: + err = btf_dump_datasec_data(d, t, id, data); + break; + default: + pr_warn("unexpected kind [%u] for id [%u]\n", + BTF_INFO_KIND(t->info), id); + return -EINVAL; + } + if (err < 0) + return err; + return size; +} + +int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, + const void *data, size_t data_sz, + const struct btf_dump_type_data_opts *opts) +{ + struct btf_dump_data typed_dump = {}; + const struct btf_type *t; + int ret; + + if (!OPTS_VALID(opts, btf_dump_type_data_opts)) + return libbpf_err(-EINVAL); + + t = btf__type_by_id(d->btf, id); + if (!t) + return libbpf_err(-ENOENT); + + d->typed_dump = &typed_dump; + d->typed_dump->data_end = data + data_sz; + d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); + + /* default indent string is a tab */ + if (!OPTS_GET(opts, indent_str, NULL)) + d->typed_dump->indent_str[0] = '\t'; + else + libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, + sizeof(d->typed_dump->indent_str)); + + d->typed_dump->compact = OPTS_GET(opts, compact, false); + d->typed_dump->skip_names = OPTS_GET(opts, skip_names, false); + d->typed_dump->emit_zeroes = OPTS_GET(opts, emit_zeroes, false); + + ret = btf_dump_dump_type_data(d, NULL, t, id, data, 0, 0); + + d->typed_dump = NULL; + + return libbpf_err(ret); +} diff --git a/third_party/libbpf/src/gen_loader.c b/third_party/libbpf/src/gen_loader.c new file mode 100644 index 0000000..cf3323f --- /dev/null +++ b/third_party/libbpf/src/gen_loader.c @@ -0,0 +1,1123 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" +#include "skel_internal.h" +#include + +#define MAX_USED_MAPS 64 +#define MAX_USED_PROGS 32 +#define MAX_KFUNC_DESCS 256 +#define MAX_FD_ARRAY_SZ (MAX_USED_MAPS + MAX_KFUNC_DESCS) + +/* The following structure describes the stack layout of the loader program. + * In addition R6 contains the pointer to context. + * R7 contains the result of the last sys_bpf command (typically error or FD). + * R9 contains the result of the last sys_close command. + * + * Naming convention: + * ctx - bpf program context + * stack - bpf program stack + * blob - bpf_attr-s, strings, insns, map data. + * All the bytes that loader prog will use for read/write. + */ +struct loader_stack { + __u32 btf_fd; + __u32 inner_map_fd; + __u32 prog_fd[MAX_USED_PROGS]; +}; + +#define stack_off(field) \ + (__s16)(-sizeof(struct loader_stack) + offsetof(struct loader_stack, field)) + +#define attr_field(attr, field) (attr + offsetof(union bpf_attr, field)) + +static int blob_fd_array_off(struct bpf_gen *gen, int index) +{ + return gen->fd_array + index * sizeof(int); +} + +static int realloc_insn_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->insn_cur - gen->insn_start; + void *insn_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + insn_start = realloc(gen->insn_start, off + size); + if (!insn_start) { + gen->error = -ENOMEM; + free(gen->insn_start); + gen->insn_start = NULL; + return -ENOMEM; + } + gen->insn_start = insn_start; + gen->insn_cur = insn_start + off; + return 0; +} + +static int realloc_data_buf(struct bpf_gen *gen, __u32 size) +{ + size_t off = gen->data_cur - gen->data_start; + void *data_start; + + if (gen->error) + return gen->error; + if (size > INT32_MAX || off + size > INT32_MAX) { + gen->error = -ERANGE; + return -ERANGE; + } + data_start = realloc(gen->data_start, off + size); + if (!data_start) { + gen->error = -ENOMEM; + free(gen->data_start); + gen->data_start = NULL; + return -ENOMEM; + } + gen->data_start = data_start; + gen->data_cur = data_start + off; + return 0; +} + +static void emit(struct bpf_gen *gen, struct bpf_insn insn) +{ + if (realloc_insn_buf(gen, sizeof(insn))) + return; + memcpy(gen->insn_cur, &insn, sizeof(insn)); + gen->insn_cur += sizeof(insn); +} + +static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn insn2) +{ + emit(gen, insn1); + emit(gen, insn2); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size); +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) +{ + size_t stack_sz = sizeof(struct loader_stack), nr_progs_sz; + int i; + + gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); + gen->log_level = log_level; + /* save ctx pointer into R6 */ + emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); + + /* bzero stack */ + emit(gen, BPF_MOV64_REG(BPF_REG_1, BPF_REG_10)); + emit(gen, BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, stack_sz)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + /* amount of stack actually used, only used to calculate iterations, not stack offset */ + nr_progs_sz = offsetof(struct loader_stack, prog_fd[nr_progs]); + /* jump over cleanup code */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, + /* size of cleanup code below (including map fd cleanup) */ + (nr_progs_sz / 4) * 3 + 2 + + /* 6 insns for emit_sys_close_blob, + * 6 insns for debug_regs in emit_sys_close_blob + */ + nr_maps * (6 + (gen->log_level ? 6 : 0)))); + + /* remember the label where all error branches will jump to */ + gen->cleanup_label = gen->insn_cur - gen->insn_start; + /* emit cleanup code: close all temp FDs */ + for (i = 0; i < nr_progs_sz; i += 4) { + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -stack_sz + i)); + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + } + for (i = 0; i < nr_maps; i++) + emit_sys_close_blob(gen, blob_fd_array_off(gen, i)); + /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ + emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); + emit(gen, BPF_EXIT_INSN()); +} + +static int add_data(struct bpf_gen *gen, const void *data, __u32 size) +{ + __u32 size8 = roundup(size, 8); + __u64 zero = 0; + void *prev; + + if (realloc_data_buf(gen, size8)) + return 0; + prev = gen->data_cur; + if (data) { + memcpy(gen->data_cur, data, size); + memcpy(gen->data_cur + size, &zero, size8 - size); + } else { + memset(gen->data_cur, 0, size8); + } + gen->data_cur += size8; + return prev - gen->data_start; +} + +/* Get index for map_fd/btf_fd slot in reserved fd_array, or in data relative + * to start of fd_array. Caller can decide if it is usable or not. + */ +static int add_map_fd(struct bpf_gen *gen) +{ + if (gen->nr_maps == MAX_USED_MAPS) { + pr_warn("Total maps exceeds %d\n", MAX_USED_MAPS); + gen->error = -E2BIG; + return 0; + } + return gen->nr_maps++; +} + +static int add_kfunc_btf_fd(struct bpf_gen *gen) +{ + int cur; + + if (gen->nr_fd_array == MAX_KFUNC_DESCS) { + cur = add_data(gen, NULL, sizeof(int)); + return (cur - gen->fd_array) / sizeof(int); + } + return MAX_USED_MAPS + gen->nr_fd_array++; +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* *(u64 *)(blob + off) = (u64)(void *)(blob + data) */ +static void emit_rel_store(struct bpf_gen *gen, int off, int data) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, data)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2blob(struct bpf_gen *gen, int off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_2, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_blob2ctx(struct bpf_gen *gen, int ctx_off, int size, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_1, 0)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void move_ctx2blob(struct bpf_gen *gen, int off, int size, int ctx_off, + bool check_non_zero) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_6, ctx_off)); + if (check_non_zero) + /* If value in ctx is zero don't update the blob. + * For example: when ctx->map.max_entries == 0, keep default max_entries from bpf.c + */ + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2blob(struct bpf_gen *gen, int off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_1, BPF_REG_0, 0)); +} + +static void move_stack2ctx(struct bpf_gen *gen, int ctx_off, int size, int stack_off) +{ + emit(gen, BPF_LDX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_0, BPF_REG_10, stack_off)); + emit(gen, BPF_STX_MEM(insn_bytes_to_bpf_size(size), BPF_REG_6, BPF_REG_0, ctx_off)); +} + +static void emit_sys_bpf(struct bpf_gen *gen, int cmd, int attr, int attr_size) +{ + emit(gen, BPF_MOV64_IMM(BPF_REG_1, cmd)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, attr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, attr_size)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_bpf)); + /* remember the result in R7 */ + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); +} + +static bool is_simm16(__s64 value) +{ + return value == (__s64)(__s16)value; +} + +static void emit_check_err(struct bpf_gen *gen) +{ + __s64 off = -(gen->insn_cur - gen->insn_start - gen->cleanup_label) / 8 - 1; + + /* R7 contains result of last sys_bpf command. + * if (R7 < 0) goto cleanup; + */ + if (is_simm16(off)) { + emit(gen, BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0, off)); + } else { + gen->error = -ERANGE; + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, -1)); + } +} + +/* reg1 and reg2 should not be R1 - R5. They can be R0, R6 - R10 */ +static void emit_debug(struct bpf_gen *gen, int reg1, int reg2, + const char *fmt, va_list args) +{ + char buf[1024]; + int addr, len, ret; + + if (!gen->log_level) + return; + ret = vsnprintf(buf, sizeof(buf), fmt, args); + if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0) + /* The special case to accommodate common debug_ret(): + * to avoid specifying BPF_REG_7 and adding " r=%%d" to + * prints explicitly. + */ + strcat(buf, " r=%d"); + len = strlen(buf) + 1; + addr = add_data(gen, buf, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, addr)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + if (reg1 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1)); + if (reg2 >= 0) + emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk)); +} + +static void debug_regs(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, reg1, reg2, fmt, args); + va_end(args); +} + +static void debug_ret(struct bpf_gen *gen, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + emit_debug(gen, BPF_REG_7, -1, fmt, args); + va_end(args); +} + +static void __emit_sys_close(struct bpf_gen *gen) +{ + emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, + /* 2 is the number of the following insns + * * 6 is additional insns in debug_regs + */ + 2 + (gen->log_level ? 6 : 0))); + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); + debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d"); +} + +static void emit_sys_close_stack(struct bpf_gen *gen, int stack_off) +{ + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, stack_off)); + __emit_sys_close(gen); +} + +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) +{ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_off)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0)); + __emit_sys_close(gen); +} + +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) +{ + int i; + + if (nr_progs < gen->nr_progs || nr_maps != gen->nr_maps) { + pr_warn("nr_progs %d/%d nr_maps %d/%d mismatch\n", + nr_progs, gen->nr_progs, nr_maps, gen->nr_maps); + gen->error = -EFAULT; + return gen->error; + } + emit_sys_close_stack(gen, stack_off(btf_fd)); + for (i = 0; i < gen->nr_progs; i++) + move_stack2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * gen->nr_maps + + sizeof(struct bpf_prog_desc) * i + + offsetof(struct bpf_prog_desc, prog_fd), 4, + stack_off(prog_fd[i])); + for (i = 0; i < gen->nr_maps; i++) + move_blob2ctx(gen, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * i + + offsetof(struct bpf_map_desc, map_fd), 4, + blob_fd_array_off(gen, i)); + emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0)); + emit(gen, BPF_EXIT_INSN()); + pr_debug("gen: finish %d\n", gen->error); + if (!gen->error) { + struct gen_loader_opts *opts = gen->opts; + + opts->insns = gen->insn_start; + opts->insns_sz = gen->insn_cur - gen->insn_start; + opts->data = gen->data_start; + opts->data_sz = gen->data_cur - gen->data_start; + } + return gen->error; +} + +void bpf_gen__free(struct bpf_gen *gen) +{ + if (!gen) + return; + free(gen->data_start); + free(gen->insn_start); + free(gen); +} + +void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, + __u32 btf_raw_size) +{ + int attr_size = offsetofend(union bpf_attr, btf_log_level); + int btf_data, btf_load_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: load_btf: size %d\n", btf_raw_size); + btf_data = add_data(gen, btf_raw_data, btf_raw_size); + + attr.btf_size = btf_raw_size; + btf_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(btf_load_attr, btf_log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with a pointer to the BTF data */ + emit_rel_store(gen, attr_field(btf_load_attr, btf), btf_data); + /* emit BTF_LOAD command */ + emit_sys_bpf(gen, BPF_BTF_LOAD, btf_load_attr, attr_size); + debug_ret(gen, "btf_load size %d", btf_raw_size); + emit_check_err(gen); + /* remember btf_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(btf_fd))); +} + +void bpf_gen__map_create(struct bpf_gen *gen, + enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, __u32 value_size, __u32 max_entries, + struct bpf_map_create_opts *map_attr, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_extra); + bool close_inner_map_fd = false; + int map_create_attr, idx; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.map_flags = map_attr->map_flags; + attr.map_extra = map_attr->map_extra; + if (map_name) + libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.numa_node = map_attr->numa_node; + attr.map_ifindex = map_attr->map_ifindex; + attr.max_entries = max_entries; + attr.btf_key_type_id = map_attr->btf_key_type_id; + attr.btf_value_type_id = map_attr->btf_value_type_id; + + pr_debug("gen: map_create: %s idx %d type %d value_type_id %d\n", + attr.map_name, map_idx, map_type, attr.btf_value_type_id); + + map_create_attr = add_data(gen, &attr, attr_size); + if (attr.btf_value_type_id) + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(map_create_attr, btf_fd), 4, + stack_off(btf_fd)); + switch (attr.map_type) { + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + move_stack2blob(gen, attr_field(map_create_attr, inner_map_fd), 4, + stack_off(inner_map_fd)); + close_inner_map_fd = true; + break; + default: + break; + } + /* conditionally update max_entries */ + if (map_idx >= 0) + move_ctx2blob(gen, attr_field(map_create_attr, max_entries), 4, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, max_entries), + true /* check that max_entries != 0 */); + /* emit MAP_CREATE command */ + emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size); + debug_ret(gen, "map_create %s idx %d type %d value_size %d value_btf_id %d", + attr.map_name, map_idx, map_type, value_size, + attr.btf_value_type_id); + emit_check_err(gen); + /* remember map_fd in the stack, if successful */ + if (map_idx < 0) { + /* This bpf_gen__map_create() function is called with map_idx >= 0 + * for all maps that libbpf loading logic tracks. + * It's called with -1 to create an inner map. + */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(inner_map_fd))); + } else if (map_idx != gen->nr_maps) { + gen->error = -EDOM; /* internal bug */ + return; + } else { + /* add_map_fd does gen->nr_maps++ */ + idx = add_map_fd(gen); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, idx))); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_7, 0)); + } + if (close_inner_map_fd) + emit_sys_close_stack(gen, stack_off(inner_map_fd)); +} + +void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, + enum bpf_attach_type type) +{ + const char *prefix; + int kind, ret; + + btf_get_kernel_prefix_kind(type, &prefix, &kind); + gen->attach_kind = kind; + ret = snprintf(gen->attach_target, sizeof(gen->attach_target), "%s%s", + prefix, attach_name); + if (ret >= sizeof(gen->attach_target)) + gen->error = -ENOSPC; +} + +static void emit_find_attach_target(struct bpf_gen *gen) +{ + int name, len = strlen(gen->attach_target) + 1; + + pr_debug("gen: find_attach_tgt %s %d\n", gen->attach_target, gen->attach_kind); + name = add_data(gen, gen->attach_target, len); + + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, gen->attach_kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", + gen->attach_target, gen->attach_kind); + emit_check_err(gen); + /* if successful, btf_id is in lower 32-bit of R7 and + * btf_obj_fd is in upper 32-bit + */ +} + +void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, + bool is_typeless, bool is_ld64, int kind, int insn_idx) +{ + struct ksym_relo_desc *relo; + + relo = libbpf_reallocarray(gen->relos, gen->relo_cnt + 1, sizeof(*relo)); + if (!relo) { + gen->error = -ENOMEM; + return; + } + gen->relos = relo; + relo += gen->relo_cnt; + relo->name = name; + relo->is_weak = is_weak; + relo->is_typeless = is_typeless; + relo->is_ld64 = is_ld64; + relo->kind = kind; + relo->insn_idx = insn_idx; + gen->relo_cnt++; +} + +/* returns existing ksym_desc with ref incremented, or inserts a new one */ +static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + struct ksym_desc *kdesc; + int i; + + for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; + if (kdesc->kind == relo->kind && kdesc->is_ld64 == relo->is_ld64 && + !strcmp(kdesc->name, relo->name)) { + kdesc->ref++; + return kdesc; + } + } + kdesc = libbpf_reallocarray(gen->ksyms, gen->nr_ksyms + 1, sizeof(*kdesc)); + if (!kdesc) { + gen->error = -ENOMEM; + return NULL; + } + gen->ksyms = kdesc; + kdesc = &gen->ksyms[gen->nr_ksyms++]; + kdesc->name = relo->name; + kdesc->kind = relo->kind; + kdesc->ref = 1; + kdesc->off = 0; + kdesc->insn = 0; + kdesc->is_ld64 = relo->is_ld64; + return kdesc; +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + */ +static void emit_bpf_find_by_name_kind(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1; + + name_off = add_data(gen, relo->name, len); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, relo->kind)); + emit(gen, BPF_MOV64_IMM(BPF_REG_4, 0)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "find_by_name_kind(%s,%d)", relo->name, relo->kind); +} + +/* Overwrites BPF_REG_{0, 1, 2, 3, 4, 7} + * Returns result in BPF_REG_7 + * Returns u64 symbol addr in BPF_REG_9 + */ +static void emit_bpf_kallsyms_lookup_name(struct bpf_gen *gen, struct ksym_relo_desc *relo) +{ + int name_off, len = strlen(relo->name) + 1, res_off; + + name_off = add_data(gen, relo->name, len); + res_off = add_data(gen, NULL, 8); /* res is u64 */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, name_off)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, len)); + emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_4, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, res_off)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_4)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_kallsyms_lookup_name)); + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0)); + emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0)); + debug_ret(gen, "kallsyms_lookup_name(%s,%d)", relo->name, relo->kind); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + * + * We need to reuse BTF fd for same symbol otherwise each relocation takes a new + * index, while kernel limits total kfunc BTFs to 256. For duplicate symbols, + * this would mean a new BTF fd index for each entry. By pairing symbol name + * with index, we get the insn->imm, insn->off pairing that kernel uses for + * kfunc_tab, which becomes the effective limit even though all of them may + * share same index in fd_array (such that kfunc_btf_tab has 1 element). + */ +static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + int btf_fd_idx; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing bpf_insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + offsetof(struct bpf_insn, off), 2, + kdesc->insn + offsetof(struct bpf_insn, off)); + goto log; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* get index in fd_array to store BTF FD at */ + btf_fd_idx = add_kfunc_btf_fd(gen); + if (btf_fd_idx > INT16_MAX) { + pr_warn("BTF fd off %d for kfunc %s exceeds INT16_MAX, cannot process relocation\n", + btf_fd_idx, relo->name); + gen->error = -E2BIG; + return; + } + kdesc->off = btf_fd_idx; + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set value for imm, off as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 10)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* obtain fd in BPF_REG_9 */ + emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + /* load fd_array slot pointer */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); + /* store BTF fd in slot, 0 for vmlinux */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + /* jump to insn[insn_idx].off store if fd denotes module BTF */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); + /* set the default value for off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip BTF fd store for vmlinux BTF */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + /* store index into insn[insn_idx].off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); +log: + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, + offsetof(struct bpf_insn, off))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " func (%s:count=%d): imm: %%d, off: %%d", + relo->name, kdesc->ref); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, blob_fd_array_off(gen, kdesc->off))); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_0, 0)); + debug_regs(gen, BPF_REG_9, -1, " func (%s:count=%d): btf_fd", + relo->name, kdesc->ref); +} + +static void emit_ksym_relo_log(struct bpf_gen *gen, struct ksym_relo_desc *relo, + int ref) +{ + if (!gen->log_level) + return; + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_8, + offsetof(struct bpf_insn, imm))); + emit(gen, BPF_LDX_MEM(BPF_H, BPF_REG_9, BPF_REG_8, sizeof(struct bpf_insn) + + offsetof(struct bpf_insn, imm))); + debug_regs(gen, BPF_REG_7, BPF_REG_9, " var t=%d w=%d (%s:count=%d): imm[0]: %%d, imm[1]: %%d", + relo->is_typeless, relo->is_weak, relo->name, ref); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + debug_regs(gen, BPF_REG_9, -1, " var t=%d w=%d (%s:count=%d): insn.reg", + relo->is_typeless, relo->is_weak, relo->name, ref); +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_typeless(struct bpf_gen *gen, + struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + goto log; + } + /* remember insn offset, so we can copy ksym addr later */ + kdesc->insn = insn; + /* skip typeless ksym_desc in fd closing loop in cleanup_relos */ + kdesc->typeless = true; + emit_bpf_kallsyms_lookup_name(gen, relo); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_7, -ENOENT, 1)); + emit_check_err(gen); + /* store lower half of addr into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, offsetof(struct bpf_insn, imm))); + /* store upper half of addr into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_9, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); +log: + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +static __u32 src_reg_mask(void) +{ +#if defined(__LITTLE_ENDIAN_BITFIELD) + return 0x0f; /* src_reg,dst_reg,... */ +#elif defined(__BIG_ENDIAN_BITFIELD) + return 0xf0; /* dst_reg,src_reg,... */ +#else +#error "Unsupported bit endianness, cannot proceed" +#endif +} + +/* Expects: + * BPF_REG_8 - pointer to instruction + */ +static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn) +{ + struct ksym_desc *kdesc; + __u32 reg_mask; + + kdesc = get_ksym_desc(gen, relo); + if (!kdesc) + return; + /* try to copy from existing ldimm64 insn */ + if (kdesc->ref > 1) { + move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + /* jump over src_reg adjustment if imm (btf_id) is not 0, reuse BPF_REG_0 from move_blob2blob + * If btf_id is zero, clear BPF_PSEUDO_BTF_ID flag in src_reg of ld_imm64 insn + */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); + goto clear_src_reg; + } + /* remember insn offset, so we can copy BTF ID and FD later */ + kdesc->insn = insn; + emit_bpf_find_by_name_kind(gen, relo); + if (!relo->is_weak) + emit_check_err(gen); + /* jump to success case */ + emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + /* set values for insn[insn_idx].imm, insn[insn_idx + 1].imm as 0 */ + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, offsetof(struct bpf_insn, imm), 0)); + emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_8, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 0)); + /* skip success case for ret < 0 */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); + /* store btf_id into insn[insn_idx].imm */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, offsetof(struct bpf_insn, imm))); + /* store btf_obj_fd into insn[insn_idx + 1].imm */ + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); + /* skip src_reg adjustment */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3)); +clear_src_reg: + /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ + reg_mask = src_reg_mask(); + emit(gen, BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_8, offsetofend(struct bpf_insn, code))); + emit(gen, BPF_ALU32_IMM(BPF_AND, BPF_REG_9, reg_mask)); + emit(gen, BPF_STX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, offsetofend(struct bpf_insn, code))); + + emit_ksym_relo_log(gen, relo, kdesc->ref); +} + +void bpf_gen__record_relo_core(struct bpf_gen *gen, + const struct bpf_core_relo *core_relo) +{ + struct bpf_core_relo *relos; + + relos = libbpf_reallocarray(gen->core_relos, gen->core_relo_cnt + 1, sizeof(*relos)); + if (!relos) { + gen->error = -ENOMEM; + return; + } + gen->core_relos = relos; + relos += gen->core_relo_cnt; + memcpy(relos, core_relo, sizeof(*relos)); + gen->core_relo_cnt++; +} + +static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insns) +{ + int insn; + + pr_debug("gen: emit_relo (%d): %s at %d %s\n", + relo->kind, relo->name, relo->insn_idx, relo->is_ld64 ? "ld64" : "call"); + insn = insns + sizeof(struct bpf_insn) * relo->insn_idx; + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_8, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn)); + if (relo->is_ld64) { + if (relo->is_typeless) + emit_relo_ksym_typeless(gen, relo, insn); + else + emit_relo_ksym_btf(gen, relo, insn); + } else { + emit_relo_kfunc_btf(gen, relo, insn); + } +} + +static void emit_relos(struct bpf_gen *gen, int insns) +{ + int i; + + for (i = 0; i < gen->relo_cnt; i++) + emit_relo(gen, gen->relos + i, insns); +} + +static void cleanup_core_relo(struct bpf_gen *gen) +{ + if (!gen->core_relo_cnt) + return; + free(gen->core_relos); + gen->core_relo_cnt = 0; + gen->core_relos = NULL; +} + +static void cleanup_relos(struct bpf_gen *gen, int insns) +{ + struct ksym_desc *kdesc; + int i, insn; + + for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; + /* only close fds for typed ksyms and kfuncs */ + if (kdesc->is_ld64 && !kdesc->typeless) { + /* close fd recorded in insn[insn_idx + 1].imm */ + insn = kdesc->insn; + insn += sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm); + emit_sys_close_blob(gen, insn); + } else if (!kdesc->is_ld64) { + emit_sys_close_blob(gen, blob_fd_array_off(gen, kdesc->off)); + if (kdesc->off < MAX_FD_ARRAY_SZ) + gen->nr_fd_array--; + } + } + if (gen->nr_ksyms) { + free(gen->ksyms); + gen->nr_ksyms = 0; + gen->ksyms = NULL; + } + if (gen->relo_cnt) { + free(gen->relos); + gen->relo_cnt = 0; + gen->relos = NULL; + } + cleanup_core_relo(gen); +} + +void bpf_gen__prog_load(struct bpf_gen *gen, + enum bpf_prog_type prog_type, const char *prog_name, + const char *license, struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *load_attr, int prog_idx) +{ + int prog_load_attr, license_off, insns_off, func_info, line_info, core_relos; + int attr_size = offsetofend(union bpf_attr, core_relo_rec_size); + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: prog_load: type %d insns_cnt %zd progi_idx %d\n", + prog_type, insn_cnt, prog_idx); + /* add license string to blob of bytes */ + license_off = add_data(gen, license, strlen(license) + 1); + /* add insns to blob of bytes */ + insns_off = add_data(gen, insns, insn_cnt * sizeof(struct bpf_insn)); + + attr.prog_type = prog_type; + attr.expected_attach_type = load_attr->expected_attach_type; + attr.attach_btf_id = load_attr->attach_btf_id; + attr.prog_ifindex = load_attr->prog_ifindex; + attr.kern_version = 0; + attr.insn_cnt = (__u32)insn_cnt; + attr.prog_flags = load_attr->prog_flags; + + attr.func_info_rec_size = load_attr->func_info_rec_size; + attr.func_info_cnt = load_attr->func_info_cnt; + func_info = add_data(gen, load_attr->func_info, + attr.func_info_cnt * attr.func_info_rec_size); + + attr.line_info_rec_size = load_attr->line_info_rec_size; + attr.line_info_cnt = load_attr->line_info_cnt; + line_info = add_data(gen, load_attr->line_info, + attr.line_info_cnt * attr.line_info_rec_size); + + attr.core_relo_rec_size = sizeof(struct bpf_core_relo); + attr.core_relo_cnt = gen->core_relo_cnt; + core_relos = add_data(gen, gen->core_relos, + attr.core_relo_cnt * attr.core_relo_rec_size); + + libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); + prog_load_attr = add_data(gen, &attr, attr_size); + + /* populate union bpf_attr with a pointer to license */ + emit_rel_store(gen, attr_field(prog_load_attr, license), license_off); + + /* populate union bpf_attr with a pointer to instructions */ + emit_rel_store(gen, attr_field(prog_load_attr, insns), insns_off); + + /* populate union bpf_attr with a pointer to func_info */ + emit_rel_store(gen, attr_field(prog_load_attr, func_info), func_info); + + /* populate union bpf_attr with a pointer to line_info */ + emit_rel_store(gen, attr_field(prog_load_attr, line_info), line_info); + + /* populate union bpf_attr with a pointer to core_relos */ + emit_rel_store(gen, attr_field(prog_load_attr, core_relos), core_relos); + + /* populate union bpf_attr fd_array with a pointer to data where map_fds are saved */ + emit_rel_store(gen, attr_field(prog_load_attr, fd_array), gen->fd_array); + + /* populate union bpf_attr with user provided log details */ + move_ctx2blob(gen, attr_field(prog_load_attr, log_level), 4, + offsetof(struct bpf_loader_ctx, log_level), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_size), 4, + offsetof(struct bpf_loader_ctx, log_size), false); + move_ctx2blob(gen, attr_field(prog_load_attr, log_buf), 8, + offsetof(struct bpf_loader_ctx, log_buf), false); + /* populate union bpf_attr with btf_fd saved in the stack earlier */ + move_stack2blob(gen, attr_field(prog_load_attr, prog_btf_fd), 4, + stack_off(btf_fd)); + if (gen->attach_kind) { + emit_find_attach_target(gen); + /* populate union bpf_attr with btf_id and btf_obj_fd found by helper */ + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, prog_load_attr)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_id))); + emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 32)); + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, + offsetof(union bpf_attr, attach_btf_obj_fd))); + } + emit_relos(gen, insns_off); + /* emit PROG_LOAD command */ + emit_sys_bpf(gen, BPF_PROG_LOAD, prog_load_attr, attr_size); + debug_ret(gen, "prog_load %s insn_cnt %d", attr.prog_name, attr.insn_cnt); + /* successful or not, close btf module FDs used in extern ksyms and attach_btf_obj_fd */ + cleanup_relos(gen, insns_off); + if (gen->attach_kind) { + emit_sys_close_blob(gen, + attr_field(prog_load_attr, attach_btf_obj_fd)); + gen->attach_kind = 0; + } + emit_check_err(gen); + /* remember prog_fd in the stack, if successful */ + emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, + stack_off(prog_fd[gen->nr_progs]))); + gen->nr_progs++; +} + +void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *pvalue, + __u32 value_size) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, value, key; + union bpf_attr attr; + int zero = 0; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_update_elem: idx %d\n", map_idx); + + value = add_data(gen, pvalue, value_size); + key = add_data(gen, &zero, sizeof(zero)); + + /* if (map_desc[map_idx].initial_value) { + * if (ctx->flags & BPF_SKEL_KERNEL) + * bpf_probe_read_kernel(value, value_size, initial_value); + * else + * bpf_copy_from_user(value, value_size, initial_value); + * } + */ + emit(gen, BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, + sizeof(struct bpf_loader_ctx) + + sizeof(struct bpf_map_desc) * map_idx + + offsetof(struct bpf_map_desc, initial_value))); + emit(gen, BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 8)); + emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, + 0, 0, 0, value)); + emit(gen, BPF_MOV64_IMM(BPF_REG_2, value_size)); + emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct bpf_loader_ctx, flags))); + emit(gen, BPF_JMP_IMM(BPF_JSET, BPF_REG_0, BPF_SKEL_KERNEL, 2)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_copy_from_user)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); + emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), value); + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "update_elem idx %d value_size %d", map_idx, value_size); + emit_check_err(gen); +} + +void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int slot, + int inner_map_idx) +{ + int attr_size = offsetofend(union bpf_attr, flags); + int map_update_attr, key; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: populate_outer_map: outer %d key %d inner %d\n", + outer_map_idx, slot, inner_map_idx); + + key = add_data(gen, &slot, sizeof(slot)); + + map_update_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_update_attr, map_fd), 4, + blob_fd_array_off(gen, outer_map_idx)); + emit_rel_store(gen, attr_field(map_update_attr, key), key); + emit_rel_store(gen, attr_field(map_update_attr, value), + blob_fd_array_off(gen, inner_map_idx)); + + /* emit MAP_UPDATE_ELEM command */ + emit_sys_bpf(gen, BPF_MAP_UPDATE_ELEM, map_update_attr, attr_size); + debug_ret(gen, "populate_outer_map outer %d key %d inner %d", + outer_map_idx, slot, inner_map_idx); + emit_check_err(gen); +} + +void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx) +{ + int attr_size = offsetofend(union bpf_attr, map_fd); + int map_freeze_attr; + union bpf_attr attr; + + memset(&attr, 0, attr_size); + pr_debug("gen: map_freeze: idx %d\n", map_idx); + map_freeze_attr = add_data(gen, &attr, attr_size); + move_blob2blob(gen, attr_field(map_freeze_attr, map_fd), 4, + blob_fd_array_off(gen, map_idx)); + /* emit MAP_FREEZE command */ + emit_sys_bpf(gen, BPF_MAP_FREEZE, map_freeze_attr, attr_size); + debug_ret(gen, "map_freeze"); + emit_check_err(gen); +} diff --git a/third_party/libbpf/src/hashmap.c b/third_party/libbpf/src/hashmap.c new file mode 100644 index 0000000..140ee40 --- /dev/null +++ b/third_party/libbpf/src/hashmap.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#include +#include +#include +#include +#include +#include "hashmap.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +/* start with 4 buckets */ +#define HASHMAP_MIN_CAP_BITS 2 + +static void hashmap_add_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + entry->next = *pprev; + *pprev = entry; +} + +static void hashmap_del_entry(struct hashmap_entry **pprev, + struct hashmap_entry *entry) +{ + *pprev = entry->next; + entry->next = NULL; +} + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx) +{ + map->hash_fn = hash_fn; + map->equal_fn = equal_fn; + map->ctx = ctx; + + map->buckets = NULL; + map->cap = 0; + map->cap_bits = 0; + map->sz = 0; +} + +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx) +{ + struct hashmap *map = malloc(sizeof(struct hashmap)); + + if (!map) + return ERR_PTR(-ENOMEM); + hashmap__init(map, hash_fn, equal_fn, ctx); + return map; +} + +void hashmap__clear(struct hashmap *map) +{ + struct hashmap_entry *cur, *tmp; + size_t bkt; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + free(cur); + } + free(map->buckets); + map->buckets = NULL; + map->cap = map->cap_bits = map->sz = 0; +} + +void hashmap__free(struct hashmap *map) +{ + if (IS_ERR_OR_NULL(map)) + return; + + hashmap__clear(map); + free(map); +} + +size_t hashmap__size(const struct hashmap *map) +{ + return map->sz; +} + +size_t hashmap__capacity(const struct hashmap *map) +{ + return map->cap; +} + +static bool hashmap_needs_to_grow(struct hashmap *map) +{ + /* grow if empty or more than 75% filled */ + return (map->cap == 0) || ((map->sz + 1) * 4 / 3 > map->cap); +} + +static int hashmap_grow(struct hashmap *map) +{ + struct hashmap_entry **new_buckets; + struct hashmap_entry *cur, *tmp; + size_t new_cap_bits, new_cap; + size_t h, bkt; + + new_cap_bits = map->cap_bits + 1; + if (new_cap_bits < HASHMAP_MIN_CAP_BITS) + new_cap_bits = HASHMAP_MIN_CAP_BITS; + + new_cap = 1UL << new_cap_bits; + new_buckets = calloc(new_cap, sizeof(new_buckets[0])); + if (!new_buckets) + return -ENOMEM; + + hashmap__for_each_entry_safe(map, cur, tmp, bkt) { + h = hash_bits(map->hash_fn(cur->key, map->ctx), new_cap_bits); + hashmap_add_entry(&new_buckets[h], cur); + } + + map->cap = new_cap; + map->cap_bits = new_cap_bits; + free(map->buckets); + map->buckets = new_buckets; + + return 0; +} + +static bool hashmap_find_entry(const struct hashmap *map, + const long key, size_t hash, + struct hashmap_entry ***pprev, + struct hashmap_entry **entry) +{ + struct hashmap_entry *cur, **prev_ptr; + + if (!map->buckets) + return false; + + for (prev_ptr = &map->buckets[hash], cur = *prev_ptr; + cur; + prev_ptr = &cur->next, cur = cur->next) { + if (map->equal_fn(cur->key, key, map->ctx)) { + if (pprev) + *pprev = prev_ptr; + *entry = cur; + return true; + } + } + + return false; +} + +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value) +{ + struct hashmap_entry *entry; + size_t h; + int err; + + if (old_key) + *old_key = 0; + if (old_value) + *old_value = 0; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (strategy != HASHMAP_APPEND && + hashmap_find_entry(map, key, h, NULL, &entry)) { + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + if (strategy == HASHMAP_SET || strategy == HASHMAP_UPDATE) { + entry->key = key; + entry->value = value; + return 0; + } else if (strategy == HASHMAP_ADD) { + return -EEXIST; + } + } + + if (strategy == HASHMAP_UPDATE) + return -ENOENT; + + if (hashmap_needs_to_grow(map)) { + err = hashmap_grow(map); + if (err) + return err; + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + } + + entry = malloc(sizeof(struct hashmap_entry)); + if (!entry) + return -ENOMEM; + + entry->key = key; + entry->value = value; + hashmap_add_entry(&map->buckets[h], entry); + map->sz++; + + return 0; +} + +bool hashmap_find(const struct hashmap *map, long key, long *value) +{ + struct hashmap_entry *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, NULL, &entry)) + return false; + + if (value) + *value = entry->value; + return true; +} + +bool hashmap_delete(struct hashmap *map, long key, + long *old_key, long *old_value) +{ + struct hashmap_entry **pprev, *entry; + size_t h; + + h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); + if (!hashmap_find_entry(map, key, h, &pprev, &entry)) + return false; + + if (old_key) + *old_key = entry->key; + if (old_value) + *old_value = entry->value; + + hashmap_del_entry(pprev, entry); + free(entry); + map->sz--; + + return true; +} diff --git a/third_party/libbpf/src/hashmap.h b/third_party/libbpf/src/hashmap.h new file mode 100644 index 0000000..c12f832 --- /dev/null +++ b/third_party/libbpf/src/hashmap.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Generic non-thread safe hash map implementation. + * + * Copyright (c) 2019 Facebook + */ +#ifndef __LIBBPF_HASHMAP_H +#define __LIBBPF_HASHMAP_H + +#include +#include +#include + +static inline size_t hash_bits(size_t h, int bits) +{ + /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + +#if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) + /* LP64 case */ + return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); +#elif (__SIZEOF_SIZE_T__ <= __SIZEOF_LONG__) + return (h * 2654435769lu) >> (__SIZEOF_LONG__ * 8 - bits); +#else +# error "Unsupported size_t size" +#endif +} + +/* generic C-string hashing function */ +static inline size_t str_hash(const char *s) +{ + size_t h = 0; + + while (*s) { + h = h * 31 + *s; + s++; + } + return h; +} + +typedef size_t (*hashmap_hash_fn)(long key, void *ctx); +typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx); + +/* + * Hashmap interface is polymorphic, keys and values could be either + * long-sized integers or pointers, this is achieved as follows: + * - interface functions that operate on keys and values are hidden + * behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert; + * - these auxiliary macros cast the key and value parameters as + * long or long *, so the user does not have to specify the casts explicitly; + * - for pointer parameters (e.g. old_key) the size of the pointed + * type is verified by hashmap_cast_ptr using _Static_assert; + * - when iterating using hashmap__for_each_* forms + * hasmap_entry->key should be used for integer keys and + * hasmap_entry->pkey should be used for pointer keys, + * same goes for values. + */ +struct hashmap_entry { + union { + long key; + const void *pkey; + }; + union { + long value; + void *pvalue; + }; + struct hashmap_entry *next; +}; + +struct hashmap { + hashmap_hash_fn hash_fn; + hashmap_equal_fn equal_fn; + void *ctx; + + struct hashmap_entry **buckets; + size_t cap; + size_t cap_bits; + size_t sz; +}; + +void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, void *ctx); +struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, + hashmap_equal_fn equal_fn, + void *ctx); +void hashmap__clear(struct hashmap *map); +void hashmap__free(struct hashmap *map); + +size_t hashmap__size(const struct hashmap *map); +size_t hashmap__capacity(const struct hashmap *map); + +/* + * Hashmap insertion strategy: + * - HASHMAP_ADD - only add key/value if key doesn't exist yet; + * - HASHMAP_SET - add key/value pair if key doesn't exist yet; otherwise, + * update value; + * - HASHMAP_UPDATE - update value, if key already exists; otherwise, do + * nothing and return -ENOENT; + * - HASHMAP_APPEND - always add key/value pair, even if key already exists. + * This turns hashmap into a multimap by allowing multiple values to be + * associated with the same key. Most useful read API for such hashmap is + * hashmap__for_each_key_entry() iteration. If hashmap__find() is still + * used, it will return last inserted key/value entry (first in a bucket + * chain). + */ +enum hashmap_insert_strategy { + HASHMAP_ADD, + HASHMAP_SET, + HASHMAP_UPDATE, + HASHMAP_APPEND, +}; + +#define hashmap_cast_ptr(p) ({ \ + _Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) || \ + sizeof(*(p)) == sizeof(long), \ + #p " pointee should be a long-sized integer or a pointer"); \ + (long *)(p); \ +}) + +/* + * hashmap__insert() adds key/value entry w/ various semantics, depending on + * provided strategy value. If a given key/value pair replaced already + * existing key/value pair, both old key and old value will be returned + * through old_key and old_value to allow calling code do proper memory + * management. + */ +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value); + +#define hashmap__insert(map, key, value, strategy, old_key, old_value) \ + hashmap_insert((map), (long)(key), (long)(value), (strategy), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +#define hashmap__add(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL) + +#define hashmap__set(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value)) + +#define hashmap__update(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value)) + +#define hashmap__append(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL) + +bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value); + +#define hashmap__delete(map, key, old_key, old_value) \ + hashmap_delete((map), (long)(key), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) + +bool hashmap_find(const struct hashmap *map, long key, long *value); + +#define hashmap__find(map, key, value) \ + hashmap_find((map), (long)(key), hashmap_cast_ptr(value)) + +/* + * hashmap__for_each_entry - iterate over all entries in hashmap + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry(map, cur, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; cur; cur = cur->next) + +/* + * hashmap__for_each_entry_safe - iterate over all entries in hashmap, safe + * against removals + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @tmp: struct hashmap_entry * used as a temporary next cursor storage + * @bkt: integer used as a bucket loop cursor + */ +#define hashmap__for_each_entry_safe(map, cur, tmp, bkt) \ + for (bkt = 0; bkt < map->cap; bkt++) \ + for (cur = map->buckets[bkt]; \ + cur && ({tmp = cur->next; true; }); \ + cur = tmp) + +/* + * hashmap__for_each_key_entry - iterate over entries associated with given key + * @map: hashmap to iterate + * @cur: struct hashmap_entry * used as a loop cursor + * @key: key to iterate entries for + */ +#define hashmap__for_each_key_entry(map, cur, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur; \ + cur = cur->next) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ + cur && ({ tmp = cur->next; true; }); \ + cur = tmp) \ + if (map->equal_fn(cur->key, (_key), map->ctx)) + +#endif /* __LIBBPF_HASHMAP_H */ diff --git a/third_party/libbpf/src/libbpf.c b/third_party/libbpf/src/libbpf.c new file mode 100644 index 0000000..63311a7 --- /dev/null +++ b/third_party/libbpf/src/libbpf.c @@ -0,0 +1,13129 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + * Copyright (C) 2019 Isovalent, Inc. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#include "hashmap.h" +#include "bpf_gen_internal.h" +#include "zip.h" + +#ifndef BPF_FS_MAGIC +#define BPF_FS_MAGIC 0xcafe4a11 +#endif + +#define BPF_INSN_SZ (sizeof(struct bpf_insn)) + +/* vsprintf() in __base_pr() uses nonliteral format string. It may break + * compilation if user enables corresponding warning. Disable it explicitly. + */ +#pragma GCC diagnostic ignored "-Wformat-nonliteral" + +#define __printf(a, b) __attribute__((format(printf, a, b))) + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); + +static const char * const attach_type_name[] = { + [BPF_CGROUP_INET_INGRESS] = "cgroup_inet_ingress", + [BPF_CGROUP_INET_EGRESS] = "cgroup_inet_egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "cgroup_inet_sock_create", + [BPF_CGROUP_INET_SOCK_RELEASE] = "cgroup_inet_sock_release", + [BPF_CGROUP_SOCK_OPS] = "cgroup_sock_ops", + [BPF_CGROUP_DEVICE] = "cgroup_device", + [BPF_CGROUP_INET4_BIND] = "cgroup_inet4_bind", + [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", + [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", + [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", + [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", + [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", + [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", + [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", + [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", + [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "trace_raw_tp", + [BPF_TRACE_FENTRY] = "trace_fentry", + [BPF_TRACE_FEXIT] = "trace_fexit", + [BPF_MODIFY_RETURN] = "modify_return", + [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", + [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", + [BPF_PERF_EVENT] = "perf_event", + [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", + [BPF_STRUCT_OPS] = "struct_ops", + [BPF_NETFILTER] = "netfilter", +}; + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_LINK_TYPE_NETFILTER] = "netfilter", +}; + +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", + [BPF_MAP_TYPE_QUEUE] = "queue", + [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", + [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", + [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", +}; + +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", + [BPF_PROG_TYPE_SK_MSG] = "sk_msg", + [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", + [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", + [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", + [BPF_PROG_TYPE_NETFILTER] = "netfilter", +}; + +static int __base_pr(enum libbpf_print_level level, const char *format, + va_list args) +{ + if (level == LIBBPF_DEBUG) + return 0; + + return vfprintf(stderr, format, args); +} + +static libbpf_print_fn_t __libbpf_pr = __base_pr; + +libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn) +{ + libbpf_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__libbpf_pr, fn, __ATOMIC_RELAXED); + + return old_print_fn; +} + +__printf(2, 3) +void libbpf_print(enum libbpf_print_level level, const char *format, ...) +{ + va_list args; + int old_errno; + libbpf_print_fn_t print_fn; + + print_fn = __atomic_load_n(&__libbpf_pr, __ATOMIC_RELAXED); + if (!print_fn) + return; + + old_errno = errno; + + va_start(args, format); + __libbpf_pr(level, format, args); + va_end(args); + + errno = old_errno; +} + +static void pr_perm_msg(int err) +{ + struct rlimit limit; + char buf[100]; + + if (err != -EPERM || geteuid() != 0) + return; + + err = getrlimit(RLIMIT_MEMLOCK, &limit); + if (err) + return; + + if (limit.rlim_cur == RLIM_INFINITY) + return; + + if (limit.rlim_cur < 1024) + snprintf(buf, sizeof(buf), "%zu bytes", (size_t)limit.rlim_cur); + else if (limit.rlim_cur < 1024*1024) + snprintf(buf, sizeof(buf), "%.1f KiB", (double)limit.rlim_cur / 1024); + else + snprintf(buf, sizeof(buf), "%.1f MiB", (double)limit.rlim_cur / (1024*1024)); + + pr_warn("permission error while running as root; try raising 'ulimit -l'? current value: %s\n", + buf); +} + +#define STRERR_BUFSIZE 128 + +/* Copied from tools/perf/util/util.h */ +#ifndef zfree +# define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) +#endif + +#ifndef zclose +# define zclose(fd) ({ \ + int ___err = 0; \ + if ((fd) >= 0) \ + ___err = close((fd)); \ + fd = -1; \ + ___err; }) +#endif + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int libbpf_set_strict_mode(enum libbpf_strict_mode mode) +{ + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ + return 0; +} + +__u32 libbpf_major_version(void) +{ + return LIBBPF_MAJOR_VERSION; +} + +__u32 libbpf_minor_version(void) +{ + return LIBBPF_MINOR_VERSION; +} + +const char *libbpf_version_string(void) +{ +#define __S(X) #X +#define _S(X) __S(X) + return "v" _S(LIBBPF_MAJOR_VERSION) "." _S(LIBBPF_MINOR_VERSION); +#undef _S +#undef __S +} + +enum reloc_type { + RELO_LD64, + RELO_CALL, + RELO_DATA, + RELO_EXTERN_LD64, + RELO_EXTERN_CALL, + RELO_SUBPROG_ADDR, + RELO_CORE, +}; + +struct reloc_desc { + enum reloc_type type; + int insn_idx; + union { + const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ + struct { + int map_idx; + int sym_off; + int ext_idx; + }; + }; +}; + +/* stored as sec_def->cookie for all libbpf-supported SEC()s */ +enum sec_def_flags { + SEC_NONE = 0, + /* expected_attach_type is optional, if kernel doesn't support that */ + SEC_EXP_ATTACH_OPT = 1, + /* legacy, only used by libbpf_get_type_names() and + * libbpf_attach_type_by_name(), not used by libbpf itself at all. + * This used to be associated with cgroup (and few other) BPF programs + * that were attachable through BPF_PROG_ATTACH command. Pretty + * meaningless nowadays, though. + */ + SEC_ATTACHABLE = 2, + SEC_ATTACHABLE_OPT = SEC_ATTACHABLE | SEC_EXP_ATTACH_OPT, + /* attachment target is specified through BTF ID in either kernel or + * other BPF program's BTF object + */ + SEC_ATTACH_BTF = 4, + /* BPF program type allows sleeping/blocking in kernel */ + SEC_SLEEPABLE = 8, + /* BPF program support non-linear XDP buffer */ + SEC_XDP_FRAGS = 16, +}; + +struct bpf_sec_def { + char *sec; + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + long cookie; + int handler_id; + + libbpf_prog_setup_fn_t prog_setup_fn; + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + libbpf_prog_attach_fn_t prog_attach_fn; +}; + +/* + * bpf_prog should be a better name but it has been used in + * linux/filter.h. + */ +struct bpf_program { + char *name; + char *sec_name; + size_t sec_idx; + const struct bpf_sec_def *sec_def; + /* this program's instruction offset (in number of instructions) + * within its containing ELF section + */ + size_t sec_insn_off; + /* number of original instructions in ELF section belonging to this + * program, not taking into account subprogram instructions possible + * appended later during relocation + */ + size_t sec_insn_cnt; + /* Offset (in number of instructions) of the start of instruction + * belonging to this BPF program within its containing main BPF + * program. For the entry-point (main) BPF program, this is always + * zero. For a sub-program, this gets reset before each of main BPF + * programs are processed and relocated and is used to determined + * whether sub-program was already appended to the main program, and + * if yes, at which instruction offset. + */ + size_t sub_insn_off; + + /* instructions that belong to BPF program; insns[0] is located at + * sec_insn_off instruction within its ELF section in ELF file, so + * when mapping ELF file instruction index to the local instruction, + * one needs to subtract sec_insn_off; and vice versa. + */ + struct bpf_insn *insns; + /* actual number of instruction in this BPF program's image; for + * entry-point BPF programs this includes the size of main program + * itself plus all the used sub-programs, appended at the end + */ + size_t insns_cnt; + + struct reloc_desc *reloc_desc; + int nr_reloc; + + /* BPF verifier log settings */ + char *log_buf; + size_t log_size; + __u32 log_level; + + struct bpf_object *obj; + + int fd; + bool autoload; + bool autoattach; + bool mark_btf_static; + enum bpf_prog_type type; + enum bpf_attach_type expected_attach_type; + + int prog_ifindex; + __u32 attach_btf_obj_fd; + __u32 attach_btf_id; + __u32 attach_prog_fd; + + void *func_info; + __u32 func_info_rec_size; + __u32 func_info_cnt; + + void *line_info; + __u32 line_info_rec_size; + __u32 line_info_cnt; + __u32 prog_flags; +}; + +struct bpf_struct_ops { + const char *tname; + const struct btf_type *type; + struct bpf_program **progs; + __u32 *kern_func_off; + /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */ + void *data; + /* e.g. struct bpf_struct_ops_tcp_congestion_ops in + * btf_vmlinux's format. + * struct bpf_struct_ops_tcp_congestion_ops { + * [... some other kernel fields ...] + * struct tcp_congestion_ops data; + * } + * kern_vdata-size == sizeof(struct bpf_struct_ops_tcp_congestion_ops) + * bpf_map__init_kern_struct_ops() will populate the "kern_vdata" + * from "data". + */ + void *kern_vdata; + __u32 type_id; +}; + +#define DATA_SEC ".data" +#define BSS_SEC ".bss" +#define RODATA_SEC ".rodata" +#define KCONFIG_SEC ".kconfig" +#define KSYMS_SEC ".ksyms" +#define STRUCT_OPS_SEC ".struct_ops" +#define STRUCT_OPS_LINK_SEC ".struct_ops.link" + +enum libbpf_map_type { + LIBBPF_MAP_UNSPEC, + LIBBPF_MAP_DATA, + LIBBPF_MAP_BSS, + LIBBPF_MAP_RODATA, + LIBBPF_MAP_KCONFIG, +}; + +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + +struct bpf_map { + struct bpf_object *obj; + char *name; + /* real_name is defined for special internal maps (.rodata*, + * .data*, .bss, .kconfig) and preserves their original ELF section + * name. This is important to be able to find corresponding BTF + * DATASEC information. + */ + char *real_name; + int fd; + int sec_idx; + size_t sec_offset; + int map_ifindex; + int inner_map_fd; + struct bpf_map_def def; + __u32 numa_node; + __u32 btf_var_idx; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_value_type_id; + enum libbpf_map_type libbpf_type; + void *mmaped; + struct bpf_struct_ops *st_ops; + struct bpf_map *inner_map; + void **init_slots; + int init_slots_sz; + char *pin_path; + bool pinned; + bool reused; + bool autocreate; + __u64 map_extra; +}; + +enum extern_type { + EXT_UNKNOWN, + EXT_KCFG, + EXT_KSYM, +}; + +enum kcfg_type { + KCFG_UNKNOWN, + KCFG_CHAR, + KCFG_BOOL, + KCFG_INT, + KCFG_TRISTATE, + KCFG_CHAR_ARR, +}; + +struct extern_desc { + enum extern_type type; + int sym_idx; + int btf_id; + int sec_btf_id; + const char *name; + bool is_set; + bool is_weak; + union { + struct { + enum kcfg_type type; + int sz; + int align; + int data_off; + bool is_signed; + } kcfg; + struct { + unsigned long long addr; + + /* target btf_id of the corresponding kernel var. */ + int kernel_btf_obj_fd; + int kernel_btf_id; + + /* local btf_id of the ksym extern's type. */ + __u32 type_id; + /* BTF fd index to be patched in for insn->off, this is + * 0 for vmlinux BTF, index in obj->fd_array for module + * BTF + */ + __s16 btf_fd_idx; + } ksym; + }; +}; + +struct module_btf { + struct btf *btf; + char *name; + __u32 id; + int fd; + int fd_array_idx; +}; + +enum sec_type { + SEC_UNUSED = 0, + SEC_RELO, + SEC_BSS, + SEC_DATA, + SEC_RODATA, +}; + +struct elf_sec_desc { + enum sec_type sec_type; + Elf64_Shdr *shdr; + Elf_Data *data; +}; + +struct elf_state { + int fd; + const void *obj_buf; + size_t obj_buf_sz; + Elf *elf; + Elf64_Ehdr *ehdr; + Elf_Data *symbols; + Elf_Data *st_ops_data; + Elf_Data *st_ops_link_data; + size_t shstrndx; /* section index for section name strings */ + size_t strtabidx; + struct elf_sec_desc *secs; + size_t sec_cnt; + int btf_maps_shndx; + __u32 btf_maps_sec_btf_id; + int text_shndx; + int symbols_shndx; + int st_ops_shndx; + int st_ops_link_shndx; +}; + +struct usdt_manager; + +struct bpf_object { + char name[BPF_OBJ_NAME_LEN]; + char license[64]; + __u32 kern_version; + + struct bpf_program *programs; + size_t nr_programs; + struct bpf_map *maps; + size_t nr_maps; + size_t maps_cap; + + char *kconfig; + struct extern_desc *externs; + int nr_extern; + int kconfig_map_idx; + + bool loaded; + bool has_subcalls; + bool has_rodata; + + struct bpf_gen *gen_loader; + + /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ + struct elf_state efile; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* Parse and load BTF vmlinux if any of the programs in the object need + * it at load time. + */ + struct btf *btf_vmlinux; + /* Path to the custom BTF to be used for BPF CO-RE relocations as an + * override for vmlinux BTF. + */ + char *btf_custom_path; + /* vmlinux BTF override for CO-RE relocations */ + struct btf *btf_vmlinux_override; + /* Lazily initialized kernel module BTFs */ + struct module_btf *btf_modules; + bool btf_modules_loaded; + size_t btf_module_cnt; + size_t btf_module_cap; + + /* optional log settings passed to BPF_BTF_LOAD and BPF_PROG_LOAD commands */ + char *log_buf; + size_t log_size; + __u32 log_level; + + int *fd_array; + size_t fd_array_cap; + size_t fd_array_cnt; + + struct usdt_manager *usdt_man; + + char path[]; +}; + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off); +static const char *elf_sec_str(const struct bpf_object *obj, size_t off); +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx); +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name); +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn); +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn); +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn); +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx); +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); + +void bpf_program__unload(struct bpf_program *prog) +{ + if (!prog) + return; + + zclose(prog->fd); + + zfree(&prog->func_info); + zfree(&prog->line_info); +} + +static void bpf_program__exit(struct bpf_program *prog) +{ + if (!prog) + return; + + bpf_program__unload(prog); + zfree(&prog->name); + zfree(&prog->sec_name); + zfree(&prog->insns); + zfree(&prog->reloc_desc); + + prog->nr_reloc = 0; + prog->insns_cnt = 0; + prog->sec_idx = -1; +} + +static bool insn_is_subprog_call(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == BPF_PSEUDO_CALL && + insn->dst_reg == 0 && + insn->off == 0; +} + +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + +static bool insn_is_pseudo_func(struct bpf_insn *insn) +{ + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; +} + +static int +bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, + const char *name, size_t sec_idx, const char *sec_name, + size_t sec_off, void *insn_data, size_t insn_data_sz) +{ + if (insn_data_sz == 0 || insn_data_sz % BPF_INSN_SZ || sec_off % BPF_INSN_SZ) { + pr_warn("sec '%s': corrupted program '%s', offset %zu, size %zu\n", + sec_name, name, sec_off, insn_data_sz); + return -EINVAL; + } + + memset(prog, 0, sizeof(*prog)); + prog->obj = obj; + + prog->sec_idx = sec_idx; + prog->sec_insn_off = sec_off / BPF_INSN_SZ; + prog->sec_insn_cnt = insn_data_sz / BPF_INSN_SZ; + /* insns_cnt can later be increased by appending used subprograms */ + prog->insns_cnt = prog->sec_insn_cnt; + + prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; + + /* libbpf's convention for SEC("?abc...") is that it's just like + * SEC("abc...") but the corresponding bpf_program starts out with + * autoload set to false. + */ + if (sec_name[0] == '?') { + prog->autoload = false; + /* from now on forget there was ? in section name */ + sec_name++; + } else { + prog->autoload = true; + } + + prog->autoattach = true; + + /* inherit object's log_level */ + prog->log_level = obj->log_level; + + prog->sec_name = strdup(sec_name); + if (!prog->sec_name) + goto errout; + + prog->name = strdup(name); + if (!prog->name) + goto errout; + + prog->insns = malloc(insn_data_sz); + if (!prog->insns) + goto errout; + memcpy(prog->insns, insn_data, insn_data_sz); + + return 0; +errout: + pr_warn("sec '%s': failed to allocate memory for prog '%s'\n", sec_name, name); + bpf_program__exit(prog); + return -ENOMEM; +} + +static int +bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, + const char *sec_name, int sec_idx) +{ + Elf_Data *symbols = obj->efile.symbols; + struct bpf_program *prog, *progs; + void *data = sec_data->d_buf; + size_t sec_sz = sec_data->d_size, sec_off, prog_sz, nr_syms; + int nr_progs, err, i; + const char *name; + Elf64_Sym *sym; + + progs = obj->programs; + nr_progs = obj->nr_programs; + nr_syms = symbols->d_size / sizeof(Elf64_Sym); + + for (i = 0; i < nr_syms; i++) { + sym = elf_sym_by_idx(obj, i); + + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) + continue; + + prog_sz = sym->st_size; + sec_off = sym->st_value; + + name = elf_sym_str(obj, sym->st_name); + if (!name) { + pr_warn("sec '%s': failed to get symbol name for offset %zu\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_off + prog_sz > sec_sz) { + pr_warn("sec '%s': program at offset %zu crosses section boundary\n", + sec_name, sec_off); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sec_idx != obj->efile.text_shndx && ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { + pr_warn("sec '%s': program '%s' is static and not supported\n", sec_name, name); + return -ENOTSUP; + } + + pr_debug("sec '%s': found program '%s' at insn offset %zu (%zu bytes), code size %zu insns (%zu bytes)\n", + sec_name, name, sec_off / BPF_INSN_SZ, sec_off, prog_sz / BPF_INSN_SZ, prog_sz); + + progs = libbpf_reallocarray(progs, nr_progs + 1, sizeof(*progs)); + if (!progs) { + /* + * In this case the original obj->programs + * is still valid, so don't need special treat for + * bpf_close_object(). + */ + pr_warn("sec '%s': failed to alloc memory for new program '%s'\n", + sec_name, name); + return -ENOMEM; + } + obj->programs = progs; + + prog = &progs[nr_progs]; + + err = bpf_object__init_prog(obj, prog, name, sec_idx, sec_name, + sec_off, data + sec_off, prog_sz); + if (err) + return err; + + /* if function is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF FUNC + * as static to enable more permissive BPF verification mode + * with more outside context available to BPF verifier + */ + if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL + && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) + prog->mark_btf_static = true; + + nr_progs++; + obj->nr_programs = nr_progs; + } + + return 0; +} + +static const struct btf_member * +find_member_by_offset(const struct btf_type *t, __u32 bit_offset) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (btf_member_bit_offset(t, i) == bit_offset) + return m; + } + + return NULL; +} + +static const struct btf_member * +find_member_by_name(const struct btf *btf, const struct btf_type *t, + const char *name) +{ + struct btf_member *m; + int i; + + for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) { + if (!strcmp(btf__name_by_offset(btf, m->name_off), name)) + return m; + } + + return NULL; +} + +#define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind); + +static int +find_struct_ops_kern_types(const struct btf *btf, const char *tname, + const struct btf_type **type, __u32 *type_id, + const struct btf_type **vtype, __u32 *vtype_id, + const struct btf_member **data_member) +{ + const struct btf_type *kern_type, *kern_vtype; + const struct btf_member *kern_data_member; + __s32 kern_vtype_id, kern_type_id; + __u32 i; + + kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + if (kern_type_id < 0) { + pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", + tname); + return kern_type_id; + } + kern_type = btf__type_by_id(btf, kern_type_id); + + /* Find the corresponding "map_value" type that will be used + * in map_update(BPF_MAP_TYPE_STRUCT_OPS). For example, + * find "struct bpf_struct_ops_tcp_congestion_ops" from the + * btf_vmlinux. + */ + kern_vtype_id = find_btf_by_prefix_kind(btf, STRUCT_OPS_VALUE_PREFIX, + tname, BTF_KIND_STRUCT); + if (kern_vtype_id < 0) { + pr_warn("struct_ops init_kern: struct %s%s is not found in kernel BTF\n", + STRUCT_OPS_VALUE_PREFIX, tname); + return kern_vtype_id; + } + kern_vtype = btf__type_by_id(btf, kern_vtype_id); + + /* Find "struct tcp_congestion_ops" from + * struct bpf_struct_ops_tcp_congestion_ops { + * [ ... ] + * struct tcp_congestion_ops data; + * } + */ + kern_data_member = btf_members(kern_vtype); + for (i = 0; i < btf_vlen(kern_vtype); i++, kern_data_member++) { + if (kern_data_member->type == kern_type_id) + break; + } + if (i == btf_vlen(kern_vtype)) { + pr_warn("struct_ops init_kern: struct %s data is not found in struct %s%s\n", + tname, STRUCT_OPS_VALUE_PREFIX, tname); + return -EINVAL; + } + + *type = kern_type; + *type_id = kern_type_id; + *vtype = kern_vtype; + *vtype_id = kern_vtype_id; + *data_member = kern_data_member; + + return 0; +} + +static bool bpf_map__is_struct_ops(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; +} + +/* Init the map's fields that depend on kern_btf */ +static int bpf_map__init_kern_struct_ops(struct bpf_map *map, + const struct btf *btf, + const struct btf *kern_btf) +{ + const struct btf_member *member, *kern_member, *kern_data_member; + const struct btf_type *type, *kern_type, *kern_vtype; + __u32 i, kern_type_id, kern_vtype_id, kern_data_off; + struct bpf_struct_ops *st_ops; + void *data, *kern_data; + const char *tname; + int err; + + st_ops = map->st_ops; + type = st_ops->type; + tname = st_ops->tname; + err = find_struct_ops_kern_types(kern_btf, tname, + &kern_type, &kern_type_id, + &kern_vtype, &kern_vtype_id, + &kern_data_member); + if (err) + return err; + + pr_debug("struct_ops init_kern %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n", + map->name, st_ops->type_id, kern_type_id, kern_vtype_id); + + map->def.value_size = kern_vtype->size; + map->btf_vmlinux_value_type_id = kern_vtype_id; + + st_ops->kern_vdata = calloc(1, kern_vtype->size); + if (!st_ops->kern_vdata) + return -ENOMEM; + + data = st_ops->data; + kern_data_off = kern_data_member->offset / 8; + kern_data = st_ops->kern_vdata + kern_data_off; + + member = btf_members(type); + for (i = 0; i < btf_vlen(type); i++, member++) { + const struct btf_type *mtype, *kern_mtype; + __u32 mtype_id, kern_mtype_id; + void *mdata, *kern_mdata; + __s64 msize, kern_msize; + __u32 moff, kern_moff; + __u32 kern_member_idx; + const char *mname; + + mname = btf__name_by_offset(btf, member->name_off); + kern_member = find_member_by_name(kern_btf, kern_type, mname); + if (!kern_member) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + kern_member_idx = kern_member - btf_members(kern_type); + if (btf_member_bitfield_size(type, i) || + btf_member_bitfield_size(kern_type, kern_member_idx)) { + pr_warn("struct_ops init_kern %s: bitfield %s is not supported\n", + map->name, mname); + return -ENOTSUP; + } + + moff = member->offset / 8; + kern_moff = kern_member->offset / 8; + + mdata = data + moff; + kern_mdata = kern_data + kern_moff; + + mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_member->type, + &kern_mtype_id); + if (BTF_INFO_KIND(mtype->info) != + BTF_INFO_KIND(kern_mtype->info)) { + pr_warn("struct_ops init_kern %s: Unmatched member type %s %u != %u(kernel)\n", + map->name, mname, BTF_INFO_KIND(mtype->info), + BTF_INFO_KIND(kern_mtype->info)); + return -ENOTSUP; + } + + if (btf_is_ptr(mtype)) { + struct bpf_program *prog; + + prog = st_ops->progs[i]; + if (!prog) + continue; + + kern_mtype = skip_mods_and_typedefs(kern_btf, + kern_mtype->type, + &kern_mtype_id); + + /* mtype->type must be a func_proto which was + * guaranteed in bpf_object__collect_st_ops_relos(), + * so only check kern_mtype for func_proto here. + */ + if (!btf_is_func_proto(kern_mtype)) { + pr_warn("struct_ops init_kern %s: kernel member %s is not a func ptr\n", + map->name, mname); + return -ENOTSUP; + } + + prog->attach_btf_id = kern_type_id; + prog->expected_attach_type = kern_member_idx; + + st_ops->kern_func_off[i] = kern_data_off + kern_moff; + + pr_debug("struct_ops init_kern %s: func ptr %s is set to prog %s from data(+%u) to kern_data(+%u)\n", + map->name, mname, prog->name, moff, + kern_moff); + + continue; + } + + msize = btf__resolve_size(btf, mtype_id); + kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); + if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", + map->name, mname, (ssize_t)msize, + (ssize_t)kern_msize); + return -ENOTSUP; + } + + pr_debug("struct_ops init_kern %s: copy %s %u bytes from data(+%u) to kern_data(+%u)\n", + map->name, mname, (unsigned int)msize, + moff, kern_moff); + memcpy(kern_mdata, mdata, msize); + } + + return 0; +} + +static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + size_t i; + int err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!bpf_map__is_struct_ops(map)) + continue; + + err = bpf_map__init_kern_struct_ops(map, obj->btf, + obj->btf_vmlinux); + if (err) + return err; + } + + return 0; +} + +static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, + int shndx, Elf_Data *data, __u32 map_flags) +{ + const struct btf_type *type, *datasec; + const struct btf_var_secinfo *vsi; + struct bpf_struct_ops *st_ops; + const char *tname, *var_name; + __s32 type_id, datasec_id; + const struct btf *btf; + struct bpf_map *map; + __u32 i; + + if (shndx == -1) + return 0; + + btf = obj->btf; + datasec_id = btf__find_by_name_kind(btf, sec_name, + BTF_KIND_DATASEC); + if (datasec_id < 0) { + pr_warn("struct_ops init: DATASEC %s not found\n", + sec_name); + return -EINVAL; + } + + datasec = btf__type_by_id(btf, datasec_id); + vsi = btf_var_secinfos(datasec); + for (i = 0; i < btf_vlen(datasec); i++, vsi++) { + type = btf__type_by_id(obj->btf, vsi->type); + var_name = btf__name_by_offset(obj->btf, type->name_off); + + type_id = btf__resolve_type(obj->btf, vsi->type); + if (type_id < 0) { + pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", + vsi->type, sec_name); + return -EINVAL; + } + + type = btf__type_by_id(obj->btf, type_id); + tname = btf__name_by_offset(obj->btf, type->name_off); + if (!tname[0]) { + pr_warn("struct_ops init: anonymous type is not supported\n"); + return -ENOTSUP; + } + if (!btf_is_struct(type)) { + pr_warn("struct_ops init: %s is not a struct\n", tname); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->sec_idx = shndx; + map->sec_offset = vsi->offset; + map->name = strdup(var_name); + if (!map->name) + return -ENOMEM; + + map->def.type = BPF_MAP_TYPE_STRUCT_OPS; + map->def.key_size = sizeof(int); + map->def.value_size = type->size; + map->def.max_entries = 1; + map->def.map_flags = map_flags; + + map->st_ops = calloc(1, sizeof(*map->st_ops)); + if (!map->st_ops) + return -ENOMEM; + st_ops = map->st_ops; + st_ops->data = malloc(type->size); + st_ops->progs = calloc(btf_vlen(type), sizeof(*st_ops->progs)); + st_ops->kern_func_off = malloc(btf_vlen(type) * + sizeof(*st_ops->kern_func_off)); + if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) + return -ENOMEM; + + if (vsi->offset + type->size > data->d_size) { + pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", + var_name, sec_name); + return -EINVAL; + } + + memcpy(st_ops->data, + data->d_buf + vsi->offset, + type->size); + st_ops->tname = tname; + st_ops->type = type; + st_ops->type_id = type_id; + + pr_debug("struct_ops init: struct %s(type_id=%u) %s found at offset %u\n", + tname, type_id, var_name, vsi->offset); + } + + return 0; +} + +static int bpf_object_init_struct_ops(struct bpf_object *obj) +{ + int err; + + err = init_struct_ops_maps(obj, STRUCT_OPS_SEC, obj->efile.st_ops_shndx, + obj->efile.st_ops_data, 0); + err = err ?: init_struct_ops_maps(obj, STRUCT_OPS_LINK_SEC, + obj->efile.st_ops_link_shndx, + obj->efile.st_ops_link_data, + BPF_F_LINK); + return err; +} + +static struct bpf_object *bpf_object__new(const char *path, + const void *obj_buf, + size_t obj_buf_sz, + const char *obj_name) +{ + struct bpf_object *obj; + char *end; + + obj = calloc(1, sizeof(struct bpf_object) + strlen(path) + 1); + if (!obj) { + pr_warn("alloc memory failed for %s\n", path); + return ERR_PTR(-ENOMEM); + } + + strcpy(obj->path, path); + if (obj_name) { + libbpf_strlcpy(obj->name, obj_name, sizeof(obj->name)); + } else { + /* Using basename() GNU version which doesn't modify arg. */ + libbpf_strlcpy(obj->name, basename((void *)path), sizeof(obj->name)); + end = strchr(obj->name, '.'); + if (end) + *end = 0; + } + + obj->efile.fd = -1; + /* + * Caller of this function should also call + * bpf_object__elf_finish() after data collection to return + * obj_buf to user. If not, we should duplicate the buffer to + * avoid user freeing them before elf finish. + */ + obj->efile.obj_buf = obj_buf; + obj->efile.obj_buf_sz = obj_buf_sz; + obj->efile.btf_maps_shndx = -1; + obj->efile.st_ops_shndx = -1; + obj->efile.st_ops_link_shndx = -1; + obj->kconfig_map_idx = -1; + + obj->kern_version = get_kernel_version(); + obj->loaded = false; + + return obj; +} + +static void bpf_object__elf_finish(struct bpf_object *obj) +{ + if (!obj->efile.elf) + return; + + elf_end(obj->efile.elf); + obj->efile.elf = NULL; + obj->efile.symbols = NULL; + obj->efile.st_ops_data = NULL; + obj->efile.st_ops_link_data = NULL; + + zfree(&obj->efile.secs); + obj->efile.sec_cnt = 0; + zclose(obj->efile.fd); + obj->efile.obj_buf = NULL; + obj->efile.obj_buf_sz = 0; +} + +static int bpf_object__elf_init(struct bpf_object *obj) +{ + Elf64_Ehdr *ehdr; + int err = 0; + Elf *elf; + + if (obj->efile.elf) { + pr_warn("elf: init internal error\n"); + return -LIBBPF_ERRNO__LIBELF; + } + + if (obj->efile.obj_buf_sz > 0) { + /* obj_buf should have been validated by bpf_object__open_mem(). */ + elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); + } else { + obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); + if (obj->efile.fd < 0) { + char errmsg[STRERR_BUFSIZE], *cp; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("elf: failed to open %s: %s\n", obj->path, cp); + return err; + } + + elf = elf_begin(obj->efile.fd, ELF_C_READ_MMAP, NULL); + } + + if (!elf) { + pr_warn("elf: failed to open %s as ELF file: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__LIBELF; + goto errout; + } + + obj->efile.elf = elf; + + if (elf_kind(elf) != ELF_K_ELF) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a proper ELF object\n", obj->path); + goto errout; + } + + if (gelf_getclass(elf) != ELFCLASS64) { + err = -LIBBPF_ERRNO__FORMAT; + pr_warn("elf: '%s' is not a 64-bit ELF object\n", obj->path); + goto errout; + } + + obj->efile.ehdr = ehdr = elf64_getehdr(elf); + if (!obj->efile.ehdr) { + pr_warn("elf: failed to get ELF header from %s: %s\n", obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + if (elf_getshdrstrndx(elf, &obj->efile.shstrndx)) { + pr_warn("elf: failed to get section names section index for %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* ELF is corrupted/truncated, avoid calling elf_strptr. */ + if (!elf_rawdata(elf_getscn(elf, obj->efile.shstrndx), NULL)) { + pr_warn("elf: failed to get section names strings from %s: %s\n", + obj->path, elf_errmsg(-1)); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + /* Old LLVM set e_machine to EM_NONE */ + if (ehdr->e_type != ET_REL || (ehdr->e_machine && ehdr->e_machine != EM_BPF)) { + pr_warn("elf: %s is not a valid eBPF object file\n", obj->path); + err = -LIBBPF_ERRNO__FORMAT; + goto errout; + } + + return 0; +errout: + bpf_object__elf_finish(obj); + return err; +} + +static int bpf_object__check_endianness(struct bpf_object *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2LSB) + return 0; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + if (obj->efile.ehdr->e_ident[EI_DATA] == ELFDATA2MSB) + return 0; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + pr_warn("elf: endianness mismatch in %s.\n", obj->path); + return -LIBBPF_ERRNO__ENDIAN; +} + +static int +bpf_object__init_license(struct bpf_object *obj, void *data, size_t size) +{ + if (!data) { + pr_warn("invalid license section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + /* libbpf_strlcpy() only copies first N - 1 bytes, so size + 1 won't + * go over allowed ELF data section buffer + */ + libbpf_strlcpy(obj->license, data, min(size + 1, sizeof(obj->license))); + pr_debug("license of %s is %s\n", obj->path, obj->license); + return 0; +} + +static int +bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) +{ + __u32 kver; + + if (!data || size != sizeof(kver)) { + pr_warn("invalid kver section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + memcpy(&kver, data, sizeof(kver)); + obj->kern_version = kver; + pr_debug("kernel version of %s is %x\n", obj->path, obj->kern_version); + return 0; +} + +static bool bpf_map_type__is_map_in_map(enum bpf_map_type type) +{ + if (type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + type == BPF_MAP_TYPE_HASH_OF_MAPS) + return true; + return false; +} + +static int find_elf_sec_sz(const struct bpf_object *obj, const char *name, __u32 *size) +{ + Elf_Data *data; + Elf_Scn *scn; + + if (!name) + return -EINVAL; + + scn = elf_sec_by_name(obj, name); + data = elf_sec_data(obj, scn); + if (data) { + *size = data->d_size; + return 0; /* found it */ + } + + return -ENOENT; +} + +static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *name) +{ + Elf_Data *symbols = obj->efile.symbols; + const char *sname; + size_t si; + + for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, si); + + if (ELF64_ST_TYPE(sym->st_info) != STT_OBJECT) + continue; + + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && + ELF64_ST_BIND(sym->st_info) != STB_WEAK) + continue; + + sname = elf_sym_str(obj, sym->st_name); + if (!sname) { + pr_warn("failed to get sym name string for var %s\n", name); + return ERR_PTR(-EIO); + } + if (strcmp(name, sname) == 0) + return sym; + } + + return ERR_PTR(-ENOENT); +} + +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int err; + + err = libbpf_ensure_mem((void **)&obj->maps, &obj->maps_cap, + sizeof(*obj->maps), obj->nr_maps + 1); + if (err) + return ERR_PTR(err); + + map = &obj->maps[obj->nr_maps++]; + map->obj = obj; + map->fd = -1; + map->inner_map_fd = -1; + map->autocreate = true; + + return map; +} + +static size_t bpf_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + size_t map_sz; + + map_sz = (size_t)roundup(value_sz, 8) * max_entries; + map_sz = roundup(map_sz, page_sz); + return map_sz; +} + +static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) +{ + void *mmaped; + + if (!map->mmaped) + return -EINVAL; + + if (old_sz == new_sz) + return 0; + + mmaped = mmap(NULL, new_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmaped == MAP_FAILED) + return -errno; + + memcpy(mmaped, map->mmaped, min(old_sz, new_sz)); + munmap(map->mmaped, old_sz); + map->mmaped = mmaped; + return 0; +} + +static char *internal_map_name(struct bpf_object *obj, const char *real_name) +{ + char map_name[BPF_OBJ_NAME_LEN], *p; + int pfx_len, sfx_len = max((size_t)7, strlen(real_name)); + + /* This is one of the more confusing parts of libbpf for various + * reasons, some of which are historical. The original idea for naming + * internal names was to include as much of BPF object name prefix as + * possible, so that it can be distinguished from similar internal + * maps of a different BPF object. + * As an example, let's say we have bpf_object named 'my_object_name' + * and internal map corresponding to '.rodata' ELF section. The final + * map name advertised to user and to the kernel will be + * 'my_objec.rodata', taking first 8 characters of object name and + * entire 7 characters of '.rodata'. + * Somewhat confusingly, if internal map ELF section name is shorter + * than 7 characters, e.g., '.bss', we still reserve 7 characters + * for the suffix, even though we only have 4 actual characters, and + * resulting map will be called 'my_objec.bss', not even using all 15 + * characters allowed by the kernel. Oh well, at least the truncated + * object name is somewhat consistent in this case. But if the map + * name is '.kconfig', we'll still have entirety of '.kconfig' added + * (8 chars) and thus will be left with only first 7 characters of the + * object name ('my_obje'). Happy guessing, user, that the final map + * name will be "my_obje.kconfig". + * Now, with libbpf starting to support arbitrarily named .rodata.* + * and .data.* data sections, it's possible that ELF section name is + * longer than allowed 15 chars, so we now need to be careful to take + * only up to 15 first characters of ELF name, taking no BPF object + * name characters at all. So '.rodata.abracadabra' will result in + * '.rodata.abracad' kernel and user-visible name. + * We need to keep this convoluted logic intact for .data, .bss and + * .rodata maps, but for new custom .data.custom and .rodata.custom + * maps we use their ELF names as is, not prepending bpf_object name + * in front. We still need to truncate them to 15 characters for the + * kernel. Full name can be recovered for such maps by using DATASEC + * BTF type associated with such map's value type, though. + */ + if (sfx_len >= BPF_OBJ_NAME_LEN) + sfx_len = BPF_OBJ_NAME_LEN - 1; + + /* if there are two or more dots in map name, it's a custom dot map */ + if (strchr(real_name + 1, '.') != NULL) + pfx_len = 0; + else + pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, strlen(obj->name)); + + snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, + sfx_len, real_name); + + /* sanitise map name to characters allowed by kernel */ + for (p = map_name; *p && p < map_name + sizeof(map_name); p++) + if (!isalnum(*p) && *p != '_' && *p != '.') + *p = '_'; + + return strdup(map_name); +} + +static int +map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map); + +/* Internal BPF map is mmap()'able only if at least one of corresponding + * DATASEC's VARs are to be exposed through BPF skeleton. I.e., it's a GLOBAL + * variable and it's not marked as __hidden (which turns it into, effectively, + * a STATIC variable). + */ +static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) +{ + const struct btf_type *t, *vt; + struct btf_var_secinfo *vsi; + int i, n; + + if (!map->btf_value_type_id) + return false; + + t = btf__type_by_id(obj->btf, map->btf_value_type_id); + if (!btf_is_datasec(t)) + return false; + + vsi = btf_var_secinfos(t); + for (i = 0, n = btf_vlen(t); i < n; i++, vsi++) { + vt = btf__type_by_id(obj->btf, vsi->type); + if (!btf_is_var(vt)) + continue; + + if (btf_var(vt)->linkage != BTF_VAR_STATIC) + return true; + } + + return false; +} + +static int +bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, + const char *real_name, int sec_idx, void *data, size_t data_sz) +{ + struct bpf_map_def *def; + struct bpf_map *map; + size_t mmap_sz; + int err; + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + + map->libbpf_type = type; + map->sec_idx = sec_idx; + map->sec_offset = 0; + map->real_name = strdup(real_name); + map->name = internal_map_name(obj, real_name); + if (!map->real_name || !map->name) { + zfree(&map->real_name); + zfree(&map->name); + return -ENOMEM; + } + + def = &map->def; + def->type = BPF_MAP_TYPE_ARRAY; + def->key_size = sizeof(int); + def->value_size = data_sz; + def->max_entries = 1; + def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG + ? BPF_F_RDONLY_PROG : 0; + + /* failures are fine because of maps like .rodata.str1.1 */ + (void) map_fill_btf_type_info(obj, map); + + if (map_is_mmapable(obj, map)) + def->map_flags |= BPF_F_MMAPABLE; + + pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", + map->name, map->sec_idx, map->sec_offset, def->map_flags); + + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("failed to alloc map '%s' content buffer: %d\n", + map->name, err); + zfree(&map->real_name); + zfree(&map->name); + return err; + } + + if (data) + memcpy(map->mmaped, data, data_sz); + + pr_debug("map %td is \"%s\"\n", map - obj->maps, map->name); + return 0; +} + +static int bpf_object__init_global_data_maps(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + const char *sec_name; + int err = 0, sec_idx; + + /* + * Populate obj->maps with libbpf internal maps. + */ + for (sec_idx = 1; sec_idx < obj->efile.sec_cnt; sec_idx++) { + sec_desc = &obj->efile.secs[sec_idx]; + + /* Skip recognized sections with size 0. */ + if (!sec_desc->data || sec_desc->data->d_size == 0) + continue; + + switch (sec_desc->sec_type) { + case SEC_DATA: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_DATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_RODATA: + obj->has_rodata = true; + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_RODATA, + sec_name, sec_idx, + sec_desc->data->d_buf, + sec_desc->data->d_size); + break; + case SEC_BSS: + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_BSS, + sec_name, sec_idx, + NULL, + sec_desc->data->d_size); + break; + default: + /* skip */ + break; + } + if (err) + return err; + } + return 0; +} + + +static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, + const void *name) +{ + int i; + + for (i = 0; i < obj->nr_extern; i++) { + if (strcmp(obj->externs[i].name, name) == 0) + return &obj->externs[i]; + } + return NULL; +} + +static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, + char value) +{ + switch (ext->kcfg.type) { + case KCFG_BOOL: + if (value == 'm') { + pr_warn("extern (kcfg) '%s': value '%c' implies tristate or char type\n", + ext->name, value); + return -EINVAL; + } + *(bool *)ext_val = value == 'y' ? true : false; + break; + case KCFG_TRISTATE: + if (value == 'y') + *(enum libbpf_tristate *)ext_val = TRI_YES; + else if (value == 'm') + *(enum libbpf_tristate *)ext_val = TRI_MODULE; + else /* value == 'n' */ + *(enum libbpf_tristate *)ext_val = TRI_NO; + break; + case KCFG_CHAR: + *(char *)ext_val = value; + break; + case KCFG_UNKNOWN: + case KCFG_INT: + case KCFG_CHAR_ARR: + default: + pr_warn("extern (kcfg) '%s': value '%c' implies bool, tristate, or char type\n", + ext->name, value); + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, + const char *value) +{ + size_t len; + + if (ext->kcfg.type != KCFG_CHAR_ARR) { + pr_warn("extern (kcfg) '%s': value '%s' implies char array type\n", + ext->name, value); + return -EINVAL; + } + + len = strlen(value); + if (value[len - 1] != '"') { + pr_warn("extern (kcfg) '%s': invalid string config '%s'\n", + ext->name, value); + return -EINVAL; + } + + /* strip quotes */ + len -= 2; + if (len >= ext->kcfg.sz) { + pr_warn("extern (kcfg) '%s': long string '%s' of (%zu bytes) truncated to %d bytes\n", + ext->name, value, len, ext->kcfg.sz - 1); + len = ext->kcfg.sz - 1; + } + memcpy(ext_val, value + 1, len); + ext_val[len] = '\0'; + ext->is_set = true; + return 0; +} + +static int parse_u64(const char *value, __u64 *res) +{ + char *value_end; + int err; + + errno = 0; + *res = strtoull(value, &value_end, 0); + if (errno) { + err = -errno; + pr_warn("failed to parse '%s' as integer: %d\n", value, err); + return err; + } + if (*value_end) { + pr_warn("failed to parse '%s' as integer completely\n", value); + return -EINVAL; + } + return 0; +} + +static bool is_kcfg_value_in_range(const struct extern_desc *ext, __u64 v) +{ + int bit_sz = ext->kcfg.sz * 8; + + if (ext->kcfg.sz == 8) + return true; + + /* Validate that value stored in u64 fits in integer of `ext->sz` + * bytes size without any loss of information. If the target integer + * is signed, we rely on the following limits of integer type of + * Y bits and subsequent transformation: + * + * -2^(Y-1) <= X <= 2^(Y-1) - 1 + * 0 <= X + 2^(Y-1) <= 2^Y - 1 + * 0 <= X + 2^(Y-1) < 2^Y + * + * For unsigned target integer, check that all the (64 - Y) bits are + * zero. + */ + if (ext->kcfg.is_signed) + return v + (1ULL << (bit_sz - 1)) < (1ULL << bit_sz); + else + return (v >> bit_sz) == 0; +} + +static int set_kcfg_value_num(struct extern_desc *ext, void *ext_val, + __u64 value) +{ + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR && + ext->kcfg.type != KCFG_BOOL) { + pr_warn("extern (kcfg) '%s': value '%llu' implies integer, char, or boolean type\n", + ext->name, (unsigned long long)value); + return -EINVAL; + } + if (ext->kcfg.type == KCFG_BOOL && value > 1) { + pr_warn("extern (kcfg) '%s': value '%llu' isn't boolean compatible\n", + ext->name, (unsigned long long)value); + return -EINVAL; + + } + if (!is_kcfg_value_in_range(ext, value)) { + pr_warn("extern (kcfg) '%s': value '%llu' doesn't fit in %d bytes\n", + ext->name, (unsigned long long)value, ext->kcfg.sz); + return -ERANGE; + } + switch (ext->kcfg.sz) { + case 1: + *(__u8 *)ext_val = value; + break; + case 2: + *(__u16 *)ext_val = value; + break; + case 4: + *(__u32 *)ext_val = value; + break; + case 8: + *(__u64 *)ext_val = value; + break; + default: + return -EINVAL; + } + ext->is_set = true; + return 0; +} + +static int bpf_object__process_kconfig_line(struct bpf_object *obj, + char *buf, void *data) +{ + struct extern_desc *ext; + char *sep, *value; + int len, err = 0; + void *ext_val; + __u64 num; + + if (!str_has_pfx(buf, "CONFIG_")) + return 0; + + sep = strchr(buf, '='); + if (!sep) { + pr_warn("failed to parse '%s': no separator\n", buf); + return -EINVAL; + } + + /* Trim ending '\n' */ + len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + /* Split on '=' and ensure that a value is present. */ + *sep = '\0'; + if (!sep[1]) { + *sep = '='; + pr_warn("failed to parse '%s': no value\n", buf); + return -EINVAL; + } + + ext = find_extern_by_name(obj, buf); + if (!ext || ext->is_set) + return 0; + + ext_val = data + ext->kcfg.data_off; + value = sep + 1; + + switch (*value) { + case 'y': case 'n': case 'm': + err = set_kcfg_value_tri(ext, ext_val, *value); + break; + case '"': + err = set_kcfg_value_str(ext, ext_val, value); + break; + default: + /* assume integer */ + err = parse_u64(value, &num); + if (err) { + pr_warn("extern (kcfg) '%s': value '%s' isn't a valid integer\n", ext->name, value); + return err; + } + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { + pr_warn("extern (kcfg) '%s': value '%s' implies integer type\n", ext->name, value); + return -EINVAL; + } + err = set_kcfg_value_num(ext, ext_val, num); + break; + } + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to %s\n", ext->name, value); + return 0; +} + +static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) +{ + char buf[PATH_MAX]; + struct utsname uts; + int len, err = 0; + gzFile file; + + uname(&uts); + len = snprintf(buf, PATH_MAX, "/boot/config-%s", uts.release); + if (len < 0) + return -EINVAL; + else if (len >= PATH_MAX) + return -ENAMETOOLONG; + + /* gzopen also accepts uncompressed files. */ + file = gzopen(buf, "r"); + if (!file) + file = gzopen("/proc/config.gz", "r"); + + if (!file) { + pr_warn("failed to open system Kconfig\n"); + return -ENOENT; + } + + while (gzgets(file, buf, sizeof(buf))) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing system Kconfig line '%s': %d\n", + buf, err); + goto out; + } + } + +out: + gzclose(file); + return err; +} + +static int bpf_object__read_kconfig_mem(struct bpf_object *obj, + const char *config, void *data) +{ + char buf[PATH_MAX]; + int err = 0; + FILE *file; + + file = fmemopen((void *)config, strlen(config), "r"); + if (!file) { + err = -errno; + pr_warn("failed to open in-memory Kconfig: %d\n", err); + return err; + } + + while (fgets(buf, sizeof(buf), file)) { + err = bpf_object__process_kconfig_line(obj, buf, data); + if (err) { + pr_warn("error parsing in-memory Kconfig line '%s': %d\n", + buf, err); + break; + } + } + + fclose(file); + return err; +} + +static int bpf_object__init_kconfig_map(struct bpf_object *obj) +{ + struct extern_desc *last_ext = NULL, *ext; + size_t map_sz; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG) + last_ext = ext; + } + + if (!last_ext) + return 0; + + map_sz = last_ext->kcfg.data_off + last_ext->kcfg.sz; + err = bpf_object__init_internal_map(obj, LIBBPF_MAP_KCONFIG, + ".kconfig", obj->efile.symbols_shndx, + NULL, map_sz); + if (err) + return err; + + obj->kconfig_map_idx = obj->nr_maps - 1; + + return 0; +} + +const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + + if (res_id) + *res_id = id; + + while (btf_is_mod(t) || btf_is_typedef(t)) { + if (res_id) + *res_id = t->type; + t = btf__type_by_id(btf, t->type); + } + + return t; +} + +static const struct btf_type * +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) +{ + const struct btf_type *t; + + t = skip_mods_and_typedefs(btf, id, NULL); + if (!btf_is_ptr(t)) + return NULL; + + t = skip_mods_and_typedefs(btf, t->type, res_id); + + return btf_is_func_proto(t) ? t : NULL; +} + +static const char *__btf_kind_str(__u16 kind) +{ + switch (kind) { + case BTF_KIND_UNKN: return "void"; + case BTF_KIND_INT: return "int"; + case BTF_KIND_PTR: return "ptr"; + case BTF_KIND_ARRAY: return "array"; + case BTF_KIND_STRUCT: return "struct"; + case BTF_KIND_UNION: return "union"; + case BTF_KIND_ENUM: return "enum"; + case BTF_KIND_FWD: return "fwd"; + case BTF_KIND_TYPEDEF: return "typedef"; + case BTF_KIND_VOLATILE: return "volatile"; + case BTF_KIND_CONST: return "const"; + case BTF_KIND_RESTRICT: return "restrict"; + case BTF_KIND_FUNC: return "func"; + case BTF_KIND_FUNC_PROTO: return "func_proto"; + case BTF_KIND_VAR: return "var"; + case BTF_KIND_DATASEC: return "datasec"; + case BTF_KIND_FLOAT: return "float"; + case BTF_KIND_DECL_TAG: return "decl_tag"; + case BTF_KIND_TYPE_TAG: return "type_tag"; + case BTF_KIND_ENUM64: return "enum64"; + default: return "unknown"; + } +} + +const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + +/* + * Fetch integer attribute of BTF map definition. Such attributes are + * represented using a pointer to an array, in which dimensionality of array + * encodes specified integer value. E.g., int (*type)[BPF_MAP_TYPE_ARRAY]; + * encodes `type => BPF_MAP_TYPE_ARRAY` key/value pair completely using BTF + * type definition, while using only sizeof(void *) space in ELF data section. + */ +static bool get_map_field_int(const char *map_name, const struct btf *btf, + const struct btf_member *m, __u32 *res) +{ + const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); + const char *name = btf__name_by_offset(btf, m->name_off); + const struct btf_array *arr_info; + const struct btf_type *arr_t; + + if (!btf_is_ptr(t)) { + pr_warn("map '%s': attr '%s': expected PTR, got %s.\n", + map_name, name, btf_kind_str(t)); + return false; + } + + arr_t = btf__type_by_id(btf, t->type); + if (!arr_t) { + pr_warn("map '%s': attr '%s': type [%u] not found.\n", + map_name, name, t->type); + return false; + } + if (!btf_is_array(arr_t)) { + pr_warn("map '%s': attr '%s': expected ARRAY, got %s.\n", + map_name, name, btf_kind_str(arr_t)); + return false; + } + arr_info = btf_array(arr_t); + *res = arr_info->nelems; + return true; +} + +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); + if (len < 0) + return -EINVAL; + if (len >= buf_sz) + return -ENAMETOOLONG; + + return 0; +} + +static int build_map_pin_path(struct bpf_map *map, const char *path) +{ + char buf[PATH_MAX]; + int err; + + if (!path) + path = "/sys/fs/bpf"; + + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; + + return bpf_map__set_pin_path(map, buf); +} + +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def) +{ + const struct btf_type *t; + const struct btf_member *m; + bool is_inner = inner_def == NULL; + int vlen, i; + + vlen = btf_vlen(def_t); + m = btf_members(def_t); + for (i = 0; i < vlen; i++, m++) { + const char *name = btf__name_by_offset(btf, m->name_off); + + if (!name) { + pr_warn("map '%s': invalid field #%d.\n", map_name, i); + return -EINVAL; + } + if (strcmp(name, "type") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_type)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_TYPE; + } else if (strcmp(name, "max_entries") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->max_entries)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAX_ENTRIES; + } else if (strcmp(name, "map_flags") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->map_flags)) + return -EINVAL; + map_def->parts |= MAP_DEF_MAP_FLAGS; + } else if (strcmp(name, "numa_node") == 0) { + if (!get_map_field_int(map_name, btf, m, &map_def->numa_node)) + return -EINVAL; + map_def->parts |= MAP_DEF_NUMA_NODE; + } else if (strcmp(name, "key_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %u.\n", + map_name, map_def->key_size, sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->parts |= MAP_DEF_KEY_SIZE; + } else if (strcmp(name, "key") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': key type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': key spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine key size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->key_size && map_def->key_size != sz) { + pr_warn("map '%s': conflicting key size %u != %zd.\n", + map_name, map_def->key_size, (ssize_t)sz); + return -EINVAL; + } + map_def->key_size = sz; + map_def->key_type_id = t->type; + map_def->parts |= MAP_DEF_KEY_SIZE | MAP_DEF_KEY_TYPE; + } else if (strcmp(name, "value_size") == 0) { + __u32 sz; + + if (!get_map_field_int(map_name, btf, m, &sz)) + return -EINVAL; + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %u.\n", + map_name, map_def->value_size, sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->parts |= MAP_DEF_VALUE_SIZE; + } else if (strcmp(name, "value") == 0) { + __s64 sz; + + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': value type [%d] not found.\n", + map_name, m->type); + return -EINVAL; + } + if (!btf_is_ptr(t)) { + pr_warn("map '%s': value spec is not PTR: %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + sz = btf__resolve_size(btf, t->type); + if (sz < 0) { + pr_warn("map '%s': can't determine value size for type [%u]: %zd.\n", + map_name, t->type, (ssize_t)sz); + return sz; + } + if (map_def->value_size && map_def->value_size != sz) { + pr_warn("map '%s': conflicting value size %u != %zd.\n", + map_name, map_def->value_size, (ssize_t)sz); + return -EINVAL; + } + map_def->value_size = sz; + map_def->value_type_id = t->type; + map_def->parts |= MAP_DEF_VALUE_SIZE | MAP_DEF_VALUE_TYPE; + } + else if (strcmp(name, "values") == 0) { + bool is_map_in_map = bpf_map_type__is_map_in_map(map_def->map_type); + bool is_prog_array = map_def->map_type == BPF_MAP_TYPE_PROG_ARRAY; + const char *desc = is_map_in_map ? "map-in-map inner" : "prog-array value"; + char inner_map_name[128]; + int err; + + if (is_inner) { + pr_warn("map '%s': multi-level inner maps not supported.\n", + map_name); + return -ENOTSUP; + } + if (i != vlen - 1) { + pr_warn("map '%s': '%s' member should be last.\n", + map_name, name); + return -EINVAL; + } + if (!is_map_in_map && !is_prog_array) { + pr_warn("map '%s': should be map-in-map or prog-array.\n", + map_name); + return -ENOTSUP; + } + if (map_def->value_size && map_def->value_size != 4) { + pr_warn("map '%s': conflicting value size %u != 4.\n", + map_name, map_def->value_size); + return -EINVAL; + } + map_def->value_size = 4; + t = btf__type_by_id(btf, m->type); + if (!t) { + pr_warn("map '%s': %s type [%d] not found.\n", + map_name, desc, m->type); + return -EINVAL; + } + if (!btf_is_array(t) || btf_array(t)->nelems) { + pr_warn("map '%s': %s spec is not a zero-sized array.\n", + map_name, desc); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, btf_array(t)->type, NULL); + if (!btf_is_ptr(t)) { + pr_warn("map '%s': %s def is of unexpected kind %s.\n", + map_name, desc, btf_kind_str(t)); + return -EINVAL; + } + t = skip_mods_and_typedefs(btf, t->type, NULL); + if (is_prog_array) { + if (!btf_is_func_proto(t)) { + pr_warn("map '%s': prog-array value def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + continue; + } + if (!btf_is_struct(t)) { + pr_warn("map '%s': map-in-map inner def is of unexpected kind %s.\n", + map_name, btf_kind_str(t)); + return -EINVAL; + } + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", map_name); + err = parse_btf_map_def(inner_map_name, btf, t, strict, inner_def, NULL); + if (err) + return err; + + map_def->parts |= MAP_DEF_INNER_MAP; + } else if (strcmp(name, "pinning") == 0) { + __u32 val; + + if (is_inner) { + pr_warn("map '%s': inner def can't be pinned.\n", map_name); + return -EINVAL; + } + if (!get_map_field_int(map_name, btf, m, &val)) + return -EINVAL; + if (val != LIBBPF_PIN_NONE && val != LIBBPF_PIN_BY_NAME) { + pr_warn("map '%s': invalid pinning value %u.\n", + map_name, val); + return -EINVAL; + } + map_def->pinning = val; + map_def->parts |= MAP_DEF_PINNING; + } else if (strcmp(name, "map_extra") == 0) { + __u32 map_extra; + + if (!get_map_field_int(map_name, btf, m, &map_extra)) + return -EINVAL; + map_def->map_extra = map_extra; + map_def->parts |= MAP_DEF_MAP_EXTRA; + } else { + if (strict) { + pr_warn("map '%s': unknown field '%s'.\n", map_name, name); + return -ENOTSUP; + } + pr_debug("map '%s': ignoring unknown field '%s'.\n", map_name, name); + } + } + + if (map_def->map_type == BPF_MAP_TYPE_UNSPEC) { + pr_warn("map '%s': map type isn't specified.\n", map_name); + return -EINVAL; + } + + return 0; +} + +static size_t adjust_ringbuf_sz(size_t sz) +{ + __u32 page_sz = sysconf(_SC_PAGE_SIZE); + __u32 mul; + + /* if user forgot to set any size, make sure they see error */ + if (sz == 0) + return 0; + /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be + * a power-of-2 multiple of kernel's page size. If user diligently + * satisified these conditions, pass the size through. + */ + if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) + return sz; + + /* Otherwise find closest (page_sz * power_of_2) product bigger than + * user-set size to satisfy both user size request and kernel + * requirements and substitute correct max_entries for map creation. + */ + for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { + if (mul * page_sz > sz) + return mul * page_sz; + } + + /* if it's impossible to satisfy the conditions (i.e., user size is + * very close to UINT_MAX but is not a power-of-2 multiple of + * page_size) then just return original size and let kernel reject it + */ + return sz; +} + +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + +static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) +{ + map->def.type = def->map_type; + map->def.key_size = def->key_size; + map->def.value_size = def->value_size; + map->def.max_entries = def->max_entries; + map->def.map_flags = def->map_flags; + map->map_extra = def->map_extra; + + map->numa_node = def->numa_node; + map->btf_key_type_id = def->key_type_id; + map->btf_value_type_id = def->value_type_id; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + if (def->parts & MAP_DEF_MAP_TYPE) + pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); + + if (def->parts & MAP_DEF_KEY_TYPE) + pr_debug("map '%s': found key [%u], sz = %u.\n", + map->name, def->key_type_id, def->key_size); + else if (def->parts & MAP_DEF_KEY_SIZE) + pr_debug("map '%s': found key_size = %u.\n", map->name, def->key_size); + + if (def->parts & MAP_DEF_VALUE_TYPE) + pr_debug("map '%s': found value [%u], sz = %u.\n", + map->name, def->value_type_id, def->value_size); + else if (def->parts & MAP_DEF_VALUE_SIZE) + pr_debug("map '%s': found value_size = %u.\n", map->name, def->value_size); + + if (def->parts & MAP_DEF_MAX_ENTRIES) + pr_debug("map '%s': found max_entries = %u.\n", map->name, def->max_entries); + if (def->parts & MAP_DEF_MAP_FLAGS) + pr_debug("map '%s': found map_flags = 0x%x.\n", map->name, def->map_flags); + if (def->parts & MAP_DEF_MAP_EXTRA) + pr_debug("map '%s': found map_extra = 0x%llx.\n", map->name, + (unsigned long long)def->map_extra); + if (def->parts & MAP_DEF_PINNING) + pr_debug("map '%s': found pinning = %u.\n", map->name, def->pinning); + if (def->parts & MAP_DEF_NUMA_NODE) + pr_debug("map '%s': found numa_node = %u.\n", map->name, def->numa_node); + + if (def->parts & MAP_DEF_INNER_MAP) + pr_debug("map '%s': found inner map definition.\n", map->name); +} + +static const char *btf_var_linkage_str(__u32 linkage) +{ + switch (linkage) { + case BTF_VAR_STATIC: return "static"; + case BTF_VAR_GLOBAL_ALLOCATED: return "global"; + case BTF_VAR_GLOBAL_EXTERN: return "extern"; + default: return "unknown"; + } +} + +static int bpf_object__init_user_btf_map(struct bpf_object *obj, + const struct btf_type *sec, + int var_idx, int sec_idx, + const Elf_Data *data, bool strict, + const char *pin_root_path) +{ + struct btf_map_def map_def = {}, inner_def = {}; + const struct btf_type *var, *def; + const struct btf_var_secinfo *vi; + const struct btf_var *var_extra; + const char *map_name; + struct bpf_map *map; + int err; + + vi = btf_var_secinfos(sec) + var_idx; + var = btf__type_by_id(obj->btf, vi->type); + var_extra = btf_var(var); + map_name = btf__name_by_offset(obj->btf, var->name_off); + + if (map_name == NULL || map_name[0] == '\0') { + pr_warn("map #%d: empty name.\n", var_idx); + return -EINVAL; + } + if ((__u64)vi->offset + vi->size > data->d_size) { + pr_warn("map '%s' BTF data is corrupted.\n", map_name); + return -EINVAL; + } + if (!btf_is_var(var)) { + pr_warn("map '%s': unexpected var kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + pr_warn("map '%s': unsupported map linkage %s.\n", + map_name, btf_var_linkage_str(var_extra->linkage)); + return -EOPNOTSUPP; + } + + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (!btf_is_struct(def)) { + pr_warn("map '%s': unexpected def kind %s.\n", + map_name, btf_kind_str(var)); + return -EINVAL; + } + if (def->size > vi->size) { + pr_warn("map '%s': invalid def size.\n", map_name); + return -EINVAL; + } + + map = bpf_object__add_map(obj); + if (IS_ERR(map)) + return PTR_ERR(map); + map->name = strdup(map_name); + if (!map->name) { + pr_warn("map '%s': failed to alloc map name.\n", map_name); + return -ENOMEM; + } + map->libbpf_type = LIBBPF_MAP_UNSPEC; + map->def.type = BPF_MAP_TYPE_UNSPEC; + map->sec_idx = sec_idx; + map->sec_offset = vi->offset; + map->btf_var_idx = var_idx; + pr_debug("map '%s': at sec_idx %d, offset %zu.\n", + map_name, map->sec_idx, map->sec_offset); + + err = parse_btf_map_def(map->name, obj->btf, def, strict, &map_def, &inner_def); + if (err) + return err; + + fill_map_from_def(map, &map_def); + + if (map_def.pinning == LIBBPF_PIN_BY_NAME) { + err = build_map_pin_path(map, pin_root_path); + if (err) { + pr_warn("map '%s': couldn't build pin path.\n", map->name); + return err; + } + } + + if (map_def.parts & MAP_DEF_INNER_MAP) { + map->inner_map = calloc(1, sizeof(*map->inner_map)); + if (!map->inner_map) + return -ENOMEM; + map->inner_map->fd = -1; + map->inner_map->sec_idx = sec_idx; + map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); + if (!map->inner_map->name) + return -ENOMEM; + sprintf(map->inner_map->name, "%s.inner", map_name); + + fill_map_from_def(map->inner_map, &inner_def); + } + + err = map_fill_btf_type_info(obj, map); + if (err) + return err; + + return 0; +} + +static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, + const char *pin_root_path) +{ + const struct btf_type *sec = NULL; + int nr_types, i, vlen, err; + const struct btf_type *t; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + + if (obj->efile.btf_maps_shndx < 0) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.btf_maps_shndx); + data = elf_sec_data(obj, scn); + if (!scn || !data) { + pr_warn("elf: failed to get %s map definitions for %s\n", + MAPS_ELF_SEC, obj->path); + return -EINVAL; + } + + nr_types = btf__type_cnt(obj->btf); + for (i = 1; i < nr_types; i++) { + t = btf__type_by_id(obj->btf, i); + if (!btf_is_datasec(t)) + continue; + name = btf__name_by_offset(obj->btf, t->name_off); + if (strcmp(name, MAPS_ELF_SEC) == 0) { + sec = t; + obj->efile.btf_maps_sec_btf_id = i; + break; + } + } + + if (!sec) { + pr_warn("DATASEC '%s' not found.\n", MAPS_ELF_SEC); + return -ENOENT; + } + + vlen = btf_vlen(sec); + for (i = 0; i < vlen; i++) { + err = bpf_object__init_user_btf_map(obj, sec, i, + obj->efile.btf_maps_shndx, + data, strict, + pin_root_path); + if (err) + return err; + } + + return 0; +} + +static int bpf_object__init_maps(struct bpf_object *obj, + const struct bpf_object_open_opts *opts) +{ + const char *pin_root_path; + bool strict; + int err = 0; + + strict = !OPTS_GET(opts, relaxed_maps, false); + pin_root_path = OPTS_GET(opts, pin_root_path, NULL); + + err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = err ?: bpf_object__init_global_data_maps(obj); + err = err ?: bpf_object__init_kconfig_map(obj); + err = err ?: bpf_object_init_struct_ops(obj); + + return err; +} + +static bool section_have_execinstr(struct bpf_object *obj, int idx) +{ + Elf64_Shdr *sh; + + sh = elf_sec_hdr(obj, elf_sec_by_idx(obj, idx)); + if (!sh) + return false; + + return sh->sh_flags & SHF_EXECINSTR; +} + +static bool btf_needs_sanitization(struct bpf_object *obj) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + + return !has_func || !has_datasec || !has_func_global || !has_float || + !has_decl_tag || !has_type_tag || !has_enum64; +} + +static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) +{ + bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); + bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); + bool has_float = kernel_supports(obj, FEAT_BTF_FLOAT); + bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); + bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); + bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + int enum64_placeholder_id = 0; + struct btf_type *t; + int i, j, vlen; + + for (i = 1; i < btf__type_cnt(btf); i++) { + t = (struct btf_type *)btf__type_by_id(btf, i); + + if ((!has_datasec && btf_is_var(t)) || (!has_decl_tag && btf_is_decl_tag(t))) { + /* replace VAR/DECL_TAG with INT */ + t->info = BTF_INFO_ENC(BTF_KIND_INT, 0, 0); + /* + * using size = 1 is the safest choice, 4 will be too + * big and cause kernel BTF validation failure if + * original variable took less than 4 bytes + */ + t->size = 1; + *(int *)(t + 1) = BTF_INT_ENC(0, 0, 8); + } else if (!has_datasec && btf_is_datasec(t)) { + /* replace DATASEC with STRUCT */ + const struct btf_var_secinfo *v = btf_var_secinfos(t); + struct btf_member *m = btf_members(t); + struct btf_type *vt; + char *name; + + name = (char *)btf__name_by_offset(btf, t->name_off); + while (*name) { + if (*name == '.') + *name = '_'; + name++; + } + + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, vlen); + for (j = 0; j < vlen; j++, v++, m++) { + /* order of field assignments is important */ + m->offset = v->offset * 8; + m->type = v->type; + /* preserve variable name as member name */ + vt = (void *)btf__type_by_id(btf, v->type); + m->name_off = vt->name_off; + } + } else if (!has_func && btf_is_func_proto(t)) { + /* replace FUNC_PROTO with ENUM */ + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_ENUM, 0, vlen); + t->size = sizeof(__u32); /* kernel enforced */ + } else if (!has_func && btf_is_func(t)) { + /* replace FUNC with TYPEDEF */ + t->info = BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0); + } else if (!has_func_global && btf_is_func(t)) { + /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */ + t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0); + } else if (!has_float && btf_is_float(t)) { + /* replace FLOAT with an equally-sized empty STRUCT; + * since C compilers do not accept e.g. "float" as a + * valid struct name, make it anonymous + */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0); + } else if (!has_type_tag && btf_is_type_tag(t)) { + /* replace TYPE_TAG with a CONST */ + t->name_off = 0; + t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0); + } else if (!has_enum64 && btf_is_enum(t)) { + /* clear the kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t), false); + } else if (!has_enum64 && btf_is_enum64(t)) { + /* replace ENUM64 with a union */ + struct btf_member *m; + + if (enum64_placeholder_id == 0) { + enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); + if (enum64_placeholder_id < 0) + return enum64_placeholder_id; + + t = (struct btf_type *)btf__type_by_id(btf, i); + } + + m = btf_members(t); + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_UNION, 0, vlen); + for (j = 0; j < vlen; j++, m++) { + m->type = enum64_placeholder_id; + m->offset = 0; + } + } + } + + return 0; +} + +static bool libbpf_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.btf_maps_shndx >= 0 || + obj->efile.st_ops_shndx >= 0 || + obj->efile.st_ops_link_shndx >= 0 || + obj->nr_extern > 0; +} + +static bool kernel_needs_btf(const struct bpf_object *obj) +{ + return obj->efile.st_ops_shndx >= 0 || obj->efile.st_ops_link_shndx >= 0; +} + +static int bpf_object__init_btf(struct bpf_object *obj, + Elf_Data *btf_data, + Elf_Data *btf_ext_data) +{ + int err = -ENOENT; + + if (btf_data) { + obj->btf = btf__new(btf_data->d_buf, btf_data->d_size); + err = libbpf_get_error(obj->btf); + if (err) { + obj->btf = NULL; + pr_warn("Error loading ELF section %s: %d.\n", BTF_ELF_SEC, err); + goto out; + } + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + } + if (btf_ext_data) { + struct btf_ext_info *ext_segs[3]; + int seg_num, sec_num; + + if (!obj->btf) { + pr_debug("Ignore ELF section %s because its depending ELF section %s is not found.\n", + BTF_EXT_ELF_SEC, BTF_ELF_SEC); + goto out; + } + obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("Error loading ELF section %s: %d. Ignored and continue.\n", + BTF_EXT_ELF_SEC, err); + obj->btf_ext = NULL; + goto out; + } + + /* setup .BTF.ext to ELF section mapping */ + ext_segs[0] = &obj->btf_ext->func_info; + ext_segs[1] = &obj->btf_ext->line_info; + ext_segs[2] = &obj->btf_ext->core_relo_info; + for (seg_num = 0; seg_num < ARRAY_SIZE(ext_segs); seg_num++) { + struct btf_ext_info *seg = ext_segs[seg_num]; + const struct btf_ext_info_sec *sec; + const char *sec_name; + Elf_Scn *scn; + + if (seg->sec_cnt == 0) + continue; + + seg->sec_idxs = calloc(seg->sec_cnt, sizeof(*seg->sec_idxs)); + if (!seg->sec_idxs) { + err = -ENOMEM; + goto out; + } + + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + /* preventively increment index to avoid doing + * this before every continue below + */ + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) + continue; + scn = elf_sec_by_name(obj, sec_name); + if (!scn) + continue; + + seg->sec_idxs[sec_num - 1] = elf_ndxscn(scn); + } + } + } +out: + if (err && libbpf_needs_btf(obj)) { + pr_warn("BTF is required, but is missing or corrupted.\n"); + return err; + } + return 0; +} + +static int compare_vsi_off(const void *_a, const void *_b) +{ + const struct btf_var_secinfo *a = _a; + const struct btf_var_secinfo *b = _b; + + return a->offset - b->offset; +} + +static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, + struct btf_type *t) +{ + __u32 size = 0, i, vars = btf_vlen(t); + const char *sec_name = btf__name_by_offset(btf, t->name_off); + struct btf_var_secinfo *vsi; + bool fixup_offsets = false; + int err; + + if (!sec_name) { + pr_debug("No name found in string section for DATASEC kind.\n"); + return -ENOENT; + } + + /* Extern-backing datasecs (.ksyms, .kconfig) have their size and + * variable offsets set at the previous step. Further, not every + * extern BTF VAR has corresponding ELF symbol preserved, so we skip + * all fixups altogether for such sections and go straight to sorting + * VARs within their DATASEC. + */ + if (strcmp(sec_name, KCONFIG_SEC) == 0 || strcmp(sec_name, KSYMS_SEC) == 0) + goto sort_vars; + + /* Clang leaves DATASEC size and VAR offsets as zeroes, so we need to + * fix this up. But BPF static linker already fixes this up and fills + * all the sizes and offsets during static linking. So this step has + * to be optional. But the STV_HIDDEN handling is non-optional for any + * non-extern DATASEC, so the variable fixup loop below handles both + * functions at the same time, paying the cost of BTF VAR <-> ELF + * symbol matching just once. + */ + if (t->size == 0) { + err = find_elf_sec_sz(obj, sec_name, &size); + if (err || !size) { + pr_debug("sec '%s': failed to determine size from ELF: size %u, err %d\n", + sec_name, size, err); + return -ENOENT; + } + + t->size = size; + fixup_offsets = true; + } + + for (i = 0, vsi = btf_var_secinfos(t); i < vars; i++, vsi++) { + const struct btf_type *t_var; + struct btf_var *var; + const char *var_name; + Elf64_Sym *sym; + + t_var = btf__type_by_id(btf, vsi->type); + if (!t_var || !btf_is_var(t_var)) { + pr_debug("sec '%s': unexpected non-VAR type found\n", sec_name); + return -EINVAL; + } + + var = btf_var(t_var); + if (var->linkage == BTF_VAR_STATIC || var->linkage == BTF_VAR_GLOBAL_EXTERN) + continue; + + var_name = btf__name_by_offset(btf, t_var->name_off); + if (!var_name) { + pr_debug("sec '%s': failed to find name of DATASEC's member #%d\n", + sec_name, i); + return -ENOENT; + } + + sym = find_elf_var_sym(obj, var_name); + if (IS_ERR(sym)) { + pr_debug("sec '%s': failed to find ELF symbol for VAR '%s'\n", + sec_name, var_name); + return -ENOENT; + } + + if (fixup_offsets) + vsi->offset = sym->st_value; + + /* if variable is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF VAR + * as static. This follows similar logic for functions (BPF + * subprogs) and influences libbpf's further decisions about + * whether to make global data BPF array maps as + * BPF_F_MMAPABLE. + */ + if (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL) + var->linkage = BTF_VAR_STATIC; + } + +sort_vars: + qsort(btf_var_secinfos(t), vars, sizeof(*vsi), compare_vsi_off); + return 0; +} + +static int bpf_object_fixup_btf(struct bpf_object *obj) +{ + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_type *t = btf_type_by_id(obj->btf, i); + + /* Loader needs to fix up some of the things compiler + * couldn't get its hands on while emitting BTF. This + * is section size and global variable offset. We use + * the info from the ELF itself for this purpose. + */ + if (btf_is_datasec(t)) { + err = btf_fixup_datasec(obj, obj->btf, t); + if (err) + return err; + } + } + + return 0; +} + +static bool prog_needs_vmlinux_btf(struct bpf_program *prog) +{ + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_LSM) + return true; + + /* BPF_PROG_TYPE_TRACING programs which do not attach to other programs + * also need vmlinux BTF + */ + if (prog->type == BPF_PROG_TYPE_TRACING && !prog->attach_prog_fd) + return true; + + return false; +} + +static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* CO-RE relocations need kernel BTF, only when btf_custom_path + * is not specified + */ + if (obj->btf_ext && obj->btf_ext->core_relo_info.len && !obj->btf_custom_path) + return true; + + /* Support for typed ksyms needs kernel BTF */ + for (i = 0; i < obj->nr_extern; i++) { + const struct extern_desc *ext; + + ext = &obj->externs[i]; + if (ext->type == EXT_KSYM && ext->ksym.type_id) + return true; + } + + bpf_object__for_each_program(prog, obj) { + if (!prog->autoload) + continue; + if (prog_needs_vmlinux_btf(prog)) + return true; + } + + return false; +} + +static int bpf_object__load_vmlinux_btf(struct bpf_object *obj, bool force) +{ + int err; + + /* btf_vmlinux could be loaded earlier */ + if (obj->btf_vmlinux || obj->gen_loader) + return 0; + + if (!force && !obj_needs_vmlinux_btf(obj)) + return 0; + + obj->btf_vmlinux = btf__load_vmlinux_btf(); + err = libbpf_get_error(obj->btf_vmlinux); + if (err) { + pr_warn("Error loading vmlinux BTF: %d\n", err); + obj->btf_vmlinux = NULL; + return err; + } + return 0; +} + +static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) +{ + struct btf *kern_btf = obj->btf; + bool btf_mandatory, sanitize; + int i, err = 0; + + if (!obj->btf) + return 0; + + if (!kernel_supports(obj, FEAT_BTF)) { + if (kernel_needs_btf(obj)) { + err = -EOPNOTSUPP; + goto report; + } + pr_debug("Kernel doesn't support BTF, skipping uploading it.\n"); + return 0; + } + + /* Even though some subprogs are global/weak, user might prefer more + * permissive BPF verification process that BPF verifier performs for + * static functions, taking into account more context from the caller + * functions. In such case, they need to mark such subprogs with + * __attribute__((visibility("hidden"))) and libbpf will adjust + * corresponding FUNC BTF type to be marked as static and trigger more + * involved BPF verification process. + */ + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *prog = &obj->programs[i]; + struct btf_type *t; + const char *name; + int j, n; + + if (!prog->mark_btf_static || !prog_is_subprog(obj, prog)) + continue; + + n = btf__type_cnt(obj->btf); + for (j = 1; j < n; j++) { + t = btf_type_by_id(obj->btf, j); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, prog->name) != 0) + continue; + + t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_STATIC, 0); + break; + } + } + + sanitize = btf_needs_sanitization(obj); + if (sanitize) { + const void *raw_data; + __u32 sz; + + /* clone BTF to sanitize a copy and leave the original intact */ + raw_data = btf__raw_data(obj->btf, &sz); + kern_btf = btf__new(raw_data, sz); + err = libbpf_get_error(kern_btf); + if (err) + return err; + + /* enforce 8-byte pointers for BPF-targeted BTFs */ + btf__set_pointer_size(obj->btf, 8); + err = bpf_object__sanitize_btf(obj, kern_btf); + if (err) + return err; + } + + if (obj->gen_loader) { + __u32 raw_size = 0; + const void *raw_data = btf__raw_data(kern_btf, &raw_size); + + if (!raw_data) + return -ENOMEM; + bpf_gen__load_btf(obj->gen_loader, raw_data, raw_size); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + btf__set_fd(kern_btf, 0); + } else { + /* currently BPF_BTF_LOAD only supports log_level 1 */ + err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, + obj->log_level ? 1 : 0); + } + if (sanitize) { + if (!err) { + /* move fd to libbpf's BTF */ + btf__set_fd(obj->btf, btf__fd(kern_btf)); + btf__set_fd(kern_btf, -1); + } + btf__free(kern_btf); + } +report: + if (err) { + btf_mandatory = kernel_needs_btf(obj); + pr_warn("Error loading .BTF into kernel: %d. %s\n", err, + btf_mandatory ? "BTF is mandatory, can't proceed." + : "BTF is optional, ignoring."); + if (!btf_mandatory) + err = 0; + } + return err; +} + +static const char *elf_sym_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.strtabidx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static const char *elf_sec_str(const struct bpf_object *obj, size_t off) +{ + const char *name; + + name = elf_strptr(obj->efile.elf, obj->efile.shstrndx, off); + if (!name) { + pr_warn("elf: failed to get section name string at offset %zu from %s: %s\n", + off, obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Scn *elf_sec_by_idx(const struct bpf_object *obj, size_t idx) +{ + Elf_Scn *scn; + + scn = elf_getscn(obj->efile.elf, idx); + if (!scn) { + pr_warn("elf: failed to get section(%zu) from %s: %s\n", + idx, obj->path, elf_errmsg(-1)); + return NULL; + } + return scn; +} + +static Elf_Scn *elf_sec_by_name(const struct bpf_object *obj, const char *name) +{ + Elf_Scn *scn = NULL; + Elf *elf = obj->efile.elf; + const char *sec_name; + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sec_name = elf_sec_name(obj, scn); + if (!sec_name) + return NULL; + + if (strcmp(sec_name, name) != 0) + continue; + + return scn; + } + return NULL; +} + +static Elf64_Shdr *elf_sec_hdr(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf64_Shdr *shdr; + + if (!scn) + return NULL; + + shdr = elf64_getshdr(scn); + if (!shdr) { + pr_warn("elf: failed to get section(%zu) header from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return shdr; +} + +static const char *elf_sec_name(const struct bpf_object *obj, Elf_Scn *scn) +{ + const char *name; + Elf64_Shdr *sh; + + if (!scn) + return NULL; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return NULL; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) { + pr_warn("elf: failed to get section(%zu) name from %s: %s\n", + elf_ndxscn(scn), obj->path, elf_errmsg(-1)); + return NULL; + } + + return name; +} + +static Elf_Data *elf_sec_data(const struct bpf_object *obj, Elf_Scn *scn) +{ + Elf_Data *data; + + if (!scn) + return NULL; + + data = elf_getdata(scn, 0); + if (!data) { + pr_warn("elf: failed to get section(%zu) %s data from %s: %s\n", + elf_ndxscn(scn), elf_sec_name(obj, scn) ?: "", + obj->path, elf_errmsg(-1)); + return NULL; + } + + return data; +} + +static Elf64_Sym *elf_sym_by_idx(const struct bpf_object *obj, size_t idx) +{ + if (idx >= obj->efile.symbols->d_size / sizeof(Elf64_Sym)) + return NULL; + + return (Elf64_Sym *)obj->efile.symbols->d_buf + idx; +} + +static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx) +{ + if (idx >= data->d_size / sizeof(Elf64_Rel)) + return NULL; + + return (Elf64_Rel *)data->d_buf + idx; +} + +static bool is_sec_name_dwarf(const char *name) +{ + /* approximation, but the actual list is too long */ + return str_has_pfx(name, ".debug_"); +} + +static bool ignore_elf_section(Elf64_Shdr *hdr, const char *name) +{ + /* no special handling of .strtab */ + if (hdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (hdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (hdr->sh_type == SHT_PROGBITS && hdr->sh_size == 0 && + strcmp(name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_sec_name_dwarf(name)) + return true; + + if (str_has_pfx(name, ".rel")) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_sec_name_dwarf(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static int cmp_progs(const void *_a, const void *_b) +{ + const struct bpf_program *a = _a; + const struct bpf_program *b = _b; + + if (a->sec_idx != b->sec_idx) + return a->sec_idx < b->sec_idx ? -1 : 1; + + /* sec_insn_off can't be the same within the section */ + return a->sec_insn_off < b->sec_insn_off ? -1 : 1; +} + +static int bpf_object__elf_collect(struct bpf_object *obj) +{ + struct elf_sec_desc *sec_desc; + Elf *elf = obj->efile.elf; + Elf_Data *btf_ext_data = NULL; + Elf_Data *btf_data = NULL; + int idx = 0, err = 0; + const char *name; + Elf_Data *data; + Elf_Scn *scn; + Elf64_Shdr *sh; + + /* ELF section indices are 0-based, but sec #0 is special "invalid" + * section. Since section count retrieved by elf_getshdrnum() does + * include sec #0, it is already the necessary size of an array to keep + * all the sections. + */ + if (elf_getshdrnum(obj->efile.elf, &obj->efile.sec_cnt)) { + pr_warn("elf: failed to get the number of sections for %s: %s\n", + obj->path, elf_errmsg(-1)); + return -LIBBPF_ERRNO__FORMAT; + } + obj->efile.secs = calloc(obj->efile.sec_cnt, sizeof(*obj->efile.secs)); + if (!obj->efile.secs) + return -ENOMEM; + + /* a bunch of ELF parsing functionality depends on processing symbols, + * so do the first pass and find the symbol table + */ + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + if (sh->sh_type == SHT_SYMTAB) { + if (obj->efile.symbols) { + pr_warn("elf: multiple symbol tables in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + idx = elf_ndxscn(scn); + + obj->efile.symbols = data; + obj->efile.symbols_shndx = idx; + obj->efile.strtabidx = sh->sh_link; + } + } + + if (!obj->efile.symbols) { + pr_warn("elf: couldn't find symbol table in %s, stripped object file?\n", + obj->path); + return -ENOENT; + } + + scn = NULL; + while ((scn = elf_nextscn(elf, scn)) != NULL) { + idx = elf_ndxscn(scn); + sec_desc = &obj->efile.secs[idx]; + + sh = elf_sec_hdr(obj, scn); + if (!sh) + return -LIBBPF_ERRNO__FORMAT; + + name = elf_sec_str(obj, sh->sh_name); + if (!name) + return -LIBBPF_ERRNO__FORMAT; + + if (ignore_elf_section(sh, name)) + continue; + + data = elf_sec_data(obj, scn); + if (!data) + return -LIBBPF_ERRNO__FORMAT; + + pr_debug("elf: section(%d) %s, size %ld, link %d, flags %lx, type=%d\n", + idx, name, (unsigned long)data->d_size, + (int)sh->sh_link, (unsigned long)sh->sh_flags, + (int)sh->sh_type); + + if (strcmp(name, "license") == 0) { + err = bpf_object__init_license(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "version") == 0) { + err = bpf_object__init_kversion(obj, data->d_buf, data->d_size); + if (err) + return err; + } else if (strcmp(name, "maps") == 0) { + pr_warn("elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+\n"); + return -ENOTSUP; + } else if (strcmp(name, MAPS_ELF_SEC) == 0) { + obj->efile.btf_maps_shndx = idx; + } else if (strcmp(name, BTF_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_data = data; + } else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) { + if (sh->sh_type != SHT_PROGBITS) + return -LIBBPF_ERRNO__FORMAT; + btf_ext_data = data; + } else if (sh->sh_type == SHT_SYMTAB) { + /* already processed during the first pass above */ + } else if (sh->sh_type == SHT_PROGBITS && data->d_size > 0) { + if (sh->sh_flags & SHF_EXECINSTR) { + if (strcmp(name, ".text") == 0) + obj->efile.text_shndx = idx; + err = bpf_object__add_programs(obj, data, name, idx); + if (err) + return err; + } else if (strcmp(name, DATA_SEC) == 0 || + str_has_pfx(name, DATA_SEC ".")) { + sec_desc->sec_type = SEC_DATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, RODATA_SEC) == 0 || + str_has_pfx(name, RODATA_SEC ".")) { + sec_desc->sec_type = SEC_RODATA; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (strcmp(name, STRUCT_OPS_SEC) == 0) { + obj->efile.st_ops_data = data; + obj->efile.st_ops_shndx = idx; + } else if (strcmp(name, STRUCT_OPS_LINK_SEC) == 0) { + obj->efile.st_ops_link_data = data; + obj->efile.st_ops_link_shndx = idx; + } else { + pr_info("elf: skipping unrecognized data section(%d) %s\n", + idx, name); + } + } else if (sh->sh_type == SHT_REL) { + int targ_sec_idx = sh->sh_info; /* points to other section */ + + if (sh->sh_entsize != sizeof(Elf64_Rel) || + targ_sec_idx >= obj->efile.sec_cnt) + return -LIBBPF_ERRNO__FORMAT; + + /* Only do relo for section with exec instructions */ + if (!section_have_execinstr(obj, targ_sec_idx) && + strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) && + strcmp(name, ".rel" MAPS_ELF_SEC)) { + pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n", + idx, name, targ_sec_idx, + elf_sec_name(obj, elf_sec_by_idx(obj, targ_sec_idx)) ?: ""); + continue; + } + + sec_desc->sec_type = SEC_RELO; + sec_desc->shdr = sh; + sec_desc->data = data; + } else if (sh->sh_type == SHT_NOBITS && (strcmp(name, BSS_SEC) == 0 || + str_has_pfx(name, BSS_SEC "."))) { + sec_desc->sec_type = SEC_BSS; + sec_desc->shdr = sh; + sec_desc->data = data; + } else { + pr_info("elf: skipping section(%d) %s (size %zu)\n", idx, name, + (size_t)sh->sh_size); + } + } + + if (!obj->efile.strtabidx || obj->efile.strtabidx > idx) { + pr_warn("elf: symbol strings section missing or invalid in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } + + /* sort BPF programs by section name and in-section instruction offset + * for faster search + */ + if (obj->nr_programs) + qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); + + return bpf_object__init_btf(obj, btf_data, btf_ext_data); +} + +static bool sym_is_extern(const Elf64_Sym *sym) +{ + int bind = ELF64_ST_BIND(sym->st_info); + /* externs are symbols w/ type=NOTYPE, bind=GLOBAL|WEAK, section=UND */ + return sym->st_shndx == SHN_UNDEF && + (bind == STB_GLOBAL || bind == STB_WEAK) && + ELF64_ST_TYPE(sym->st_info) == STT_NOTYPE; +} + +static bool sym_is_subprog(const Elf64_Sym *sym, int text_shndx) +{ + int bind = ELF64_ST_BIND(sym->st_info); + int type = ELF64_ST_TYPE(sym->st_info); + + /* in .text section */ + if (sym->st_shndx != text_shndx) + return false; + + /* local function */ + if (bind == STB_LOCAL && type == STT_SECTION) + return true; + + /* global function */ + return bind == STB_GLOBAL && type == STT_FUNC; +} + +static int find_extern_btf_id(const struct btf *btf, const char *ext_name) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_var(t) && !btf_is_func(t)) + continue; + + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) + continue; + + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) + return -EINVAL; + + return i; + } + + return -ENOENT; +} + +static int find_extern_sec_btf_id(struct btf *btf, int ext_btf_id) { + const struct btf_var_secinfo *vs; + const struct btf_type *t; + int i, j, n; + + if (!btf) + return -ESRCH; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (!btf_is_datasec(t)) + continue; + + vs = btf_var_secinfos(t); + for (j = 0; j < btf_vlen(t); j++, vs++) { + if (vs->type == ext_btf_id) + return i; + } + } + + return -ENOENT; +} + +static enum kcfg_type find_kcfg_type(const struct btf *btf, int id, + bool *is_signed) +{ + const struct btf_type *t; + const char *name; + + t = skip_mods_and_typedefs(btf, id, NULL); + name = btf__name_by_offset(btf, t->name_off); + + if (is_signed) + *is_signed = false; + switch (btf_kind(t)) { + case BTF_KIND_INT: { + int enc = btf_int_encoding(t); + + if (enc & BTF_INT_BOOL) + return t->size == 1 ? KCFG_BOOL : KCFG_UNKNOWN; + if (is_signed) + *is_signed = enc & BTF_INT_SIGNED; + if (t->size == 1) + return KCFG_CHAR; + if (t->size < 1 || t->size > 8 || (t->size & (t->size - 1))) + return KCFG_UNKNOWN; + return KCFG_INT; + } + case BTF_KIND_ENUM: + if (t->size != 4) + return KCFG_UNKNOWN; + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ENUM64: + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; + case BTF_KIND_ARRAY: + if (btf_array(t)->nelems == 0) + return KCFG_UNKNOWN; + if (find_kcfg_type(btf, btf_array(t)->type, NULL) != KCFG_CHAR) + return KCFG_UNKNOWN; + return KCFG_CHAR_ARR; + default: + return KCFG_UNKNOWN; + } +} + +static int cmp_externs(const void *_a, const void *_b) +{ + const struct extern_desc *a = _a; + const struct extern_desc *b = _b; + + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + if (a->type == EXT_KCFG) { + /* descending order by alignment requirements */ + if (a->kcfg.align != b->kcfg.align) + return a->kcfg.align > b->kcfg.align ? -1 : 1; + /* ascending order by size, within same alignment class */ + if (a->kcfg.sz != b->kcfg.sz) + return a->kcfg.sz < b->kcfg.sz ? -1 : 1; + } + + /* resolve ties by name */ + return strcmp(a->name, b->name); +} + +static int find_int_btf_id(const struct btf *btf) +{ + const struct btf_type *t; + int i, n; + + n = btf__type_cnt(btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(btf, i); + + if (btf_is_int(t) && btf_int_bits(t) == 32) + return i; + } + + return 0; +} + +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + if (!btf) + return 0; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + +static int bpf_object__collect_externs(struct bpf_object *obj) +{ + struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; + const struct btf_type *t; + struct extern_desc *ext; + int i, n, off, dummy_var_btf_id; + const char *ext_name, *sec_name; + Elf_Scn *scn; + Elf64_Shdr *sh; + + if (!obj->efile.symbols) + return 0; + + scn = elf_sec_by_idx(obj, obj->efile.symbols_shndx); + sh = elf_sec_hdr(obj, scn); + if (!sh || sh->sh_entsize != sizeof(Elf64_Sym)) + return -LIBBPF_ERRNO__FORMAT; + + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + + n = sh->sh_size / sh->sh_entsize; + pr_debug("looking for externs among %d symbols...\n", n); + + for (i = 0; i < n; i++) { + Elf64_Sym *sym = elf_sym_by_idx(obj, i); + + if (!sym) + return -LIBBPF_ERRNO__FORMAT; + if (!sym_is_extern(sym)) + continue; + ext_name = elf_sym_str(obj, sym->st_name); + if (!ext_name || !ext_name[0]) + continue; + + ext = obj->externs; + ext = libbpf_reallocarray(ext, obj->nr_extern + 1, sizeof(*ext)); + if (!ext) + return -ENOMEM; + obj->externs = ext; + ext = &ext[obj->nr_extern]; + memset(ext, 0, sizeof(*ext)); + obj->nr_extern++; + + ext->btf_id = find_extern_btf_id(obj->btf, ext_name); + if (ext->btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s': %d\n", + ext_name, ext->btf_id); + return ext->btf_id; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->sym_idx = i; + ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; + + ext->sec_btf_id = find_extern_sec_btf_id(obj->btf, ext->btf_id); + if (ext->sec_btf_id <= 0) { + pr_warn("failed to find BTF for extern '%s' [%d] section: %d\n", + ext_name, ext->btf_id, ext->sec_btf_id); + return ext->sec_btf_id; + } + sec = (void *)btf__type_by_id(obj->btf, ext->sec_btf_id); + sec_name = btf__name_by_offset(obj->btf, sec->name_off); + + if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } + kcfg_sec = sec; + ext->type = EXT_KCFG; + ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); + if (ext->kcfg.sz <= 0) { + pr_warn("failed to resolve size of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.sz); + return ext->kcfg.sz; + } + ext->kcfg.align = btf__align_of(obj->btf, t->type); + if (ext->kcfg.align <= 0) { + pr_warn("failed to determine alignment of extern (kcfg) '%s': %d\n", + ext_name, ext->kcfg.align); + return -EINVAL; + } + ext->kcfg.type = find_kcfg_type(obj->btf, t->type, + &ext->kcfg.is_signed); + if (ext->kcfg.type == KCFG_UNKNOWN) { + pr_warn("extern (kcfg) '%s': type is unsupported\n", ext_name); + return -ENOTSUP; + } + } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + ksym_sec = sec; + ext->type = EXT_KSYM; + skip_mods_and_typedefs(obj->btf, t->type, + &ext->ksym.type_id); + } else { + pr_warn("unrecognized extern section '%s'\n", sec_name); + return -ENOTSUP; + } + } + pr_debug("collected %d externs total\n", obj->nr_extern); + + if (!obj->nr_extern) + return 0; + + /* sort externs by type, for kcfg ones also by (align, size, name) */ + qsort(obj->externs, obj->nr_extern, sizeof(*ext), cmp_externs); + + /* for .ksyms section, we need to turn all externs into allocated + * variables in BTF to pass kernel verification; we do this by + * pretending that each extern is a 8-byte variable + */ + if (ksym_sec) { + /* find existing 4-byte integer type in BTF to use for fake + * extern variables in DATASEC + */ + int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM) + continue; + pr_debug("extern (ksym) #%d: symbol %d, name %s\n", + i, ext->sym_idx, ext->name); + } + + sec = ksym_sec; + n = btf_vlen(sec); + for (i = 0, off = 0; i < n; i++, off += sizeof(int)) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + struct btf_type *vt; + + vt = (void *)btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, vt->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); + return -ESRCH; + } + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } + vs->offset = off; + vs->size = sizeof(int); + } + sec->size = off; + } + + if (kcfg_sec) { + sec = kcfg_sec; + /* for kcfg externs calculate their offsets within a .kconfig map */ + off = 0; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KCFG) + continue; + + ext->kcfg.data_off = roundup(off, ext->kcfg.align); + off = ext->kcfg.data_off + ext->kcfg.sz; + pr_debug("extern (kcfg) #%d: symbol %d, off %u, name %s\n", + i, ext->sym_idx, ext->kcfg.data_off, ext->name); + } + sec->size = off; + n = btf_vlen(sec); + for (i = 0; i < n; i++) { + struct btf_var_secinfo *vs = btf_var_secinfos(sec) + i; + + t = btf__type_by_id(obj->btf, vs->type); + ext_name = btf__name_by_offset(obj->btf, t->name_off); + ext = find_extern_by_name(obj, ext_name); + if (!ext) { + pr_warn("failed to find extern definition for BTF var '%s'\n", + ext_name); + return -ESRCH; + } + btf_var(t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vs->offset = ext->kcfg.data_off; + } + } + return 0; +} + +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) +{ + return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; +} + +struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name) +{ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, obj) { + if (prog_is_subprog(obj, prog)) + continue; + if (!strcmp(prog->name, name)) + return prog; + } + return errno = ENOENT, NULL; +} + +static bool bpf_object__shndx_is_data(const struct bpf_object *obj, + int shndx) +{ + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + case SEC_DATA: + case SEC_RODATA: + return true; + default: + return false; + } +} + +static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, + int shndx) +{ + return shndx == obj->efile.btf_maps_shndx; +} + +static enum libbpf_map_type +bpf_object__section_to_libbpf_map_type(const struct bpf_object *obj, int shndx) +{ + if (shndx == obj->efile.symbols_shndx) + return LIBBPF_MAP_KCONFIG; + + switch (obj->efile.secs[shndx].sec_type) { + case SEC_BSS: + return LIBBPF_MAP_BSS; + case SEC_DATA: + return LIBBPF_MAP_DATA; + case SEC_RODATA: + return LIBBPF_MAP_RODATA; + default: + return LIBBPF_MAP_UNSPEC; + } +} + +static int bpf_program__record_reloc(struct bpf_program *prog, + struct reloc_desc *reloc_desc, + __u32 insn_idx, const char *sym_name, + const Elf64_Sym *sym, const Elf64_Rel *rel) +{ + struct bpf_insn *insn = &prog->insns[insn_idx]; + size_t map_idx, nr_maps = prog->obj->nr_maps; + struct bpf_object *obj = prog->obj; + __u32 shdr_idx = sym->st_shndx; + enum libbpf_map_type type; + const char *sym_sec_name; + struct bpf_map *map; + + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { + pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", + prog->name, sym_name, insn_idx, insn->code); + return -LIBBPF_ERRNO__RELOC; + } + + if (sym_is_extern(sym)) { + int sym_idx = ELF64_R_SYM(rel->r_info); + int i, n = obj->nr_extern; + struct extern_desc *ext; + + for (i = 0; i < n; i++) { + ext = &obj->externs[i]; + if (ext->sym_idx == sym_idx) + break; + } + if (i >= n) { + pr_warn("prog '%s': extern relo failed to find extern for '%s' (%d)\n", + prog->name, sym_name, sym_idx); + return -LIBBPF_ERRNO__RELOC; + } + pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", + prog->name, i, ext->name, ext->sym_idx, insn_idx); + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_CALL; + else + reloc_desc->type = RELO_EXTERN_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->ext_idx = i; + return 0; + } + + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { + pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", + prog->name, sym_name, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + + /* loading subprog addresses */ + if (sym_is_subprog(sym, obj->efile.text_shndx)) { + /* global_func: sym->st_value = offset in the section, insn->imm = 0. + * local_func: sym->st_value = 0, insn->imm = offset in the section. + */ + if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) { + pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n", + prog->name, sym_name, (size_t)sym->st_value, insn->imm); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_SUBPROG_ADDR; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + + type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + + /* generic map reference relocation */ + if (type == LIBBPF_MAP_UNSPEC) { + if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { + pr_warn("prog '%s': bad map relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || + map->sec_idx != sym->st_shndx || + map->sec_offset != sym->st_value) + continue; + pr_debug("prog '%s': found map %zd (%s, sec %d, off %zu) for insn #%u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': map relo failed to find map for section '%s', off %zu\n", + prog->name, sym_sec_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_LD64; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = 0; /* sym->st_value determines map_idx */ + return 0; + } + + /* global data map relocation */ + if (!bpf_object__shndx_is_data(obj, shdr_idx)) { + pr_warn("prog '%s': bad data relo against section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + map = &obj->maps[map_idx]; + if (map->libbpf_type != type || map->sec_idx != sym->st_shndx) + continue; + pr_debug("prog '%s': found data map %zd (%s, sec %d, off %zu) for insn %u\n", + prog->name, map_idx, map->name, map->sec_idx, + map->sec_offset, insn_idx); + break; + } + if (map_idx >= nr_maps) { + pr_warn("prog '%s': data relo failed to find map for section '%s'\n", + prog->name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = map_idx; + reloc_desc->sym_off = sym->st_value; + return 0; +} + +static bool prog_contains_insn(const struct bpf_program *prog, size_t insn_idx) +{ + return insn_idx >= prog->sec_insn_off && + insn_idx < prog->sec_insn_off + prog->sec_insn_cnt; +} + +static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, + size_t sec_idx, size_t insn_idx) +{ + int l = 0, r = obj->nr_programs - 1, m; + struct bpf_program *prog; + + if (!obj->nr_programs) + return NULL; + + while (l < r) { + m = l + (r - l + 1) / 2; + prog = &obj->programs[m]; + + if (prog->sec_idx < sec_idx || + (prog->sec_idx == sec_idx && prog->sec_insn_off <= insn_idx)) + l = m; + else + r = m - 1; + } + /* matching program could be at index l, but it still might be the + * wrong one, so we need to double check conditions for the last time + */ + prog = &obj->programs[l]; + if (prog->sec_idx == sec_idx && prog_contains_insn(prog, insn_idx)) + return prog; + return NULL; +} + +static int +bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) +{ + const char *relo_sec_name, *sec_name; + size_t sec_idx = shdr->sh_info, sym_idx; + struct bpf_program *prog; + struct reloc_desc *relos; + int err, i, nrels; + const char *sym_name; + __u32 insn_idx; + Elf_Scn *scn; + Elf_Data *scn_data; + Elf64_Sym *sym; + Elf64_Rel *rel; + + if (sec_idx >= obj->efile.sec_cnt) + return -EINVAL; + + scn = elf_sec_by_idx(obj, sec_idx); + scn_data = elf_sec_data(obj, scn); + + relo_sec_name = elf_sec_str(obj, shdr->sh_name); + sec_name = elf_sec_name(obj, scn); + if (!relo_sec_name || !sec_name) + return -EINVAL; + + pr_debug("sec '%s': collecting relocation for section(%zu) '%s'\n", + relo_sec_name, sec_idx, sec_name); + nrels = shdr->sh_size / shdr->sh_entsize; + + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("sec '%s': failed to get relo #%d\n", relo_sec_name, i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym_idx = ELF64_R_SYM(rel->r_info); + sym = elf_sym_by_idx(obj, sym_idx); + if (!sym) { + pr_warn("sec '%s': symbol #%zu not found for relo #%d\n", + relo_sec_name, sym_idx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (sym->st_shndx >= obj->efile.sec_cnt) { + pr_warn("sec '%s': corrupted symbol #%zu pointing to invalid section #%zu for relo #%d\n", + relo_sec_name, sym_idx, (size_t)sym->st_shndx, i); + return -LIBBPF_ERRNO__FORMAT; + } + + if (rel->r_offset % BPF_INSN_SZ || rel->r_offset >= scn_data->d_size) { + pr_warn("sec '%s': invalid offset 0x%zx for relo #%d\n", + relo_sec_name, (size_t)rel->r_offset, i); + return -LIBBPF_ERRNO__FORMAT; + } + + insn_idx = rel->r_offset / BPF_INSN_SZ; + /* relocations against static functions are recorded as + * relocations against the section that contains a function; + * in such case, symbol will be STT_SECTION and sym.st_name + * will point to empty string (0), so fetch section name + * instead + */ + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && sym->st_name == 0) + sym_name = elf_sec_name(obj, elf_sec_by_idx(obj, sym->st_shndx)); + else + sym_name = elf_sym_str(obj, sym->st_name); + sym_name = sym_name ?: "reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + prog->reloc_desc = relos; + + /* adjust insn_idx to local BPF program frame of reference */ + insn_idx -= prog->sec_insn_off; + err = bpf_program__record_reloc(prog, &relos[prog->nr_reloc], + insn_idx, sym_name, sym, rel); + if (err) + return err; + + prog->nr_reloc++; + } + return 0; +} + +static int map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map) +{ + int id; + + if (!obj->btf) + return -ENOENT; + + /* if it's BTF-defined map, we don't need to search for type IDs. + * For struct_ops map, it does not need btf_key_type_id and + * btf_value_type_id. + */ + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) + return 0; + + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; + + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; + + map->btf_key_type_id = 0; + map->btf_value_type_id = id; + return 0; +} + +static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) +{ + char file[PATH_MAX], buff[4096]; + FILE *fp; + __u32 val; + int err; + + snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); + memset(info, 0, sizeof(*info)); + + fp = fopen(file, "re"); + if (!fp) { + err = -errno; + pr_warn("failed to open %s: %d. No procfs support?\n", file, + err); + return err; + } + + while (fgets(buff, sizeof(buff), fp)) { + if (sscanf(buff, "map_type:\t%u", &val) == 1) + info->type = val; + else if (sscanf(buff, "key_size:\t%u", &val) == 1) + info->key_size = val; + else if (sscanf(buff, "value_size:\t%u", &val) == 1) + info->value_size = val; + else if (sscanf(buff, "max_entries:\t%u", &val) == 1) + info->max_entries = val; + else if (sscanf(buff, "map_flags:\t%i", &val) == 1) + info->map_flags = val; + } + + fclose(fp); + + return 0; +} + +bool bpf_map__autocreate(const struct bpf_map *map) +{ + return map->autocreate; +} + +int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->autocreate = autocreate; + return 0; +} + +int bpf_map__reuse_fd(struct bpf_map *map, int fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info), name_len; + int new_fd, err; + char *new_name; + + memset(&info, 0, len); + err = bpf_map_get_info_by_fd(fd, &info, &len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(fd, &info); + if (err) + return libbpf_err(err); + + name_len = strlen(info.name); + if (name_len == BPF_OBJ_NAME_LEN - 1 && strncmp(map->name, info.name, name_len) == 0) + new_name = strdup(map->name); + else + new_name = strdup(info.name); + + if (!new_name) + return libbpf_err(-errno); + + /* + * Like dup(), but make sure new FD is >= 3 and has O_CLOEXEC set. + * This is similar to what we do in ensure_good_fd(), but without + * closing original FD. + */ + new_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + if (new_fd < 0) { + err = -errno; + goto err_free_new_name; + } + + err = zclose(map->fd); + if (err) { + err = -errno; + goto err_close_new_fd; + } + free(map->name); + + map->fd = new_fd; + map->name = new_name; + map->def.type = info.type; + map->def.key_size = info.key_size; + map->def.value_size = info.value_size; + map->def.max_entries = info.max_entries; + map->def.map_flags = info.map_flags; + map->btf_key_type_id = info.btf_key_type_id; + map->btf_value_type_id = info.btf_value_type_id; + map->reused = true; + map->map_extra = info.map_extra; + + return 0; + +err_close_new_fd: + close(new_fd); +err_free_new_name: + free(new_name); + return libbpf_err(err); +} + +__u32 bpf_map__max_entries(const struct bpf_map *map) +{ + return map->def.max_entries; +} + +struct bpf_map *bpf_map__inner_map(struct bpf_map *map) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) + return errno = EINVAL, NULL; + + return map->inner_map; +} + +int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->def.max_entries = max_entries; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + + return 0; +} + +static int +bpf_object__probe_loading(struct bpf_object *obj) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + if (obj->gen_loader) + return 0; + + ret = bump_rlimit_memlock(); + if (ret) + pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret); + + /* make sure basic loading works */ + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + if (ret < 0) { + ret = errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't load trivial BPF " + "program. Make sure your kernel supports BPF " + "(CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is " + "set to big enough value.\n", __func__, cp, ret); + return -ret; + } + close(ret); + + return 0; +} + +static int probe_fd(int fd) +{ + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static int probe_kern_prog_name(void) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_name); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); + + /* make sure loading with name works */ + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); + return probe_fd(ret); +} + +static int probe_kern_global_data(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + insns[0].imm = map; + + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + close(map); + return probe_fd(ret); +} + +static int probe_kern_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_func_global(void) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_datasec(void) +{ + static const char strs[] = "\0x\0.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC val */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_float(void) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_decl_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* attr */ + BTF_TYPE_DECL_TAG_ENC(1, 2, -1), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_btf_type_tag(void) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* attr */ + BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ + /* ptr */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_array_mmap(void) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); +} + +static int probe_kern_exp_attach_type(void) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + /* use any valid combination of program type and (optional) + * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) + * to see if kernel supports expected_attach_type field for + * BPF_PROG_LOAD command + */ + fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_kern_probe_read_kernel(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ + BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ + BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(fd); +} + +static int probe_prog_bind_map(void) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, NULL); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + +static int probe_module_btf(void) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_btf_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + +static int probe_perf_link(void) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", + insns, ARRAY_SIZE(insns), NULL); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_kern_bpf_cookie(void) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); + return probe_fd(ret); +} + +static int probe_kern_btf_enum64(void) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs))); +} + +static int probe_kern_syscall_wrapper(void); + +enum kern_feature_result { + FEAT_UNKNOWN = 0, + FEAT_SUPPORTED = 1, + FEAT_MISSING = 2, +}; + +typedef int (*feature_probe_fn)(void); + +static struct kern_feature_desc { + const char *desc; + feature_probe_fn probe; + enum kern_feature_result res; +} feature_probes[__FEAT_CNT] = { + [FEAT_PROG_NAME] = { + "BPF program name", probe_kern_prog_name, + }, + [FEAT_GLOBAL_DATA] = { + "global variables", probe_kern_global_data, + }, + [FEAT_BTF] = { + "minimal BTF", probe_kern_btf, + }, + [FEAT_BTF_FUNC] = { + "BTF functions", probe_kern_btf_func, + }, + [FEAT_BTF_GLOBAL_FUNC] = { + "BTF global function", probe_kern_btf_func_global, + }, + [FEAT_BTF_DATASEC] = { + "BTF data section and variable", probe_kern_btf_datasec, + }, + [FEAT_ARRAY_MMAP] = { + "ARRAY map mmap()", probe_kern_array_mmap, + }, + [FEAT_EXP_ATTACH_TYPE] = { + "BPF_PROG_LOAD expected_attach_type attribute", + probe_kern_exp_attach_type, + }, + [FEAT_PROBE_READ_KERN] = { + "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, + [FEAT_BTF_DECL_TAG] = { + "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, + }, + [FEAT_BTF_TYPE_TAG] = { + "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, + }, + [FEAT_MEMCG_ACCOUNT] = { + "memcg-based memory accounting", probe_memcg_account, + }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, +}; + +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) +{ + struct kern_feature_desc *feat = &feature_probes[feat_id]; + int ret; + + if (obj && obj->gen_loader) + /* To generate loader program assume the latest kernel + * to avoid doing extra prog_load, map_create syscalls. + */ + return true; + + if (READ_ONCE(feat->res) == FEAT_UNKNOWN) { + ret = feat->probe(); + if (ret > 0) { + WRITE_ONCE(feat->res, FEAT_SUPPORTED); + } else if (ret == 0) { + WRITE_ONCE(feat->res, FEAT_MISSING); + } else { + pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); + WRITE_ONCE(feat->res, FEAT_MISSING); + } + } + + return READ_ONCE(feat->res) == FEAT_SUPPORTED; +} + +static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) +{ + struct bpf_map_info map_info; + char msg[STRERR_BUFSIZE]; + __u32 map_info_len = sizeof(map_info); + int err; + + memset(&map_info, 0, map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len); + if (err && errno == EINVAL) + err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); + if (err) { + pr_warn("failed to get map info for map FD %d: %s\n", map_fd, + libbpf_strerror_r(errno, msg, sizeof(msg))); + return false; + } + + return (map_info.type == map->def.type && + map_info.key_size == map->def.key_size && + map_info.value_size == map->def.value_size && + map_info.max_entries == map->def.max_entries && + map_info.map_flags == map->def.map_flags && + map_info.map_extra == map->map_extra); +} + +static int +bpf_object__reuse_map(struct bpf_map *map) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err, pin_fd; + + pin_fd = bpf_obj_get(map->pin_path); + if (pin_fd < 0) { + err = -errno; + if (err == -ENOENT) { + pr_debug("found no pinned map to reuse at '%s'\n", + map->pin_path); + return 0; + } + + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("couldn't retrieve pinned map '%s': %s\n", + map->pin_path, cp); + return err; + } + + if (!map_is_reuse_compat(map, pin_fd)) { + pr_warn("couldn't reuse pinned map at '%s': parameter mismatch\n", + map->pin_path); + close(pin_fd); + return -EINVAL; + } + + err = bpf_map__reuse_fd(map, pin_fd); + close(pin_fd); + if (err) + return err; + + map->pinned = true; + pr_debug("reused pinned map at '%s'\n", map->pin_path); + + return 0; +} + +static int +bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) +{ + enum libbpf_map_type map_type = map->libbpf_type; + char *cp, errmsg[STRERR_BUFSIZE]; + int err, zero = 0; + + if (obj->gen_loader) { + bpf_gen__map_update_elem(obj->gen_loader, map - obj->maps, + map->mmaped, map->def.value_size); + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) + bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); + return 0; + } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error setting initial map(%s) contents: %s\n", + map->name, cp); + return err; + } + + /* Freeze .rodata and .kconfig map as read-only from syscall side. */ + if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG) { + err = bpf_map_freeze(map->fd); + if (err) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error freezing map(%s) as read-only: %s\n", + map->name, cp); + return err; + } + } + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map); + +static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) +{ + LIBBPF_OPTS(bpf_map_create_opts, create_attr); + struct bpf_map_def *def = &map->def; + const char *map_name = NULL; + int err = 0; + + if (kernel_supports(obj, FEAT_PROG_NAME)) + map_name = map->name; + create_attr.map_ifindex = map->map_ifindex; + create_attr.map_flags = def->map_flags; + create_attr.numa_node = map->numa_node; + create_attr.map_extra = map->map_extra; + + if (bpf_map__is_struct_ops(map)) + create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; + + if (obj->btf && btf__fd(obj->btf) >= 0) { + create_attr.btf_fd = btf__fd(obj->btf); + create_attr.btf_key_type_id = map->btf_key_type_id; + create_attr.btf_value_type_id = map->btf_value_type_id; + } + + if (bpf_map_type__is_map_in_map(def->type)) { + if (map->inner_map) { + err = bpf_object__create_map(obj, map->inner_map, true); + if (err) { + pr_warn("map '%s': failed to create inner map: %d\n", + map->name, err); + return err; + } + map->inner_map_fd = bpf_map__fd(map->inner_map); + } + if (map->inner_map_fd >= 0) + create_attr.inner_map_fd = map->inner_map_fd; + } + + switch (def->type) { + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_STACK_TRACE: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + default: + break; + } + + if (obj->gen_loader) { + bpf_gen__map_create(obj->gen_loader, def->type, map_name, + def->key_size, def->value_size, def->max_entries, + &create_attr, is_inner ? -1 : map - obj->maps); + /* Pretend to have valid FD to pass various fd >= 0 checks. + * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + */ + map->fd = 0; + } else { + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + if (map->fd < 0 && (create_attr.btf_key_type_id || + create_attr.btf_value_type_id)) { + char *cp, errmsg[STRERR_BUFSIZE]; + + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", + map->name, cp, err); + create_attr.btf_fd = 0; + create_attr.btf_key_type_id = 0; + create_attr.btf_value_type_id = 0; + map->btf_key_type_id = 0; + map->btf_value_type_id = 0; + map->fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); + } + + err = map->fd < 0 ? -errno : 0; + + if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { + if (obj->gen_loader) + map->inner_map->fd = -1; + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + return err; +} + +static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_map *targ_map; + unsigned int i; + int fd, err = 0; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_map = map->init_slots[i]; + fd = bpf_map__fd(targ_map); + + if (obj->gen_loader) { + bpf_gen__populate_outer_map(obj->gen_loader, + map - obj->maps, i, + targ_map - obj->maps); + } else { + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + } + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n", + map->name, i, targ_map->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n", + map->name, i, targ_map->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int init_prog_array_slots(struct bpf_object *obj, struct bpf_map *map) +{ + const struct bpf_program *targ_prog; + unsigned int i; + int fd, err; + + if (obj->gen_loader) + return -ENOTSUP; + + for (i = 0; i < map->init_slots_sz; i++) { + if (!map->init_slots[i]) + continue; + + targ_prog = map->init_slots[i]; + fd = bpf_program__fd(targ_prog); + + err = bpf_map_update_elem(map->fd, &i, &fd, 0); + if (err) { + err = -errno; + pr_warn("map '%s': failed to initialize slot [%d] to prog '%s' fd=%d: %d\n", + map->name, i, targ_prog->name, fd, err); + return err; + } + pr_debug("map '%s': slot [%d] set to prog '%s' fd=%d\n", + map->name, i, targ_prog->name, fd); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + return 0; +} + +static int bpf_object_init_prog_arrays(struct bpf_object *obj) +{ + struct bpf_map *map; + int i, err; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!map->init_slots_sz || map->def.type != BPF_MAP_TYPE_PROG_ARRAY) + continue; + + err = init_prog_array_slots(obj, map); + if (err < 0) { + zclose(map->fd); + return err; + } + } + return 0; +} + +static int map_set_def_max_entries(struct bpf_map *map) +{ + if (map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY && !map->def.max_entries) { + int nr_cpus; + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + pr_warn("map '%s': failed to determine number of system CPUs: %d\n", + map->name, nr_cpus); + return nr_cpus; + } + pr_debug("map '%s': setting size to %d\n", map->name, nr_cpus); + map->def.max_entries = nr_cpus; + } + + return 0; +} + +static int +bpf_object__create_maps(struct bpf_object *obj) +{ + struct bpf_map *map; + char *cp, errmsg[STRERR_BUFSIZE]; + unsigned int i, j; + int err; + bool retried; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + /* To support old kernels, we skip creating global data maps + * (.rodata, .data, .kconfig, etc); later on, during program + * loading, if we detect that at least one of the to-be-loaded + * programs is referencing any global data map, we'll error + * out with program name and relocation index logged. + * This approach allows to accommodate Clang emitting + * unnecessary .rodata.str1.1 sections for string literals, + * but also it allows to have CO-RE applications that use + * global variables in some of BPF programs, but not others. + * If those global variable-using programs are not loaded at + * runtime due to bpf_program__set_autoload(prog, false), + * bpf_object loading will succeed just fine even on old + * kernels. + */ + if (bpf_map__is_internal(map) && !kernel_supports(obj, FEAT_GLOBAL_DATA)) + map->autocreate = false; + + if (!map->autocreate) { + pr_debug("map '%s': skipped auto-creating...\n", map->name); + continue; + } + + err = map_set_def_max_entries(map); + if (err) + goto err_out; + + retried = false; +retry: + if (map->pin_path) { + err = bpf_object__reuse_map(map); + if (err) { + pr_warn("map '%s': error reusing pinned map\n", + map->name); + goto err_out; + } + if (retried && map->fd < 0) { + pr_warn("map '%s': cannot find pinned map\n", + map->name); + err = -ENOENT; + goto err_out; + } + } + + if (map->fd >= 0) { + pr_debug("map '%s': skipping creation (preset fd=%d)\n", + map->name, map->fd); + } else { + err = bpf_object__create_map(obj, map, false); + if (err) + goto err_out; + + pr_debug("map '%s': created successfully, fd=%d\n", + map->name, map->fd); + + if (bpf_map__is_internal(map)) { + err = bpf_object__populate_internal_map(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + + if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { + err = init_map_in_map_slots(obj, map); + if (err < 0) { + zclose(map->fd); + goto err_out; + } + } + } + + if (map->pin_path && !map->pinned) { + err = bpf_map__pin(map, NULL); + if (err) { + zclose(map->fd); + if (!retried && err == -EEXIST) { + retried = true; + goto retry; + } + pr_warn("map '%s': failed to auto-pin at '%s': %d\n", + map->name, map->pin_path, err); + goto err_out; + } + } + } + + return 0; + +err_out: + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("map '%s': failed to create: %s(%d)\n", map->name, cp, err); + pr_perm_msg(err); + for (j = 0; j < i; j++) + zclose(obj->maps[j].fd); + return err; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +/* Given 'some_struct_name___with_flavor' return the length of a name prefix + * before last triple underscore. Struct name part after last triple + * underscore is ignored by BPF CO-RE relocation during relocation matching. + */ +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} + +void bpf_core_free_cands(struct bpf_core_cand_list *cands) +{ + if (!cands) + return; + + free(cands->cands); + free(cands); +} + +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands) +{ + struct bpf_core_cand *new_cands, *cand; + const struct btf_type *t, *local_t; + const char *targ_name, *local_name; + size_t targ_essent_len; + int n, i; + + local_t = btf__type_by_id(local_cand->btf, local_cand->id); + local_name = btf__str_by_offset(local_cand->btf, local_t->name_off); + + n = btf__type_cnt(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf__type_by_id(targ_btf, i); + if (!btf_kind_core_compat(t, local_t)) + continue; + + targ_name = btf__name_by_offset(targ_btf, t->name_off); + if (str_is_empty(targ_name)) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + + if (strncmp(local_name, targ_name, local_essent_len) != 0) + continue; + + pr_debug("CO-RE relocating [%d] %s %s: found target candidate [%d] %s %s in [%s]\n", + local_cand->id, btf_kind_str(local_t), + local_name, i, btf_kind_str(t), targ_name, + targ_btf_name); + new_cands = libbpf_reallocarray(cands->cands, cands->len + 1, + sizeof(*cands->cands)); + if (!new_cands) + return -ENOMEM; + + cand = &new_cands[cands->len]; + cand->btf = targ_btf; + cand->id = i; + + cands->cands = new_cands; + cands->len++; + } + return 0; +} + +static int load_module_btfs(struct bpf_object *obj) +{ + struct bpf_btf_info info; + struct module_btf *mod_btf; + struct btf *btf; + char name[64]; + __u32 id = 0, len; + int err, fd; + + if (obj->btf_modules_loaded) + return 0; + + if (obj->gen_loader) + return 0; + + /* don't do this again, even if we find no module BTFs */ + obj->btf_modules_loaded = true; + + /* kernel too old to support module BTFs */ + if (!kernel_supports(obj, FEAT_MODULE_BTF)) + return 0; + + while (true) { + err = bpf_btf_get_next_id(id, &id); + if (err && errno == ENOENT) + return 0; + if (err && errno == EPERM) { + pr_debug("skipping module BTFs loading, missing privileges\n"); + return 0; + } + if (err) { + err = -errno; + pr_warn("failed to iterate BTF objects: %d\n", err); + return err; + } + + fd = bpf_btf_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; /* expected race: BTF was unloaded */ + err = -errno; + pr_warn("failed to get BTF object #%d FD: %d\n", id, err); + return err; + } + + len = sizeof(info); + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + err = bpf_btf_get_info_by_fd(fd, &info, &len); + if (err) { + err = -errno; + pr_warn("failed to get BTF object #%d info: %d\n", id, err); + goto err_out; + } + + /* ignore non-module BTFs */ + if (!info.kernel_btf || strcmp(name, "vmlinux") == 0) { + close(fd); + continue; + } + + btf = btf_get_from_fd(fd, obj->btf_vmlinux); + err = libbpf_get_error(btf); + if (err) { + pr_warn("failed to load module [%s]'s BTF object #%d: %d\n", + name, id, err); + goto err_out; + } + + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + if (err) + goto err_out; + + mod_btf = &obj->btf_modules[obj->btf_module_cnt++]; + + mod_btf->btf = btf; + mod_btf->id = id; + mod_btf->fd = fd; + mod_btf->name = strdup(name); + if (!mod_btf->name) { + err = -ENOMEM; + goto err_out; + } + continue; + +err_out: + close(fd); + return err; + } + + return 0; +} + +static struct bpf_core_cand_list * +bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 local_type_id) +{ + struct bpf_core_cand local_cand = {}; + struct bpf_core_cand_list *cands; + const struct btf *main_btf; + const struct btf_type *local_t; + const char *local_name; + size_t local_essent_len; + int err, i; + + local_cand.btf = local_btf; + local_cand.id = local_type_id; + local_t = btf__type_by_id(local_btf, local_type_id); + if (!local_t) + return ERR_PTR(-EINVAL); + + local_name = btf__name_by_offset(local_btf, local_t->name_off); + if (str_is_empty(local_name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(local_name); + + cands = calloc(1, sizeof(*cands)); + if (!cands) + return ERR_PTR(-ENOMEM); + + /* Attempt to find target candidates in vmlinux BTF first */ + main_btf = obj->btf_vmlinux_override ?: obj->btf_vmlinux; + err = bpf_core_add_cands(&local_cand, local_essent_len, main_btf, "vmlinux", 1, cands); + if (err) + goto err_out; + + /* if vmlinux BTF has any candidate, don't got for module BTFs */ + if (cands->len) + return cands; + + /* if vmlinux BTF was overridden, don't attempt to load module BTFs */ + if (obj->btf_vmlinux_override) + return cands; + + /* now look through module BTFs, trying to still find candidates */ + err = load_module_btfs(obj); + if (err) + goto err_out; + + for (i = 0; i < obj->btf_module_cnt; i++) { + err = bpf_core_add_cands(&local_cand, local_essent_len, + obj->btf_modules[i].btf, + obj->btf_modules[i].name, + btf__type_cnt(obj->btf_vmlinux), + cands); + if (err) + goto err_out; + } + + return cands; +err_out: + bpf_core_free_cands(cands); + return ERR_PTR(err); +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); +} + +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} + +static size_t bpf_core_hash_fn(const long key, void *ctx) +{ + return key; +} + +static bool bpf_core_equal_fn(const long k1, const long k2, void *ctx) +{ + return k1 == k2; +} + +static int record_relo_core(struct bpf_program *prog, + const struct bpf_core_relo *core_relo, int insn_idx) +{ + struct reloc_desc *relos, *relo; + + relos = libbpf_reallocarray(prog->reloc_desc, + prog->nr_reloc + 1, sizeof(*relos)); + if (!relos) + return -ENOMEM; + relo = &relos[prog->nr_reloc]; + relo->type = RELO_CORE; + relo->insn_idx = insn_idx; + relo->core_relo = core_relo; + prog->reloc_desc = relos; + prog->nr_reloc++; + return 0; +} + +static const struct bpf_core_relo *find_relo_core(struct bpf_program *prog, int insn_idx) +{ + struct reloc_desc *relo; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + relo = &prog->reloc_desc[i]; + if (relo->type != RELO_CORE || relo->insn_idx != insn_idx) + continue; + + return relo->core_relo; + } + + return NULL; +} + +static int bpf_core_resolve_relo(struct bpf_program *prog, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct hashmap *cand_cache, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec specs_scratch[3] = {}; + struct bpf_core_cand_list *cands = NULL; + const char *prog_name = prog->name; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id = relo->type_id; + int err; + + local_type = btf__type_by_id(local_btf, local_id); + if (!local_type) + return -EINVAL; + + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && + !hashmap__find(cand_cache, local_id, &cands)) { + cands = bpf_core_find_cands(prog->obj, local_btf, local_id); + if (IS_ERR(cands)) { + pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + local_name, PTR_ERR(cands)); + return PTR_ERR(cands); + } + err = hashmap__set(cand_cache, local_id, cands, NULL, NULL); + if (err) { + bpf_core_free_cands(cands); + return err; + } + } + + return bpf_core_calc_relo_insn(prog_name, relo, relo_idx, local_btf, cands, specs_scratch, + targ_res); +} + +static int +bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) +{ + const struct btf_ext_info_sec *sec; + struct bpf_core_relo_res targ_res; + const struct bpf_core_relo *rec; + const struct btf_ext_info *seg; + struct hashmap_entry *entry; + struct hashmap *cand_cache = NULL; + struct bpf_program *prog; + struct bpf_insn *insn; + const char *sec_name; + int i, err = 0, insn_idx, sec_idx, sec_num; + + if (obj->btf_ext->core_relo_info.len == 0) + return 0; + + if (targ_btf_path) { + obj->btf_vmlinux_override = btf__parse(targ_btf_path, NULL); + err = libbpf_get_error(obj->btf_vmlinux_override); + if (err) { + pr_warn("failed to parse target BTF: %d\n", err); + return err; + } + } + + cand_cache = hashmap__new(bpf_core_hash_fn, bpf_core_equal_fn, NULL); + if (IS_ERR(cand_cache)) { + err = PTR_ERR(cand_cache); + goto out; + } + + seg = &obj->btf_ext->core_relo_info; + sec_num = 0; + for_each_btf_ext_sec(seg, sec) { + sec_idx = seg->sec_idxs[sec_num]; + sec_num++; + + sec_name = btf__name_by_offset(obj->btf, sec->sec_name_off); + if (str_is_empty(sec_name)) { + err = -EINVAL; + goto out; + } + + pr_debug("sec '%s': found %d CO-RE relocations\n", sec_name, sec->num_info); + + for_each_btf_ext_rec(seg, sec, i, rec) { + if (rec->insn_off % BPF_INSN_SZ) + return -EINVAL; + insn_idx = rec->insn_off / BPF_INSN_SZ; + prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx); + if (!prog) { + /* When __weak subprog is "overridden" by another instance + * of the subprog from a different object file, linker still + * appends all the .BTF.ext info that used to belong to that + * eliminated subprogram. + * This is similar to what x86-64 linker does for relocations. + * So just ignore such relocations just like we ignore + * subprog instructions when discovering subprograms. + */ + pr_debug("sec '%s': skipping CO-RE relocation #%d for insn #%d belonging to eliminated weak subprogram\n", + sec_name, i, insn_idx); + continue; + } + /* no need to apply CO-RE relocation if the program is + * not going to be loaded + */ + if (!prog->autoload) + continue; + + /* adjust insn_idx from section frame of reference to the local + * program's frame of reference; (sub-)program code is not yet + * relocated, so it's enough to just subtract in-section offset + */ + insn_idx = insn_idx - prog->sec_insn_off; + if (insn_idx >= prog->insns_cnt) + return -EINVAL; + insn = &prog->insns[insn_idx]; + + err = record_relo_core(prog, rec, insn_idx); + if (err) { + pr_warn("prog '%s': relo #%d: failed to record relocation: %d\n", + prog->name, i, err); + goto out; + } + + if (prog->obj->gen_loader) + continue; + + err = bpf_core_resolve_relo(prog, rec, i, obj->btf, cand_cache, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to relocate: %d\n", + prog->name, i, err); + goto out; + } + + err = bpf_core_patch_insn(prog->name, insn, insn_idx, rec, i, &targ_res); + if (err) { + pr_warn("prog '%s': relo #%d: failed to patch insn #%u: %d\n", + prog->name, i, insn_idx, err); + goto out; + } + } + } + +out: + /* obj->btf_vmlinux and module BTFs are freed after object load */ + btf__free(obj->btf_vmlinux_override); + obj->btf_vmlinux_override = NULL; + + if (!IS_ERR_OR_NULL(cand_cache)) { + hashmap__for_each_entry(cand_cache, entry, i) { + bpf_core_free_cands(entry->pvalue); + } + hashmap__free(cand_cache); + } + return err; +} + +/* base map load ldimm64 special constant, used also for log fixup logic */ +#define POISON_LDIMM64_MAP_BASE 2001000000 +#define POISON_LDIMM64_MAP_PFX "200100" + +static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int map_idx, const struct bpf_map *map) +{ + int i; + + pr_debug("prog '%s': relo #%d: poisoning insn #%d that loads map #%d '%s'\n", + prog->name, relo_idx, insn_idx, map_idx, map->name); + + /* we turn single ldimm64 into two identical invalid calls */ + for (i = 0; i < 2; i++) { + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is map index into obj->maps[] array + */ + insn->imm = POISON_LDIMM64_MAP_BASE + map_idx; + + insn++; + } +} + +/* unresolved kfunc call special constant, used also for log fixup logic */ +#define POISON_CALL_KFUNC_BASE 2002000000 +#define POISON_CALL_KFUNC_PFX "2002" + +static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int ext_idx, const struct extern_desc *ext) +{ + pr_debug("prog '%s': relo #%d: poisoning insn #%d that calls kfunc '%s'\n", + prog->name, relo_idx, insn_idx, ext->name); + + /* we turn kfunc call into invalid helper call with identifiable constant */ + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is extern index into obj->externs[] array + */ + insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; +} + +/* Relocate data references within program code: + * - map references; + * - global variable references; + * - extern references. + */ +static int +bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) +{ + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + const struct bpf_map *map; + struct extern_desc *ext; + + switch (relo->type) { + case RELO_LD64: + map = &obj->maps[relo->map_idx]; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_FD; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_DATA: + map = &obj->maps[relo->map_idx]; + insn[1].imm = insn[0].imm + relo->sym_off; + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = relo->map_idx; + } else if (map->autocreate) { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); + } + break; + case RELO_EXTERN_LD64: + ext = &obj->externs[relo->ext_idx]; + if (ext->type == EXT_KCFG) { + if (obj->gen_loader) { + insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; + insn[0].imm = obj->kconfig_map_idx; + } else { + insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; + insn[0].imm = obj->maps[obj->kconfig_map_idx].fd; + } + insn[1].imm = ext->kcfg.data_off; + } else /* EXT_KSYM */ { + if (ext->ksym.type_id && ext->is_set) { /* typed ksyms */ + insn[0].src_reg = BPF_PSEUDO_BTF_ID; + insn[0].imm = ext->ksym.kernel_btf_id; + insn[1].imm = ext->ksym.kernel_btf_obj_fd; + } else { /* typeless ksyms or unresolved typed ksyms */ + insn[0].imm = (__u32)ext->ksym.addr; + insn[1].imm = ext->ksym.addr >> 32; + } + } + break; + case RELO_EXTERN_CALL: + ext = &obj->externs[relo->ext_idx]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + if (ext->is_set) { + insn[0].imm = ext->ksym.kernel_btf_id; + insn[0].off = ext->ksym.btf_fd_idx; + } else { /* unresolved weak kfunc call */ + poison_kfunc_call(prog, i, relo->insn_idx, insn, + relo->ext_idx, ext); + } + break; + case RELO_SUBPROG_ADDR: + if (insn[0].src_reg != BPF_PSEUDO_FUNC) { + pr_warn("prog '%s': relo #%d: bad insn\n", + prog->name, i); + return -EINVAL; + } + /* handled already */ + break; + case RELO_CALL: + /* handled already */ + break; + case RELO_CORE: + /* will be handled by bpf_program_record_relos() */ + break; + default: + pr_warn("prog '%s': relo #%d: bad relo type %d\n", + prog->name, i, relo->type); + return -EINVAL; + } + } + + return 0; +} + +static int adjust_prog_btf_ext_info(const struct bpf_object *obj, + const struct bpf_program *prog, + const struct btf_ext_info *ext_info, + void **prog_info, __u32 *prog_rec_cnt, + __u32 *prog_rec_sz) +{ + void *copy_start = NULL, *copy_end = NULL; + void *rec, *rec_end, *new_prog_info; + const struct btf_ext_info_sec *sec; + size_t old_sz, new_sz; + int i, sec_num, sec_idx, off_adj; + + sec_num = 0; + for_each_btf_ext_sec(ext_info, sec) { + sec_idx = ext_info->sec_idxs[sec_num]; + sec_num++; + if (prog->sec_idx != sec_idx) + continue; + + for_each_btf_ext_rec(ext_info, sec, i, rec) { + __u32 insn_off = *(__u32 *)rec / BPF_INSN_SZ; + + if (insn_off < prog->sec_insn_off) + continue; + if (insn_off >= prog->sec_insn_off + prog->sec_insn_cnt) + break; + + if (!copy_start) + copy_start = rec; + copy_end = rec + ext_info->rec_size; + } + + if (!copy_start) + return -ENOENT; + + /* append func/line info of a given (sub-)program to the main + * program func/line info + */ + old_sz = (size_t)(*prog_rec_cnt) * ext_info->rec_size; + new_sz = old_sz + (copy_end - copy_start); + new_prog_info = realloc(*prog_info, new_sz); + if (!new_prog_info) + return -ENOMEM; + *prog_info = new_prog_info; + *prog_rec_cnt = new_sz / ext_info->rec_size; + memcpy(new_prog_info + old_sz, copy_start, copy_end - copy_start); + + /* Kernel instruction offsets are in units of 8-byte + * instructions, while .BTF.ext instruction offsets generated + * by Clang are in units of bytes. So convert Clang offsets + * into kernel offsets and adjust offset according to program + * relocated position. + */ + off_adj = prog->sub_insn_off - prog->sec_insn_off; + rec = new_prog_info + old_sz; + rec_end = new_prog_info + new_sz; + for (; rec < rec_end; rec += ext_info->rec_size) { + __u32 *insn_off = rec; + + *insn_off = *insn_off / BPF_INSN_SZ + off_adj; + } + *prog_rec_sz = ext_info->rec_size; + return 0; + } + + return -ENOENT; +} + +static int +reloc_prog_func_and_line_info(const struct bpf_object *obj, + struct bpf_program *main_prog, + const struct bpf_program *prog) +{ + int err; + + /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't + * supprot func/line info + */ + if (!obj->btf_ext || !kernel_supports(obj, FEAT_BTF_FUNC)) + return 0; + + /* only attempt func info relocation if main program's func_info + * relocation was successful + */ + if (main_prog != prog && !main_prog->func_info) + goto line_info; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->func_info, + &main_prog->func_info, + &main_prog->func_info_cnt, + &main_prog->func_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext function info: %d\n", + prog->name, err); + return err; + } + if (main_prog->func_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext function info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext function info for the main program, skipping all of .BTF.ext func info.\n", + prog->name); + } + +line_info: + /* don't relocate line info if main program's relocation failed */ + if (main_prog != prog && !main_prog->line_info) + return 0; + + err = adjust_prog_btf_ext_info(obj, prog, &obj->btf_ext->line_info, + &main_prog->line_info, + &main_prog->line_info_cnt, + &main_prog->line_info_rec_size); + if (err) { + if (err != -ENOENT) { + pr_warn("prog '%s': error relocating .BTF.ext line info: %d\n", + prog->name, err); + return err; + } + if (main_prog->line_info) { + /* + * Some info has already been found but has problem + * in the last btf_ext reloc. Must have to error out. + */ + pr_warn("prog '%s': missing .BTF.ext line info.\n", prog->name); + return err; + } + /* Have problem loading the very first info. Ignore the rest. */ + pr_warn("prog '%s': missing .BTF.ext line info for the main program, skipping all of .BTF.ext line info.\n", + prog->name); + } + return 0; +} + +static int cmp_relo_by_insn_idx(const void *key, const void *elem) +{ + size_t insn_idx = *(const size_t *)key; + const struct reloc_desc *relo = elem; + + if (insn_idx == relo->insn_idx) + return 0; + return insn_idx < relo->insn_idx ? -1 : 1; +} + +static struct reloc_desc *find_prog_insn_relo(const struct bpf_program *prog, size_t insn_idx) +{ + if (!prog->nr_reloc) + return NULL; + return bsearch(&insn_idx, prog->reloc_desc, prog->nr_reloc, + sizeof(*prog->reloc_desc), cmp_relo_by_insn_idx); +} + +static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_program *subprog) +{ + int new_cnt = main_prog->nr_reloc + subprog->nr_reloc; + struct reloc_desc *relos; + int i; + + if (main_prog == subprog) + return 0; + relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (!relos && new_cnt) + return -ENOMEM; + if (subprog->nr_reloc) + memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, + sizeof(*relos) * subprog->nr_reloc); + + for (i = main_prog->nr_reloc; i < new_cnt; i++) + relos[i].insn_idx += subprog->sub_insn_off; + /* After insn_idx adjustment the 'relos' array is still sorted + * by insn_idx and doesn't break bsearch. + */ + main_prog->reloc_desc = relos; + main_prog->nr_reloc = new_cnt; + return 0; +} + +static int +bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *prog) +{ + size_t sub_insn_idx, insn_idx, new_cnt; + struct bpf_program *subprog; + struct bpf_insn *insns, *insn; + struct reloc_desc *relo; + int err; + + err = reloc_prog_func_and_line_info(obj, main_prog, prog); + if (err) + return err; + + for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) { + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn)) + continue; + + relo = find_prog_insn_relo(prog, insn_idx); + if (relo && relo->type == RELO_EXTERN_CALL) + /* kfunc relocations will be handled later + * in bpf_object__relocate_data() + */ + continue; + if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) { + pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n", + prog->name, insn_idx, relo->type); + return -LIBBPF_ERRNO__RELOC; + } + if (relo) { + /* sub-program instruction index is a combination of + * an offset of a symbol pointed to by relocation and + * call instruction's imm field; for global functions, + * call always has imm = -1, but for static functions + * relocation is against STT_SECTION and insn->imm + * points to a start of a static function + * + * for subprog addr relocation, the relo->sym_off + insn->imm is + * the byte offset in the corresponding section. + */ + if (relo->type == RELO_CALL) + sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1; + else + sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ; + } else if (insn_is_pseudo_func(insn)) { + /* + * RELO_SUBPROG_ADDR relo is always emitted even if both + * functions are in the same section, so it shouldn't reach here. + */ + pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n", + prog->name, insn_idx); + return -LIBBPF_ERRNO__RELOC; + } else { + /* if subprogram call is to a static function within + * the same ELF section, there won't be any relocation + * emitted, but it also means there is no additional + * offset necessary, insns->imm is relative to + * instruction's original position within the section + */ + sub_insn_idx = prog->sec_insn_off + insn_idx + insn->imm + 1; + } + + /* we enforce that sub-programs should be in .text section */ + subprog = find_prog_by_sec_insn(obj, obj->efile.text_shndx, sub_insn_idx); + if (!subprog) { + pr_warn("prog '%s': no .text section found yet sub-program call exists\n", + prog->name); + return -LIBBPF_ERRNO__RELOC; + } + + /* if it's the first call instruction calling into this + * subprogram (meaning this subprog hasn't been processed + * yet) within the context of current main program: + * - append it at the end of main program's instructions blog; + * - process is recursively, while current program is put on hold; + * - if that subprogram calls some other not yet processes + * subprogram, same thing will happen recursively until + * there are no more unprocesses subprograms left to append + * and relocate. + */ + if (subprog->sub_insn_off == 0) { + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + err = bpf_object__reloc_code(obj, main_prog, subprog); + if (err) + return err; + } + + /* main_prog->insns memory could have been re-allocated, so + * calculate pointer again + */ + insn = &main_prog->insns[prog->sub_insn_off + insn_idx]; + /* calculate correct instruction position within current main + * prog; each main prog can have a different set of + * subprograms appended (potentially in different order as + * well), so position of any subprog can be different for + * different main programs + */ + insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; + + pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", + prog->name, insn_idx, insn->imm, subprog->name, subprog->sub_insn_off); + } + + return 0; +} + +/* + * Relocate sub-program calls. + * + * Algorithm operates as follows. Each entry-point BPF program (referred to as + * main prog) is processed separately. For each subprog (non-entry functions, + * that can be called from either entry progs or other subprogs) gets their + * sub_insn_off reset to zero. This serves as indicator that this subprogram + * hasn't been yet appended and relocated within current main prog. Once its + * relocated, sub_insn_off will point at the position within current main prog + * where given subprog was appended. This will further be used to relocate all + * the call instructions jumping into this subprog. + * + * We start with main program and process all call instructions. If the call + * is into a subprog that hasn't been processed (i.e., subprog->sub_insn_off + * is zero), subprog instructions are appended at the end of main program's + * instruction array. Then main program is "put on hold" while we recursively + * process newly appended subprogram. If that subprogram calls into another + * subprogram that hasn't been appended, new subprogram is appended again to + * the *main* prog's instructions (subprog's instructions are always left + * untouched, as they need to be in unmodified state for subsequent main progs + * and subprog instructions are always sent only as part of a main prog) and + * the process continues recursively. Once all the subprogs called from a main + * prog or any of its subprogs are appended (and relocated), all their + * positions within finalized instructions array are known, so it's easy to + * rewrite call instructions with correct relative offsets, corresponding to + * desired target subprog. + * + * Its important to realize that some subprogs might not be called from some + * main prog and any of its called/used subprogs. Those will keep their + * subprog->sub_insn_off as zero at all times and won't be appended to current + * main prog and won't be relocated within the context of current main prog. + * They might still be used from other main progs later. + * + * Visually this process can be shown as below. Suppose we have two main + * programs mainA and mainB and BPF object contains three subprogs: subA, + * subB, and subC. mainA calls only subA, mainB calls only subC, but subA and + * subC both call subB: + * + * +--------+ +-------+ + * | v v | + * +--+---+ +--+-+-+ +---+--+ + * | subA | | subB | | subC | + * +--+---+ +------+ +---+--+ + * ^ ^ + * | | + * +---+-------+ +------+----+ + * | mainA | | mainB | + * +-----------+ +-----------+ + * + * We'll start relocating mainA, will find subA, append it and start + * processing sub A recursively: + * + * +-----------+------+ + * | mainA | subA | + * +-----------+------+ + * + * At this point we notice that subB is used from subA, so we append it and + * relocate (there are no further subcalls from subB): + * + * +-----------+------+------+ + * | mainA | subA | subB | + * +-----------+------+------+ + * + * At this point, we relocate subA calls, then go one level up and finish with + * relocatin mainA calls. mainA is done. + * + * For mainB process is similar but results in different order. We start with + * mainB and skip subA and subB, as mainB never calls them (at least + * directly), but we see subC is needed, so we append and start processing it: + * + * +-----------+------+ + * | mainB | subC | + * +-----------+------+ + * Now we see subC needs subB, so we go back to it, append and relocate it: + * + * +-----------+------+------+ + * | mainB | subC | subB | + * +-----------+------+------+ + * + * At this point we unwind recursion, relocate calls in subC, then in mainB. + */ +static int +bpf_object__relocate_calls(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_program *subprog; + int i, err; + + /* mark all subprogs as not relocated (yet) within the context of + * current main program + */ + for (i = 0; i < obj->nr_programs; i++) { + subprog = &obj->programs[i]; + if (!prog_is_subprog(obj, subprog)) + continue; + + subprog->sub_insn_off = 0; + } + + err = bpf_object__reloc_code(obj, prog, prog); + if (err) + return err; + + return 0; +} + +static void +bpf_object__free_relocs(struct bpf_object *obj) +{ + struct bpf_program *prog; + int i; + + /* free up relocation descriptors */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + zfree(&prog->reloc_desc); + prog->nr_reloc = 0; + } +} + +static int cmp_relocs(const void *_a, const void *_b) +{ + const struct reloc_desc *a = _a; + const struct reloc_desc *b = _b; + + if (a->insn_idx != b->insn_idx) + return a->insn_idx < b->insn_idx ? -1 : 1; + + /* no two relocations should have the same insn_idx, but ... */ + if (a->type != b->type) + return a->type < b->type ? -1 : 1; + + return 0; +} + +static void bpf_object__sort_relos(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + struct bpf_program *p = &obj->programs[i]; + + if (!p->nr_reloc) + continue; + + qsort(p->reloc_desc, p->nr_reloc, sizeof(*p->reloc_desc), cmp_relocs); + } +} + +static int +bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) +{ + struct bpf_program *prog; + size_t i, j; + int err; + + if (obj->btf_ext) { + err = bpf_object__relocate_core(obj, targ_btf_path); + if (err) { + pr_warn("failed to perform CO-RE relocations: %d\n", + err); + return err; + } + bpf_object__sort_relos(obj); + } + + /* Before relocating calls pre-process relocations and mark + * few ld_imm64 instructions that points to subprogs. + * Otherwise bpf_object__reloc_code() later would have to consider + * all ld_imm64 insns as relocation candidates. That would + * reduce relocation speed, since amount of find_prog_insn_relo() + * would increase and most of them will fail to find a relo. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + for (j = 0; j < prog->nr_reloc; j++) { + struct reloc_desc *relo = &prog->reloc_desc[j]; + struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + + /* mark the insn, so it's recognized by insn_is_pseudo_func() */ + if (relo->type == RELO_SUBPROG_ADDR) + insn[0].src_reg = BPF_PSEUDO_FUNC; + } + } + + /* relocate subprogram calls and append used subprograms to main + * programs; each copy of subprogram code needs to be relocated + * differently for each main program, because its code location might + * have changed. + * Append subprog relos to main programs to allow data relos to be + * processed after text is completely relocated. + */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + /* sub-program's sub-calls are relocated within the context of + * its main program only + */ + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + + err = bpf_object__relocate_calls(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate calls: %d\n", + prog->name, err); + return err; + } + } + /* Process data relos for main programs */ + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) + continue; + err = bpf_object__relocate_data(obj, prog); + if (err) { + pr_warn("prog '%s': failed to relocate data references: %d\n", + prog->name, err); + return err; + } + } + + return 0; +} + +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data); + +static int bpf_object__collect_map_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const int bpf_ptr_sz = 8, host_ptr_sz = sizeof(void *); + int i, j, nrels, new_sz; + const struct btf_var_secinfo *vi = NULL; + const struct btf_type *sec, *var, *def; + struct bpf_map *map = NULL, *targ_map = NULL; + struct bpf_program *targ_prog = NULL; + bool is_prog_array, is_map_in_map; + const struct btf_member *member; + const char *name, *mname, *type; + unsigned int moff; + Elf64_Sym *sym; + Elf64_Rel *rel; + void *tmp; + + if (!obj->efile.btf_maps_sec_btf_id || !obj->btf) + return -EINVAL; + sec = btf__type_by_id(obj->btf, obj->efile.btf_maps_sec_btf_id); + if (!sec) + return -EINVAL; + + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn(".maps relo #%d: failed to get ELF relo\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn(".maps relo #%d: symbol %zx not found\n", + i, (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + name = elf_sym_str(obj, sym->st_name) ?: ""; + + pr_debug(".maps relo #%d: for %zd value %zd rel->r_offset %zu name %d ('%s')\n", + i, (ssize_t)(rel->r_info >> 32), (size_t)sym->st_value, + (size_t)rel->r_offset, sym->st_name, name); + + for (j = 0; j < obj->nr_maps; j++) { + map = &obj->maps[j]; + if (map->sec_idx != obj->efile.btf_maps_shndx) + continue; + + vi = btf_var_secinfos(sec) + map->btf_var_idx; + if (vi->offset <= rel->r_offset && + rel->r_offset + bpf_ptr_sz <= vi->offset + vi->size) + break; + } + if (j == obj->nr_maps) { + pr_warn(".maps relo #%d: cannot find map '%s' at rel->r_offset %zu\n", + i, name, (size_t)rel->r_offset); + return -EINVAL; + } + + is_map_in_map = bpf_map_type__is_map_in_map(map->def.type); + is_prog_array = map->def.type == BPF_MAP_TYPE_PROG_ARRAY; + type = is_map_in_map ? "map" : "prog"; + if (is_map_in_map) { + if (sym->st_shndx != obj->efile.btf_maps_shndx) { + pr_warn(".maps relo #%d: '%s' isn't a BTF-defined map\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + if (map->def.type == BPF_MAP_TYPE_HASH_OF_MAPS && + map->def.key_size != sizeof(int)) { + pr_warn(".maps relo #%d: hash-of-maps '%s' should have key size %zu.\n", + i, map->name, sizeof(int)); + return -EINVAL; + } + targ_map = bpf_object__find_map_by_name(obj, name); + if (!targ_map) { + pr_warn(".maps relo #%d: '%s' isn't a valid map reference\n", + i, name); + return -ESRCH; + } + } else if (is_prog_array) { + targ_prog = bpf_object__find_program_by_name(obj, name); + if (!targ_prog) { + pr_warn(".maps relo #%d: '%s' isn't a valid program reference\n", + i, name); + return -ESRCH; + } + if (targ_prog->sec_idx != sym->st_shndx || + targ_prog->sec_insn_off * 8 != sym->st_value || + prog_is_subprog(obj, targ_prog)) { + pr_warn(".maps relo #%d: '%s' isn't an entry-point program\n", + i, name); + return -LIBBPF_ERRNO__RELOC; + } + } else { + return -EINVAL; + } + + var = btf__type_by_id(obj->btf, vi->type); + def = skip_mods_and_typedefs(obj->btf, var->type, NULL); + if (btf_vlen(def) == 0) + return -EINVAL; + member = btf_members(def) + btf_vlen(def) - 1; + mname = btf__name_by_offset(obj->btf, member->name_off); + if (strcmp(mname, "values")) + return -EINVAL; + + moff = btf_member_bit_offset(def, btf_vlen(def) - 1) / 8; + if (rel->r_offset - vi->offset < moff) + return -EINVAL; + + moff = rel->r_offset - vi->offset - moff; + /* here we use BPF pointer size, which is always 64 bit, as we + * are parsing ELF that was built for BPF target + */ + if (moff % bpf_ptr_sz) + return -EINVAL; + moff /= bpf_ptr_sz; + if (moff >= map->init_slots_sz) { + new_sz = moff + 1; + tmp = libbpf_reallocarray(map->init_slots, new_sz, host_ptr_sz); + if (!tmp) + return -ENOMEM; + map->init_slots = tmp; + memset(map->init_slots + map->init_slots_sz, 0, + (new_sz - map->init_slots_sz) * host_ptr_sz); + map->init_slots_sz = new_sz; + } + map->init_slots[moff] = is_map_in_map ? (void *)targ_map : (void *)targ_prog; + + pr_debug(".maps relo #%d: map '%s' slot [%d] points to %s '%s'\n", + i, map->name, moff, type, name); + } + + return 0; +} + +static int bpf_object__collect_relos(struct bpf_object *obj) +{ + int i, err; + + for (i = 0; i < obj->efile.sec_cnt; i++) { + struct elf_sec_desc *sec_desc = &obj->efile.secs[i]; + Elf64_Shdr *shdr; + Elf_Data *data; + int idx; + + if (sec_desc->sec_type != SEC_RELO) + continue; + + shdr = sec_desc->shdr; + data = sec_desc->data; + idx = shdr->sh_info; + + if (shdr->sh_type != SHT_REL) { + pr_warn("internal error at %d\n", __LINE__); + return -LIBBPF_ERRNO__INTERNAL; + } + + if (idx == obj->efile.st_ops_shndx || idx == obj->efile.st_ops_link_shndx) + err = bpf_object__collect_st_ops_relos(obj, shdr, data); + else if (idx == obj->efile.btf_maps_shndx) + err = bpf_object__collect_map_relos(obj, shdr, data); + else + err = bpf_object__collect_prog_relos(obj, shdr, data); + if (err) + return err; + } + + bpf_object__sort_relos(obj); + return 0; +} + +static bool insn_is_helper_call(struct bpf_insn *insn, enum bpf_func_id *func_id) +{ + if (BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_CALL && + BPF_SRC(insn->code) == BPF_K && + insn->src_reg == 0 && + insn->dst_reg == 0) { + *func_id = insn->imm; + return true; + } + return false; +} + +static int bpf_object__sanitize_prog(struct bpf_object *obj, struct bpf_program *prog) +{ + struct bpf_insn *insn = prog->insns; + enum bpf_func_id func_id; + int i; + + if (obj->gen_loader) + return 0; + + for (i = 0; i < prog->insns_cnt; i++, insn++) { + if (!insn_is_helper_call(insn, &func_id)) + continue; + + /* on kernels that don't yet support + * bpf_probe_read_{kernel,user}[_str] helpers, fall back + * to bpf_probe_read() which works well for old kernels + */ + switch (func_id) { + case BPF_FUNC_probe_read_kernel: + case BPF_FUNC_probe_read_user: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read; + break; + case BPF_FUNC_probe_read_kernel_str: + case BPF_FUNC_probe_read_user_str: + if (!kernel_supports(obj, FEAT_PROBE_READ_KERN)) + insn->imm = BPF_FUNC_probe_read_str; + break; + default: + break; + } + } + return 0; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id); + +/* this is called as prog->sec_def->prog_prepare_load_fn for libbpf-supported sec_defs */ +static int libbpf_prepare_prog_load(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie) +{ + enum sec_def_flags def = cookie; + + /* old kernels might not support specifying expected_attach_type */ + if ((def & SEC_EXP_ATTACH_OPT) && !kernel_supports(prog->obj, FEAT_EXP_ATTACH_TYPE)) + opts->expected_attach_type = 0; + + if (def & SEC_SLEEPABLE) + opts->prog_flags |= BPF_F_SLEEPABLE; + + if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) + opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; + + if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { + int btf_obj_fd = 0, btf_type_id = 0, err; + const char *attach_name; + + attach_name = strchr(prog->sec_name, '/'); + if (!attach_name) { + /* if BPF program is annotated with just SEC("fentry") + * (or similar) without declaratively specifying + * target, then it is expected that target will be + * specified with bpf_program__set_attach_target() at + * runtime before BPF object load step. If not, then + * there is nothing to load into the kernel as BPF + * verifier won't be able to validate BPF program + * correctness anyways. + */ + pr_warn("prog '%s': no BTF-based attach target is specified, use bpf_program__set_attach_target()\n", + prog->name); + return -EINVAL; + } + attach_name++; /* skip over / */ + + err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id); + if (err) + return err; + + /* cache resolved BTF FD and BTF type ID in the prog */ + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_btf_id = btf_type_id; + + /* but by now libbpf common logic is not utilizing + * prog->atach_btf_obj_fd/prog->attach_btf_id anymore because + * this callback is called after opts were populated by + * libbpf, so this callback has to update opts explicitly here + */ + opts->attach_btf_obj_fd = btf_obj_fd; + opts->attach_btf_id = btf_type_id; + } + return 0; +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); + +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_attr); + const char *prog_name = NULL; + char *cp, errmsg[STRERR_BUFSIZE]; + size_t log_buf_size = 0; + char *log_buf = NULL, *tmp; + int btf_fd, ret, err; + bool own_log_buf = true; + __u32 log_level = prog->log_level; + + if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* + * The program type must be set. Most likely we couldn't find a proper + * section definition at load time, and thus we didn't infer the type. + */ + pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + if (!insns || !insns_cnt) + return -EINVAL; + + load_attr.expected_attach_type = prog->expected_attach_type; + if (kernel_supports(obj, FEAT_PROG_NAME)) + prog_name = prog->name; + load_attr.attach_prog_fd = prog->attach_prog_fd; + load_attr.attach_btf_obj_fd = prog->attach_btf_obj_fd; + load_attr.attach_btf_id = prog->attach_btf_id; + load_attr.kern_version = kern_version; + load_attr.prog_ifindex = prog->prog_ifindex; + + /* specify func_info/line_info only if kernel supports them */ + btf_fd = bpf_object__btf_fd(obj); + if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf_fd; + load_attr.func_info = prog->func_info; + load_attr.func_info_rec_size = prog->func_info_rec_size; + load_attr.func_info_cnt = prog->func_info_cnt; + load_attr.line_info = prog->line_info; + load_attr.line_info_rec_size = prog->line_info_rec_size; + load_attr.line_info_cnt = prog->line_info_cnt; + } + load_attr.log_level = log_level; + load_attr.prog_flags = prog->prog_flags; + load_attr.fd_array = obj->fd_array; + + /* adjust load_attr if sec_def provides custom preload callback */ + if (prog->sec_def && prog->sec_def->prog_prepare_load_fn) { + err = prog->sec_def->prog_prepare_load_fn(prog, &load_attr, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to prepare load attributes: %d\n", + prog->name, err); + return err; + } + insns = prog->insns; + insns_cnt = prog->insns_cnt; + } + + if (obj->gen_loader) { + bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name, + license, insns, insns_cnt, &load_attr, + prog - obj->programs); + *prog_fd = -1; + return 0; + } + +retry_load: + /* if log_level is zero, we don't request logs initially even if + * custom log_buf is specified; if the program load fails, then we'll + * bump log_level to 1 and use either custom log_buf or we'll allocate + * our own and retry the load to get details on what failed + */ + if (log_level) { + if (prog->log_buf) { + log_buf = prog->log_buf; + log_buf_size = prog->log_size; + own_log_buf = false; + } else if (obj->log_buf) { + log_buf = obj->log_buf; + log_buf_size = obj->log_size; + own_log_buf = false; + } else { + log_buf_size = max((size_t)BPF_LOG_BUF_SIZE, log_buf_size * 2); + tmp = realloc(log_buf, log_buf_size); + if (!tmp) { + ret = -ENOMEM; + goto out; + } + log_buf = tmp; + log_buf[0] = '\0'; + own_log_buf = true; + } + } + + load_attr.log_buf = log_buf; + load_attr.log_size = log_buf_size; + load_attr.log_level = log_level; + + ret = bpf_prog_load(prog->type, prog_name, license, insns, insns_cnt, &load_attr); + if (ret >= 0) { + if (log_level && own_log_buf) { + pr_debug("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + + if (obj->has_rodata && kernel_supports(obj, FEAT_PROG_BIND_MAP)) { + struct bpf_map *map; + int i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &prog->obj->maps[i]; + if (map->libbpf_type != LIBBPF_MAP_RODATA) + continue; + + if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to bind map '%s': %s\n", + prog->name, map->real_name, cp); + /* Don't fail hard if can't bind rodata. */ + } + } + } + + *prog_fd = ret; + ret = 0; + goto out; + } + + if (log_level == 0) { + log_level = 1; + goto retry_load; + } + /* On ENOSPC, increase log buffer size and retry, unless custom + * log_buf is specified. + * Be careful to not overflow u32, though. Kernel's log buf size limit + * isn't part of UAPI so it can always be bumped to full 4GB. So don't + * multiply by 2 unless we are sure we'll fit within 32 bits. + * Currently, we'll get -EINVAL when we reach (UINT_MAX >> 2). + */ + if (own_log_buf && errno == ENOSPC && log_buf_size <= UINT_MAX / 2) + goto retry_load; + + ret = -errno; + + /* post-process verifier log to improve error descriptions */ + fixup_verifier_log(prog, log_buf, log_buf_size); + + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': BPF program load failed: %s\n", prog->name, cp); + pr_perm_msg(ret); + + if (own_log_buf && log_buf && log_buf[0] != '\0') { + pr_warn("prog '%s': -- BEGIN PROG LOAD LOG --\n%s-- END PROG LOAD LOG --\n", + prog->name, log_buf); + } + +out: + if (own_log_buf) + free(log_buf); + return ret; +} + +static char *find_prev_line(char *buf, char *cur) +{ + char *p; + + if (cur == buf) /* end of a log buf */ + return NULL; + + p = cur - 1; + while (p - 1 >= buf && *(p - 1) != '\n') + p--; + + return p; +} + +static void patch_log(char *buf, size_t buf_sz, size_t log_sz, + char *orig, size_t orig_sz, const char *patch) +{ + /* size of the remaining log content to the right from the to-be-replaced part */ + size_t rem_sz = (buf + log_sz) - (orig + orig_sz); + size_t patch_sz = strlen(patch); + + if (patch_sz != orig_sz) { + /* If patch line(s) are longer than original piece of verifier log, + * shift log contents by (patch_sz - orig_sz) bytes to the right + * starting from after to-be-replaced part of the log. + * + * If patch line(s) are shorter than original piece of verifier log, + * shift log contents by (orig_sz - patch_sz) bytes to the left + * starting from after to-be-replaced part of the log + * + * We need to be careful about not overflowing available + * buf_sz capacity. If that's the case, we'll truncate the end + * of the original log, as necessary. + */ + if (patch_sz > orig_sz) { + if (orig + patch_sz >= buf + buf_sz) { + /* patch is big enough to cover remaining space completely */ + patch_sz -= (orig + patch_sz) - (buf + buf_sz) + 1; + rem_sz = 0; + } else if (patch_sz - orig_sz > buf_sz - log_sz) { + /* patch causes part of remaining log to be truncated */ + rem_sz -= (patch_sz - orig_sz) - (buf_sz - log_sz); + } + } + /* shift remaining log to the right by calculated amount */ + memmove(orig + patch_sz, orig + orig_sz, rem_sz); + } + + memcpy(orig, patch, patch_sz); +} + +static void fixup_log_failed_core_relo(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded CO-RE relocation: + * line1 -> 123: (85) call unknown#195896080 + * line2 -> invalid func unknown#195896080 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. We extract + * instruction index to find corresponding CO-RE relocation and + * replace this part of the log with more relevant information about + * failed CO-RE relocation. + */ + const struct bpf_core_relo *relo; + struct bpf_core_spec spec; + char patch[512], spec_buf[256]; + int insn_idx, err, spec_len; + + if (sscanf(line1, "%d: (%*d) call unknown#195896080\n", &insn_idx) != 1) + return; + + relo = find_relo_core(prog, insn_idx); + if (!relo) + return; + + err = bpf_core_parse_spec(prog->name, prog->obj->btf, relo, &spec); + if (err) + return; + + spec_len = bpf_core_format_spec(spec_buf, sizeof(spec_buf), &spec); + snprintf(patch, sizeof(patch), + "%d: \n" + "failed to resolve CO-RE relocation %s%s\n", + insn_idx, spec_buf, spec_len >= sizeof(spec_buf) ? "..." : ""); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_map_load(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded map reference: + * line1 -> 123: (85) call unknown#2001000345 + * line2 -> invalid func unknown#2001000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2001000345" is a map index in obj->maps to fetch map name. + */ + struct bpf_object *obj = prog->obj; + const struct bpf_map *map; + int insn_idx, map_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &map_idx) != 2) + return; + + map_idx -= POISON_LDIMM64_MAP_BASE; + if (map_idx < 0 || map_idx >= obj->nr_maps) + return; + map = &obj->maps[map_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "BPF map '%s' is referenced but wasn't created\n", + insn_idx, map->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_kfunc_call(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded kfunc call: + * line1 -> 123: (85) call unknown#2002000345 + * line2 -> invalid func unknown#2002000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2002000345" is an extern index in obj->externs to fetch kfunc name. + */ + struct bpf_object *obj = prog->obj; + const struct extern_desc *ext; + int insn_idx, ext_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &ext_idx) != 2) + return; + + ext_idx -= POISON_CALL_KFUNC_BASE; + if (ext_idx < 0 || ext_idx >= obj->nr_extern) + return; + ext = &obj->externs[ext_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "kfunc '%s' is referenced but wasn't resolved\n", + insn_idx, ext->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) +{ + /* look for familiar error patterns in last N lines of the log */ + const size_t max_last_line_cnt = 10; + char *prev_line, *cur_line, *next_line; + size_t log_sz; + int i; + + if (!buf) + return; + + log_sz = strlen(buf) + 1; + next_line = buf + log_sz - 1; + + for (i = 0; i < max_last_line_cnt; i++, next_line = cur_line) { + cur_line = find_prev_line(buf, next_line); + if (!cur_line) + return; + + if (str_has_pfx(cur_line, "invalid func unknown#195896080\n")) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* failed CO-RE relocation case */ + fixup_log_failed_core_relo(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_LDIMM64_MAP_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to uncreated BPF map */ + fixup_log_missing_map_load(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_CALL_KFUNC_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to unresolved kfunc */ + fixup_log_missing_kfunc_call(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } + } +} + +static int bpf_program_record_relos(struct bpf_program *prog) +{ + struct bpf_object *obj = prog->obj; + int i; + + for (i = 0; i < prog->nr_reloc; i++) { + struct reloc_desc *relo = &prog->reloc_desc[i]; + struct extern_desc *ext = &obj->externs[relo->ext_idx]; + int kind; + + switch (relo->type) { + case RELO_EXTERN_LD64: + if (ext->type != EXT_KSYM) + continue; + kind = btf_is_var(btf__type_by_id(obj->btf, ext->btf_id)) ? + BTF_KIND_VAR : BTF_KIND_FUNC; + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, !ext->ksym.type_id, + true, kind, relo->insn_idx); + break; + case RELO_EXTERN_CALL: + bpf_gen__record_extern(obj->gen_loader, ext->name, + ext->is_weak, false, false, BTF_KIND_FUNC, + relo->insn_idx); + break; + case RELO_CORE: { + struct bpf_core_relo cr = { + .insn_off = relo->insn_idx * 8, + .type_id = relo->core_relo->type_id, + .access_str_off = relo->core_relo->access_str_off, + .kind = relo->core_relo->kind, + }; + + bpf_gen__record_relo_core(obj->gen_loader, &cr); + break; + } + default: + continue; + } + } + return 0; +} + +static int +bpf_object__load_progs(struct bpf_object *obj, int log_level) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + err = bpf_object__sanitize_prog(obj, prog); + if (err) + return err; + } + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + if (prog_is_subprog(obj, prog)) + continue; + if (!prog->autoload) { + pr_debug("prog '%s': skipped loading\n", prog->name); + continue; + } + prog->log_level |= log_level; + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); + return err; + } + } + + bpf_object__free_relocs(obj); + return 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name); + +static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts) +{ + struct bpf_program *prog; + int err; + + bpf_object__for_each_program(prog, obj) { + prog->sec_def = find_sec_def(prog->sec_name); + if (!prog->sec_def) { + /* couldn't guess, but user might manually specify */ + pr_debug("prog '%s': unrecognized ELF section name '%s'\n", + prog->name, prog->sec_name); + continue; + } + + prog->type = prog->sec_def->prog_type; + prog->expected_attach_type = prog->sec_def->expected_attach_type; + + /* sec_def can have custom callback which should be called + * after bpf_program is initialized to adjust its properties + */ + if (prog->sec_def->prog_setup_fn) { + err = prog->sec_def->prog_setup_fn(prog, prog->sec_def->cookie); + if (err < 0) { + pr_warn("prog '%s': failed to initialize: %d\n", + prog->name, err); + return err; + } + } + } + + return 0; +} + +static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + const char *obj_name, *kconfig, *btf_tmp_path; + struct bpf_object *obj; + char tmp_name[64]; + int err; + char *log_buf; + size_t log_size; + __u32 log_level; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", + path ? : "(mem buf)"); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); + } + + if (!OPTS_VALID(opts, bpf_object_open_opts)) + return ERR_PTR(-EINVAL); + + obj_name = OPTS_GET(opts, object_name, NULL); + if (obj_buf) { + if (!obj_name) { + snprintf(tmp_name, sizeof(tmp_name), "%lx-%lx", + (unsigned long)obj_buf, + (unsigned long)obj_buf_sz); + obj_name = tmp_name; + } + path = obj_name; + pr_debug("loading object '%s' from buffer\n", obj_name); + } + + log_buf = OPTS_GET(opts, kernel_log_buf, NULL); + log_size = OPTS_GET(opts, kernel_log_size, 0); + log_level = OPTS_GET(opts, kernel_log_level, 0); + if (log_size > UINT_MAX) + return ERR_PTR(-EINVAL); + if (log_size && !log_buf) + return ERR_PTR(-EINVAL); + + obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); + if (IS_ERR(obj)) + return obj; + + obj->log_buf = log_buf; + obj->log_size = log_size; + obj->log_level = log_level; + + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); + if (btf_tmp_path) { + if (strlen(btf_tmp_path) >= PATH_MAX) { + err = -ENAMETOOLONG; + goto out; + } + obj->btf_custom_path = strdup(btf_tmp_path); + if (!obj->btf_custom_path) { + err = -ENOMEM; + goto out; + } + } + + kconfig = OPTS_GET(opts, kconfig, NULL); + if (kconfig) { + obj->kconfig = strdup(kconfig); + if (!obj->kconfig) { + err = -ENOMEM; + goto out; + } + } + + err = bpf_object__elf_init(obj); + err = err ? : bpf_object__check_endianness(obj); + err = err ? : bpf_object__elf_collect(obj); + err = err ? : bpf_object__collect_externs(obj); + err = err ? : bpf_object_fixup_btf(obj); + err = err ? : bpf_object__init_maps(obj, opts); + err = err ? : bpf_object_init_progs(obj, opts); + err = err ? : bpf_object__collect_relos(obj); + if (err) + goto out; + + bpf_object__elf_finish(obj); + + return obj; +out: + bpf_object__close(obj); + return ERR_PTR(err); +} + +struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) +{ + if (!path) + return libbpf_err_ptr(-EINVAL); + + pr_debug("loading %s\n", path); + + return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); +} + +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + +struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts) +{ + if (!obj_buf || obj_buf_sz == 0) + return libbpf_err_ptr(-EINVAL); + + return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); +} + +static int bpf_object_unload(struct bpf_object *obj) +{ + size_t i; + + if (!obj) + return libbpf_err(-EINVAL); + + for (i = 0; i < obj->nr_maps; i++) { + zclose(obj->maps[i].fd); + if (obj->maps[i].st_ops) + zfree(&obj->maps[i].st_ops->kern_vdata); + } + + for (i = 0; i < obj->nr_programs; i++) + bpf_program__unload(&obj->programs[i]); + + return 0; +} + +static int bpf_object__sanitize_maps(struct bpf_object *obj) +{ + struct bpf_map *m; + + bpf_object__for_each_map(m, obj) { + if (!bpf_map__is_internal(m)) + continue; + if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) + m->def.map_flags &= ~BPF_F_MMAPABLE; + } + + return 0; +} + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +{ + char sym_type, sym_name[500]; + unsigned long long sym_addr; + int ret, err = 0; + FILE *f; + + f = fopen("/proc/kallsyms", "re"); + if (!f) { + err = -errno; + pr_warn("failed to open /proc/kallsyms: %d\n", err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %c %499s%*[^\n]\n", + &sym_addr, &sym_type, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 3) { + pr_warn("failed to read kallsyms entry: %d\n", ret); + err = -EINVAL; + break; + } + + err = cb(sym_addr, sym_type, sym_name, ctx); + if (err) + break; + } + + fclose(f); + return err; +} + +static int kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct bpf_object *obj = ctx; + const struct btf_type *t; + struct extern_desc *ext; + + ext = find_extern_by_name(obj, sym_name); + if (!ext || ext->type != EXT_KSYM) + return 0; + + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + return 0; + + if (ext->is_set && ext->ksym.addr != sym_addr) { + pr_warn("extern (ksym) '%s': resolution is ambiguous: 0x%llx or 0x%llx\n", + sym_name, ext->ksym.addr, sym_addr); + return -EINVAL; + } + if (!ext->is_set) { + ext->is_set = true; + ext->ksym.addr = sym_addr; + pr_debug("extern (ksym) '%s': set to 0x%llx\n", sym_name, sym_addr); + } + return 0; +} + +static int bpf_object__read_kallsyms_file(struct bpf_object *obj) +{ + return libbpf_kallsyms_parse(kallsyms_cb, obj); +} + +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + struct module_btf **res_mod_btf) +{ + struct module_btf *mod_btf; + struct btf *btf; + int i, id, err; + + btf = obj->btf_vmlinux; + mod_btf = NULL; + id = btf__find_by_name_kind(btf, ksym_name, kind); + + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; + + for (i = 0; i < obj->btf_module_cnt; i++) { + /* we assume module_btf's BTF FD is always >0 */ + mod_btf = &obj->btf_modules[i]; + btf = mod_btf->btf; + id = btf__find_by_name_kind_own(btf, ksym_name, kind); + if (id != -ENOENT) + break; + } + } + if (id <= 0) + return -ESRCH; + + *res_btf = btf; + *res_mod_btf = mod_btf; + return id; +} + +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + struct module_btf *mod_btf = NULL; + const char *targ_var_name; + struct btf *btf = NULL; + int id, err; + + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &mod_btf); + if (id < 0) { + if (id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (var ksym) '%s': not found in kernel BTF\n", + ext->name); + return id; + } + + /* find local type_id */ + local_type_id = ext->ksym.type_id; + + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); + + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); + + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + + return 0; +} + +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + struct module_btf *mod_btf = NULL; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, &kern_btf, &mod_btf); + if (kfunc_id < 0) { + if (kfunc_id == -ESRCH && ext->is_weak) + return 0; + pr_warn("extern (func ksym) '%s': not found in kernel or module BTFs\n", + ext->name); + return kfunc_id; + } + + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with %s [%d]\n", + ext->name, local_func_proto_id, + mod_btf ? mod_btf->name : "vmlinux", kfunc_proto_id); + return -EINVAL; + } + + /* set index for module BTF fd in fd_array, if unset */ + if (mod_btf && !mod_btf->fd_array_idx) { + /* insn->off is s16 */ + if (obj->fd_array_cnt == INT16_MAX) { + pr_warn("extern (func ksym) '%s': module BTF fd index %d too big to fit in bpf_insn offset\n", + ext->name, mod_btf->fd_array_idx); + return -E2BIG; + } + /* Cannot use index 0 for module BTF fd */ + if (!obj->fd_array_cnt) + obj->fd_array_cnt = 1; + + ret = libbpf_ensure_mem((void **)&obj->fd_array, &obj->fd_array_cap, sizeof(int), + obj->fd_array_cnt + 1); + if (ret) + return ret; + mod_btf->fd_array_idx = obj->fd_array_cnt; + /* we assume module BTF FD is always >0 */ + obj->fd_array[obj->fd_array_cnt++] = mod_btf->fd; + } + + ext->is_set = true; + ext->ksym.kernel_btf_id = kfunc_id; + ext->ksym.btf_fd_idx = mod_btf ? mod_btf->fd_array_idx : 0; + /* Also set kernel_btf_obj_fd to make sure that bpf_object__relocate_data() + * populates FD into ld_imm64 insn when it's used to point to kfunc. + * {kernel_btf_id, btf_fd_idx} -> fixup bpf_call. + * {kernel_btf_id, kernel_btf_obj_fd} -> fixup ld_imm64. + */ + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + pr_debug("extern (func ksym) '%s': resolved to %s [%d]\n", + ext->name, mod_btf ? mod_btf->name : "vmlinux", kfunc_id); + + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + if (obj->gen_loader) { + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = 0; + ext->ksym.kernel_btf_id = 0; + continue; + } + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); + if (err) + return err; + } + return 0; +} + +static int bpf_object__resolve_externs(struct bpf_object *obj, + const char *extra_kconfig) +{ + bool need_config = false, need_kallsyms = false; + bool need_vmlinux_btf = false; + struct extern_desc *ext; + void *kcfg_data = NULL; + int err, i; + + if (obj->nr_extern == 0) + return 0; + + if (obj->kconfig_map_idx >= 0) + kcfg_data = obj->maps[obj->kconfig_map_idx].mmaped; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (ext->type == EXT_KSYM) { + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; + continue; + } else if (ext->type == EXT_KCFG) { + void *ext_ptr = kcfg_data + ext->kcfg.data_off; + __u64 value = 0; + + /* Kconfig externs need actual /proc/config.gz */ + if (str_has_pfx(ext->name, "CONFIG_")) { + need_config = true; + continue; + } + + /* Virtual kcfg externs are customly handled by libbpf */ + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + value = get_kernel_version(); + if (!value) { + pr_warn("extern (kcfg) '%s': failed to get kernel version\n", ext->name); + return -EINVAL; + } + } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { + value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (strcmp(ext->name, "LINUX_HAS_SYSCALL_WRAPPER") == 0) { + value = kernel_supports(obj, FEAT_SYSCALL_WRAPPER); + } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { + /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed + * __kconfig externs, where LINUX_ ones are virtual and filled out + * customly by libbpf (their values don't come from Kconfig). + * If LINUX_xxx variable is not recognized by libbpf, but is marked + * __weak, it defaults to zero value, just like for CONFIG_xxx + * externs. + */ + pr_warn("extern (kcfg) '%s': unrecognized virtual extern\n", ext->name); + return -EINVAL; + } + + err = set_kcfg_value_num(ext, ext_ptr, value); + if (err) + return err; + pr_debug("extern (kcfg) '%s': set to 0x%llx\n", + ext->name, (long long)value); + } else { + pr_warn("extern '%s': unrecognized extern kind\n", ext->name); + return -EINVAL; + } + } + if (need_config && extra_kconfig) { + err = bpf_object__read_kconfig_mem(obj, extra_kconfig, kcfg_data); + if (err) + return -EINVAL; + need_config = false; + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type == EXT_KCFG && !ext->is_set) { + need_config = true; + break; + } + } + } + if (need_config) { + err = bpf_object__read_kconfig_file(obj, kcfg_data); + if (err) + return -EINVAL; + } + if (need_kallsyms) { + err = bpf_object__read_kallsyms_file(obj); + if (err) + return -EINVAL; + } + if (need_vmlinux_btf) { + err = bpf_object__resolve_ksyms_btf_id(obj); + if (err) + return -EINVAL; + } + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + + if (!ext->is_set && !ext->is_weak) { + pr_warn("extern '%s' (strong): not resolved\n", ext->name); + return -ESRCH; + } else if (!ext->is_set) { + pr_debug("extern '%s' (weak): not resolved, defaulting to zero\n", + ext->name); + } + } + + return 0; +} + +static void bpf_map_prepare_vdata(const struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + __u32 i; + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } +} + +static int bpf_object_prepare_struct_ops(struct bpf_object *obj) +{ + int i; + + for (i = 0; i < obj->nr_maps; i++) + if (bpf_map__is_struct_ops(&obj->maps[i])) + bpf_map_prepare_vdata(&obj->maps[i]); + + return 0; +} + +static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) +{ + int err, i; + + if (!obj) + return libbpf_err(-EINVAL); + + if (obj->loaded) { + pr_warn("object '%s': load can't be attempted twice\n", obj->name); + return libbpf_err(-EINVAL); + } + + if (obj->gen_loader) + bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps); + + err = bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__load_progs(obj, extra_log_level); + err = err ? : bpf_object_init_prog_arrays(obj); + err = err ? : bpf_object_prepare_struct_ops(obj); + + if (obj->gen_loader) { + /* reset FDs */ + if (obj->btf) + btf__set_fd(obj->btf, -1); + for (i = 0; i < obj->nr_maps; i++) + obj->maps[i].fd = -1; + if (!err) + err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); + } + + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + free(obj->btf_modules); + + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; + + obj->loaded = true; /* doesn't matter if successfully or not */ + + if (err) + goto out; + + return 0; +out: + /* unpin any maps that were auto-pinned during load */ + for (i = 0; i < obj->nr_maps; i++) + if (obj->maps[i].pinned && !obj->maps[i].reused) + bpf_map__unpin(&obj->maps[i], NULL); + + bpf_object_unload(obj); + pr_warn("failed to load object '%s'\n", obj->path); + return libbpf_err(err); +} + +int bpf_object__load(struct bpf_object *obj) +{ + return bpf_object_load(obj, 0, NULL); +} + +static int make_parent_dir(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + char *dname, *dir; + int err = 0; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (mkdir(dir, 0700) && errno != EEXIST) + err = -errno; + + free(dname); + if (err) { + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to mkdir %s: %s\n", path, cp); + } + return err; +} + +static int check_path(const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct statfs st_fs; + char *dname, *dir; + int err = 0; + + if (path == NULL) + return -EINVAL; + + dname = strdup(path); + if (dname == NULL) + return -ENOMEM; + + dir = dirname(dname); + if (statfs(dir, &st_fs)) { + cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); + pr_warn("failed to statfs %s: %s\n", dir, cp); + err = -errno; + } + free(dname); + + if (!err && st_fs.f_type != BPF_FS_MAGIC) { + pr_warn("specified path %s is not on BPF FS\n", path); + err = -EINVAL; + } + + return err; +} + +int bpf_program__pin(struct bpf_program *prog, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + + err = check_path(path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(prog->fd, path)) { + err = -errno; + cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); + return libbpf_err(err); + } + + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); + return 0; +} + +int bpf_program__unpin(struct bpf_program *prog, const char *path) +{ + int err; + + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err) + return libbpf_err(-errno); + + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); + return 0; +} + +int bpf_map__pin(struct bpf_map *map, const char *path) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_debug("map '%s' already pinned at '%s'; not re-pinning\n", + bpf_map__name(map), map->pin_path); + return 0; + } + } else { + if (!path) { + pr_warn("missing a path to pin map '%s' at\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_warn("map '%s' already pinned\n", bpf_map__name(map)); + return libbpf_err(-EEXIST); + } + + map->pin_path = strdup(path); + if (!map->pin_path) { + err = -errno; + goto out_err; + } + } + + err = make_parent_dir(map->pin_path); + if (err) + return libbpf_err(err); + + err = check_path(map->pin_path); + if (err) + return libbpf_err(err); + + if (bpf_obj_pin(map->fd, map->pin_path)) { + err = -errno; + goto out_err; + } + + map->pinned = true; + pr_debug("pinned map '%s'\n", map->pin_path); + + return 0; + +out_err: + cp = libbpf_strerror_r(-err, errmsg, sizeof(errmsg)); + pr_warn("failed to pin map: %s\n", cp); + return libbpf_err(err); +} + +int bpf_map__unpin(struct bpf_map *map, const char *path) +{ + int err; + + if (map == NULL) { + pr_warn("invalid map pointer\n"); + return libbpf_err(-EINVAL); + } + + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } + path = map->pin_path; + } else if (!path) { + pr_warn("no path to unpin map '%s' from\n", + bpf_map__name(map)); + return libbpf_err(-EINVAL); + } + + err = check_path(path); + if (err) + return libbpf_err(err); + + err = unlink(path); + if (err != 0) + return libbpf_err(-errno); + + map->pinned = false; + pr_debug("unpinned map '%s' from '%s'\n", bpf_map__name(map), path); + + return 0; +} + +int bpf_map__set_pin_path(struct bpf_map *map, const char *path) +{ + char *new = NULL; + + if (path) { + new = strdup(path); + if (!new) + return libbpf_err(-errno); + } + + free(map->pin_path); + map->pin_path = new; + return 0; +} + +__alias(bpf_map__pin_path) +const char *bpf_map__get_pin_path(const struct bpf_map *map); + +const char *bpf_map__pin_path(const struct bpf_map *map) +{ + return map->pin_path; +} + +bool bpf_map__is_pinned(const struct bpf_map *map) +{ + return map->pinned; +} + +static void sanitize_pin_path(char *s) +{ + /* bpffs disallows periods in path names */ + while (*s) { + if (*s == '.') + *s = '_'; + s++; + } +} + +int bpf_object__pin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (!map->autocreate) + continue; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + goto err_unpin_maps; + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__pin(map, pin_path); + if (err) + goto err_unpin_maps; + } + + return 0; + +err_unpin_maps: + while ((map = bpf_object__prev_map(obj, map))) { + if (!map->pin_path) + continue; + + bpf_map__unpin(map, NULL); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) +{ + struct bpf_map *map; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_map(map, obj) { + char *pin_path = NULL; + char buf[PATH_MAX]; + + if (path) { + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); + sanitize_pin_path(buf); + pin_path = buf; + } else if (!map->pin_path) { + continue; + } + + err = bpf_map__unpin(map, pin_path); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + char buf[PATH_MAX]; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + if (!obj->loaded) { + pr_warn("object not yet loaded; load it first\n"); + return libbpf_err(-ENOENT); + } + + bpf_object__for_each_program(prog, obj) { + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + goto err_unpin_programs; + + err = bpf_program__pin(prog, buf); + if (err) + goto err_unpin_programs; + } + + return 0; + +err_unpin_programs: + while ((prog = bpf_object__prev_program(obj, prog))) { + if (pathname_concat(buf, sizeof(buf), path, prog->name)) + continue; + + bpf_program__unpin(prog, buf); + } + + return libbpf_err(err); +} + +int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) +{ + struct bpf_program *prog; + int err; + + if (!obj) + return libbpf_err(-ENOENT); + + bpf_object__for_each_program(prog, obj) { + char buf[PATH_MAX]; + + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); + + err = bpf_program__unpin(prog, buf); + if (err) + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__pin(struct bpf_object *obj, const char *path) +{ + int err; + + err = bpf_object__pin_maps(obj, path); + if (err) + return libbpf_err(err); + + err = bpf_object__pin_programs(obj, path); + if (err) { + bpf_object__unpin_maps(obj, path); + return libbpf_err(err); + } + + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map) +{ + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + + zfree(&map->init_slots); + map->init_slots_sz = 0; + + if (map->mmaped) { + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + munmap(map->mmaped, mmap_sz); + map->mmaped = NULL; + } + + if (map->st_ops) { + zfree(&map->st_ops->data); + zfree(&map->st_ops->progs); + zfree(&map->st_ops->kern_func_off); + zfree(&map->st_ops); + } + + zfree(&map->name); + zfree(&map->real_name); + zfree(&map->pin_path); + + if (map->fd >= 0) + zclose(map->fd); +} + +void bpf_object__close(struct bpf_object *obj) +{ + size_t i; + + if (IS_ERR_OR_NULL(obj)) + return; + + usdt_manager_free(obj->usdt_man); + obj->usdt_man = NULL; + + bpf_gen__free(obj->gen_loader); + bpf_object__elf_finish(obj); + bpf_object_unload(obj); + btf__free(obj->btf); + btf_ext__free(obj->btf_ext); + + for (i = 0; i < obj->nr_maps; i++) + bpf_map__destroy(&obj->maps[i]); + + zfree(&obj->btf_custom_path); + zfree(&obj->kconfig); + zfree(&obj->externs); + obj->nr_extern = 0; + + zfree(&obj->maps); + obj->nr_maps = 0; + + if (obj->programs && obj->nr_programs) { + for (i = 0; i < obj->nr_programs; i++) + bpf_program__exit(&obj->programs[i]); + } + zfree(&obj->programs); + + free(obj); +} + +const char *bpf_object__name(const struct bpf_object *obj) +{ + return obj ? obj->name : libbpf_err_ptr(-EINVAL); +} + +unsigned int bpf_object__kversion(const struct bpf_object *obj) +{ + return obj ? obj->kern_version : 0; +} + +struct btf *bpf_object__btf(const struct bpf_object *obj) +{ + return obj ? obj->btf : NULL; +} + +int bpf_object__btf_fd(const struct bpf_object *obj) +{ + return obj->btf ? btf__fd(obj->btf) : -1; +} + +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return libbpf_err(-EINVAL); + + obj->kern_version = kern_version; + + return 0; +} + +int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) +{ + struct bpf_gen *gen; + + if (!opts) + return -EFAULT; + if (!OPTS_VALID(opts, gen_loader_opts)) + return -EINVAL; + gen = calloc(sizeof(*gen), 1); + if (!gen) + return -ENOMEM; + gen->opts = opts; + obj->gen_loader = gen; + return 0; +} + +static struct bpf_program * +__bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, + bool forward) +{ + size_t nr_programs = obj->nr_programs; + ssize_t idx; + + if (!nr_programs) + return NULL; + + if (!p) + /* Iter from the beginning */ + return forward ? &obj->programs[0] : + &obj->programs[nr_programs - 1]; + + if (p->obj != obj) { + pr_warn("error: program handler doesn't match object\n"); + return errno = EINVAL, NULL; + } + + idx = (p - obj->programs) + (forward ? 1 : -1); + if (idx >= obj->nr_programs || idx < 0) + return NULL; + return &obj->programs[idx]; +} + +struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) +{ + struct bpf_program *prog = prev; + + do { + prog = __bpf_program__iter(prog, obj, true); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) +{ + struct bpf_program *prog = next; + + do { + prog = __bpf_program__iter(prog, obj, false); + } while (prog && prog_is_subprog(obj, prog)); + + return prog; +} + +void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) +{ + prog->prog_ifindex = ifindex; +} + +const char *bpf_program__name(const struct bpf_program *prog) +{ + return prog->name; +} + +const char *bpf_program__section_name(const struct bpf_program *prog) +{ + return prog->sec_name; +} + +bool bpf_program__autoload(const struct bpf_program *prog) +{ + return prog->autoload; +} + +int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) +{ + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + prog->autoload = autoload; + return 0; +} + +bool bpf_program__autoattach(const struct bpf_program *prog) +{ + return prog->autoattach; +} + +void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach) +{ + prog->autoattach = autoattach; +} + +const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) +{ + return prog->insns; +} + +size_t bpf_program__insn_cnt(const struct bpf_program *prog) +{ + return prog->insns_cnt; +} + +int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt) +{ + struct bpf_insn *insns; + + if (prog->obj->loaded) + return -EBUSY; + + insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); + /* NULL is a valid return from reallocarray if the new count is zero */ + if (!insns && new_insn_cnt) { + pr_warn("prog '%s': failed to realloc prog code\n", prog->name); + return -ENOMEM; + } + memcpy(insns, new_insns, new_insn_cnt * sizeof(*insns)); + + prog->insns = insns; + prog->insns_cnt = new_insn_cnt; + return 0; +} + +int bpf_program__fd(const struct bpf_program *prog) +{ + if (!prog) + return libbpf_err(-EINVAL); + + if (prog->fd < 0) + return libbpf_err(-ENOENT); + + return prog->fd; +} + +__alias(bpf_program__type) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); + +enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) +{ + return prog->type; +} + +static size_t custom_sec_def_cnt; +static struct bpf_sec_def *custom_sec_defs; +static struct bpf_sec_def custom_fallback_def; +static bool has_custom_fallback_def; +static int last_custom_sec_def_handler_id; + +int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + /* if type is not changed, do nothing */ + if (prog->type == type) + return 0; + + prog->type = type; + + /* If a program type was changed, we need to reset associated SEC() + * handler, as it will be invalid now. The only exception is a generic + * fallback handler, which by definition is program type-agnostic and + * is a catch-all custom handler, optionally set by the application, + * so should be able to handle any type of BPF program. + */ + if (prog->sec_def != &custom_fallback_def) + prog->sec_def = NULL; + return 0; +} + +__alias(bpf_program__expected_attach_type) +enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); + +enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program *prog) +{ + return prog->expected_attach_type; +} + +int bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->expected_attach_type = type; + return 0; +} + +__u32 bpf_program__flags(const struct bpf_program *prog) +{ + return prog->prog_flags; +} + +int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->prog_flags = flags; + return 0; +} + +__u32 bpf_program__log_level(const struct bpf_program *prog) +{ + return prog->log_level; +} + +int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) +{ + if (prog->obj->loaded) + return libbpf_err(-EBUSY); + + prog->log_level = log_level; + return 0; +} + +const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size) +{ + *log_size = prog->log_size; + return prog->log_buf; +} + +int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size) +{ + if (log_size && !log_buf) + return -EINVAL; + if (prog->log_size > UINT_MAX) + return -EINVAL; + if (prog->obj->loaded) + return -EBUSY; + + prog->log_buf = log_buf; + prog->log_size = log_size; + return 0; +} + +#define SEC_DEF(sec_pfx, ptype, atype, flags, ...) { \ + .sec = (char *)sec_pfx, \ + .prog_type = BPF_PROG_TYPE_##ptype, \ + .expected_attach_type = atype, \ + .cookie = (long)(flags), \ + .prog_prepare_load_fn = libbpf_prepare_prog_load, \ + __VA_ARGS__ \ +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); + +static const struct bpf_sec_def section_defs[] = { + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), + SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), + SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), + SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), + SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), + SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), + SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), + SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), + SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), + SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), + SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), + SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), + SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), + SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), +}; + +int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts) +{ + struct bpf_sec_def *sec_def; + + if (!OPTS_VALID(opts, libbpf_prog_handler_opts)) + return libbpf_err(-EINVAL); + + if (last_custom_sec_def_handler_id == INT_MAX) /* prevent overflow */ + return libbpf_err(-E2BIG); + + if (sec) { + sec_def = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt + 1, + sizeof(*sec_def)); + if (!sec_def) + return libbpf_err(-ENOMEM); + + custom_sec_defs = sec_def; + sec_def = &custom_sec_defs[custom_sec_def_cnt]; + } else { + if (has_custom_fallback_def) + return libbpf_err(-EBUSY); + + sec_def = &custom_fallback_def; + } + + sec_def->sec = sec ? strdup(sec) : NULL; + if (sec && !sec_def->sec) + return libbpf_err(-ENOMEM); + + sec_def->prog_type = prog_type; + sec_def->expected_attach_type = exp_attach_type; + sec_def->cookie = OPTS_GET(opts, cookie, 0); + + sec_def->prog_setup_fn = OPTS_GET(opts, prog_setup_fn, NULL); + sec_def->prog_prepare_load_fn = OPTS_GET(opts, prog_prepare_load_fn, NULL); + sec_def->prog_attach_fn = OPTS_GET(opts, prog_attach_fn, NULL); + + sec_def->handler_id = ++last_custom_sec_def_handler_id; + + if (sec) + custom_sec_def_cnt++; + else + has_custom_fallback_def = true; + + return sec_def->handler_id; +} + +int libbpf_unregister_prog_handler(int handler_id) +{ + struct bpf_sec_def *sec_defs; + int i; + + if (handler_id <= 0) + return libbpf_err(-EINVAL); + + if (has_custom_fallback_def && custom_fallback_def.handler_id == handler_id) { + memset(&custom_fallback_def, 0, sizeof(custom_fallback_def)); + has_custom_fallback_def = false; + return 0; + } + + for (i = 0; i < custom_sec_def_cnt; i++) { + if (custom_sec_defs[i].handler_id == handler_id) + break; + } + + if (i == custom_sec_def_cnt) + return libbpf_err(-ENOENT); + + free(custom_sec_defs[i].sec); + for (i = i + 1; i < custom_sec_def_cnt; i++) + custom_sec_defs[i - 1] = custom_sec_defs[i]; + custom_sec_def_cnt--; + + /* try to shrink the array, but it's ok if we couldn't */ + sec_defs = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt, sizeof(*sec_defs)); + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (sec_defs || custom_sec_def_cnt == 0) + custom_sec_defs = sec_defs; + + return 0; +} + +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) +{ + size_t len = strlen(sec_def->sec); + + /* "type/" always has to have proper SEC("type/extras") form */ + if (sec_def->sec[len - 1] == '/') { + if (str_has_pfx(sec_name, sec_def->sec)) + return true; + return false; + } + + /* "type+" means it can be either exact SEC("type") or + * well-formed SEC("type/extras") with proper '/' separator + */ + if (sec_def->sec[len - 1] == '+') { + len--; + /* not even a prefix */ + if (strncmp(sec_name, sec_def->sec, len) != 0) + return false; + /* exact match or has '/' separator */ + if (sec_name[len] == '\0' || sec_name[len] == '/') + return true; + return false; + } + + return strcmp(sec_name, sec_def->sec) == 0; +} + +static const struct bpf_sec_def *find_sec_def(const char *sec_name) +{ + const struct bpf_sec_def *sec_def; + int i, n; + + n = custom_sec_def_cnt; + for (i = 0; i < n; i++) { + sec_def = &custom_sec_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + n = ARRAY_SIZE(section_defs); + for (i = 0; i < n; i++) { + sec_def = §ion_defs[i]; + if (sec_def_matches(sec_def, sec_name)) + return sec_def; + } + + if (has_custom_fallback_def) + return &custom_fallback_def; + + return NULL; +} + +#define MAX_TYPE_NAME_SIZE 32 + +static char *libbpf_get_type_names(bool attach_type) +{ + int i, len = ARRAY_SIZE(section_defs) * MAX_TYPE_NAME_SIZE; + char *buf; + + buf = malloc(len); + if (!buf) + return NULL; + + buf[0] = '\0'; + /* Forge string buf with all available names */ + for (i = 0; i < ARRAY_SIZE(section_defs); i++) { + const struct bpf_sec_def *sec_def = §ion_defs[i]; + + if (attach_type) { + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + continue; + + if (!(sec_def->cookie & SEC_ATTACHABLE)) + continue; + } + + if (strlen(buf) + strlen(section_defs[i].sec) + 2 > len) { + free(buf); + return NULL; + } + strcat(buf, " "); + strcat(buf, section_defs[i].sec); + } + + return buf; +} + +int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type) +{ + const struct bpf_sec_def *sec_def; + char *type_names; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (sec_def) { + *prog_type = sec_def->prog_type; + *expected_attach_type = sec_def->expected_attach_type; + return 0; + } + + pr_debug("failed to guess program type from ELF section '%s'\n", name); + type_names = libbpf_get_type_names(false); + if (type_names != NULL) { + pr_debug("supported section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-ESRCH); +} + +const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(attach_type_name)) + return NULL; + + return attach_type_name[t]; +} + +const char *libbpf_bpf_link_type_str(enum bpf_link_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(link_type_name)) + return NULL; + + return link_type_name[t]; +} + +const char *libbpf_bpf_map_type_str(enum bpf_map_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(map_type_name)) + return NULL; + + return map_type_name[t]; +} + +const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) + return NULL; + + return prog_type_name[t]; +} + +static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + int sec_idx, + size_t offset) +{ + struct bpf_map *map; + size_t i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + if (!bpf_map__is_struct_ops(map)) + continue; + if (map->sec_idx == sec_idx && + map->sec_offset <= offset && + offset - map->sec_offset < map->def.value_size) + return map; + } + + return NULL; +} + +/* Collect the reloc from ELF and populate the st_ops->progs[] */ +static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, + Elf64_Shdr *shdr, Elf_Data *data) +{ + const struct btf_member *member; + struct bpf_struct_ops *st_ops; + struct bpf_program *prog; + unsigned int shdr_idx; + const struct btf *btf; + struct bpf_map *map; + unsigned int moff, insn_idx; + const char *name; + __u32 member_idx; + Elf64_Sym *sym; + Elf64_Rel *rel; + int i, nrels; + + btf = obj->btf; + nrels = shdr->sh_size / shdr->sh_entsize; + for (i = 0; i < nrels; i++) { + rel = elf_rel_by_idx(data, i); + if (!rel) { + pr_warn("struct_ops reloc: failed to get %d reloc\n", i); + return -LIBBPF_ERRNO__FORMAT; + } + + sym = elf_sym_by_idx(obj, ELF64_R_SYM(rel->r_info)); + if (!sym) { + pr_warn("struct_ops reloc: symbol %zx not found\n", + (size_t)ELF64_R_SYM(rel->r_info)); + return -LIBBPF_ERRNO__FORMAT; + } + + name = elf_sym_str(obj, sym->st_name) ?: ""; + map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset); + if (!map) { + pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n", + (size_t)rel->r_offset); + return -EINVAL; + } + + moff = rel->r_offset - map->sec_offset; + shdr_idx = sym->st_shndx; + st_ops = map->st_ops; + pr_debug("struct_ops reloc %s: for %lld value %lld shdr_idx %u rel->r_offset %zu map->sec_offset %zu name %d (\'%s\')\n", + map->name, + (long long)(rel->r_info >> 32), + (long long)sym->st_value, + shdr_idx, (size_t)rel->r_offset, + map->sec_offset, sym->st_name, name); + + if (shdr_idx >= SHN_LORESERVE) { + pr_warn("struct_ops reloc %s: rel->r_offset %zu shdr_idx %u unsupported non-static function\n", + map->name, (size_t)rel->r_offset, shdr_idx); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("struct_ops reloc %s: invalid target program offset %llu\n", + map->name, (unsigned long long)sym->st_value); + return -LIBBPF_ERRNO__FORMAT; + } + insn_idx = sym->st_value / BPF_INSN_SZ; + + member = find_member_by_offset(st_ops->type, moff * 8); + if (!member) { + pr_warn("struct_ops reloc %s: cannot find member at moff %u\n", + map->name, moff); + return -EINVAL; + } + member_idx = member - btf_members(st_ops->type); + name = btf__name_by_offset(btf, member->name_off); + + if (!resolve_func_ptr(btf, member->type, NULL)) { + pr_warn("struct_ops reloc %s: cannot relocate non func ptr %s\n", + map->name, name); + return -EINVAL; + } + + prog = find_prog_by_sec_insn(obj, shdr_idx, insn_idx); + if (!prog) { + pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n", + map->name, shdr_idx, name); + return -EINVAL; + } + + /* prevent the use of BPF prog with invalid type */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("struct_ops reloc %s: prog %s is not struct_ops BPF program\n", + map->name, prog->name); + return -EINVAL; + } + + /* if we haven't yet processed this BPF program, record proper + * attach_btf_id and member_idx + */ + if (!prog->attach_btf_id) { + prog->attach_btf_id = st_ops->type_id; + prog->expected_attach_type = member_idx; + } + + /* struct_ops BPF prog can be re-used between multiple + * .struct_ops & .struct_ops.link as long as it's the + * same struct_ops struct definition and the same + * function pointer field + */ + if (prog->attach_btf_id != st_ops->type_id || + prog->expected_attach_type != member_idx) { + pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", + map->name, prog->name, prog->sec_name, prog->type, + prog->attach_btf_id, prog->expected_attach_type, name); + return -EINVAL; + } + + st_ops->progs[member_idx] = prog; + } + + return 0; +} + +#define BTF_TRACE_PREFIX "btf_trace_" +#define BTF_LSM_PREFIX "bpf_lsm_" +#define BTF_ITER_PREFIX "bpf_iter_" +#define BTF_MAX_NAME_SIZE 128 + +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind) +{ + switch (attach_type) { + case BPF_TRACE_RAW_TP: + *prefix = BTF_TRACE_PREFIX; + *kind = BTF_KIND_TYPEDEF; + break; + case BPF_LSM_MAC: + case BPF_LSM_CGROUP: + *prefix = BTF_LSM_PREFIX; + *kind = BTF_KIND_FUNC; + break; + case BPF_TRACE_ITER: + *prefix = BTF_ITER_PREFIX; + *kind = BTF_KIND_FUNC; + break; + default: + *prefix = ""; + *kind = BTF_KIND_FUNC; + } +} + +static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, + const char *name, __u32 kind) +{ + char btf_type_name[BTF_MAX_NAME_SIZE]; + int ret; + + ret = snprintf(btf_type_name, sizeof(btf_type_name), + "%s%s", prefix, name); + /* snprintf returns the number of characters written excluding the + * terminating null. So, if >= BTF_MAX_NAME_SIZE are written, it + * indicates truncation. + */ + if (ret < 0 || ret >= sizeof(btf_type_name)) + return -ENAMETOOLONG; + return btf__find_by_name_kind(btf, btf_type_name, kind); +} + +static inline int find_attach_btf_id(struct btf *btf, const char *name, + enum bpf_attach_type attach_type) +{ + const char *prefix; + int kind; + + btf_get_kernel_prefix_kind(attach_type, &prefix, &kind); + return find_btf_by_prefix_kind(btf, prefix, name, kind); +} + +int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type) +{ + struct btf *btf; + int err; + + btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(btf); + if (err) { + pr_warn("vmlinux BTF is not found\n"); + return libbpf_err(err); + } + + err = find_attach_btf_id(btf, name, attach_type); + if (err <= 0) + pr_warn("%s is not found in vmlinux BTF\n", name); + + btf__free(btf); + return libbpf_err(err); +} + +static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + struct btf *btf; + int err; + + memset(&info, 0, info_len); + err = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len); + if (err) { + pr_warn("failed bpf_prog_get_info_by_fd for FD %d: %d\n", + attach_prog_fd, err); + return err; + } + + err = -EINVAL; + if (!info.btf_id) { + pr_warn("The target program doesn't have BTF\n"); + goto out; + } + btf = btf__load_from_kernel_by_id(info.btf_id); + err = libbpf_get_error(btf); + if (err) { + pr_warn("Failed to get BTF %d of the program: %d\n", info.btf_id, err); + goto out; + } + err = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + btf__free(btf); + if (err <= 0) { + pr_warn("%s is not found in prog's BTF\n", name); + goto out; + } +out: + return err; +} + +static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, + enum bpf_attach_type attach_type, + int *btf_obj_fd, int *btf_type_id) +{ + int ret, i; + + ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; + + ret = load_module_btfs(obj); + if (ret) + return ret; + + for (i = 0; i < obj->btf_module_cnt; i++) { + const struct module_btf *mod = &obj->btf_modules[i]; + + ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (ret > 0) { + *btf_obj_fd = mod->fd; + *btf_type_id = ret; + return 0; + } + if (ret == -ENOENT) + continue; + + return ret; + } + + return -ESRCH; +} + +static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attach_name, + int *btf_obj_fd, int *btf_type_id) +{ + enum bpf_attach_type attach_type = prog->expected_attach_type; + __u32 attach_prog_fd = prog->attach_prog_fd; + int err = 0; + + /* BPF program's BTF ID */ + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } + err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); + if (err < 0) { + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); + return err; + } + *btf_obj_fd = 0; + *btf_type_id = err; + return 0; + } + + /* kernel/module BTF ID */ + if (prog->obj->gen_loader) { + bpf_gen__record_attach_target(prog->obj->gen_loader, attach_name, attach_type); + *btf_obj_fd = 0; + *btf_type_id = 1; + } else { + err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + } + if (err) { + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); + return err; + } + return 0; +} + +int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type) +{ + char *type_names; + const struct bpf_sec_def *sec_def; + + if (!name) + return libbpf_err(-EINVAL); + + sec_def = find_sec_def(name); + if (!sec_def) { + pr_debug("failed to guess attach type based on ELF section name '%s'\n", name); + type_names = libbpf_get_type_names(true); + if (type_names != NULL) { + pr_debug("attachable section(type) names are:%s\n", type_names); + free(type_names); + } + + return libbpf_err(-EINVAL); + } + + if (sec_def->prog_prepare_load_fn != libbpf_prepare_prog_load) + return libbpf_err(-EINVAL); + if (!(sec_def->cookie & SEC_ATTACHABLE)) + return libbpf_err(-EINVAL); + + *attach_type = sec_def->expected_attach_type; + return 0; +} + +int bpf_map__fd(const struct bpf_map *map) +{ + return map ? map->fd : libbpf_err(-EINVAL); +} + +static bool map_uses_real_name(const struct bpf_map *map) +{ + /* Since libbpf started to support custom .data.* and .rodata.* maps, + * their user-visible name differs from kernel-visible name. Users see + * such map's corresponding ELF section name as a map name. + * This check distinguishes .data/.rodata from .data.* and .rodata.* + * maps to know which name has to be returned to the user. + */ + if (map->libbpf_type == LIBBPF_MAP_DATA && strcmp(map->real_name, DATA_SEC) != 0) + return true; + if (map->libbpf_type == LIBBPF_MAP_RODATA && strcmp(map->real_name, RODATA_SEC) != 0) + return true; + return false; +} + +const char *bpf_map__name(const struct bpf_map *map) +{ + if (!map) + return NULL; + + if (map_uses_real_name(map)) + return map->real_name; + + return map->name; +} + +enum bpf_map_type bpf_map__type(const struct bpf_map *map) +{ + return map->def.type; +} + +int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.type = type; + return 0; +} + +__u32 bpf_map__map_flags(const struct bpf_map *map) +{ + return map->def.map_flags; +} + +int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.map_flags = flags; + return 0; +} + +__u64 bpf_map__map_extra(const struct bpf_map *map) +{ + return map->map_extra; +} + +int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_extra = map_extra; + return 0; +} + +__u32 bpf_map__numa_node(const struct bpf_map *map) +{ + return map->numa_node; +} + +int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->numa_node = numa_node; + return 0; +} + +__u32 bpf_map__key_size(const struct bpf_map *map) +{ + return map->def.key_size; +} + +int bpf_map__set_key_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->def.key_size = size; + return 0; +} + +__u32 bpf_map__value_size(const struct bpf_map *map) +{ + return map->def.value_size; +} + +static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) +{ + struct btf *btf; + struct btf_type *datasec_type, *var_type; + struct btf_var_secinfo *var; + const struct btf_type *array_type; + const struct btf_array *array; + int vlen, element_sz, new_array_id; + __u32 nr_elements; + + /* check btf existence */ + btf = bpf_object__btf(map->obj); + if (!btf) + return -ENOENT; + + /* verify map is datasec */ + datasec_type = btf_type_by_id(btf, bpf_map__btf_value_type_id(map)); + if (!btf_is_datasec(datasec_type)) { + pr_warn("map '%s': cannot be resized, map value type is not a datasec\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify datasec has at least one var */ + vlen = btf_vlen(datasec_type); + if (vlen == 0) { + pr_warn("map '%s': cannot be resized, map value datasec is empty\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify last var in the datasec is an array */ + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + array_type = skip_mods_and_typedefs(btf, var_type->type, NULL); + if (!btf_is_array(array_type)) { + pr_warn("map '%s': cannot be resized, last var must be an array\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify request size aligns with array */ + array = btf_array(array_type); + element_sz = btf__resolve_size(btf, array->type); + if (element_sz <= 0 || (size - var->offset) % element_sz != 0) { + pr_warn("map '%s': cannot be resized, element size (%d) doesn't align with new total size (%u)\n", + bpf_map__name(map), element_sz, size); + return -EINVAL; + } + + /* create a new array based on the existing array, but with new length */ + nr_elements = (size - var->offset) / element_sz; + new_array_id = btf__add_array(btf, array->index_type, array->type, nr_elements); + if (new_array_id < 0) + return new_array_id; + + /* adding a new btf type invalidates existing pointers to btf objects, + * so refresh pointers before proceeding + */ + datasec_type = btf_type_by_id(btf, map->btf_value_type_id); + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + + /* finally update btf info */ + datasec_type->size = size; + var->size = size - var->offset; + var_type->type = new_array_id; + + return 0; +} + +int bpf_map__set_value_size(struct bpf_map *map, __u32 size) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + + if (map->mmaped) { + int err; + size_t mmap_old_sz, mmap_new_sz; + + mmap_old_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + mmap_new_sz = bpf_map_mmap_sz(size, map->def.max_entries); + err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); + if (err) { + pr_warn("map '%s': failed to resize memory-mapped region: %d\n", + bpf_map__name(map), err); + return err; + } + err = map_btf_datasec_resize(map, size); + if (err && err != -ENOENT) { + pr_warn("map '%s': failed to adjust resized BTF, clearing BTF key/value info: %d\n", + bpf_map__name(map), err); + map->btf_value_type_id = 0; + map->btf_key_type_id = 0; + } + } + + map->def.value_size = size; + return 0; +} + +__u32 bpf_map__btf_key_type_id(const struct bpf_map *map) +{ + return map ? map->btf_key_type_id : 0; +} + +__u32 bpf_map__btf_value_type_id(const struct bpf_map *map) +{ + return map ? map->btf_value_type_id : 0; +} + +int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size) +{ + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || + size != map->def.value_size || map->fd >= 0) + return libbpf_err(-EINVAL); + + memcpy(map->mmaped, data, size); + return 0; +} + +void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +{ + if (!map->mmaped) + return NULL; + *psize = map->def.value_size; + return map->mmaped; +} + +bool bpf_map__is_internal(const struct bpf_map *map) +{ + return map->libbpf_type != LIBBPF_MAP_UNSPEC; +} + +__u32 bpf_map__ifindex(const struct bpf_map *map) +{ + return map->map_ifindex; +} + +int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex) +{ + if (map->fd >= 0) + return libbpf_err(-EBUSY); + map->map_ifindex = ifindex; + return 0; +} + +int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd) +{ + if (!bpf_map_type__is_map_in_map(map->def.type)) { + pr_warn("error: unsupported map type\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map_fd != -1) { + pr_warn("error: inner_map_fd already specified\n"); + return libbpf_err(-EINVAL); + } + if (map->inner_map) { + bpf_map__destroy(map->inner_map); + zfree(&map->inner_map); + } + map->inner_map_fd = fd; + return 0; +} + +static struct bpf_map * +__bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) +{ + ssize_t idx; + struct bpf_map *s, *e; + + if (!obj || !obj->maps) + return errno = EINVAL, NULL; + + s = obj->maps; + e = obj->maps + obj->nr_maps; + + if ((m < s) || (m >= e)) { + pr_warn("error in %s: map handler doesn't belong to object\n", + __func__); + return errno = EINVAL, NULL; + } + + idx = (m - obj->maps) + i; + if (idx >= obj->nr_maps || idx < 0) + return NULL; + return &obj->maps[idx]; +} + +struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) +{ + if (prev == NULL) + return obj->maps; + + return __bpf_map__iter(prev, obj, 1); +} + +struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) +{ + if (next == NULL) { + if (!obj->nr_maps) + return NULL; + return obj->maps + obj->nr_maps - 1; + } + + return __bpf_map__iter(next, obj, -1); +} + +struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name) +{ + struct bpf_map *pos; + + bpf_object__for_each_map(pos, obj) { + /* if it's a special internal map name (which always starts + * with dot) then check if that special name matches the + * real map name (ELF section name) + */ + if (name[0] == '.') { + if (pos->real_name && strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + /* otherwise map name has to be an exact match */ + if (map_uses_real_name(pos)) { + if (strcmp(pos->real_name, name) == 0) + return pos; + continue; + } + if (strcmp(pos->name, name) == 0) + return pos; + } + return errno = ENOENT, NULL; +} + +int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) +{ + return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); +} + +static int validate_map_op(const struct bpf_map *map, size_t key_sz, + size_t value_sz, bool check_value_sz) +{ + if (map->fd <= 0) + return -ENOENT; + + if (map->def.key_size != key_sz) { + pr_warn("map '%s': unexpected key size %zu provided, expected %u\n", + map->name, key_sz, map->def.key_size); + return -EINVAL; + } + + if (!check_value_sz) + return 0; + + switch (map->def.type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: { + int num_cpu = libbpf_num_possible_cpus(); + size_t elem_sz = roundup(map->def.value_size, 8); + + if (value_sz != num_cpu * elem_sz) { + pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", + map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); + return -EINVAL; + } + break; + } + default: + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + return 0; +} + +int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_update_elem(map->fd, key, value, flags); +} + +int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_delete_elem_flags(map->fd, key, flags); +} + +int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; + + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_lookup_and_delete_elem_flags(map->fd, key, value, flags); +} + +int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) + return libbpf_err(err); + + return bpf_map_get_next_key(map->fd, cur_key, next_key); +} + +long libbpf_get_error(const void *ptr) +{ + if (!IS_ERR_OR_NULL(ptr)) + return 0; + + if (IS_ERR(ptr)) + errno = -PTR_ERR(ptr); + + /* If ptr == NULL, then errno should be already set by the failing + * API, because libbpf never returns NULL on success and it now always + * sets errno on error. So no extra errno handling for ptr == NULL + * case. + */ + return -errno; +} + +/* Replace link's underlying BPF program with the new one */ +int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) +{ + int ret; + + ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + return libbpf_err_errno(ret); +} + +/* Release "ownership" of underlying BPF resource (typically, BPF program + * attached to some BPF hook, e.g., tracepoint, kprobe, etc). Disconnected + * link, when destructed through bpf_link__destroy() call won't attempt to + * detach/unregisted that BPF resource. This is useful in situations where, + * say, attached BPF program has to outlive userspace program that attached it + * in the system. Depending on type of BPF program, though, there might be + * additional steps (like pinning BPF program in BPF FS) necessary to ensure + * exit of userspace program doesn't trigger automatic detachment and clean up + * inside the kernel. + */ +void bpf_link__disconnect(struct bpf_link *link) +{ + link->disconnected = true; +} + +int bpf_link__destroy(struct bpf_link *link) +{ + int err = 0; + + if (IS_ERR_OR_NULL(link)) + return 0; + + if (!link->disconnected && link->detach) + err = link->detach(link); + if (link->pin_path) + free(link->pin_path); + if (link->dealloc) + link->dealloc(link); + else + free(link); + + return libbpf_err(err); +} + +int bpf_link__fd(const struct bpf_link *link) +{ + return link->fd; +} + +const char *bpf_link__pin_path(const struct bpf_link *link) +{ + return link->pin_path; +} + +static int bpf_link__detach_fd(struct bpf_link *link) +{ + return libbpf_err_errno(close(link->fd)); +} + +struct bpf_link *bpf_link__open(const char *path) +{ + struct bpf_link *link; + int fd; + + fd = bpf_obj_get(path); + if (fd < 0) { + fd = -errno; + pr_warn("failed to open link at %s: %d\n", path, fd); + return libbpf_err_ptr(fd); + } + + link = calloc(1, sizeof(*link)); + if (!link) { + close(fd); + return libbpf_err_ptr(-ENOMEM); + } + link->detach = &bpf_link__detach_fd; + link->fd = fd; + + link->pin_path = strdup(path); + if (!link->pin_path) { + bpf_link__destroy(link); + return libbpf_err_ptr(-ENOMEM); + } + + return link; +} + +int bpf_link__detach(struct bpf_link *link) +{ + return bpf_link_detach(link->fd) ? -errno : 0; +} + +int bpf_link__pin(struct bpf_link *link, const char *path) +{ + int err; + + if (link->pin_path) + return libbpf_err(-EBUSY); + err = make_parent_dir(path); + if (err) + return libbpf_err(err); + err = check_path(path); + if (err) + return libbpf_err(err); + + link->pin_path = strdup(path); + if (!link->pin_path) + return libbpf_err(-ENOMEM); + + if (bpf_obj_pin(link->fd, link->pin_path)) { + err = -errno; + zfree(&link->pin_path); + return libbpf_err(err); + } + + pr_debug("link fd=%d: pinned at %s\n", link->fd, link->pin_path); + return 0; +} + +int bpf_link__unpin(struct bpf_link *link) +{ + int err; + + if (!link->pin_path) + return libbpf_err(-EINVAL); + + err = unlink(link->pin_path); + if (err != 0) + return -errno; + + pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path); + zfree(&link->pin_path); + return 0; +} + +struct bpf_link_perf { + struct bpf_link link; + int perf_event_fd; + /* legacy kprobe support: keep track of probe identifier and type */ + char *legacy_probe_name; + bool legacy_is_kprobe; + bool legacy_is_retprobe; +}; + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe); +static int remove_uprobe_event_legacy(const char *probe_name, bool retprobe); + +static int bpf_link_perf_detach(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + int err = 0; + + if (ioctl(perf_link->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0) < 0) + err = -errno; + + if (perf_link->perf_event_fd != link->fd) + close(perf_link->perf_event_fd); + close(link->fd); + + /* legacy uprobe/kprobe needs to be removed after perf event fd closure */ + if (perf_link->legacy_probe_name) { + if (perf_link->legacy_is_kprobe) { + err = remove_kprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } else { + err = remove_uprobe_event_legacy(perf_link->legacy_probe_name, + perf_link->legacy_is_retprobe); + } + } + + return err; +} + +static void bpf_link_perf_dealloc(struct bpf_link *link) +{ + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + free(perf_link->legacy_probe_name); + free(perf_link); +} + +struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link_perf *link; + int prog_fd, link_fd = -1, err; + bool force_ioctl_attach; + + if (!OPTS_VALID(opts, bpf_perf_event_opts)) + return libbpf_err_ptr(-EINVAL); + + if (pfd < 0) { + pr_warn("prog '%s': invalid perf event FD %d\n", + prog->name, pfd); + return libbpf_err_ptr(-EINVAL); + } + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->link.detach = &bpf_link_perf_detach; + link->link.dealloc = &bpf_link_perf_dealloc; + link->perf_event_fd = pfd; + + force_ioctl_attach = OPTS_GET(opts, force_ioctl_attach, false); + if (kernel_supports(prog->obj, FEAT_PERF_LINK) && !force_ioctl_attach) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, + .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); + + link_fd = bpf_link_create(prog_fd, pfd, BPF_PERF_EVENT, &link_opts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create BPF link for perf_event FD %d: %d (%s)\n", + prog->name, pfd, + err, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link->link.fd = link_fd; + } else { + if (OPTS_GET(opts, bpf_cookie, 0)) { + pr_warn("prog '%s': user context value is not supported\n", prog->name); + err = -EOPNOTSUPP; + goto err_out; + } + + if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, prog_fd) < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach to perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + if (err == -EPROTO) + pr_warn("prog '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n", + prog->name, pfd); + goto err_out; + } + link->link.fd = pfd; + } + if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("prog '%s': failed to enable perf_event FD %d: %s\n", + prog->name, pfd, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + return &link->link; +err_out: + if (link_fd >= 0) + close(link_fd); + free(link); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd) +{ + return bpf_program__attach_perf_event_opts(prog, pfd, NULL); +} + +/* + * this function is expected to parse integer in the range of [0, 2^31-1] from + * given file using scanf format string fmt. If actual parsed value is + * negative, the result might be indistinguishable from error + */ +static int parse_uint_from_file(const char *file, const char *fmt) +{ + char buf[STRERR_BUFSIZE]; + int err, ret; + FILE *f; + + f = fopen(file, "re"); + if (!f) { + err = -errno; + pr_debug("failed to open '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + return err; + } + err = fscanf(f, fmt, &ret); + if (err != 1) { + err = err == EOF ? -EIO : -errno; + pr_debug("failed to parse '%s': %s\n", file, + libbpf_strerror_r(err, buf, sizeof(buf))); + fclose(f); + return err; + } + fclose(f); + return ret; +} + +static int determine_kprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_uprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_kprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/kprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +static int determine_uprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +#define PERF_UPROBE_REF_CTR_OFFSET_BITS 32 +#define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 + +static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, + uint64_t offset, int pid, size_t ref_ctr_off) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd; + + if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + return -EINVAL; + + memset(&attr, 0, attr_sz); + + type = uprobe ? determine_uprobe_perf_type() + : determine_kprobe_perf_type(); + if (type < 0) { + pr_warn("failed to determine %s perf type: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(type, errmsg, sizeof(errmsg))); + return type; + } + if (retprobe) { + int bit = uprobe ? determine_uprobe_retprobe_bit() + : determine_kprobe_retprobe_bit(); + + if (bit < 0) { + pr_warn("failed to determine %s retprobe bit: %s\n", + uprobe ? "uprobe" : "kprobe", + libbpf_strerror_r(bit, errmsg, sizeof(errmsg))); + return bit; + } + attr.config |= 1 << bit; + } + attr.size = attr_sz; + attr.type = type; + attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; + attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ + attr.config2 = offset; /* kprobe_addr or probe_offset */ + + /* pid filter is meaningful only for uprobes */ + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid /* pid */, + pid == -1 ? 0 : -1 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + return pfd >= 0 ? pfd : -errno; +} + +static int append_to_file(const char *file, const char *fmt, ...) +{ + int fd, n, err = 0; + va_list ap; + char buf[1024]; + + va_start(ap, fmt); + n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + if (n < 0 || n >= sizeof(buf)) + return -EINVAL; + + fd = open(file, O_WRONLY | O_APPEND | O_CLOEXEC, 0); + if (fd < 0) + return -errno; + + if (write(fd, buf, n) < 0) + err = -errno; + + close(fd); + return err; +} + +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) +{ + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) +{ + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_kprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/kprobe_events" : TRACEFS"/kprobe_events"; +} + +static const char *tracefs_uprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; +} + +static const char *tracefs_available_filter_functions(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions" + : TRACEFS"/available_filter_functions"; +} + +static const char *tracefs_available_filter_functions_addrs(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions_addrs" + : TRACEFS"/available_filter_functions_addrs"; +} + +static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *kfunc_name, size_t offset) +{ + static int index = 0; + int i; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, + __sync_fetch_and_add(&index, 1)); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } +} + +static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset) +{ + return append_to_file(tracefs_kprobe_events(), "%c:%s/%s %s+0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "kretprobes" : "kprobes", + probe_name, kfunc_name, offset); +} + +static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_kprobe_events(), "-:%s/%s", + retprobe ? "kretprobes" : "kprobes", probe_name); +} + +static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[256]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "kretprobes" : "kprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, + const char *kfunc_name, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int type, pfd, err; + + err = add_kprobe_event_legacy(probe_name, retprobe, kfunc_name, offset); + if (err < 0) { + pr_warn("failed to add legacy kprobe event for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + type = determine_kprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", + kfunc_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy kprobe perf_event_open() failed: %s\n", + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; +} + +static const char *arch_specific_syscall_pfx(void) +{ +#if defined(__x86_64__) + return "x64"; +#elif defined(__i386__) + return "ia32"; +#elif defined(__s390x__) + return "s390x"; +#elif defined(__s390__) + return "s390"; +#elif defined(__arm__) + return "arm"; +#elif defined(__aarch64__) + return "arm64"; +#elif defined(__mips__) + return "mips"; +#elif defined(__riscv) + return "riscv"; +#elif defined(__powerpc__) + return "powerpc"; +#elif defined(__powerpc64__) + return "powerpc64"; +#else + return NULL; +#endif +} + +static int probe_kern_syscall_wrapper(void) +{ + char syscall_name[64]; + const char *ksys_pfx; + + ksys_pfx = arch_specific_syscall_pfx(); + if (!ksys_pfx) + return 0; + + snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx); + + if (determine_kprobe_perf_type() >= 0) { + int pfd; + + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + if (pfd >= 0) + close(pfd); + + return pfd >= 0 ? 1 : 0; + } else { /* legacy mode */ + char probe_name[128]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0); + if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0) + return 0; + + (void)remove_kprobe_event_legacy(probe_name, false); + return 1; + } +} + +struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char errmsg[STRERR_BUFSIZE]; + char *legacy_probe = NULL; + struct bpf_link *link; + size_t offset; + bool retprobe, legacy; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_kprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); + retprobe = OPTS_GET(opts, retprobe, false); + offset = OPTS_GET(opts, offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + legacy = determine_kprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + + if (!legacy) { + pfd = perf_event_open_probe(false /* uprobe */, retprobe, + func_name, offset, + -1 /* pid */, 0 /* ref_ctr_off */); + } else { + char probe_name[256]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), + func_name, offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_kprobe_open_legacy(legacy_probe, retprobe, func_name, + offset, -1 /* pid */); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s+0x%zx' perf event: %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s+0x%zx': %s\n", + prog->name, retprobe ? "kretprobe" : "kprobe", + func_name, offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = true; + perf_link->legacy_is_retprobe = retprobe; + } + + return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog, + bool retprobe, + const char *func_name) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe, + ); + + return bpf_program__attach_kprobe_opts(prog, func_name, &opts); +} + +struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts) +{ + LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts); + char func_name[128]; + + if (!OPTS_VALID(opts, bpf_ksyscall_opts)) + return libbpf_err_ptr(-EINVAL); + + if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) { + /* arch_specific_syscall_pfx() should never return NULL here + * because it is guarded by kernel_supports(). However, since + * compiler does not know that we have an explicit conditional + * as well. + */ + snprintf(func_name, sizeof(func_name), "__%s_sys_%s", + arch_specific_syscall_pfx() ? : "", syscall_name); + } else { + snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name); + } + + kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false); + kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts); +} + +/* Adapted from perf/util/string.c */ +static bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*pat == '?') { /* Matches any single character */ + str++; + pat++; + continue; + } + if (*str != *pat) + return false; + str++; + pat++; + } + /* Check wild card */ + if (*pat == '*') { + while (*pat == '*') + pat++; + if (!*pat) /* Tail wild card matches all */ + return true; + while (*str) + if (glob_match(str++, pat)) + return true; + } + return !*str && !*pat; +} + +struct kprobe_multi_resolve { + const char *pattern; + unsigned long *addrs; + size_t cap; + size_t cnt; +}; + +struct avail_kallsyms_data { + char **syms; + size_t cnt; + struct kprobe_multi_resolve *res; +}; + +static int avail_func_cmp(const void *a, const void *b) +{ + return strcmp(*(const char **)a, *(const char **)b); +} + +static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct avail_kallsyms_data *data = ctx; + struct kprobe_multi_resolve *res = data->res; + int err; + + if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp)) + return 0; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, sizeof(*res->addrs), res->cnt + 1); + if (err) + return err; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + return 0; +} + +static int libbpf_available_kallsyms_parse(struct kprobe_multi_resolve *res) +{ + const char *available_functions_file = tracefs_available_filter_functions(); + struct avail_kallsyms_data data; + char sym_name[500]; + FILE *f; + int err = 0, ret, i; + char **syms = NULL; + size_t cap = 0, cnt = 0; + + f = fopen(available_functions_file, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_functions_file, err); + return err; + } + + while (true) { + char *name; + + ret = fscanf(f, "%499s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 1) { + pr_warn("failed to parse available_filter_functions entry: %d\n", ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&syms, &cap, sizeof(*syms), cnt + 1); + if (err) + goto cleanup; + + name = strdup(sym_name); + if (!name) { + err = -errno; + goto cleanup; + } + + syms[cnt++] = name; + } + + /* no entries found, bail out */ + if (cnt == 0) { + err = -ENOENT; + goto cleanup; + } + + /* sort available functions */ + qsort(syms, cnt, sizeof(*syms), avail_func_cmp); + + data.syms = syms; + data.res = res; + data.cnt = cnt; + libbpf_kallsyms_parse(avail_kallsyms_cb, &data); + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + for (i = 0; i < cnt; i++) + free((char *)syms[i]); + free(syms); + + fclose(f); + return err; +} + +static bool has_available_filter_functions_addrs(void) +{ + return access(tracefs_available_filter_functions_addrs(), R_OK) != -1; +} + +static int libbpf_available_kprobes_parse(struct kprobe_multi_resolve *res) +{ + const char *available_path = tracefs_available_filter_functions_addrs(); + char sym_name[500]; + FILE *f; + int ret, err = 0; + unsigned long long sym_addr; + + f = fopen(available_path, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_path, err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %499s%*[^\n]\n", &sym_addr, sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 2) { + pr_warn("failed to parse available_filter_functions_addrs entry: %d\n", + ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, + sizeof(*res->addrs), res->cnt + 1); + if (err) + goto cleanup; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + } + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + fclose(f); + return err; +} + +struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct kprobe_multi_resolve res = { + .pattern = pattern, + }; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + const unsigned long *addrs; + int err, link_fd, prog_fd; + const __u64 *cookies; + const char **syms; + bool retprobe; + size_t cnt; + + if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + syms = OPTS_GET(opts, syms, false); + addrs = OPTS_GET(opts, addrs, false); + cnt = OPTS_GET(opts, cnt, false); + cookies = OPTS_GET(opts, cookies, false); + + if (!pattern && !addrs && !syms) + return libbpf_err_ptr(-EINVAL); + if (pattern && (addrs || syms || cookies || cnt)) + return libbpf_err_ptr(-EINVAL); + if (!pattern && !cnt) + return libbpf_err_ptr(-EINVAL); + if (addrs && syms) + return libbpf_err_ptr(-EINVAL); + + if (pattern) { + if (has_available_filter_functions_addrs()) + err = libbpf_available_kprobes_parse(&res); + else + err = libbpf_available_kallsyms_parse(&res); + if (err) + goto error; + addrs = res.addrs; + cnt = res.cnt; + } + + retprobe = OPTS_GET(opts, retprobe, false); + + lopts.kprobe_multi.syms = syms; + lopts.kprobe_multi.addrs = addrs; + lopts.kprobe_multi.cookies = cookies; + lopts.kprobe_multi.cnt = cnt; + lopts.kprobe_multi.flags = retprobe ? BPF_F_KPROBE_MULTI_RETURN : 0; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + prog_fd = bpf_program__fd(prog); + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto error; + } + link->fd = link_fd; + free(res.addrs); + return link; + +error: + free(link); + free(res.addrs); + return libbpf_err_ptr(err); +} + +static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_kprobe_opts, opts); + unsigned long offset = 0; + const char *func_name; + char *func; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe") and SEC("kretprobe") */ + if (strcmp(prog->sec_name, "kprobe") == 0 || strcmp(prog->sec_name, "kretprobe") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe/"); + if (opts.retprobe) + func_name = prog->sec_name + sizeof("kretprobe/") - 1; + else + func_name = prog->sec_name + sizeof("kprobe/") - 1; + + n = sscanf(func_name, "%m[a-zA-Z0-9_.]+%li", &func, &offset); + if (n < 1) { + pr_warn("kprobe name is invalid: %s\n", func_name); + return -EINVAL; + } + if (opts.retprobe && offset != 0) { + free(func); + pr_warn("kretprobes do not support offset specification\n"); + return -EINVAL; + } + + opts.offset = offset; + *link = bpf_program__attach_kprobe_opts(prog, func, &opts); + free(func); + return libbpf_get_error(*link); +} + +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_ksyscall_opts, opts); + const char *syscall_name; + + *link = NULL; + + /* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */ + if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/"); + if (opts.retprobe) + syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1; + else + syscall_name = prog->sec_name + sizeof("ksyscall/") - 1; + + *link = bpf_program__attach_ksyscall(prog, syscall_name, &opts); + return *link ? 0 : -errno; +} + +static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.multi") and SEC("kretprobe.multi") */ + if (strcmp(prog->sec_name, "kprobe.multi") == 0 || + strcmp(prog->sec_name, "kretprobe.multi") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe.multi/"); + if (opts.retprobe) + spec = prog->sec_name + sizeof("kretprobe.multi/") - 1; + else + spec = prog->sec_name + sizeof("kprobe.multi/") - 1; + + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return libbpf_get_error(*link); +} + +static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, + const char *binary_path, uint64_t offset) +{ + int i; + + snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx", getpid(), binary_path, (size_t)offset); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } +} + +static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset) +{ + return append_to_file(tracefs_uprobe_events(), "%c:%s/%s %s:0x%zx", + retprobe ? 'r' : 'p', + retprobe ? "uretprobes" : "uprobes", + probe_name, binary_path, offset); +} + +static inline int remove_uprobe_event_legacy(const char *probe_name, bool retprobe) +{ + return append_to_file(tracefs_uprobe_events(), "-:%s/%s", + retprobe ? "uretprobes" : "uprobes", probe_name); +} + +static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retprobe) +{ + char file[512]; + + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "uretprobes" : "uprobes", probe_name); + + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, + const char *binary_path, size_t offset, int pid) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + int type, pfd, err; + + err = add_uprobe_event_legacy(probe_name, retprobe, binary_path, offset); + if (err < 0) { + pr_warn("failed to add legacy uprobe event for %s:0x%zx: %d\n", + binary_path, (size_t)offset, err); + return err; + } + type = determine_uprobe_perf_type_legacy(probe_name, retprobe); + if (type < 0) { + err = type; + pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", + binary_path, offset, err); + goto err_clean_legacy; + } + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = type; + attr.type = PERF_TYPE_TRACEPOINT; + + pfd = syscall(__NR_perf_event_open, &attr, + pid < 0 ? -1 : pid, /* pid */ + pid == -1 ? 0 : -1, /* cpu */ + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); + goto err_clean_legacy; + } + return pfd; + +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} + +/* Find offset of function name in the provided ELF object. "binary_path" is + * the path to the ELF binary represented by "elf", and only used for error + * reporting matters. "name" matches symbol name or name@@LIB for library + * functions. + */ +static long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name) +{ + int i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + bool is_shared_lib, is_name_qualified; + long ret = -ENOENT; + size_t name_len; + GElf_Ehdr ehdr; + + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; + + name_len = strlen(name); + /* Does name specify "@@LIB"? */ + is_name_qualified = strstr(name, "@@") != NULL; + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + size_t nr_syms, strtabidx, idx; + Elf_Data *symbols = NULL; + Elf_Scn *scn = NULL; + int last_bind = -1; + const char *sname; + GElf_Shdr sh; + + scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); + if (!scn) { + pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", + binary_path); + continue; + } + if (!gelf_getshdr(scn, &sh)) + continue; + strtabidx = sh.sh_link; + symbols = elf_getdata(scn, 0); + if (!symbols) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + nr_syms = symbols->d_size / sh.sh_entsize; + + for (idx = 0; idx < nr_syms; idx++) { + int curr_bind; + GElf_Sym sym; + Elf_Scn *sym_scn; + GElf_Shdr sym_sh; + + if (!gelf_getsym(symbols, idx, &sym)) + continue; + + if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) + continue; + + sname = elf_strptr(elf, strtabidx, sym.st_name); + if (!sname) + continue; + + curr_bind = GELF_ST_BIND(sym.st_info); + + /* User can specify func, func@@LIB or func@@LIB_VERSION. */ + if (strncmp(sname, name, name_len) != 0) + continue; + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') + continue; + + if (ret >= 0) { + /* handle multiple matches */ + if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sname, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (curr_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + + /* Transform symbol's virtual address (absolute for + * binaries and relative for shared libs) into file + * offset, which is what kernel is expecting for + * uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more + * details. + * This is done by looking up symbol's containing + * section's header and using it's virtual address + * (sh_addr) and corresponding file offset (sh_offset) + * to transform sym.st_value (virtual address) into + * desired final file offset. + */ + sym_scn = elf_getscn(elf, sym.st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &sym_sh)) + continue; + + ret = sym.st_value - sym_sh.sh_addr + sym_sh.sh_offset; + last_bind = curr_bind; + } + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + return ret; +} + +/* Find offset of function name in ELF object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +static long elf_find_func_offset_from_file(const char *binary_path, const char *name) +{ + char errmsg[STRERR_BUFSIZE]; + long ret = -ENOENT; + Elf *elf; + int fd; + + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + + ret = elf_find_func_offset(elf, binary_path, name); + elf_end(elf); + close(fd); + return ret; +} + +/* Find offset of function name in archive specified by path. Currently + * supported are .zip files that do not compress their contents, as used on + * Android in the form of APKs, for example. "file_name" is the name of the ELF + * file inside the archive. "func_name" matches symbol name or name@@LIB for + * library functions. + * + * An overview of the APK format specifically provided here: + * https://en.wikipedia.org/w/index.php?title=Apk_(file_format)&oldid=1139099120#Package_contents + */ +static long elf_find_func_offset_from_archive(const char *archive_path, const char *file_name, + const char *func_name) +{ + struct zip_archive *archive; + struct zip_entry entry; + long ret; + Elf *elf; + + archive = zip_archive_open(archive_path); + if (IS_ERR(archive)) { + ret = PTR_ERR(archive); + pr_warn("zip: failed to open %s: %ld\n", archive_path, ret); + return ret; + } + + ret = zip_archive_find_entry(archive, file_name, &entry); + if (ret) { + pr_warn("zip: could not find archive member %s in %s: %ld\n", file_name, + archive_path, ret); + goto out; + } + pr_debug("zip: found entry for %s in %s at 0x%lx\n", file_name, archive_path, + (unsigned long)entry.data_offset); + + if (entry.compression) { + pr_warn("zip: entry %s of %s is compressed and cannot be handled\n", file_name, + archive_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + + elf = elf_memory((void *)entry.data, entry.data_length); + if (!elf) { + pr_warn("elf: could not read elf file %s from %s: %s\n", file_name, archive_path, + elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__LIBELF; + goto out; + } + + ret = elf_find_func_offset(elf, file_name, func_name); + if (ret > 0) { + pr_debug("elf: symbol address match for %s of %s in %s: 0x%x + 0x%lx = 0x%lx\n", + func_name, file_name, archive_path, entry.data_offset, ret, + ret + entry.data_offset); + ret += entry.data_offset; + } + elf_end(elf); + +out: + zip_archive_close(archive); + return ret; +} + +static const char *arch_specific_lib_paths(void) +{ + /* + * Based on https://packages.debian.org/sid/libc6. + * + * Assume that the traced program is built for the same architecture + * as libbpf, which should cover the vast majority of cases. + */ +#if defined(__x86_64__) + return "/lib/x86_64-linux-gnu"; +#elif defined(__i386__) + return "/lib/i386-linux-gnu"; +#elif defined(__s390x__) + return "/lib/s390x-linux-gnu"; +#elif defined(__s390__) + return "/lib/s390-linux-gnu"; +#elif defined(__arm__) && defined(__SOFTFP__) + return "/lib/arm-linux-gnueabi"; +#elif defined(__arm__) && !defined(__SOFTFP__) + return "/lib/arm-linux-gnueabihf"; +#elif defined(__aarch64__) + return "/lib/aarch64-linux-gnu"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 64 + return "/lib/mips64el-linux-gnuabi64"; +#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 32 + return "/lib/mipsel-linux-gnu"; +#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return "/lib/powerpc64le-linux-gnu"; +#elif defined(__sparc__) && defined(__arch64__) + return "/lib/sparc64-linux-gnu"; +#elif defined(__riscv) && __riscv_xlen == 64 + return "/lib/riscv64-linux-gnu"; +#else + return NULL; +#endif +} + +/* Get full path to program/shared library. */ +static int resolve_full_path(const char *file, char *result, size_t result_sz) +{ + const char *search_paths[3] = {}; + int i, perm; + + if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { + search_paths[0] = getenv("LD_LIBRARY_PATH"); + search_paths[1] = "/usr/lib64:/usr/lib"; + search_paths[2] = arch_specific_lib_paths(); + perm = R_OK; + } else { + search_paths[0] = getenv("PATH"); + search_paths[1] = "/usr/bin:/usr/sbin"; + perm = R_OK | X_OK; + } + + for (i = 0; i < ARRAY_SIZE(search_paths); i++) { + const char *s; + + if (!search_paths[i]) + continue; + for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) { + char *next_path; + int seg_len; + + if (s[0] == ':') + s++; + next_path = strchr(s, ':'); + seg_len = next_path ? next_path - s : strlen(s); + if (!seg_len) + continue; + snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); + /* ensure it has required permissions */ + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) + continue; + pr_debug("resolved '%s' to '%s'\n", file, result); + return 0; + } + } + return -ENOENT; +} + +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts) +{ + const char *archive_path = NULL, *archive_sep = NULL; + char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char full_path[PATH_MAX]; + struct bpf_link *link; + size_t ref_ctr_off; + int pfd, err; + bool retprobe, legacy; + const char *func_name; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); + retprobe = OPTS_GET(opts, retprobe, false); + ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + /* Check if "binary_path" refers to an archive. */ + archive_sep = strstr(binary_path, "!/"); + if (archive_sep) { + full_path[0] = '\0'; + libbpf_strlcpy(full_path, binary_path, + min(sizeof(full_path), (size_t)(archive_sep - binary_path + 1))); + archive_path = full_path; + binary_path = archive_sep + 2; + } else if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_path, sizeof(full_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = full_path; + } + func_name = OPTS_GET(opts, func_name, NULL); + if (func_name) { + long sym_off; + + if (archive_path) { + sym_off = elf_find_func_offset_from_archive(archive_path, binary_path, + func_name); + binary_path = archive_path; + } else { + sym_off = elf_find_func_offset_from_file(binary_path, func_name); + } + if (sym_off < 0) + return libbpf_err_ptr(sym_off); + func_offset += sym_off; + } + + legacy = determine_uprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + + if (!legacy) { + pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, + func_offset, pid, ref_ctr_off); + } else { + char probe_name[PATH_MAX + 64]; + + if (ref_ctr_off) + return libbpf_err_ptr(-EINVAL); + + gen_uprobe_legacy_event_name(probe_name, sizeof(probe_name), + binary_path, func_offset); + + legacy_probe = strdup(probe_name); + if (!legacy_probe) + return libbpf_err_ptr(-ENOMEM); + + pfd = perf_event_uprobe_open_legacy(legacy_probe, retprobe, + binary_path, func_offset, pid); + } + if (pfd < 0) { + err = -errno; + pr_warn("prog '%s': failed to create %s '%s:0x%zx' perf event: %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_out; + } + + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to %s '%s:0x%zx': %s\n", + prog->name, retprobe ? "uretprobe" : "uprobe", + binary_path, func_offset, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; + } + if (legacy) { + struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); + + perf_link->legacy_probe_name = legacy_probe; + perf_link->legacy_is_kprobe = false; + perf_link->legacy_is_retprobe = retprobe; + } + return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); +err_out: + free(legacy_probe); + return libbpf_err_ptr(err); +} + +/* Format of u[ret]probe section definition supporting auto-attach: + * u[ret]probe/binary:function[+offset] + * + * binary can be an absolute/relative path or a filename; the latter is resolved to a + * full binary path via bpf_program__attach_uprobe_opts. + * + * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be + * specified (and auto-attach is not possible) or the above format is specified for + * auto-attach. + */ +static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + int n, ret = -EINVAL; + long offset = 0; + + *link = NULL; + + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", + &probe_type, &binary_path, &func_name, &offset); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 2: + pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n", + prog->name, prog->sec_name); + break; + case 3: + case 4: + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; + if (opts.retprobe && offset != 0) { + pr_warn("prog '%s': uretprobes do not support offset specification\n", + prog->name); + break; + } + opts.func_name = func_name; + *link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, + prog->sec_name); + break; + } + free(probe_type); + free(binary_path); + free(func_name); + + return ret; +} + +struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog, + bool retprobe, pid_t pid, + const char *binary_path, + size_t func_offset) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts, .retprobe = retprobe); + + return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts); +} + +struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts) +{ + char resolved_path[512]; + struct bpf_object *obj = prog->obj; + struct bpf_link *link; + __u64 usdt_cookie; + int err; + + if (!OPTS_VALID(opts, bpf_uprobe_opts)) + return libbpf_err_ptr(-EINVAL); + + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, binary_path, err); + return libbpf_err_ptr(err); + } + binary_path = resolved_path; + } + + /* USDT manager is instantiated lazily on first USDT attach. It will + * be destroyed together with BPF object in bpf_object__close(). + */ + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + if (!obj->usdt_man) { + obj->usdt_man = usdt_manager_new(obj); + if (IS_ERR(obj->usdt_man)) + return libbpf_ptr(obj->usdt_man); + } + + usdt_cookie = OPTS_GET(opts, usdt_cookie, 0); + link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path, + usdt_provider, usdt_name, usdt_cookie); + err = libbpf_get_error(link); + if (err) + return libbpf_err_ptr(err); + return link; +} + +static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *path = NULL, *provider = NULL, *name = NULL; + const char *sec_name; + int n, err; + + sec_name = bpf_program__section_name(prog); + if (strcmp(sec_name, "usdt") == 0) { + /* no auto-attach for just SEC("usdt") */ + *link = NULL; + return 0; + } + + n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name); + if (n != 3) { + pr_warn("invalid section '%s', expected SEC(\"usdt/::\")\n", + sec_name); + err = -EINVAL; + } else { + *link = bpf_program__attach_usdt(prog, -1 /* any process */, path, + provider, name, NULL); + err = libbpf_get_error(*link); + } + free(path); + free(provider); + free(name); + return err; +} + +static int determine_tracepoint_id(const char *tp_category, + const char *tp_name) +{ + char file[PATH_MAX]; + int ret; + + ret = snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), tp_category, tp_name); + if (ret < 0) + return -errno; + if (ret >= sizeof(file)) { + pr_debug("tracepoint %s/%s path is too long\n", + tp_category, tp_name); + return -E2BIG; + } + return parse_uint_from_file(file, "%d\n"); +} + +static int perf_event_open_tracepoint(const char *tp_category, + const char *tp_name) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + char errmsg[STRERR_BUFSIZE]; + int tp_id, pfd, err; + + tp_id = determine_tracepoint_id(tp_category, tp_name); + if (tp_id < 0) { + pr_warn("failed to determine tracepoint '%s/%s' perf event ID: %s\n", + tp_category, tp_name, + libbpf_strerror_r(tp_id, errmsg, sizeof(errmsg))); + return tp_id; + } + + memset(&attr, 0, attr_sz); + attr.type = PERF_TYPE_TRACEPOINT; + attr.size = attr_sz; + attr.config = tp_id; + + pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (pfd < 0) { + err = -errno; + pr_warn("tracepoint '%s/%s' perf_event_open() failed: %s\n", + tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return err; + } + return pfd; +} + +struct bpf_link *bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int pfd, err; + + if (!OPTS_VALID(opts, bpf_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + + pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + pfd = perf_event_open_tracepoint(tp_category, tp_name); + if (pfd < 0) { + pr_warn("prog '%s': failed to create tracepoint '%s/%s' perf event: %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link = bpf_program__attach_perf_event_opts(prog, pfd, &pe_opts); + err = libbpf_get_error(link); + if (err) { + close(pfd); + pr_warn("prog '%s': failed to attach to tracepoint '%s/%s': %s\n", + prog->name, tp_category, tp_name, + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(err); + } + return link; +} + +struct bpf_link *bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name) +{ + return bpf_program__attach_tracepoint_opts(prog, tp_category, tp_name, NULL); +} + +static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *sec_name, *tp_cat, *tp_name; + + *link = NULL; + + /* no auto-attach for SEC("tp") or SEC("tracepoint") */ + if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + return 0; + + sec_name = strdup(prog->sec_name); + if (!sec_name) + return -ENOMEM; + + /* extract "tp//" or "tracepoint//" */ + if (str_has_pfx(prog->sec_name, "tp/")) + tp_cat = sec_name + sizeof("tp/") - 1; + else + tp_cat = sec_name + sizeof("tracepoint/") - 1; + tp_name = strchr(tp_cat, '/'); + if (!tp_name) { + free(sec_name); + return -EINVAL; + } + *tp_name = '\0'; + tp_name++; + + *link = bpf_program__attach_tracepoint(prog, tp_cat, tp_name); + free(sec_name); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to raw tracepoint '%s': %s\n", + prog->name, tp_name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + static const char *const prefixes[] = { + "raw_tp", + "raw_tracepoint", + "raw_tp.w", + "raw_tracepoint.w", + }; + size_t i; + const char *tp_name = NULL; + + *link = NULL; + + for (i = 0; i < ARRAY_SIZE(prefixes); i++) { + size_t pfx_len; + + if (!str_has_pfx(prog->sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + /* no auto-attach case of, e.g., SEC("raw_tp") */ + if (prog->sec_name[pfx_len] == '\0') + return 0; + + if (prog->sec_name[pfx_len] != '/') + continue; + + tp_name = prog->sec_name + pfx_len + 1; + break; + } + + if (!tp_name) { + pr_warn("prog '%s': invalid section name '%s'\n", + prog->name, prog->sec_name); + return -EINVAL; + } + + *link = bpf_program__attach_raw_tracepoint(prog, tp_name); + return libbpf_get_error(*link); +} + +/* Common logic for all BPF program types that attach to a btf_id */ +static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, pfd; + + if (!OPTS_VALID(opts, bpf_trace_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + /* libbpf is smart enough to redirect to BPF_RAW_TRACEPOINT_OPEN on old kernels */ + link_opts.tracing.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts); + if (pfd < 0) { + pfd = -errno; + free(link); + pr_warn("prog '%s': failed to attach: %s\n", + prog->name, libbpf_strerror_r(pfd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(pfd); + } + link->fd = pfd; + return link; +} + +struct bpf_link *bpf_program__attach_trace(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +struct bpf_link *bpf_program__attach_trace_opts(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + return bpf_program__attach_btf_id(prog, opts); +} + +struct bpf_link *bpf_program__attach_lsm(const struct bpf_program *prog) +{ + return bpf_program__attach_btf_id(prog, NULL); +} + +static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_trace(prog); + return libbpf_get_error(*link); +} + +static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_lsm(prog); + return libbpf_get_error(*link); +} + +static struct bpf_link * +bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id, + const char *target_name) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .target_btf_id = btf_id); + enum bpf_attach_type attach_type; + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + attach_type = bpf_program__expected_attach_type(prog); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to %s: %s\n", + prog->name, target_name, + libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program__attach_fd(prog, cgroup_fd, 0, "cgroup"); +} + +struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) +{ + return bpf_program__attach_fd(prog, netns_fd, 0, "netns"); +} + +struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) +{ + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program__attach_fd(prog, ifindex, 0, "xdp"); +} + +struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, + const char *attach_func_name) +{ + int btf_id; + + if (!!target_fd != !!attach_func_name) { + pr_warn("prog '%s': supply none or both of target_fd and attach_func_name\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (prog->type != BPF_PROG_TYPE_EXT) { + pr_warn("prog '%s': only BPF_PROG_TYPE_EXT can attach as freplace", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + if (target_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd); + if (btf_id < 0) + return libbpf_err_ptr(btf_id); + + return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace"); + } else { + /* no target, so use raw_tracepoint_open for compatibility + * with old kernels + */ + return bpf_program__attach_trace(prog); + } +} + +struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + char errmsg[STRERR_BUFSIZE]; + struct bpf_link *link; + int prog_fd, link_fd; + __u32 target_fd = 0; + + if (!OPTS_VALID(opts, bpf_iter_attach_opts)) + return libbpf_err_ptr(-EINVAL); + + link_create_opts.iter_info = OPTS_GET(opts, link_info, (void *)0); + link_create_opts.iter_info_len = OPTS_GET(opts, link_info_len, 0); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER, + &link_create_opts); + if (link_fd < 0) { + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to iterator: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + return link; +} + +static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + *link = bpf_program__attach_iter(prog, NULL); + return libbpf_get_error(*link); +} + +struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct bpf_link *link; + int prog_fd, link_fd; + + if (!OPTS_VALID(opts, bpf_netfilter_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + + link->detach = &bpf_link__detach_fd; + + lopts.netfilter.pf = OPTS_GET(opts, pf, 0); + lopts.netfilter.hooknum = OPTS_GET(opts, hooknum, 0); + lopts.netfilter.priority = OPTS_GET(opts, priority, 0); + lopts.netfilter.flags = OPTS_GET(opts, flags, 0); + + link_fd = bpf_link_create(prog_fd, 0, BPF_NETFILTER, &lopts); + if (link_fd < 0) { + char errmsg[STRERR_BUFSIZE]; + + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to netfilter: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + + return link; +} + +struct bpf_link *bpf_program__attach(const struct bpf_program *prog) +{ + struct bpf_link *link = NULL; + int err; + + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + return libbpf_err_ptr(-EOPNOTSUPP); + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); + if (err) + return libbpf_err_ptr(err); + + /* When calling bpf_program__attach() explicitly, auto-attach support + * is expected to work, so NULL returned link is considered an error. + * This is different for skeleton's attach, see comment in + * bpf_object__attach_skeleton(). + */ + if (!link) + return libbpf_err_ptr(-EOPNOTSUPP); + + return link; +} + +struct bpf_link_struct_ops { + struct bpf_link link; + int map_fd; +}; + +static int bpf_link__detach_struct_ops(struct bpf_link *link) +{ + struct bpf_link_struct_ops *st_link; + __u32 zero = 0; + + st_link = container_of(link, struct bpf_link_struct_ops, link); + + if (st_link->map_fd < 0) + /* w/o a real link */ + return bpf_map_delete_elem(link->fd, &zero); + + return close(link->fd); +} + +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + struct bpf_link_struct_ops *link; + __u32 zero = 0; + int err, fd; + + if (!bpf_map__is_struct_ops(map) || map->fd == -1) + return libbpf_err_ptr(-EINVAL); + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-EINVAL); + + /* kern_vdata should be prepared during the loading phase. */ + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) { + free(link); + return libbpf_err_ptr(err); + } + + link->link.detach = bpf_link__detach_struct_ops; + + if (!(map->def.map_flags & BPF_F_LINK)) { + /* w/o a real link */ + link->link.fd = map->fd; + link->map_fd = -1; + return &link->link; + } + + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + if (fd < 0) { + free(link); + return libbpf_err_ptr(fd); + } + + link->link.fd = fd; + link->map_fd = map->fd; + + return &link->link; +} + +/* + * Swap the back struct_ops of a link with a new struct_ops map. + */ +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) +{ + struct bpf_link_struct_ops *st_ops_link; + __u32 zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map) || map->fd < 0) + return -EINVAL; + + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); + /* Ensure the type of a link is correct */ + if (st_ops_link->map_fd < 0) + return -EINVAL; + + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && err != -EBUSY) + return err; + + err = bpf_link_update(link->fd, map->fd, NULL); + if (err < 0) + return err; + + st_ops_link->map_fd = map->fd; + + return 0; +} + +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + +static enum bpf_perf_event_ret +perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, + void **copy_mem, size_t *copy_size, + bpf_perf_event_print_t fn, void *private_data) +{ + struct perf_event_mmap_page *header = mmap_mem; + __u64 data_head = ring_buffer_read_head(header); + __u64 data_tail = header->data_tail; + void *base = ((__u8 *)header) + page_size; + int ret = LIBBPF_PERF_EVENT_CONT; + struct perf_event_header *ehdr; + size_t ehdr_size; + + while (data_head != data_tail) { + ehdr = base + (data_tail & (mmap_size - 1)); + ehdr_size = ehdr->size; + + if (((void *)ehdr) + ehdr_size > base + mmap_size) { + void *copy_start = ehdr; + size_t len_first = base + mmap_size - copy_start; + size_t len_secnd = ehdr_size - len_first; + + if (*copy_size < ehdr_size) { + free(*copy_mem); + *copy_mem = malloc(ehdr_size); + if (!*copy_mem) { + *copy_size = 0; + ret = LIBBPF_PERF_EVENT_ERROR; + break; + } + *copy_size = ehdr_size; + } + + memcpy(*copy_mem, copy_start, len_first); + memcpy(*copy_mem + len_first, base, len_secnd); + ehdr = *copy_mem; + } + + ret = fn(ehdr, private_data); + data_tail += ehdr_size; + if (ret != LIBBPF_PERF_EVENT_CONT) + break; + } + + ring_buffer_write_tail(header, data_tail); + return libbpf_err(ret); +} + +struct perf_buffer; + +struct perf_buffer_params { + struct perf_event_attr *attr; + /* if event_cb is specified, it takes precendence */ + perf_buffer_event_fn event_cb; + /* sample_cb and lost_cb are higher-level common-case callbacks */ + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; + int cpu_cnt; + int *cpus; + int *map_keys; +}; + +struct perf_cpu_buf { + struct perf_buffer *pb; + void *base; /* mmap()'ed memory */ + void *buf; /* for reconstructing segmented data */ + size_t buf_size; + int fd; + int cpu; + int map_key; +}; + +struct perf_buffer { + perf_buffer_event_fn event_cb; + perf_buffer_sample_fn sample_cb; + perf_buffer_lost_fn lost_cb; + void *ctx; /* passed into callbacks */ + + size_t page_size; + size_t mmap_size; + struct perf_cpu_buf **cpu_bufs; + struct epoll_event *events; + int cpu_cnt; /* number of allocated CPU buffers */ + int epoll_fd; /* perf event FD */ + int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */ +}; + +static void perf_buffer__free_cpu_buf(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + if (!cpu_buf) + return; + if (cpu_buf->base && + munmap(cpu_buf->base, pb->mmap_size + pb->page_size)) + pr_warn("failed to munmap cpu_buf #%d\n", cpu_buf->cpu); + if (cpu_buf->fd >= 0) { + ioctl(cpu_buf->fd, PERF_EVENT_IOC_DISABLE, 0); + close(cpu_buf->fd); + } + free(cpu_buf->buf); + free(cpu_buf); +} + +void perf_buffer__free(struct perf_buffer *pb) +{ + int i; + + if (IS_ERR_OR_NULL(pb)) + return; + if (pb->cpu_bufs) { + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + bpf_map_delete_elem(pb->map_fd, &cpu_buf->map_key); + perf_buffer__free_cpu_buf(pb, cpu_buf); + } + free(pb->cpu_bufs); + } + if (pb->epoll_fd >= 0) + close(pb->epoll_fd); + free(pb->events); + free(pb); +} + +static struct perf_cpu_buf * +perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr, + int cpu, int map_key) +{ + struct perf_cpu_buf *cpu_buf; + char msg[STRERR_BUFSIZE]; + int err; + + cpu_buf = calloc(1, sizeof(*cpu_buf)); + if (!cpu_buf) + return ERR_PTR(-ENOMEM); + + cpu_buf->pb = pb; + cpu_buf->cpu = cpu; + cpu_buf->map_key = map_key; + + cpu_buf->fd = syscall(__NR_perf_event_open, attr, -1 /* pid */, cpu, + -1, PERF_FLAG_FD_CLOEXEC); + if (cpu_buf->fd < 0) { + err = -errno; + pr_warn("failed to open perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + cpu_buf->base = mmap(NULL, pb->mmap_size + pb->page_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + cpu_buf->fd, 0); + if (cpu_buf->base == MAP_FAILED) { + cpu_buf->base = NULL; + err = -errno; + pr_warn("failed to mmap perf buffer on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (ioctl(cpu_buf->fd, PERF_EVENT_IOC_ENABLE, 0) < 0) { + err = -errno; + pr_warn("failed to enable perf buffer event on cpu #%d: %s\n", + cpu, libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + return cpu_buf; + +error: + perf_buffer__free_cpu_buf(pb, cpu_buf); + return (struct perf_cpu_buf *)ERR_PTR(err); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p); + +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_buffer_params p = {}; + struct perf_event_attr attr; + __u32 sample_period; + + if (!OPTS_VALID(opts, perf_buffer_opts)) + return libbpf_err_ptr(-EINVAL); + + sample_period = OPTS_GET(opts, sample_period, 1); + if (!sample_period) + sample_period = 1; + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; + attr.config = PERF_COUNT_SW_BPF_OUTPUT; + attr.type = PERF_TYPE_SOFTWARE; + attr.sample_type = PERF_SAMPLE_RAW; + attr.sample_period = sample_period; + attr.wakeup_events = sample_period; + + p.attr = &attr; + p.sample_cb = sample_cb; + p.lost_cb = lost_cb; + p.ctx = ctx; + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) +{ + struct perf_buffer_params p = {}; + + if (!attr) + return libbpf_err_ptr(-EINVAL); + + if (!OPTS_VALID(opts, perf_buffer_raw_opts)) + return libbpf_err_ptr(-EINVAL); + + p.attr = attr; + p.event_cb = event_cb; + p.ctx = ctx; + p.cpu_cnt = OPTS_GET(opts, cpu_cnt, 0); + p.cpus = OPTS_GET(opts, cpus, NULL); + p.map_keys = OPTS_GET(opts, map_keys, NULL); + + return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); +} + +static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, + struct perf_buffer_params *p) +{ + const char *online_cpus_file = "/sys/devices/system/cpu/online"; + struct bpf_map_info map; + char msg[STRERR_BUFSIZE]; + struct perf_buffer *pb; + bool *online = NULL; + __u32 map_info_len; + int err, i, j, n; + + if (page_cnt == 0 || (page_cnt & (page_cnt - 1))) { + pr_warn("page count should be power of two, but is %zu\n", + page_cnt); + return ERR_PTR(-EINVAL); + } + + /* best-effort sanity checks */ + memset(&map, 0, sizeof(map)); + map_info_len = sizeof(map); + err = bpf_map_get_info_by_fd(map_fd, &map, &map_info_len); + if (err) { + err = -errno; + /* if BPF_OBJ_GET_INFO_BY_FD is supported, will return + * -EBADFD, -EFAULT, or -E2BIG on real error + */ + if (err != -EINVAL) { + pr_warn("failed to get map info for map FD %d: %s\n", + map_fd, libbpf_strerror_r(err, msg, sizeof(msg))); + return ERR_PTR(err); + } + pr_debug("failed to get map info for FD %d; API not supported? Ignoring...\n", + map_fd); + } else { + if (map.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + pr_warn("map '%s' should be BPF_MAP_TYPE_PERF_EVENT_ARRAY\n", + map.name); + return ERR_PTR(-EINVAL); + } + } + + pb = calloc(1, sizeof(*pb)); + if (!pb) + return ERR_PTR(-ENOMEM); + + pb->event_cb = p->event_cb; + pb->sample_cb = p->sample_cb; + pb->lost_cb = p->lost_cb; + pb->ctx = p->ctx; + + pb->page_size = getpagesize(); + pb->mmap_size = pb->page_size * page_cnt; + pb->map_fd = map_fd; + + pb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (pb->epoll_fd < 0) { + err = -errno; + pr_warn("failed to create epoll instance: %s\n", + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + if (p->cpu_cnt > 0) { + pb->cpu_cnt = p->cpu_cnt; + } else { + pb->cpu_cnt = libbpf_num_possible_cpus(); + if (pb->cpu_cnt < 0) { + err = pb->cpu_cnt; + goto error; + } + if (map.max_entries && map.max_entries < pb->cpu_cnt) + pb->cpu_cnt = map.max_entries; + } + + pb->events = calloc(pb->cpu_cnt, sizeof(*pb->events)); + if (!pb->events) { + err = -ENOMEM; + pr_warn("failed to allocate events: out of memory\n"); + goto error; + } + pb->cpu_bufs = calloc(pb->cpu_cnt, sizeof(*pb->cpu_bufs)); + if (!pb->cpu_bufs) { + err = -ENOMEM; + pr_warn("failed to allocate buffers: out of memory\n"); + goto error; + } + + err = parse_cpu_mask_file(online_cpus_file, &online, &n); + if (err) { + pr_warn("failed to get online CPU mask: %d\n", err); + goto error; + } + + for (i = 0, j = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf; + int cpu, map_key; + + cpu = p->cpu_cnt > 0 ? p->cpus[i] : i; + map_key = p->cpu_cnt > 0 ? p->map_keys[i] : i; + + /* in case user didn't explicitly requested particular CPUs to + * be attached to, skip offline/not present CPUs + */ + if (p->cpu_cnt <= 0 && (cpu >= n || !online[cpu])) + continue; + + cpu_buf = perf_buffer__open_cpu_buf(pb, p->attr, cpu, map_key); + if (IS_ERR(cpu_buf)) { + err = PTR_ERR(cpu_buf); + goto error; + } + + pb->cpu_bufs[j] = cpu_buf; + + err = bpf_map_update_elem(pb->map_fd, &map_key, + &cpu_buf->fd, 0); + if (err) { + err = -errno; + pr_warn("failed to set cpu #%d, key %d -> perf FD %d: %s\n", + cpu, map_key, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + + pb->events[j].events = EPOLLIN; + pb->events[j].data.ptr = cpu_buf; + if (epoll_ctl(pb->epoll_fd, EPOLL_CTL_ADD, cpu_buf->fd, + &pb->events[j]) < 0) { + err = -errno; + pr_warn("failed to epoll_ctl cpu #%d perf FD %d: %s\n", + cpu, cpu_buf->fd, + libbpf_strerror_r(err, msg, sizeof(msg))); + goto error; + } + j++; + } + pb->cpu_cnt = j; + free(online); + + return pb; + +error: + free(online); + if (pb) + perf_buffer__free(pb); + return ERR_PTR(err); +} + +struct perf_sample_raw { + struct perf_event_header header; + uint32_t size; + char data[]; +}; + +struct perf_sample_lost { + struct perf_event_header header; + uint64_t id; + uint64_t lost; + uint64_t sample_id; +}; + +static enum bpf_perf_event_ret +perf_buffer__process_record(struct perf_event_header *e, void *ctx) +{ + struct perf_cpu_buf *cpu_buf = ctx; + struct perf_buffer *pb = cpu_buf->pb; + void *data = e; + + /* user wants full control over parsing perf event */ + if (pb->event_cb) + return pb->event_cb(pb->ctx, cpu_buf->cpu, e); + + switch (e->type) { + case PERF_RECORD_SAMPLE: { + struct perf_sample_raw *s = data; + + if (pb->sample_cb) + pb->sample_cb(pb->ctx, cpu_buf->cpu, s->data, s->size); + break; + } + case PERF_RECORD_LOST: { + struct perf_sample_lost *s = data; + + if (pb->lost_cb) + pb->lost_cb(pb->ctx, cpu_buf->cpu, s->lost); + break; + } + default: + pr_warn("unknown perf sample type %d\n", e->type); + return LIBBPF_PERF_EVENT_ERROR; + } + return LIBBPF_PERF_EVENT_CONT; +} + +static int perf_buffer__process_records(struct perf_buffer *pb, + struct perf_cpu_buf *cpu_buf) +{ + enum bpf_perf_event_ret ret; + + ret = perf_event_read_simple(cpu_buf->base, pb->mmap_size, + pb->page_size, &cpu_buf->buf, + &cpu_buf->buf_size, + perf_buffer__process_record, cpu_buf); + if (ret != LIBBPF_PERF_EVENT_CONT) + return ret; + return 0; +} + +int perf_buffer__epoll_fd(const struct perf_buffer *pb) +{ + return pb->epoll_fd; +} + +int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms) +{ + int i, cnt, err; + + cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms); + if (cnt < 0) + return -errno; + + for (i = 0; i < cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("error while processing records: %d\n", err); + return libbpf_err(err); + } + } + return cnt; +} + +/* Return number of PERF_EVENT_ARRAY map slots set up by this perf_buffer + * manager. + */ +size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb) +{ + return pb->cpu_cnt; +} + +/* + * Return perf_event FD of a ring buffer in *buf_idx* slot of + * PERF_EVENT_ARRAY BPF map. This FD can be polled for new data using + * select()/poll()/epoll() Linux syscalls. + */ +int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return cpu_buf->fd; +} + +int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, size_t *buf_size) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + *buf = cpu_buf->base; + *buf_size = pb->mmap_size; + return 0; +} + +/* + * Consume data from perf ring buffer corresponding to slot *buf_idx* in + * PERF_EVENT_ARRAY BPF map without waiting/polling. If there is no data to + * consume, do nothing and return success. + * Returns: + * - 0 on success; + * - <0 on failure. + */ +int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + return perf_buffer__process_records(pb, cpu_buf); +} + +int perf_buffer__consume(struct perf_buffer *pb) +{ + int i, err; + + for (i = 0; i < pb->cpu_cnt; i++) { + struct perf_cpu_buf *cpu_buf = pb->cpu_bufs[i]; + + if (!cpu_buf) + continue; + + err = perf_buffer__process_records(pb, cpu_buf); + if (err) { + pr_warn("perf_buffer: failed to process records in buffer #%d: %d\n", i, err); + return libbpf_err(err); + } + } + return 0; +} + +int bpf_program__set_attach_target(struct bpf_program *prog, + int attach_prog_fd, + const char *attach_func_name) +{ + int btf_obj_fd = 0, btf_id = 0, err; + + if (!prog || attach_prog_fd < 0) + return libbpf_err(-EINVAL); + + if (prog->obj->loaded) + return libbpf_err(-EINVAL); + + if (attach_prog_fd && !attach_func_name) { + /* remember attach_prog_fd and let bpf_program__load() find + * BTF ID during the program load + */ + prog->attach_prog_fd = attach_prog_fd; + return 0; + } + + if (attach_prog_fd) { + btf_id = libbpf_find_prog_btf_id(attach_func_name, + attach_prog_fd); + if (btf_id < 0) + return libbpf_err(btf_id); + } else { + if (!attach_func_name) + return libbpf_err(-EINVAL); + + /* load btf_vmlinux, if not yet */ + err = bpf_object__load_vmlinux_btf(prog->obj, true); + if (err) + return libbpf_err(err); + err = find_kernel_btf_id(prog->obj, attach_func_name, + prog->expected_attach_type, + &btf_obj_fd, &btf_id); + if (err) + return libbpf_err(err); + } + + prog->attach_btf_id = btf_id; + prog->attach_btf_obj_fd = btf_obj_fd; + prog->attach_prog_fd = attach_prog_fd; + return 0; +} + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) +{ + int err = 0, n, len, start, end = -1; + bool *tmp; + + *mask = NULL; + *mask_sz = 0; + + /* Each sub string separated by ',' has format \d+-\d+ or \d+ */ + while (*s) { + if (*s == ',' || *s == '\n') { + s++; + continue; + } + n = sscanf(s, "%d%n-%d%n", &start, &len, &end, &len); + if (n <= 0 || n > 2) { + pr_warn("Failed to get CPU range %s: %d\n", s, n); + err = -EINVAL; + goto cleanup; + } else if (n == 1) { + end = start; + } + if (start < 0 || start > end) { + pr_warn("Invalid CPU range [%d,%d] in %s\n", + start, end, s); + err = -EINVAL; + goto cleanup; + } + tmp = realloc(*mask, end + 1); + if (!tmp) { + err = -ENOMEM; + goto cleanup; + } + *mask = tmp; + memset(tmp + *mask_sz, 0, start - *mask_sz); + memset(tmp + start, 1, end - start + 1); + *mask_sz = end + 1; + s += len; + } + if (!*mask_sz) { + pr_warn("Empty CPU range\n"); + return -EINVAL; + } + return 0; +cleanup: + free(*mask); + *mask = NULL; + return err; +} + +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz) +{ + int fd, err = 0, len; + char buf[128]; + + fd = open(fcpu, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("Failed to open cpu mask file %s: %d\n", fcpu, err); + return err; + } + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len <= 0) { + err = len ? -errno : -EINVAL; + pr_warn("Failed to read cpu mask from %s: %d\n", fcpu, err); + return err; + } + if (len >= sizeof(buf)) { + pr_warn("CPU mask is too big in file %s\n", fcpu); + return -E2BIG; + } + buf[len] = '\0'; + + return parse_cpu_mask_str(buf, mask, mask_sz); +} + +int libbpf_num_possible_cpus(void) +{ + static const char *fcpu = "/sys/devices/system/cpu/possible"; + static int cpus; + int err, n, i, tmp_cpus; + bool *mask; + + tmp_cpus = READ_ONCE(cpus); + if (tmp_cpus > 0) + return tmp_cpus; + + err = parse_cpu_mask_file(fcpu, &mask, &n); + if (err) + return libbpf_err(err); + + tmp_cpus = 0; + for (i = 0; i < n; i++) { + if (mask[i]) + tmp_cpus++; + } + free(mask); + + WRITE_ONCE(cpus, tmp_cpus); + return tmp_cpus; +} + +static int populate_skeleton_maps(const struct bpf_object *obj, + struct bpf_map_skeleton *maps, + size_t map_cnt) +{ + int i; + + for (i = 0; i < map_cnt; i++) { + struct bpf_map **map = maps[i].map; + const char *name = maps[i].name; + void **mmaped = maps[i].mmaped; + + *map = bpf_object__find_map_by_name(obj, name); + if (!*map) { + pr_warn("failed to find skeleton map '%s'\n", name); + return -ESRCH; + } + + /* externs shouldn't be pre-setup from user code */ + if (mmaped && (*map)->libbpf_type != LIBBPF_MAP_KCONFIG) + *mmaped = (*map)->mmaped; + } + return 0; +} + +static int populate_skeleton_progs(const struct bpf_object *obj, + struct bpf_prog_skeleton *progs, + size_t prog_cnt) +{ + int i; + + for (i = 0; i < prog_cnt; i++) { + struct bpf_program **prog = progs[i].prog; + const char *name = progs[i].name; + + *prog = bpf_object__find_program_by_name(obj, name); + if (!*prog) { + pr_warn("failed to find skeleton program '%s'\n", name); + return -ESRCH; + } + } + return 0; +} + +int bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts) +{ + DECLARE_LIBBPF_OPTS(bpf_object_open_opts, skel_opts, + .object_name = s->name, + ); + struct bpf_object *obj; + int err; + + /* Attempt to preserve opts->object_name, unless overriden by user + * explicitly. Overwriting object name for skeletons is discouraged, + * as it breaks global data maps, because they contain object name + * prefix as their own map name prefix. When skeleton is generated, + * bpftool is making an assumption that this name will stay the same. + */ + if (opts) { + memcpy(&skel_opts, opts, sizeof(*opts)); + if (!opts->object_name) + skel_opts.object_name = s->name; + } + + obj = bpf_object__open_mem(s->data, s->data_sz, &skel_opts); + err = libbpf_get_error(obj); + if (err) { + pr_warn("failed to initialize skeleton BPF object '%s': %d\n", + s->name, err); + return libbpf_err(err); + } + + *s->obj = obj; + err = populate_skeleton_maps(obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate skeleton maps for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate skeleton progs for '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + return 0; +} + +int bpf_object__open_subskeleton(struct bpf_object_subskeleton *s) +{ + int err, len, var_idx, i; + const char *var_name; + const struct bpf_map *map; + struct btf *btf; + __u32 map_type_id; + const struct btf_type *map_type, *var_type; + const struct bpf_var_skeleton *var_skel; + struct btf_var_secinfo *var; + + if (!s->obj) + return libbpf_err(-EINVAL); + + btf = bpf_object__btf(s->obj); + if (!btf) { + pr_warn("subskeletons require BTF at runtime (object %s)\n", + bpf_object__name(s->obj)); + return libbpf_err(-errno); + } + + err = populate_skeleton_maps(s->obj, s->maps, s->map_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + err = populate_skeleton_progs(s->obj, s->progs, s->prog_cnt); + if (err) { + pr_warn("failed to populate subskeleton maps: %d\n", err); + return libbpf_err(err); + } + + for (var_idx = 0; var_idx < s->var_cnt; var_idx++) { + var_skel = &s->vars[var_idx]; + map = *var_skel->map; + map_type_id = bpf_map__btf_value_type_id(map); + map_type = btf__type_by_id(btf, map_type_id); + + if (!btf_is_datasec(map_type)) { + pr_warn("type for map '%1$s' is not a datasec: %2$s", + bpf_map__name(map), + __btf_kind_str(btf_kind(map_type))); + return libbpf_err(-EINVAL); + } + + len = btf_vlen(map_type); + var = btf_var_secinfos(map_type); + for (i = 0; i < len; i++, var++) { + var_type = btf__type_by_id(btf, var->type); + var_name = btf__name_by_offset(btf, var_type->name_off); + if (strcmp(var_name, var_skel->name) == 0) { + *var_skel->addr = map->mmaped + var->offset; + break; + } + } + } + return 0; +} + +void bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s) +{ + if (!s) + return; + free(s->maps); + free(s->progs); + free(s->vars); + free(s); +} + +int bpf_object__load_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + err = bpf_object__load(*s->obj); + if (err) { + pr_warn("failed to load BPF skeleton '%s': %d\n", s->name, err); + return libbpf_err(err); + } + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map *map = *s->maps[i].map; + size_t mmap_sz = bpf_map_mmap_sz(map->def.value_size, map->def.max_entries); + int prot, map_fd = bpf_map__fd(map); + void **mmaped = s->maps[i].mmaped; + + if (!mmaped) + continue; + + if (!(map->def.map_flags & BPF_F_MMAPABLE)) { + *mmaped = NULL; + continue; + } + + if (map->def.map_flags & BPF_F_RDONLY_PROG) + prot = PROT_READ; + else + prot = PROT_READ | PROT_WRITE; + + /* Remap anonymous mmap()-ed "map initialization image" as + * a BPF map-backed mmap()-ed memory, but preserving the same + * memory address. This will cause kernel to change process' + * page table to point to a different piece of kernel memory, + * but from userspace point of view memory address (and its + * contents, being identical at this point) will stay the + * same. This mapping will be released by bpf_object__close() + * as per normal clean up procedure, so we don't need to worry + * about it from skeleton's clean up perspective. + */ + *mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0); + if (*mmaped == MAP_FAILED) { + err = -errno; + *mmaped = NULL; + pr_warn("failed to re-mmap() map '%s': %d\n", + bpf_map__name(map), err); + return libbpf_err(err); + } + } + + return 0; +} + +int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) +{ + int i, err; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_program *prog = *s->progs[i].prog; + struct bpf_link **link = s->progs[i].link; + + if (!prog->autoload || !prog->autoattach) + continue; + + /* auto-attaching not supported for this program */ + if (!prog->sec_def || !prog->sec_def->prog_attach_fn) + continue; + + /* if user already set the link manually, don't attempt auto-attach */ + if (*link) + continue; + + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, link); + if (err) { + pr_warn("prog '%s': failed to auto-attach: %d\n", + bpf_program__name(prog), err); + return libbpf_err(err); + } + + /* It's possible that for some SEC() definitions auto-attach + * is supported in some cases (e.g., if definition completely + * specifies target information), but is not in other cases. + * SEC("uprobe") is one such case. If user specified target + * binary and function name, such BPF program can be + * auto-attached. But if not, it shouldn't trigger skeleton's + * attach to fail. It should just be skipped. + * attach_fn signals such case with returning 0 (no error) and + * setting link to NULL. + */ + } + + return 0; +} + +void bpf_object__detach_skeleton(struct bpf_object_skeleton *s) +{ + int i; + + for (i = 0; i < s->prog_cnt; i++) { + struct bpf_link **link = s->progs[i].link; + + bpf_link__destroy(*link); + *link = NULL; + } +} + +void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) +{ + if (!s) + return; + + if (s->progs) + bpf_object__detach_skeleton(s); + if (s->obj) + bpf_object__close(*s->obj); + free(s->maps); + free(s->progs); + free(s); +} diff --git a/third_party/libbpf/src/libbpf.h b/third_party/libbpf/src/libbpf.h new file mode 100644 index 0000000..10642ad --- /dev/null +++ b/third_party/libbpf/src/libbpf.h @@ -0,0 +1,1685 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common eBPF ELF object loading operations. + * + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + */ +#ifndef __LIBBPF_LIBBPF_H +#define __LIBBPF_LIBBPF_H + +#include +#include +#include +#include +#include // for size_t +#include + +#include "libbpf_common.h" +#include "libbpf_legacy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +LIBBPF_API __u32 libbpf_major_version(void); +LIBBPF_API __u32 libbpf_minor_version(void); +LIBBPF_API const char *libbpf_version_string(void); + +enum libbpf_errno { + __LIBBPF_ERRNO__START = 4000, + + /* Something wrong in libelf */ + LIBBPF_ERRNO__LIBELF = __LIBBPF_ERRNO__START, + LIBBPF_ERRNO__FORMAT, /* BPF object format invalid */ + LIBBPF_ERRNO__KVERSION, /* Incorrect or no 'version' section */ + LIBBPF_ERRNO__ENDIAN, /* Endian mismatch */ + LIBBPF_ERRNO__INTERNAL, /* Internal error in libbpf */ + LIBBPF_ERRNO__RELOC, /* Relocation failed */ + LIBBPF_ERRNO__LOAD, /* Load program failure for unknown reason */ + LIBBPF_ERRNO__VERIFY, /* Kernel verifier blocks program loading */ + LIBBPF_ERRNO__PROG2BIG, /* Program too big */ + LIBBPF_ERRNO__KVER, /* Incorrect kernel version */ + LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ + LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ + LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ + __LIBBPF_ERRNO__END, +}; + +LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); + +/** + * @brief **libbpf_bpf_attach_type_str()** converts the provided attach type + * value into a textual representation. + * @param t The attach type. + * @return Pointer to a static string identifying the attach type. NULL is + * returned for unknown **bpf_attach_type** values. + */ +LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); + +/** + * @brief **libbpf_bpf_link_type_str()** converts the provided link type value + * into a textual representation. + * @param t The link type. + * @return Pointer to a static string identifying the link type. NULL is + * returned for unknown **bpf_link_type** values. + */ +LIBBPF_API const char *libbpf_bpf_link_type_str(enum bpf_link_type t); + +/** + * @brief **libbpf_bpf_map_type_str()** converts the provided map type value + * into a textual representation. + * @param t The map type. + * @return Pointer to a static string identifying the map type. NULL is + * returned for unknown **bpf_map_type** values. + */ +LIBBPF_API const char *libbpf_bpf_map_type_str(enum bpf_map_type t); + +/** + * @brief **libbpf_bpf_prog_type_str()** converts the provided program type + * value into a textual representation. + * @param t The program type. + * @return Pointer to a static string identifying the program type. NULL is + * returned for unknown **bpf_prog_type** values. + */ +LIBBPF_API const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t); + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, + const char *, va_list ap); + +/** + * @brief **libbpf_set_print()** sets user-provided log callback function to + * be used for libbpf warnings and informational messages. + * @param fn The log print function. If NULL, libbpf won't print anything. + * @return Pointer to old print function. + * + * This function is thread-safe. + */ +LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); + +/* Hide internal to user */ +struct bpf_object; + +struct bpf_object_open_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* object name override, if provided: + * - for object open from file, this will override setting object + * name from file path's base name; + * - for object open from memory buffer, this will specify an object + * name and will override default "-" name; + */ + const char *object_name; + /* parse map definitions non-strictly, allowing extra attributes/data */ + bool relaxed_maps; + /* maps that set the 'pinning' attribute in their definition will have + * their pin_path attribute set to a file in this directory, and be + * auto-pinned to that path on load; defaults to "/sys/fs/bpf". + */ + const char *pin_root_path; + + __u32 :32; /* stub out now removed attach_prog_fd */ + + /* Additional kernel config content that augments and overrides + * system Kconfig for CONFIG_xxx externs. + */ + const char *kconfig; + /* Path to the custom BTF to be used for BPF CO-RE relocations. + * This custom BTF completely replaces the use of vmlinux BTF + * for the purpose of CO-RE relocations. + * NOTE: any other BPF feature (e.g., fentry/fexit programs, + * struct_ops, etc) will need actual kernel BTF at /sys/kernel/btf/vmlinux. + */ + const char *btf_custom_path; + /* Pointer to a buffer for storing kernel logs for applicable BPF + * commands. Valid kernel_log_size has to be specified as well and are + * passed-through to bpf() syscall. Keep in mind that kernel might + * fail operation with -ENOSPC error if provided buffer is too small + * to contain entire log output. + * See the comment below for kernel_log_level for interaction between + * log_buf and log_level settings. + * + * If specified, this log buffer will be passed for: + * - each BPF progral load (BPF_PROG_LOAD) attempt, unless overriden + * with bpf_program__set_log() on per-program level, to get + * BPF verifier log output. + * - during BPF object's BTF load into kernel (BPF_BTF_LOAD) to get + * BTF sanity checking log. + * + * Each BPF command (BPF_BTF_LOAD or BPF_PROG_LOAD) will overwrite + * previous contents, so if you need more fine-grained control, set + * per-program buffer with bpf_program__set_log_buf() to preserve each + * individual program's verification log. Keep using kernel_log_buf + * for BTF verification log, if necessary. + */ + char *kernel_log_buf; + size_t kernel_log_size; + /* + * Log level can be set independently from log buffer. Log_level=0 + * means that libbpf will attempt loading BTF or program without any + * logging requested, but will retry with either its own or custom log + * buffer, if provided, and log_level=1 on any error. + * And vice versa, setting log_level>0 will request BTF or prog + * loading with verbose log from the first attempt (and as such also + * for successfully loaded BTF or program), and the actual log buffer + * could be either libbpf's own auto-allocated log buffer, if + * kernel_log_buffer is NULL, or user-provided custom kernel_log_buf. + * If user didn't provide custom log buffer, libbpf will emit captured + * logs through its print callback. + */ + __u32 kernel_log_level; + + size_t :0; +}; +#define bpf_object_open_opts__last_field kernel_log_level + +/** + * @brief **bpf_object__open()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path. + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object *bpf_object__open(const char *path); + +/** + * @brief **bpf_object__open_file()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path + * @param opts options for how to load the bpf object, this parameter is + * optional and can be set to NULL + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__open_mem()** creates a bpf_object by reading + * the BPF objects raw bytes from a memory buffer containing a valid + * BPF ELF object file. + * @param obj_buf pointer to the buffer containing ELF file bytes + * @param obj_buf_sz number of bytes in the buffer + * @param opts options for how to load the bpf object + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_object * +bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, + const struct bpf_object_open_opts *opts); + +/** + * @brief **bpf_object__load()** loads BPF object into kernel. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** APIs + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); + +/** + * @brief **bpf_object__close()** closes a BPF object and releases all + * resources. + * @param obj Pointer to a valid BPF object + */ +LIBBPF_API void bpf_object__close(struct bpf_object *obj); + +/** + * @brief **bpf_object__pin_maps()** pins each map contained within + * the BPF object at the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where maps should be pinned. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__pin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ +LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path); + +/** + * @brief **bpf_object__unpin_maps()** unpins each map contained within + * the BPF object found in the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where pinned maps should be searched for. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__unpin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ +LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, + const char *path); +LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); + +LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); +LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); + +struct btf; +LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); +LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); + +LIBBPF_API struct bpf_program * +bpf_object__find_program_by_name(const struct bpf_object *obj, + const char *name); + +LIBBPF_API int +libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, + enum bpf_attach_type *expected_attach_type); +LIBBPF_API int libbpf_attach_type_by_name(const char *name, + enum bpf_attach_type *attach_type); +LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, + enum bpf_attach_type attach_type); + +/* Accessors of bpf_program */ +struct bpf_program; + +LIBBPF_API struct bpf_program * +bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); + +#define bpf_object__for_each_program(pos, obj) \ + for ((pos) = bpf_object__next_program((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_program((obj), (pos))) + +LIBBPF_API struct bpf_program * +bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); + +LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, + __u32 ifindex); + +LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); +LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); +LIBBPF_API bool bpf_program__autoattach(const struct bpf_program *prog); +LIBBPF_API void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach); + +struct bpf_insn; + +/** + * @brief **bpf_program__insns()** gives read-only access to BPF program's + * underlying BPF instructions. + * @param prog BPF program for which to return instructions + * @return a pointer to an array of BPF instructions that belong to the + * specified BPF program + * + * Returned pointer is always valid and not NULL. Number of `struct bpf_insn` + * pointed to can be fetched using **bpf_program__insn_cnt()** API. + * + * Keep in mind, libbpf can modify and append/delete BPF program's + * instructions as it processes BPF object file and prepares everything for + * uploading into the kernel. So depending on the point in BPF object + * lifetime, **bpf_program__insns()** can return different sets of + * instructions. As an example, during BPF object load phase BPF program + * instructions will be CO-RE-relocated, BPF subprograms instructions will be + * appended, ldimm64 instructions will have FDs embedded, etc. So instructions + * returned before **bpf_object__load()** and after it might be quite + * different. + */ +LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_insns()** can set BPF program's underlying + * BPF instructions. + * + * WARNING: This is a very advanced libbpf API and users need to know + * what they are doing. This should be used from prog_prepare_load_fn + * callback only. + * + * @param prog BPF program for which to return instructions + * @param new_insns a pointer to an array of BPF instructions + * @param new_insn_cnt number of `struct bpf_insn`'s that form + * specified BPF program + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt); + +/** + * @brief **bpf_program__insn_cnt()** returns number of `struct bpf_insn`'s + * that form specified BPF program. + * @param prog BPF program for which to return number of BPF instructions + * + * See **bpf_program__insns()** documentation for notes on how libbpf can + * change instructions and their count during different phases of + * **bpf_object** lifetime. + */ +LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); + +LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); + +/** + * @brief **bpf_program__pin()** pins the BPF program to a file + * in the BPF FS specified by a path. This increments the programs + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param prog BPF program to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path); + +/** + * @brief **bpf_program__unpin()** unpins the BPF program from a file + * in the BPFFS specified by a path. This decrements the programs + * reference count. + * + * The file pinning the BPF program can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__unpin(struct bpf_program *prog, const char *path); +LIBBPF_API void bpf_program__unload(struct bpf_program *prog); + +struct bpf_link; + +LIBBPF_API struct bpf_link *bpf_link__open(const char *path); +LIBBPF_API int bpf_link__fd(const struct bpf_link *link); +LIBBPF_API const char *bpf_link__pin_path(const struct bpf_link *link); +/** + * @brief **bpf_link__pin()** pins the BPF link to a file + * in the BPF FS specified by a path. This increments the links + * reference count, allowing it to stay loaded after the process + * which loaded it has exited. + * + * @param link BPF link to pin, must already be loaded + * @param path file path in a BPF file system + * @return 0, on success; negative error code, otherwise + */ + +LIBBPF_API int bpf_link__pin(struct bpf_link *link, const char *path); + +/** + * @brief **bpf_link__unpin()** unpins the BPF link from a file + * in the BPFFS specified by a path. This decrements the links + * reference count. + * + * The file pinning the BPF link can also be unlinked by a different + * process in which case this function will return an error. + * + * @param prog BPF program to unpin + * @param path file path to the pin in a BPF file system + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_link__unpin(struct bpf_link *link); +LIBBPF_API int bpf_link__update_program(struct bpf_link *link, + struct bpf_program *prog); +LIBBPF_API void bpf_link__disconnect(struct bpf_link *link); +LIBBPF_API int bpf_link__detach(struct bpf_link *link); +LIBBPF_API int bpf_link__destroy(struct bpf_link *link); + +/** + * @brief **bpf_program__attach()** is a generic function for attaching + * a BPF program based on auto-detection of program type, attach type, + * and extra paremeters, where applicable. + * + * @param prog BPF program to attach + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + * + * This is supported for: + * - kprobe/kretprobe (depends on SEC() definition) + * - uprobe/uretprobe (depends on SEC() definition) + * - tracepoint + * - raw tracepoint + * - tracing programs (typed raw TP/fentry/fexit/fmod_ret) + */ +LIBBPF_API struct bpf_link * +bpf_program__attach(const struct bpf_program *prog); + +struct bpf_perf_event_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* don't use BPF link when attach BPF program */ + bool force_ioctl_attach; + size_t :0; +}; +#define bpf_perf_event_opts__last_field force_ioctl_attach + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd); + +LIBBPF_API struct bpf_link * +bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, + const struct bpf_perf_event_opts *opts); + +/** + * enum probe_attach_mode - the mode to attach kprobe/uprobe + * + * force libbpf to attach kprobe/uprobe in specific mode, -ENOTSUP will + * be returned if it is not supported by the kernel. + */ +enum probe_attach_mode { + /* attach probe in latest supported mode by kernel */ + PROBE_ATTACH_MODE_DEFAULT = 0, + /* attach probe in legacy mode, using debugfs/tracefs */ + PROBE_ATTACH_MODE_LEGACY, + /* create perf event with perf_event_open() syscall */ + PROBE_ATTACH_MODE_PERF, + /* attach probe with BPF link */ + PROBE_ATTACH_MODE_LINK, +}; + +struct bpf_kprobe_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* function's offset to install kprobe to */ + size_t offset; + /* kprobe is return probe */ + bool retprobe; + /* kprobe attach mode */ + enum probe_attach_mode attach_mode; + size_t :0; +}; +#define bpf_kprobe_opts__last_field attach_mode + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe, + const char *func_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_opts(const struct bpf_program *prog, + const char *func_name, + const struct bpf_kprobe_opts *opts); + +struct bpf_kprobe_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* array of function symbols to attach */ + const char **syms; + /* array of function addresses to attach */ + const unsigned long *addrs; + /* array of user-provided values fetchable through bpf_get_attach_cookie */ + const __u64 *cookies; + /* number of elements in syms/addrs/cookies arrays */ + size_t cnt; + /* create return kprobes */ + bool retprobe; + size_t :0; +}; + +#define bpf_kprobe_multi_opts__last_field retprobe + +LIBBPF_API struct bpf_link * +bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, + const char *pattern, + const struct bpf_kprobe_multi_opts *opts); + +struct bpf_ksyscall_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* attach as return probe? */ + bool retprobe; + size_t :0; +}; +#define bpf_ksyscall_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_ksyscall()** attaches a BPF program + * to kernel syscall handler of a specified syscall. Optionally it's possible + * to request to install retprobe that will be triggered at syscall exit. It's + * also possible to associate BPF cookie (though options). + * + * Libbpf automatically will determine correct full kernel function name, + * which depending on system architecture and kernel version/configuration + * could be of the form ___sys_ or __se_sys_, and will + * attach specified program using kprobe/kretprobe mechanism. + * + * **bpf_program__attach_ksyscall()** is an API counterpart of declarative + * **SEC("ksyscall/")** annotation of BPF programs. + * + * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do + * not handle all the calling convention quirks for mmap(), clone() and compat + * syscalls. It also only attaches to "native" syscall interfaces. If host + * system supports compat syscalls or defines 32-bit syscalls in 64-bit + * kernel, such syscall interfaces won't be attached to by libbpf. + * + * These limitations may or may not change in the future. Therefore it is + * recommended to use SEC("kprobe") for these syscalls or if working with + * compat and 32-bit interfaces is required. + * + * @param prog BPF program to attach + * @param syscall_name Symbolic name of the syscall (e.g., "bpf") + * @param opts Additional options (see **struct bpf_ksyscall_opts**) + * @return Reference to the newly created BPF link; or NULL is returned on + * error, error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts); + +struct bpf_uprobe_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* offset of kernel reference counted USDT semaphore, added in + * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + size_t ref_ctr_offset; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* uprobe is return probe, invoked at function return time */ + bool retprobe; + /* Function name to attach to. Could be an unqualified ("abc") or library-qualified + * "abc@LIBXYZ" name. To specify function entry, func_name should be set while + * func_offset argument to bpf_prog__attach_uprobe_opts() should be 0. To trace an + * offset within a function, specify func_name and use func_offset argument to specify + * offset within the function. Shared library functions must specify the shared library + * binary_path. + */ + const char *func_name; + /* uprobe attach mode */ + enum probe_attach_mode attach_mode; + size_t :0; +}; +#define bpf_uprobe_opts__last_field attach_mode + +/** + * @brief **bpf_program__attach_uprobe()** attaches a BPF program + * to the userspace function which is found by binary path and + * offset. You can optionally specify a particular proccess to attach + * to. You can also optionally attach the program to the function + * exit instead of entry. + * + * @param prog BPF program to attach + * @param retprobe Attach to function exit + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe(const struct bpf_program *prog, bool retprobe, + pid_t pid, const char *binary_path, + size_t func_offset); + +/** + * @brief **bpf_program__attach_uprobe_opts()** is just like + * bpf_program__attach_uprobe() except with a options struct + * for various configurations. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains the function symbol + * @param func_offset Offset within the binary of the function symbol + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, + const char *binary_path, size_t func_offset, + const struct bpf_uprobe_opts *opts); + +struct bpf_usdt_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value accessible through usdt_cookie() */ + __u64 usdt_cookie; + size_t :0; +}; +#define bpf_usdt_opts__last_field usdt_cookie + +/** + * @brief **bpf_program__attach_usdt()** is just like + * bpf_program__attach_uprobe_opts() except it covers USDT (User-space + * Statically Defined Tracepoint) attachment, instead of attaching to + * user-space function entry or exit. + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary that contains provided USDT probe + * @param usdt_provider USDT provider name + * @param usdt_name USDT probe name + * @param opts Options for altering program attachment + * @return Reference to the newly created BPF link; or NULL is returned on error, + * error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_usdt(const struct bpf_program *prog, + pid_t pid, const char *binary_path, + const char *usdt_provider, const char *usdt_name, + const struct bpf_usdt_opts *opts); + +struct bpf_tracepoint_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; +}; +#define bpf_tracepoint_opts__last_field bpf_cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, + const char *tp_category, + const char *tp_name, + const struct bpf_tracepoint_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name); + +struct bpf_trace_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 cookie; +}; +#define bpf_trace_opts__last_field cookie + +LIBBPF_API struct bpf_link * +bpf_program__attach_trace(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_trace_opts(const struct bpf_program *prog, const struct bpf_trace_opts *opts); + +LIBBPF_API struct bpf_link * +bpf_program__attach_lsm(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); +LIBBPF_API struct bpf_link * +bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); +LIBBPF_API struct bpf_link * +bpf_program__attach_freplace(const struct bpf_program *prog, + int target_fd, const char *attach_func_name); + +struct bpf_netfilter_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; +}; +#define bpf_netfilter_opts__last_field flags + +LIBBPF_API struct bpf_link * +bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts); + +struct bpf_map; + +LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); + +struct bpf_iter_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + union bpf_iter_link_info *link_info; + __u32 link_info_len; +}; +#define bpf_iter_attach_opts__last_field link_info_len + +LIBBPF_API struct bpf_link * +bpf_program__attach_iter(const struct bpf_program *prog, + const struct bpf_iter_attach_opts *opts); + +LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_type()** sets the program + * type of the passed BPF program. + * @param prog BPF program to set the program type for + * @param type program type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int bpf_program__set_type(struct bpf_program *prog, + enum bpf_prog_type type); + +LIBBPF_API enum bpf_attach_type +bpf_program__expected_attach_type(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_expected_attach_type()** sets the + * attach type of the passed BPF program. This is used for + * auto-detection of attachment when programs are loaded. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error. An error occurs + * if the object is already loaded. + * + * This must be called before the BPF object is loaded, + * otherwise it has no effect and an error is returned. + */ +LIBBPF_API int +bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type); + +LIBBPF_API __u32 bpf_program__flags(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_flags(struct bpf_program *prog, __u32 flags); + +/* Per-program log level and log buffer getters/setters. + * See bpf_object_open_opts comments regarding log_level and log_buf + * interactions. + */ +LIBBPF_API __u32 bpf_program__log_level(const struct bpf_program *prog); +LIBBPF_API int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level); +LIBBPF_API const char *bpf_program__log_buf(const struct bpf_program *prog, size_t *log_size); +LIBBPF_API int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log_size); + +/** + * @brief **bpf_program__set_attach_target()** sets BTF-based attach target + * for supported BPF program types: + * - BTF-aware raw tracepoints (tp_btf); + * - fentry/fexit/fmod_ret; + * - lsm; + * - freplace. + * @param prog BPF program to set the attach type for + * @param type attach type to set the BPF map to have + * @return error code; or 0 if no error occurred. + */ +LIBBPF_API int +bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, + const char *attach_func_name); + +/** + * @brief **bpf_object__find_map_by_name()** returns BPF map of + * the given name, if it exists within the passed BPF object + * @param obj BPF object + * @param name name of the BPF map + * @return BPF map instance, if such map exists within the BPF object; + * or NULL otherwise. + */ +LIBBPF_API struct bpf_map * +bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API int +bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); + +LIBBPF_API struct bpf_map * +bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); + +#define bpf_object__for_each_map(pos, obj) \ + for ((pos) = bpf_object__next_map((obj), NULL); \ + (pos) != NULL; \ + (pos) = bpf_object__next_map((obj), (pos))) +#define bpf_map__for_each bpf_object__for_each_map + +LIBBPF_API struct bpf_map * +bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); + +/** + * @brief **bpf_map__set_autocreate()** sets whether libbpf has to auto-create + * BPF map during BPF object load phase. + * @param map the BPF map instance + * @param autocreate whether to create BPF map during BPF object load + * @return 0 on success; -EBUSY if BPF object was already loaded + * + * **bpf_map__set_autocreate()** allows to opt-out from libbpf auto-creating + * BPF map. By default, libbpf will attempt to create every single BPF map + * defined in BPF object file using BPF_MAP_CREATE command of bpf() syscall + * and fill in map FD in BPF instructions. + * + * This API allows to opt-out of this process for specific map instance. This + * can be useful if host kernel doesn't support such BPF map type or used + * combination of flags and user application wants to avoid creating such + * a map in the first place. User is still responsible to make sure that their + * BPF-side code that expects to use such missing BPF map is recognized by BPF + * verifier as dead code, otherwise BPF verifier will reject such BPF program. + */ +LIBBPF_API int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate); +LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); + +/** + * @brief **bpf_map__fd()** gets the file descriptor of the passed + * BPF map + * @param map the BPF map instance + * @return the file descriptor; or -EINVAL in case of an error + */ +LIBBPF_API int bpf_map__fd(const struct bpf_map *map); +LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +/* get map name */ +LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); +/* get/set map type */ +LIBBPF_API enum bpf_map_type bpf_map__type(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); +/* get/set map size (max_entries) */ +LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); +/* get/set map flags */ +LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); +/* get/set map NUMA node */ +LIBBPF_API __u32 bpf_map__numa_node(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node); +/* get/set map key size */ +LIBBPF_API __u32 bpf_map__key_size(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_key_size(struct bpf_map *map, __u32 size); +/* get map value size */ +LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); +/** + * @brief **bpf_map__set_value_size()** sets map value size. + * @param map the BPF map instance + * @return 0, on success; negative error, otherwise + * + * There is a special case for maps with associated memory-mapped regions, like + * the global data section maps (bss, data, rodata). When this function is used + * on such a map, the mapped region is resized. Afterward, an attempt is made to + * adjust the corresponding BTF info. This attempt is best-effort and can only + * succeed if the last variable of the data section map is an array. The array + * BTF type is replaced by a new BTF array type with a different length. + * Any previously existing pointers returned from bpf_map__initial_value() or + * corresponding data section skeleton pointer must be reinitialized. + */ +LIBBPF_API int bpf_map__set_value_size(struct bpf_map *map, __u32 size); +/* get map key/value BTF type IDs */ +LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); +LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map); +/* get/set map if_index */ +LIBBPF_API __u32 bpf_map__ifindex(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); +/* get/set map map_extra flags */ +LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); +LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); + +LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, + const void *data, size_t size); +LIBBPF_API void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); + +/** + * @brief **bpf_map__is_internal()** tells the caller whether or not the + * passed map is a special map created by libbpf automatically for things like + * global variables, __ksym externs, Kconfig values, etc + * @param map the bpf_map + * @return true, if the map is an internal map; false, otherwise + */ +LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); + +/** + * @brief **bpf_map__set_pin_path()** sets the path attribute that tells where the + * BPF map should be pinned. This does not actually create the 'pin'. + * @param map The bpf_map + * @param path The path + * @return 0, on success; negative error, otherwise + */ +LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__pin_path()** gets the path attribute that tells where the + * BPF map should be pinned. + * @param map The bpf_map + * @return The path string; which can be NULL + */ +LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); + +/** + * @brief **bpf_map__is_pinned()** tells the caller whether or not the + * passed map has been pinned via a 'pin' file. + * @param map The bpf_map + * @return true, if the map is pinned; false, otherwise + */ +LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); + +/** + * @brief **bpf_map__pin()** creates a file that serves as a 'pin' + * for the BPF map. This increments the reference count on the + * BPF map which will keep the BPF map loaded even after the + * userspace process which loaded it has exited. + * @param map The bpf_map to pin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * If `path` is NULL the maps `pin_path` attribute will be used. If this is + * also NULL, an error will be returned and the map will not be pinned. + */ +LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__unpin()** removes the file that serves as a + * 'pin' for the BPF map. + * @param map The bpf_map to unpin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * The `path` parameter can be NULL, in which case the `pin_path` + * map attribute is unpinned. If both the `path` parameter and + * `pin_path` map attribute are set, they must be equal. + */ +LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); + +LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); +LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); + +/** + * @brief **bpf_map__lookup_elem()** allows to lookup BPF map value + * corresponding to provided key. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_elem()** is high-level equivalent of + * **bpf_map_lookup_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__update_elem()** allows to insert or update value in BPF + * map that corresponds to provided key. + * @param map BPF map to insert to or update element in + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory containing bytes of the value + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__update_elem()** is high-level equivalent of + * **bpf_map_update_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__delete_elem()** allows to delete element in BPF map that + * corresponds to provided key. + * @param map BPF map to delete element from + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__delete_elem()** is high-level equivalent of + * **bpf_map_delete_elem()** API with added check for key size. + */ +LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags); + +/** + * @brief **bpf_map__lookup_and_delete_elem()** allows to lookup BPF map value + * corresponding to provided key and atomically delete it afterwards. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of + * **bpf_map_lookup_and_delete_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__get_next_key()** allows to iterate BPF map keys by + * fetching next key that follows current key. + * @param map BPF map to fetch next key from + * @param cur_key pointer to memory containing bytes of current key or NULL to + * fetch the first key + * @param next_key pointer to memory to write next key into + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @return 0, on success; -ENOENT if **cur_key** is the last key in BPF map; + * negative error, otherwise + * + * **bpf_map__get_next_key()** is high-level equivalent of + * **bpf_map_get_next_key()** API with added check for key size. + */ +LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz); + +struct bpf_xdp_set_link_opts { + size_t sz; + int old_fd; + size_t :0; +}; +#define bpf_xdp_set_link_opts__last_field old_fd + +struct bpf_xdp_attach_opts { + size_t sz; + int old_prog_fd; + size_t :0; +}; +#define bpf_xdp_attach_opts__last_field old_prog_fd + +struct bpf_xdp_query_opts { + size_t sz; + __u32 prog_id; /* output */ + __u32 drv_prog_id; /* output */ + __u32 hw_prog_id; /* output */ + __u32 skb_prog_id; /* output */ + __u8 attach_mode; /* output */ + __u64 feature_flags; /* output */ + size_t :0; +}; +#define bpf_xdp_query_opts__last_field feature_flags + +LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_detach(int ifindex, __u32 flags, + const struct bpf_xdp_attach_opts *opts); +LIBBPF_API int bpf_xdp_query(int ifindex, int flags, struct bpf_xdp_query_opts *opts); +LIBBPF_API int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id); + +/* TC related API */ +enum bpf_tc_attach_point { + BPF_TC_INGRESS = 1 << 0, + BPF_TC_EGRESS = 1 << 1, + BPF_TC_CUSTOM = 1 << 2, +}; + +#define BPF_TC_PARENT(a, b) \ + ((((a) << 16) & 0xFFFF0000U) | ((b) & 0x0000FFFFU)) + +enum bpf_tc_flags { + BPF_TC_F_REPLACE = 1 << 0, +}; + +struct bpf_tc_hook { + size_t sz; + int ifindex; + enum bpf_tc_attach_point attach_point; + __u32 parent; + size_t :0; +}; +#define bpf_tc_hook__last_field parent + +struct bpf_tc_opts { + size_t sz; + int prog_fd; + __u32 flags; + __u32 prog_id; + __u32 handle; + __u32 priority; + size_t :0; +}; +#define bpf_tc_opts__last_field priority + +LIBBPF_API int bpf_tc_hook_create(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_hook_destroy(struct bpf_tc_hook *hook); +LIBBPF_API int bpf_tc_attach(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts); +LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, + struct bpf_tc_opts *opts); + +/* Ring buffer APIs */ +struct ring_buffer; +struct user_ring_buffer; + +typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); + +struct ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define ring_buffer_opts__last_field sz + +LIBBPF_API struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts); +LIBBPF_API void ring_buffer__free(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx); +LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); +LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); + +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/** + * @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/** + * @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/** + * @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/** + * @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + +/* Perf buffer APIs */ +struct perf_buffer; + +typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, + void *data, __u32 size); +typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); + +/* common use perf buffer options */ +struct perf_buffer_opts { + size_t sz; + __u32 sample_period; + size_t :0; +}; +#define perf_buffer_opts__last_field sample_period + +/** + * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified + * BPF_PERF_EVENT_ARRAY map + * @param map_fd FD of BPF_PERF_EVENT_ARRAY BPF map that will be used by BPF + * code to send data over to user-space + * @param page_cnt number of memory pages allocated for each per-CPU buffer + * @param sample_cb function called on each received data record + * @param lost_cb function called when record loss has occurred + * @param ctx user-provided extra context passed into *sample_cb* and *lost_cb* + * @return a new instance of struct perf_buffer on success, NULL on error with + * *errno* containing an error code + */ +LIBBPF_API struct perf_buffer * +perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, + const struct perf_buffer_opts *opts); + +enum bpf_perf_event_ret { + LIBBPF_PERF_EVENT_DONE = 0, + LIBBPF_PERF_EVENT_ERROR = -1, + LIBBPF_PERF_EVENT_CONT = -2, +}; + +struct perf_event_header; + +typedef enum bpf_perf_event_ret +(*perf_buffer_event_fn)(void *ctx, int cpu, struct perf_event_header *event); + +/* raw perf buffer options, giving most power and control */ +struct perf_buffer_raw_opts { + size_t sz; + long :0; + long :0; + /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of + * max_entries of given PERF_EVENT_ARRAY map) + */ + int cpu_cnt; + /* if cpu_cnt > 0, cpus is an array of CPUs to open ring buffers on */ + int *cpus; + /* if cpu_cnt > 0, map_keys specify map keys to set per-CPU FDs for */ + int *map_keys; +}; +#define perf_buffer_raw_opts__last_field map_keys + +struct perf_event_attr; + +LIBBPF_API struct perf_buffer * +perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts); + +LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); +LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); +LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx); +LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); +LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); +/** + * @brief **perf_buffer__buffer()** returns the per-cpu raw mmap()'ed underlying + * memory region of the ring buffer. + * This ring buffer can be used to implement a custom events consumer. + * The ring buffer starts with the *struct perf_event_mmap_page*, which + * holds the ring buffer managment fields, when accessing the header + * structure it's important to be SMP aware. + * You can refer to *perf_event_read_simple* for a simple example. + * @param pb the perf buffer structure + * @param buf_idx the buffer index to retreive + * @param buf (out) gets the base pointer of the mmap()'ed memory + * @param buf_size (out) gets the size of the mmap()'ed region + * @return 0 on success, negative error code for failure + */ +LIBBPF_API int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, + size_t *buf_size); + +struct bpf_prog_linfo; +struct bpf_prog_info; + +LIBBPF_API void bpf_prog_linfo__free(struct bpf_prog_linfo *prog_linfo); +LIBBPF_API struct bpf_prog_linfo * +bpf_prog_linfo__new(const struct bpf_prog_info *info); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind_addr_func(const struct bpf_prog_linfo *prog_linfo, + __u64 addr, __u32 func_idx, __u32 nr_skip); +LIBBPF_API const struct bpf_line_info * +bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, + __u32 insn_off, __u32 nr_skip); + +/* + * Probe for supported system features + * + * Note that running many of these probes in a short amount of time can cause + * the kernel to reach the maximal size of lockable memory allowed for the + * user, causing subsequent probes to fail. In this case, the caller may want + * to adjust that limit with setrlimit(). + */ + +/** + * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports + * BPF programs of a given type. + * @param prog_type BPF program type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given program type is supported; 0, if given program type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_map_type()** detects if host kernel supports + * BPF maps of a given type. + * @param map_type BPF map type to detect kernel support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given map type is supported; 0, if given map type is + * not supported; negative error code if feature detection failed or can't be + * performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts); +/** + * @brief **libbpf_probe_bpf_helper()** detects if host kernel supports the + * use of a given BPF helper from specified BPF program type. + * @param prog_type BPF program type used to check the support of BPF helper + * @param helper_id BPF helper ID (enum bpf_func_id) to check support for + * @param opts reserved for future extensibility, should be NULL + * @return 1, if given combination of program type and helper is supported; 0, + * if the combination is not supported; negative error code if feature + * detection for provided input arguments failed or can't be performed + * + * Make sure the process has required set of CAP_* permissions (or runs as + * root) when performing feature checking. + */ +LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, + enum bpf_func_id helper_id, const void *opts); + +/** + * @brief **libbpf_num_possible_cpus()** is a helper function to get the + * number of possible CPUs that the host kernel supports and expects. + * @return number of possible CPUs; or error code on failure + * + * Example usage: + * + * int ncpus = libbpf_num_possible_cpus(); + * if (ncpus < 0) { + * // error handling + * } + * long values[ncpus]; + * bpf_map_lookup_elem(per_cpu_map_fd, key, values); + */ +LIBBPF_API int libbpf_num_possible_cpus(void); + +struct bpf_map_skeleton { + const char *name; + struct bpf_map **map; + void **mmaped; +}; + +struct bpf_prog_skeleton { + const char *name; + struct bpf_program **prog; + struct bpf_link **link; +}; + +struct bpf_object_skeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const char *name; + const void *data; + size_t data_sz; + + struct bpf_object **obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; +}; + +LIBBPF_API int +bpf_object__open_skeleton(struct bpf_object_skeleton *s, + const struct bpf_object_open_opts *opts); +LIBBPF_API int bpf_object__load_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API int bpf_object__attach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__detach_skeleton(struct bpf_object_skeleton *s); +LIBBPF_API void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s); + +struct bpf_var_skeleton { + const char *name; + struct bpf_map **map; + void **addr; +}; + +struct bpf_object_subskeleton { + size_t sz; /* size of this struct, for forward/backward compatibility */ + + const struct bpf_object *obj; + + int map_cnt; + int map_skel_sz; /* sizeof(struct bpf_map_skeleton) */ + struct bpf_map_skeleton *maps; + + int prog_cnt; + int prog_skel_sz; /* sizeof(struct bpf_prog_skeleton) */ + struct bpf_prog_skeleton *progs; + + int var_cnt; + int var_skel_sz; /* sizeof(struct bpf_var_skeleton) */ + struct bpf_var_skeleton *vars; +}; + +LIBBPF_API int +bpf_object__open_subskeleton(struct bpf_object_subskeleton *s); +LIBBPF_API void +bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s); + +struct gen_loader_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ + const char *data; + const char *insns; + __u32 data_sz; + __u32 insns_sz; +}; + +#define gen_loader_opts__last_field insns_sz +LIBBPF_API int bpf_object__gen_loader(struct bpf_object *obj, + struct gen_loader_opts *opts); + +enum libbpf_tristate { + TRI_NO = 0, + TRI_YES = 1, + TRI_MODULE = 2, +}; + +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker_file_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; +}; +#define bpf_linker_file_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, + const char *filename, + const struct bpf_linker_file_opts *opts); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + +/* + * Custom handling of BPF program's SEC() definitions + */ + +struct bpf_prog_load_opts; /* defined in bpf.h */ + +/* Called during bpf_object__open() for each recognized BPF program. Callback + * can use various bpf_program__set_*() setters to adjust whatever properties + * are necessary. + */ +typedef int (*libbpf_prog_setup_fn_t)(struct bpf_program *prog, long cookie); + +/* Called right before libbpf performs bpf_prog_load() to load BPF program + * into the kernel. Callback can adjust opts as necessary. + */ +typedef int (*libbpf_prog_prepare_load_fn_t)(struct bpf_program *prog, + struct bpf_prog_load_opts *opts, long cookie); + +/* Called during skeleton attach or through bpf_program__attach(). If + * auto-attach is not supported, callback should return 0 and set link to + * NULL (it's not considered an error during skeleton attach, but it will be + * an error for bpf_program__attach() calls). On error, error should be + * returned directly and link set to NULL. On success, return 0 and set link + * to a valid struct bpf_link. + */ +typedef int (*libbpf_prog_attach_fn_t)(const struct bpf_program *prog, long cookie, + struct bpf_link **link); + +struct libbpf_prog_handler_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* User-provided value that is passed to prog_setup_fn, + * prog_prepare_load_fn, and prog_attach_fn callbacks. Allows user to + * register one set of callbacks for multiple SEC() definitions and + * still be able to distinguish them, if necessary. For example, + * libbpf itself is using this to pass necessary flags (e.g., + * sleepable flag) to a common internal SEC() handler. + */ + long cookie; + /* BPF program initialization callback (see libbpf_prog_setup_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_setup_fn_t prog_setup_fn; + /* BPF program loading callback (see libbpf_prog_prepare_load_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_prepare_load_fn_t prog_prepare_load_fn; + /* BPF program attach callback (see libbpf_prog_attach_fn_t). + * Callback is optional, pass NULL if it's not necessary. + */ + libbpf_prog_attach_fn_t prog_attach_fn; +}; +#define libbpf_prog_handler_opts__last_field prog_attach_fn + +/** + * @brief **libbpf_register_prog_handler()** registers a custom BPF program + * SEC() handler. + * @param sec section prefix for which custom handler is registered + * @param prog_type BPF program type associated with specified section + * @param exp_attach_type Expected BPF attach type associated with specified section + * @param opts optional cookie, callbacks, and other extra options + * @return Non-negative handler ID is returned on success. This handler ID has + * to be passed to *libbpf_unregister_prog_handler()* to unregister such + * custom handler. Negative error code is returned on error. + * + * *sec* defines which SEC() definitions are handled by this custom handler + * registration. *sec* can have few different forms: + * - if *sec* is just a plain string (e.g., "abc"), it will match only + * SEC("abc"). If BPF program specifies SEC("abc/whatever") it will result + * in an error; + * - if *sec* is of the form "abc/", proper SEC() form is + * SEC("abc/something"), where acceptable "something" should be checked by + * *prog_init_fn* callback, if there are additional restrictions; + * - if *sec* is of the form "abc+", it will successfully match both + * SEC("abc") and SEC("abc/whatever") forms; + * - if *sec* is NULL, custom handler is registered for any BPF program that + * doesn't match any of the registered (custom or libbpf's own) SEC() + * handlers. There could be only one such generic custom handler registered + * at any given time. + * + * All custom handlers (except the one with *sec* == NULL) are processed + * before libbpf's own SEC() handlers. It is allowed to "override" libbpf's + * SEC() handlers by registering custom ones for the same section prefix + * (i.e., it's possible to have custom SEC("perf_event/LLC-load-misses") + * handler). + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_register_prog_handler(const char *sec, + enum bpf_prog_type prog_type, + enum bpf_attach_type exp_attach_type, + const struct libbpf_prog_handler_opts *opts); +/** + * @brief *libbpf_unregister_prog_handler()* unregisters previously registered + * custom BPF program SEC() handler. + * @param handler_id handler ID returned by *libbpf_register_prog_handler()* + * after successful registration + * @return 0 on success, negative error code if handler isn't found + * + * Note, like much of global libbpf APIs (e.g., libbpf_set_print(), + * libbpf_set_strict_mode(), etc)) these APIs are not thread-safe. User needs + * to ensure synchronization if there is a risk of running this API from + * multiple threads simultaneously. + */ +LIBBPF_API int libbpf_unregister_prog_handler(int handler_id); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LIBBPF_H */ diff --git a/third_party/libbpf/src/libbpf.map b/third_party/libbpf/src/libbpf.map new file mode 100644 index 0000000..d9ec440 --- /dev/null +++ b/third_party/libbpf/src/libbpf.map @@ -0,0 +1,399 @@ +LIBBPF_0.0.1 { + global: + bpf_btf_get_fd_by_id; + bpf_map__btf_key_type_id; + bpf_map__btf_value_type_id; + bpf_map__fd; + bpf_map__name; + bpf_map__pin; + bpf_map__reuse_fd; + bpf_map__set_ifindex; + bpf_map__set_inner_map_fd; + bpf_map__unpin; + bpf_map_delete_elem; + bpf_map_get_fd_by_id; + bpf_map_get_next_id; + bpf_map_get_next_key; + bpf_map_lookup_and_delete_elem; + bpf_map_lookup_elem; + bpf_map_update_elem; + bpf_obj_get; + bpf_obj_get_info_by_fd; + bpf_obj_pin; + bpf_object__btf_fd; + bpf_object__close; + bpf_object__find_map_by_name; + bpf_object__kversion; + bpf_object__load; + bpf_object__name; + bpf_object__open; + bpf_object__pin; + bpf_object__pin_maps; + bpf_object__pin_programs; + bpf_object__unpin_maps; + bpf_object__unpin_programs; + bpf_prog_attach; + bpf_prog_detach; + bpf_prog_detach2; + bpf_prog_get_fd_by_id; + bpf_prog_get_next_id; + bpf_prog_query; + bpf_program__fd; + bpf_program__pin; + bpf_program__set_expected_attach_type; + bpf_program__set_ifindex; + bpf_program__set_type; + bpf_program__unload; + bpf_program__unpin; + bpf_prog_linfo__free; + bpf_prog_linfo__new; + bpf_prog_linfo__lfind_addr_func; + bpf_prog_linfo__lfind; + bpf_raw_tracepoint_open; + bpf_task_fd_query; + btf__fd; + btf__find_by_name; + btf__free; + btf__name_by_offset; + btf__new; + btf__resolve_size; + btf__resolve_type; + btf__type_by_id; + libbpf_attach_type_by_name; + libbpf_get_error; + libbpf_prog_type_by_name; + libbpf_set_print; + libbpf_strerror; + local: + *; +}; + +LIBBPF_0.0.2 { + global: + bpf_map_lookup_elem_flags; + bpf_object__btf; + bpf_object__find_map_fd_by_name; + btf__get_raw_data; + btf_ext__free; + btf_ext__get_raw_data; + btf_ext__new; +} LIBBPF_0.0.1; + +LIBBPF_0.0.3 { + global: + bpf_map__is_internal; + bpf_map_freeze; +} LIBBPF_0.0.2; + +LIBBPF_0.0.4 { + global: + bpf_link__destroy; + bpf_program__attach_kprobe; + bpf_program__attach_perf_event; + bpf_program__attach_raw_tracepoint; + bpf_program__attach_tracepoint; + bpf_program__attach_uprobe; + btf_dump__dump_type; + btf_dump__free; + btf__parse_elf; + libbpf_num_possible_cpus; + perf_buffer__free; + perf_buffer__poll; +} LIBBPF_0.0.3; + +LIBBPF_0.0.5 { + global: + bpf_btf_get_next_id; +} LIBBPF_0.0.4; + +LIBBPF_0.0.6 { + global: + bpf_map__get_pin_path; + bpf_map__is_pinned; + bpf_map__set_pin_path; + bpf_object__open_file; + bpf_object__open_mem; + bpf_program__attach_trace; + bpf_program__get_expected_attach_type; + bpf_program__get_type; + btf__find_by_name_kind; + libbpf_find_vmlinux_btf_id; +} LIBBPF_0.0.5; + +LIBBPF_0.0.7 { + global: + btf_dump__emit_type_decl; + bpf_link__disconnect; + bpf_map__attach_struct_ops; + bpf_map_delete_batch; + bpf_map_lookup_and_delete_batch; + bpf_map_lookup_batch; + bpf_map_update_batch; + bpf_object__find_program_by_name; + bpf_object__attach_skeleton; + bpf_object__destroy_skeleton; + bpf_object__detach_skeleton; + bpf_object__load_skeleton; + bpf_object__open_skeleton; + bpf_program__attach; + bpf_program__name; + btf__align_of; + libbpf_find_kernel_btf; +} LIBBPF_0.0.6; + +LIBBPF_0.0.8 { + global: + bpf_link__fd; + bpf_link__open; + bpf_link__pin; + bpf_link__pin_path; + bpf_link__unpin; + bpf_link__update_program; + bpf_link_create; + bpf_link_update; + bpf_map__set_initial_value; + bpf_prog_attach_opts; + bpf_program__attach_cgroup; + bpf_program__attach_lsm; + bpf_program__set_attach_target; +} LIBBPF_0.0.7; + +LIBBPF_0.0.9 { + global: + bpf_enable_stats; + bpf_iter_create; + bpf_link_get_fd_by_id; + bpf_link_get_next_id; + bpf_program__attach_iter; + bpf_program__attach_netns; + perf_buffer__consume; + ring_buffer__add; + ring_buffer__consume; + ring_buffer__free; + ring_buffer__new; + ring_buffer__poll; +} LIBBPF_0.0.8; + +LIBBPF_0.1.0 { + global: + bpf_link__detach; + bpf_link_detach; + bpf_map__ifindex; + bpf_map__key_size; + bpf_map__map_flags; + bpf_map__max_entries; + bpf_map__numa_node; + bpf_map__set_key_size; + bpf_map__set_map_flags; + bpf_map__set_max_entries; + bpf_map__set_numa_node; + bpf_map__set_type; + bpf_map__set_value_size; + bpf_map__type; + bpf_map__value_size; + bpf_program__attach_xdp; + bpf_program__autoload; + bpf_program__set_autoload; + btf__parse; + btf__parse_raw; + btf__pointer_size; + btf__set_fd; + btf__set_pointer_size; +} LIBBPF_0.0.9; + +LIBBPF_0.2.0 { + global: + bpf_prog_bind_map; + bpf_prog_test_run_opts; + bpf_program__attach_freplace; + bpf_program__section_name; + btf__add_array; + btf__add_const; + btf__add_enum; + btf__add_enum_value; + btf__add_datasec; + btf__add_datasec_var_info; + btf__add_field; + btf__add_func; + btf__add_func_param; + btf__add_func_proto; + btf__add_fwd; + btf__add_int; + btf__add_ptr; + btf__add_restrict; + btf__add_str; + btf__add_struct; + btf__add_typedef; + btf__add_union; + btf__add_var; + btf__add_volatile; + btf__endianness; + btf__find_str; + btf__new_empty; + btf__set_endianness; + btf__str_by_offset; + perf_buffer__buffer_cnt; + perf_buffer__buffer_fd; + perf_buffer__epoll_fd; + perf_buffer__consume_buffer; +} LIBBPF_0.1.0; + +LIBBPF_0.3.0 { + global: + btf__base_btf; + btf__parse_elf_split; + btf__parse_raw_split; + btf__parse_split; + btf__new_empty_split; + btf__new_split; + ring_buffer__epoll_fd; +} LIBBPF_0.2.0; + +LIBBPF_0.4.0 { + global: + btf__add_float; + btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; + bpf_map__inner_map; + bpf_object__set_kversion; + bpf_tc_attach; + bpf_tc_detach; + bpf_tc_hook_create; + bpf_tc_hook_destroy; + bpf_tc_query; +} LIBBPF_0.3.0; + +LIBBPF_0.5.0 { + global: + bpf_map__initial_value; + bpf_map__pin_path; + bpf_map_lookup_and_delete_elem_flags; + bpf_program__attach_kprobe_opts; + bpf_program__attach_perf_event_opts; + bpf_program__attach_tracepoint_opts; + bpf_program__attach_uprobe_opts; + bpf_object__gen_loader; + btf__load_from_kernel_by_id; + btf__load_from_kernel_by_id_split; + btf__load_into_kernel; + btf__load_module_btf; + btf__load_vmlinux_btf; + btf_dump__dump_type_data; + libbpf_set_strict_mode; +} LIBBPF_0.4.0; + +LIBBPF_0.6.0 { + global: + bpf_map__map_extra; + bpf_map__set_map_extra; + bpf_map_create; + bpf_object__next_map; + bpf_object__next_program; + bpf_object__prev_map; + bpf_object__prev_program; + bpf_prog_load; + bpf_program__flags; + bpf_program__insn_cnt; + bpf_program__insns; + bpf_program__set_flags; + btf__add_btf; + btf__add_decl_tag; + btf__add_type_tag; + btf__dedup; + btf__raw_data; + btf__type_cnt; + btf_dump__new; + libbpf_major_version; + libbpf_minor_version; + libbpf_version_string; + perf_buffer__new; + perf_buffer__new_raw; +} LIBBPF_0.5.0; + +LIBBPF_0.7.0 { + global: + bpf_btf_load; + bpf_program__expected_attach_type; + bpf_program__log_buf; + bpf_program__log_level; + bpf_program__set_log_buf; + bpf_program__set_log_level; + bpf_program__type; + bpf_xdp_attach; + bpf_xdp_detach; + bpf_xdp_query; + bpf_xdp_query_id; + btf_ext__raw_data; + libbpf_probe_bpf_helper; + libbpf_probe_bpf_map_type; + libbpf_probe_bpf_prog_type; + libbpf_set_memlock_rlim; +} LIBBPF_0.6.0; + +LIBBPF_0.8.0 { + global: + bpf_map__autocreate; + bpf_map__get_next_key; + bpf_map__delete_elem; + bpf_map__lookup_and_delete_elem; + bpf_map__lookup_elem; + bpf_map__set_autocreate; + bpf_map__update_elem; + bpf_map_delete_elem_flags; + bpf_object__destroy_subskeleton; + bpf_object__open_subskeleton; + bpf_program__attach_kprobe_multi_opts; + bpf_program__attach_trace_opts; + bpf_program__attach_usdt; + bpf_program__set_insns; + libbpf_register_prog_handler; + libbpf_unregister_prog_handler; +} LIBBPF_0.7.0; + +LIBBPF_1.0.0 { + global: + bpf_obj_get_opts; + bpf_prog_query_opts; + bpf_program__attach_ksyscall; + bpf_program__autoattach; + bpf_program__set_autoattach; + btf__add_enum64; + btf__add_enum64_value; + libbpf_bpf_attach_type_str; + libbpf_bpf_link_type_str; + libbpf_bpf_map_type_str; + libbpf_bpf_prog_type_str; + perf_buffer__buffer; +} LIBBPF_0.8.0; + +LIBBPF_1.1.0 { + global: + bpf_btf_get_fd_by_id_opts; + bpf_link_get_fd_by_id_opts; + bpf_map_get_fd_by_id_opts; + bpf_prog_get_fd_by_id_opts; + user_ring_buffer__discard; + user_ring_buffer__free; + user_ring_buffer__new; + user_ring_buffer__reserve; + user_ring_buffer__reserve_blocking; + user_ring_buffer__submit; +} LIBBPF_1.0.0; + +LIBBPF_1.2.0 { + global: + bpf_btf_get_info_by_fd; + bpf_link__update_map; + bpf_link_get_info_by_fd; + bpf_map_get_info_by_fd; + bpf_prog_get_info_by_fd; +} LIBBPF_1.1.0; + +LIBBPF_1.3.0 { + global: + bpf_obj_pin_opts; + bpf_program__attach_netfilter; +} LIBBPF_1.2.0; diff --git a/third_party/libbpf/src/libbpf.pc.template b/third_party/libbpf/src/libbpf.pc.template new file mode 100644 index 0000000..b45ed53 --- /dev/null +++ b/third_party/libbpf/src/libbpf.pc.template @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +prefix=@PREFIX@ +libdir=@LIBDIR@ +includedir=${prefix}/include + +Name: libbpf +Description: BPF library +Version: @VERSION@ +Libs: -L${libdir} -lbpf +Requires.private: libelf zlib +Cflags: -I${includedir} diff --git a/third_party/libbpf/src/libbpf_common.h b/third_party/libbpf/src/libbpf_common.h new file mode 100644 index 0000000..9a7937f --- /dev/null +++ b/third_party/libbpf/src/libbpf_common.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Common user-facing libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_COMMON_H +#define __LIBBPF_LIBBPF_COMMON_H + +#include +#include "libbpf_version.h" + +#ifndef LIBBPF_API +#define LIBBPF_API __attribute__((visibility("default"))) +#endif + +#define LIBBPF_DEPRECATED(msg) __attribute__((deprecated(msg))) + +/* Mark a symbol as deprecated when libbpf version is >= {major}.{minor} */ +#define LIBBPF_DEPRECATED_SINCE(major, minor, msg) \ + __LIBBPF_MARK_DEPRECATED_ ## major ## _ ## minor \ + (LIBBPF_DEPRECATED("libbpf v" # major "." # minor "+: " msg)) + +#define __LIBBPF_CURRENT_VERSION_GEQ(major, minor) \ + (LIBBPF_MAJOR_VERSION > (major) || \ + (LIBBPF_MAJOR_VERSION == (major) && LIBBPF_MINOR_VERSION >= (minor))) + +/* Add checks for other versions below when planning deprecation of API symbols + * with the LIBBPF_DEPRECATED_SINCE macro. + */ +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X +#else +#define __LIBBPF_MARK_DEPRECATED_1_0(X) +#endif + +/* This set of internal macros allows to do "function overloading" based on + * number of arguments provided by used in backwards-compatible way during the + * transition to libbpf 1.0 + * It's ugly but necessary evil that will be cleaned up when we get to 1.0. + * See bpf_prog_load() overload for example. + */ +#define ___libbpf_cat(A, B) A ## B +#define ___libbpf_select(NAME, NUM) ___libbpf_cat(NAME, NUM) +#define ___libbpf_nth(_1, _2, _3, _4, _5, _6, N, ...) N +#define ___libbpf_cnt(...) ___libbpf_nth(__VA_ARGS__, 6, 5, 4, 3, 2, 1) +#define ___libbpf_overload(NAME, ...) ___libbpf_select(NAME, ___libbpf_cnt(__VA_ARGS__))(__VA_ARGS__) + +/* Helper macro to declare and initialize libbpf options struct + * + * This dance with uninitialized declaration, followed by memset to zero, + * followed by assignment using compound literal syntax is done to preserve + * ability to use a nice struct field initialization syntax and **hopefully** + * have all the padding bytes initialized to zero. It's not guaranteed though, + * when copying literal, that compiler won't copy garbage in literal's padding + * bytes, but that's the best way I've found and it seems to work in practice. + * + * Macro declares opts struct of given type and name, zero-initializes, + * including any extra padding, it with memset() and then assigns initial + * values provided by users in struct initializer-syntax as varargs. + */ +#define LIBBPF_OPTS(TYPE, NAME, ...) \ + struct TYPE NAME = ({ \ + memset(&NAME, 0, sizeof(struct TYPE)); \ + (struct TYPE) { \ + .sz = sizeof(struct TYPE), \ + __VA_ARGS__ \ + }; \ + }) + +#endif /* __LIBBPF_LIBBPF_COMMON_H */ diff --git a/third_party/libbpf/src/libbpf_errno.c b/third_party/libbpf/src/libbpf_errno.c new file mode 100644 index 0000000..6b18017 --- /dev/null +++ b/third_party/libbpf/src/libbpf_errno.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * Copyright (C) 2013-2015 Alexei Starovoitov + * Copyright (C) 2015 Wang Nan + * Copyright (C) 2015 Huawei Inc. + * Copyright (C) 2017 Nicira, Inc. + */ + +#undef _GNU_SOURCE +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +#define ERRNO_OFFSET(e) ((e) - __LIBBPF_ERRNO__START) +#define ERRCODE_OFFSET(c) ERRNO_OFFSET(LIBBPF_ERRNO__##c) +#define NR_ERRNO (__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START) + +static const char *libbpf_strerror_table[NR_ERRNO] = { + [ERRCODE_OFFSET(LIBELF)] = "Something wrong in libelf", + [ERRCODE_OFFSET(FORMAT)] = "BPF object format invalid", + [ERRCODE_OFFSET(KVERSION)] = "'version' section incorrect or lost", + [ERRCODE_OFFSET(ENDIAN)] = "Endian mismatch", + [ERRCODE_OFFSET(INTERNAL)] = "Internal error in libbpf", + [ERRCODE_OFFSET(RELOC)] = "Relocation failed", + [ERRCODE_OFFSET(VERIFY)] = "Kernel verifier blocks program loading", + [ERRCODE_OFFSET(PROG2BIG)] = "Program too big", + [ERRCODE_OFFSET(KVER)] = "Incorrect kernel version", + [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", + [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", + [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", +}; + +int libbpf_strerror(int err, char *buf, size_t size) +{ + int ret; + + if (!buf || !size) + return libbpf_err(-EINVAL); + + err = err > 0 ? err : -err; + + if (err < __LIBBPF_ERRNO__START) { + ret = strerror_r(err, buf, size); + buf[size - 1] = '\0'; + return libbpf_err_errno(ret); + } + + if (err < __LIBBPF_ERRNO__END) { + const char *msg; + + msg = libbpf_strerror_table[ERRNO_OFFSET(err)]; + ret = snprintf(buf, size, "%s", msg); + buf[size - 1] = '\0'; + /* The length of the buf and msg is positive. + * A negative number may be returned only when the + * size exceeds INT_MAX. Not likely to appear. + */ + if (ret >= size) + return libbpf_err(-ERANGE); + return 0; + } + + ret = snprintf(buf, size, "Unknown libbpf error %d", err); + buf[size - 1] = '\0'; + if (ret >= size) + return libbpf_err(-ERANGE); + return libbpf_err(-ENOENT); +} diff --git a/third_party/libbpf/src/libbpf_internal.h b/third_party/libbpf/src/libbpf_internal.h new file mode 100644 index 0000000..e4d0566 --- /dev/null +++ b/third_party/libbpf/src/libbpf_internal.h @@ -0,0 +1,580 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Internal libbpf helpers. + * + * Copyright (c) 2019 Facebook + */ + +#ifndef __LIBBPF_LIBBPF_INTERNAL_H +#define __LIBBPF_LIBBPF_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include "relo_core.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* prevent accidental re-addition of reallocarray() */ +#pragma GCC poison reallocarray + +#include "libbpf.h" +#include "btf.h" + +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_ABS64 +#define R_BPF_64_ABS64 2 +#endif +#ifndef R_BPF_64_ABS32 +#define R_BPF_64_ABS32 3 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif + +/* Older libelf all end up in this expression, for both 32 and 64 bit */ +#ifndef ELF64_ST_VISIBILITY +#define ELF64_ST_VISIBILITY(o) ((o) & 0x03) +#endif + +#define BTF_INFO_ENC(kind, kind_flag, vlen) \ + ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) +#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) +#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \ + ((encoding) << 24 | (bits_offset) << 16 | (nr_bits)) +#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ + BTF_INT_ENC(encoding, bits_offset, bits) +#define BTF_MEMBER_ENC(name, type, bits_offset) (name), (type), (bits_offset) +#define BTF_PARAM_ENC(name, type) (name), (type) +#define BTF_VAR_SECINFO_ENC(type, offset, size) (type), (offset), (size) +#define BTF_TYPE_FLOAT_ENC(name, sz) \ + BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FLOAT, 0, 0), sz) +#define BTF_TYPE_DECL_TAG_ENC(value, type, component_idx) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_DECL_TAG, 0, 0), type), (component_idx) +#define BTF_TYPE_TYPE_TAG_ENC(value, type) \ + BTF_TYPE_ENC(value, BTF_INFO_ENC(BTF_KIND_TYPE_TAG, 0, 0), type) + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef min +# define min(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef max +# define max(x, y) ((x) < (y) ? (y) : (x)) +#endif +#ifndef offsetofend +# define offsetofend(TYPE, FIELD) \ + (offsetof(TYPE, FIELD) + sizeof(((TYPE *)0)->FIELD)) +#endif +#ifndef __alias +#define __alias(symbol) __attribute__((alias(#symbol))) +#endif + +/* Check whether a string `str` has prefix `pfx`, regardless if `pfx` is + * a string literal known at compilation time or char * pointer known only at + * runtime. + */ +#define str_has_pfx(str, pfx) \ + (strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0) + +/* suffix check */ +static inline bool str_has_sfx(const char *str, const char *sfx) +{ + size_t str_len = strlen(str); + size_t sfx_len = strlen(sfx); + + if (sfx_len > str_len) + return false; + return strcmp(str + str_len - sfx_len, sfx) == 0; +} + +/* Symbol versioning is different between static and shared library. + * Properly versioned symbols are needed for shared library, but + * only the symbol of the new version is needed for static library. + * Starting with GNU C 10, use symver attribute instead of .symver assembler + * directive, which works better with GCC LTO builds. + */ +#if defined(SHARED) && defined(__GNUC__) && __GNUC__ >= 10 + +#define DEFAULT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@@" #version))) +#define COMPAT_VERSION(internal_name, api_name, version) \ + __attribute__((symver(#api_name "@" #version))) + +#elif defined(SHARED) + +#define COMPAT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@" #version); +#define DEFAULT_VERSION(internal_name, api_name, version) \ + asm(".symver " #internal_name "," #api_name "@@" #version); + +#else /* !SHARED */ + +#define COMPAT_VERSION(internal_name, api_name, version) +#define DEFAULT_VERSION(internal_name, api_name, version) \ + extern typeof(internal_name) api_name \ + __attribute__((alias(#internal_name))); + +#endif + +extern void libbpf_print(enum libbpf_print_level level, + const char *format, ...) + __attribute__((format(printf, 2, 3))); + +#define __pr(level, fmt, ...) \ +do { \ + libbpf_print(level, "libbpf: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_warn(fmt, ...) __pr(LIBBPF_WARN, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__) + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +struct bpf_link { + int (*detach)(struct bpf_link *link); + void (*dealloc)(struct bpf_link *link); + char *pin_path; /* NULL, if not pinned */ + int fd; /* hook FD, -1 if not applicable */ + bool disconnected; +}; + +/* + * Re-implement glibc's reallocarray() for libbpf internal-only use. + * reallocarray(), unfortunately, is not available in all versions of glibc, + * so requires extra feature detection and using reallocarray() stub from + * and COMPAT_NEED_REALLOCARRAY. All this complicates + * build of libbpf unnecessarily and is just a maintenance burden. Instead, + * it's trivial to implement libbpf-specific internal version and use it + * throughout libbpf. + */ +static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) +{ + size_t total; + +#if __has_builtin(__builtin_mul_overflow) + if (unlikely(__builtin_mul_overflow(nmemb, size, &total))) + return NULL; +#else + if (size == 0 || nmemb > ULONG_MAX / size) + return NULL; + total = nmemb * size; +#endif + return realloc(ptr, total); +} + +/* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst + * is zero-terminated string no matter what (unless sz == 0, in which case + * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs + * in what is returned. Given this is internal helper, it's trivial to extend + * this, when necessary. Use this instead of strncpy inside libbpf source code. + */ +static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz) +{ + size_t i; + + if (sz == 0) + return; + + sz--; + for (i = 0; i < sz && src[i]; i++) + dst[i] = src[i]; + dst[i] = '\0'; +} + +__u32 get_kernel_version(void); + +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id); +const char *btf_kind_str(const struct btf_type *t); +const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); + +static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)(int)btf_vlen(t); +} + +static inline __u32 btf_type_info(int kind, int vlen, int kflag) +{ + return (kflag << 31) | (kind << 24) | vlen; +} + +enum map_def_parts { + MAP_DEF_MAP_TYPE = 0x001, + MAP_DEF_KEY_TYPE = 0x002, + MAP_DEF_KEY_SIZE = 0x004, + MAP_DEF_VALUE_TYPE = 0x008, + MAP_DEF_VALUE_SIZE = 0x010, + MAP_DEF_MAX_ENTRIES = 0x020, + MAP_DEF_MAP_FLAGS = 0x040, + MAP_DEF_NUMA_NODE = 0x080, + MAP_DEF_PINNING = 0x100, + MAP_DEF_INNER_MAP = 0x200, + MAP_DEF_MAP_EXTRA = 0x400, + + MAP_DEF_ALL = 0x7ff, /* combination of all above */ +}; + +struct btf_map_def { + enum map_def_parts parts; + __u32 map_type; + __u32 key_type_id; + __u32 key_size; + __u32 value_type_id; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + __u32 numa_node; + __u32 pinning; + __u64 map_extra; +}; + +int parse_btf_map_def(const char *map_name, struct btf *btf, + const struct btf_type *def_t, bool strict, + struct btf_map_def *map_def, struct btf_map_def *inner_def); + +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); + +static inline bool libbpf_is_mem_zeroed(const char *p, ssize_t len) +{ + while (len > 0) { + if (*p) + return false; + p++; + len--; + } + return true; +} + +static inline bool libbpf_validate_opts(const char *opts, + size_t opts_sz, size_t user_sz, + const char *type_name) +{ + if (user_sz < sizeof(size_t)) { + pr_warn("%s size (%zu) is too small\n", type_name, user_sz); + return false; + } + if (!libbpf_is_mem_zeroed(opts + opts_sz, (ssize_t)user_sz - opts_sz)) { + pr_warn("%s has non-zero extra bytes\n", type_name); + return false; + } + return true; +} + +#define OPTS_VALID(opts, type) \ + (!(opts) || libbpf_validate_opts((const char *)opts, \ + offsetofend(struct type, \ + type##__last_field), \ + (opts)->sz, #type)) +#define OPTS_HAS(opts, field) \ + ((opts) && opts->sz >= offsetofend(typeof(*(opts)), field)) +#define OPTS_GET(opts, field, fallback_value) \ + (OPTS_HAS(opts, field) ? (opts)->field : fallback_value) +#define OPTS_SET(opts, field, value) \ + do { \ + if (OPTS_HAS(opts, field)) \ + (opts)->field = value; \ + } while (0) + +#define OPTS_ZEROED(opts, last_nonzero_field) \ +({ \ + ssize_t __off = offsetofend(typeof(*(opts)), last_nonzero_field); \ + !(opts) || libbpf_is_mem_zeroed((const void *)opts + __off, \ + (opts)->sz - __off); \ +}) + +enum kern_feature_id { + /* v4.14: kernel support for program & map names. */ + FEAT_PROG_NAME, + /* v5.2: kernel support for global data sections. */ + FEAT_GLOBAL_DATA, + /* BTF support */ + FEAT_BTF, + /* BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO support */ + FEAT_BTF_FUNC, + /* BTF_KIND_VAR and BTF_KIND_DATASEC support */ + FEAT_BTF_DATASEC, + /* BTF_FUNC_GLOBAL is supported */ + FEAT_BTF_GLOBAL_FUNC, + /* BPF_F_MMAPABLE is supported for arrays */ + FEAT_ARRAY_MMAP, + /* kernel support for expected_attach_type in BPF_PROG_LOAD */ + FEAT_EXP_ATTACH_TYPE, + /* bpf_probe_read_{kernel,user}[_str] helpers */ + FEAT_PROBE_READ_KERN, + /* BPF_PROG_BIND_MAP is supported */ + FEAT_PROG_BIND_MAP, + /* Kernel support for module BTFs */ + FEAT_MODULE_BTF, + /* BTF_KIND_FLOAT support */ + FEAT_BTF_FLOAT, + /* BPF perf link support */ + FEAT_PERF_LINK, + /* BTF_KIND_DECL_TAG support */ + FEAT_BTF_DECL_TAG, + /* BTF_KIND_TYPE_TAG support */ + FEAT_BTF_TYPE_TAG, + /* memcg-based accounting for BPF maps and progs */ + FEAT_MEMCG_ACCOUNT, + /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ + FEAT_BPF_COOKIE, + /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ + FEAT_BTF_ENUM64, + /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ + FEAT_SYSCALL_WRAPPER, + __FEAT_CNT, +}; + +int probe_memcg_account(void); +bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id); +int bump_rlimit_memlock(void); + +int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); +int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len); +int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); + +struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); +void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, + const char **prefix, int *kind); + +struct btf_ext_info { + /* + * info points to the individual info section (e.g. func_info and + * line_info) from the .BTF.ext. It does not include the __u32 rec_size. + */ + void *info; + __u32 rec_size; + __u32 len; + /* optional (maintained internally by libbpf) mapping between .BTF.ext + * section and corresponding ELF section. This is used to join + * information like CO-RE relocation records with corresponding BPF + * programs defined in ELF sections + */ + __u32 *sec_idxs; + int sec_cnt; +}; + +#define for_each_btf_ext_sec(seg, sec) \ + for (sec = (seg)->info; \ + (void *)sec < (seg)->info + (seg)->len; \ + sec = (void *)sec + sizeof(struct btf_ext_info_sec) + \ + (seg)->rec_size * sec->num_info) + +#define for_each_btf_ext_rec(seg, sec, i, rec) \ + for (i = 0, rec = (void *)&(sec)->data; \ + i < (sec)->num_info; \ + i++, rec = (void *)rec + (seg)->rec_size) + +/* + * The .BTF.ext ELF section layout defined as + * struct btf_ext_header + * func_info subsection + * + * The func_info subsection layout: + * record size for struct bpf_func_info in the func_info subsection + * struct btf_sec_func_info for section #1 + * a list of bpf_func_info records for section #1 + * where struct bpf_func_info mimics one in include/uapi/linux/bpf.h + * but may not be identical + * struct btf_sec_func_info for section #2 + * a list of bpf_func_info records for section #2 + * ...... + * + * Note that the bpf_func_info record size in .BTF.ext may not + * be the same as the one defined in include/uapi/linux/bpf.h. + * The loader should ensure that record_size meets minimum + * requirement and pass the record as is to the kernel. The + * kernel will handle the func_info properly based on its contents. + */ +struct btf_ext_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; + + /* optional part of .BTF.ext header */ + __u32 core_relo_off; + __u32 core_relo_len; +}; + +struct btf_ext { + union { + struct btf_ext_header *hdr; + void *data; + }; + struct btf_ext_info func_info; + struct btf_ext_info line_info; + struct btf_ext_info core_relo_info; + __u32 data_size; +}; + +struct btf_ext_info_sec { + __u32 sec_name_off; + __u32 num_info; + /* Followed by num_info * record_size number of bytes */ + __u8 data[]; +}; + +/* The minimum bpf_func_info checked by the loader */ +struct bpf_func_info_min { + __u32 insn_off; + __u32 type_id; +}; + +/* The minimum bpf_line_info checked by the loader */ +struct bpf_line_info_min { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + + +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); +__s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, + __u32 kind); + +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); + +/* handle direct returned errors */ +static inline int libbpf_err(int ret) +{ + if (ret < 0) + errno = -ret; + return ret; +} + +/* handle errno-based (e.g., syscall or libc) errors according to libbpf's + * strict mode settings + */ +static inline int libbpf_err_errno(int ret) +{ + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; +} + +/* handle error for pointer-returning APIs, err is assumed to be < 0 always */ +static inline void *libbpf_err_ptr(int err) +{ + /* set errno on error, this doesn't break anything */ + errno = -err; + return NULL; +} + +/* handle pointer-returning APIs' error handling */ +static inline void *libbpf_ptr(void *ret) +{ + /* set errno on error, this doesn't break anything */ + if (IS_ERR(ret)) + errno = -PTR_ERR(ret); + + return IS_ERR(ret) ? NULL : ret; +} + +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +/* if fd is stdin, stdout, or stderr, dup to a fd greater than 2 + * Takes ownership of the fd passed in, and closes it if calling + * fcntl(fd, F_DUPFD_CLOEXEC, 3). + */ +static inline int ensure_good_fd(int fd) +{ + int old_fd = fd, saved_errno; + + if (fd < 0) + return fd; + if (fd < 3) { + fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + saved_errno = errno; + close(old_fd); + errno = saved_errno; + if (fd < 0) { + pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); + errno = saved_errno; + } + } + return fd; +} + +/* The following two functions are exposed to bpftool */ +int bpf_core_add_cands(struct bpf_core_cand *local_cand, + size_t local_essent_len, + const struct btf *targ_btf, + const char *targ_btf_name, + int targ_start_id, + struct bpf_core_cand_list *cands); +void bpf_core_free_cands(struct bpf_core_cand_list *cands); + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj); +void usdt_manager_free(struct usdt_manager *man); +struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, + const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie); + +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +#define PROG_LOAD_ATTEMPTS 5 +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); + +#endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/third_party/libbpf/src/libbpf_legacy.h b/third_party/libbpf/src/libbpf_legacy.h new file mode 100644 index 0000000..1e1be46 --- /dev/null +++ b/third_party/libbpf/src/libbpf_legacy.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * Libbpf legacy APIs (either discouraged or deprecated, as mentioned in [0]) + * + * [0] https://docs.google.com/document/d/1UyjTZuPFWiPFyKk1tV5an11_iaRuec6U-ZESZ54nNTY + * + * Copyright (C) 2021 Facebook + */ +#ifndef __LIBBPF_LEGACY_BPF_H +#define __LIBBPF_LEGACY_BPF_H + +#include +#include +#include +#include +#include "libbpf_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ +enum libbpf_strict_mode { + /* Turn on all supported strict features of libbpf to simulate libbpf + * v1.0 behavior. + * This will be the default behavior in libbpf v1.0. + */ + LIBBPF_STRICT_ALL = 0xffffffff, + + /* + * Disable any libbpf 1.0 behaviors. This is the default before libbpf + * v1.0. It won't be supported anymore in v1.0, please update your + * code so that it handles LIBBPF_STRICT_ALL mode before libbpf v1.0. + */ + LIBBPF_STRICT_NONE = 0x00, + /* + * Return NULL pointers on error, not ERR_PTR(err). + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_CLEAN_PTRS = 0x01, + /* + * Return actual error codes from low-level APIs directly, not just -1. + * Additionally, libbpf also always sets errno to corresponding Exx + * (positive) error code. + */ + LIBBPF_STRICT_DIRECT_ERRS = 0x02, + /* + * Enforce strict BPF program section (SEC()) names. + * E.g., while prefiously SEC("xdp_whatever") or SEC("perf_event_blah") were + * allowed, with LIBBPF_STRICT_SEC_PREFIX this will become + * unrecognized by libbpf and would have to be just SEC("xdp") and + * SEC("xdp") and SEC("perf_event"). + * + * Note, in this mode the program pin path will be based on the + * function name instead of section name. + * + * Additionally, routines in the .text section are always considered + * sub-programs. Legacy behavior allows for a single routine in .text + * to be a program. + */ + LIBBPF_STRICT_SEC_NAME = 0x04, + /* + * Disable the global 'bpf_objects_list'. Maintaining this list adds + * a race condition to bpf_object__open() and bpf_object__close(). + * Clients can maintain it on their own if it is valuable for them. + */ + LIBBPF_STRICT_NO_OBJECT_LIST = 0x08, + /* + * Automatically bump RLIMIT_MEMLOCK using setrlimit() before the + * first BPF program or map creation operation. This is done only if + * kernel is too old to support memcg-based memory accounting for BPF + * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before + * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() + * operation. + */ + LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK = 0x10, + /* + * Error out on any SEC("maps") map definition, which are deprecated + * in favor of BTF-defined map definitions in SEC(".maps"). + */ + LIBBPF_STRICT_MAP_DEFINITIONS = 0x20, + + __LIBBPF_STRICT_LAST, +}; + +LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); + +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + +#define DECLARE_LIBBPF_OPTS LIBBPF_OPTS + +/* "Discouraged" APIs which don't follow consistent libbpf naming patterns. + * They are normally a trivial aliases or wrappers for proper APIs and are + * left to minimize unnecessary disruption for users of libbpf. But they + * shouldn't be used going forward. + */ + +struct bpf_program; +struct bpf_map; +struct btf; +struct btf_ext; + +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); +LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); +LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); +LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_LEGACY_BPF_H */ diff --git a/third_party/libbpf/src/libbpf_probes.c b/third_party/libbpf/src/libbpf_probes.c new file mode 100644 index 0000000..9c4db90 --- /dev/null +++ b/third_party/libbpf/src/libbpf_probes.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, + * but Ubuntu provides /proc/version_signature file, as described at + * https://ubuntu.com/kernel, with an example contents below, which we + * can use to get a proper LINUX_VERSION_CODE. + * + * Ubuntu 5.4.0-12.15-generic 5.4.8 + * + * In the above, 5.4.8 is what kernel is actually expecting, while + * uname() call will return 5.4.0 in info.release. + */ +static __u32 get_ubuntu_kernel_version(void) +{ + const char *ubuntu_kver_file = "/proc/version_signature"; + __u32 major, minor, patch; + int ret; + FILE *f; + + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) != 0) + return 0; + + f = fopen(ubuntu_kver_file, "re"); + if (!f) + return 0; + + ret = fscanf(f, "%*s %*s %u.%u.%u\n", &major, &minor, &patch); + fclose(f); + if (ret != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +/* On Debian LINUX_VERSION_CODE doesn't correspond to info.release. + * Instead, it is provided in info.version. An example content of + * Debian 10 looks like the below. + * + * utsname::release 4.19.0-22-amd64 + * utsname::version #1 SMP Debian 4.19.260-1 (2022-09-29) + * + * In the above, 4.19.260 is what kernel is actually expecting, while + * uname() call will return 4.19.0 in info.release. + */ +static __u32 get_debian_kernel_version(struct utsname *info) +{ + __u32 major, minor, patch; + char *p; + + p = strstr(info->version, "Debian "); + if (!p) { + /* This is not a Debian kernel. */ + return 0; + } + + if (sscanf(p, "Debian %u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +__u32 get_kernel_version(void) +{ + __u32 major, minor, patch, version; + struct utsname info; + + /* Check if this is an Ubuntu kernel. */ + version = get_ubuntu_kernel_version(); + if (version != 0) + return version; + + uname(&info); + + /* Check if this is a Debian kernel. */ + version = get_debian_kernel_version(&info); + if (version != 0) + return version; + + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +static int probe_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, size_t insns_cnt, + char *log_buf, size_t log_buf_sz) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .log_buf = log_buf, + .log_size = log_buf_sz, + .log_level = log_buf ? 1 : 0, + ); + int fd, err, exp_err = 0; + const char *exp_msg = NULL; + char buf[4096]; + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + opts.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + break; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + opts.expected_attach_type = BPF_CGROUP_GETSOCKOPT; + break; + case BPF_PROG_TYPE_SK_LOOKUP: + opts.expected_attach_type = BPF_SK_LOOKUP; + break; + case BPF_PROG_TYPE_KPROBE: + opts.kern_version = get_kernel_version(); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + opts.expected_attach_type = BPF_LIRC_MODE2; + break; + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + if (prog_type == BPF_PROG_TYPE_TRACING) + opts.expected_attach_type = BPF_TRACE_FENTRY; + else + opts.expected_attach_type = BPF_MODIFY_RETURN; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "attach_btf_id 1 is not a function"; + break; + case BPF_PROG_TYPE_EXT: + opts.log_buf = buf; + opts.log_size = sizeof(buf); + opts.log_level = 1; + opts.attach_btf_id = 1; + + exp_err = -EINVAL; + exp_msg = "Cannot replace kernel functions"; + break; + case BPF_PROG_TYPE_SYSCALL: + opts.prog_flags = BPF_F_SLEEPABLE; + break; + case BPF_PROG_TYPE_STRUCT_OPS: + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_PROG_TYPE_UNSPEC: + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + break; + case BPF_PROG_TYPE_NETFILTER: + opts.expected_attach_type = BPF_NETFILTER; + break; + default: + return -EOPNOTSUPP; + } + + fd = bpf_prog_load(prog_type, NULL, "GPL", insns, insns_cnt, &opts); + err = -errno; + if (fd >= 0) + close(fd); + if (exp_err) { + if (fd >= 0 || err != exp_err) + return 0; + if (exp_msg && !strstr(buf, exp_msg)) + return 0; + return 1; + } + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); + return libbpf_err(ret); +} + +int libbpf__load_raw_btf(const char *raw_types, size_t types_len, + const char *str_sec, size_t str_len) +{ + struct btf_header hdr = { + .magic = BTF_MAGIC, + .version = BTF_VERSION, + .hdr_len = sizeof(struct btf_header), + .type_len = types_len, + .str_off = types_len, + .str_len = str_len, + }; + int btf_fd, btf_len; + __u8 *raw_btf; + + btf_len = hdr.hdr_len + hdr.type_len + hdr.str_len; + raw_btf = malloc(btf_len); + if (!raw_btf) + return -ENOMEM; + + memcpy(raw_btf, &hdr, sizeof(hdr)); + memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); + memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); + + btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); + + free(raw_btf); + return btf_fd; +} + +static int load_local_storage_btf(void) +{ + const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l"; + /* struct bpf_spin_lock { + * int val; + * }; + * struct val { + * int cnt; + * struct bpf_spin_lock l; + * }; + */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* struct bpf_spin_lock */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), + BTF_MEMBER_ENC(15, 1, 0), /* int val; */ + /* struct val */ /* [3] */ + BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8), + BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */ + BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */ + }; + + return libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs)); +} + +static int probe_map_create(enum bpf_map_type map_type) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts); + int key_size, value_size, max_entries; + __u32 btf_key_type_id = 0, btf_value_type_id = 0; + int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err = 0; + + key_size = sizeof(__u32); + value_size = sizeof(__u32); + max_entries = 1; + + switch (map_type) { + case BPF_MAP_TYPE_STACK_TRACE: + value_size = sizeof(__u64); + break; + case BPF_MAP_TYPE_LPM_TRIE: + key_size = sizeof(__u64); + value_size = sizeof(__u64); + opts.map_flags = BPF_F_NO_PREALLOC; + break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + key_size = sizeof(struct bpf_cgroup_storage_key); + value_size = sizeof(__u64); + max_entries = 0; + break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + key_size = 0; + break; + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: + btf_key_type_id = 1; + btf_value_type_id = 3; + value_size = 8; + max_entries = 0; + opts.map_flags = BPF_F_NO_PREALLOC; + btf_fd = load_local_storage_btf(); + if (btf_fd < 0) + return btf_fd; + break; + case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: + key_size = 0; + value_size = 0; + max_entries = sysconf(_SC_PAGE_SIZE); + break; + case BPF_MAP_TYPE_STRUCT_OPS: + /* we'll get -ENOTSUPP for invalid BTF type ID for struct_ops */ + opts.btf_vmlinux_value_type_id = 1; + exp_err = -524; /* -ENOTSUPP */ + break; + case BPF_MAP_TYPE_BLOOM_FILTER: + key_size = 0; + max_entries = 1; + break; + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + break; + case BPF_MAP_TYPE_UNSPEC: + default: + return -EOPNOTSUPP; + } + + if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, + sizeof(__u32), sizeof(__u32), 1, NULL); + if (fd_inner < 0) + goto cleanup; + + opts.inner_map_fd = fd_inner; + } + + if (btf_fd >= 0) { + opts.btf_fd = btf_fd; + opts.btf_key_type_id = btf_key_type_id; + opts.btf_value_type_id = btf_value_type_id; + } + + fd = bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); + err = -errno; + +cleanup: + if (fd >= 0) + close(fd); + if (fd_inner >= 0) + close(fd_inner); + if (btf_fd >= 0) + close(btf_fd); + + if (exp_err) + return fd < 0 && err == exp_err ? 1 : 0; + else + return fd >= 0 ? 1 : 0; +} + +int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) +{ + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + ret = probe_map_create(map_type); + return libbpf_err(ret); +} + +int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, + const void *opts) +{ + struct bpf_insn insns[] = { + BPF_EMIT_CALL((__u32)helper_id), + BPF_EXIT_INSN(), + }; + const size_t insn_cnt = ARRAY_SIZE(insns); + char buf[4096]; + int ret; + + if (opts) + return libbpf_err(-EINVAL); + + /* we can't successfully load all prog types to check for BPF helper + * support, so bail out with -EOPNOTSUPP error + */ + switch (prog_type) { + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: + return -EOPNOTSUPP; + default: + break; + } + + buf[0] = '\0'; + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); + if (ret < 0) + return libbpf_err(ret); + + /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) + * at all, it will emit something like "invalid func unknown#181". + * If BPF verifier recognizes BPF helper but it's not supported for + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * In both cases, provided combination of BPF program type and BPF + * helper is not supported by the kernel. + * In all other cases, probe_prog_load() above will either succeed (e.g., + * because BPF helper happens to accept no input arguments or it + * accepts one input argument and initial PTR_TO_CTX is fine for + * that), or we'll get some more specific BPF verifier error about + * some unsatisfied conditions. + */ + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + return 0; + return 1; /* assume supported */ +} diff --git a/third_party/libbpf/src/libbpf_version.h b/third_party/libbpf/src/libbpf_version.h new file mode 100644 index 0000000..290411d --- /dev/null +++ b/third_party/libbpf/src/libbpf_version.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (C) 2021 Facebook */ +#ifndef __LIBBPF_VERSION_H +#define __LIBBPF_VERSION_H + +#define LIBBPF_MAJOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 3 + +#endif /* __LIBBPF_VERSION_H */ diff --git a/third_party/libbpf/src/linker.c b/third_party/libbpf/src/linker.c new file mode 100644 index 0000000..5ced96d --- /dev/null +++ b/third_party/libbpf/src/linker.c @@ -0,0 +1,2905 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +#define BTF_EXTERN_SEC ".extern" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; +}; + +struct glob_sym { + /* ELF symbol index */ + int sym_idx; + /* associated section id for .ksyms, .kconfig, etc, but not .extern */ + int sec_id; + /* extern name offset in STRTAB */ + int name_off; + /* optional associated BTF type ID */ + int btf_id; + /* BTF type ID to which VAR/FUNC type is pointing to; used for + * rewriting types when extern VAR/FUNC is resolved to a concrete + * definition + */ + int underlying_btf_id; + /* sec_var index in the corresponding dst_sec, if exists */ + int var_idx; + + /* extern or resolved/global symbol */ + bool is_extern; + /* weak or strong symbol, never goes back from strong to weak */ + bool is_weak; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; + + /* global (including extern) ELF symbols */ + int glob_sym_cnt; + struct glob_sym *glob_syms; +}; + +#define pr_warn_elf(fmt, ...) \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); + } + free(linker->secs); + + free(linker->glob_syms); + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return errno = EINVAL, NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return errno = EINVAL, NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return errno = ENOMEM, NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return errno = -err, NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr) { + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts) +{ + struct src_obj obj = {}; + int err = 0; + + if (!OPTS_VALID(opts, bpf_linker_file_opts)) + return libbpf_err(-EINVAL); + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = err ?: linker_load_obj_file(linker, filename, opts, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); + + /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return libbpf_err(err); +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, + const struct bpf_linker_file_opts *opts, + struct src_obj *obj) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER__" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY | O_CLOEXEC); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); + + return err; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec; + int i, err; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: + err = linker_sanity_check_elf_symtab(obj, sec); + if (err) + return err; + break; + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: + err = linker_sanity_check_elf_relos(obj, sec); + if (err) + return err; + break; + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int linker_sanity_check_elf_symtab(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec; + Elf64_Sym *sym; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (i = 0; i < n; i++, sym++) { + int sym_type = ELF64_ST_TYPE(sym->st_info); + int sym_bind = ELF64_ST_BIND(sym->st_info); + int sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + + if (i == 0) { + if (sym->st_name != 0 || sym->st_info != 0 + || sym->st_other != 0 || sym->st_shndx != 0 + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #0 is invalid in %s\n", obj->filename); + return -EINVAL; + } + continue; + } + if (sym_bind != STB_LOCAL && sym_bind != STB_GLOBAL && sym_bind != STB_WEAK) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol binding %d\n", + i, sec->sec_idx, sym_bind); + return -EINVAL; + } + if (sym_vis != STV_DEFAULT && sym_vis != STV_HIDDEN) { + pr_warn("ELF sym #%d in section #%zu has unsupported symbol visibility %d\n", + i, sec->sec_idx, sym_vis); + return -EINVAL; + } + if (sym->st_shndx == 0) { + if (sym_type != STT_NOTYPE || sym_bind == STB_LOCAL + || sym->st_value != 0 || sym->st_size != 0) { + pr_warn("ELF sym #%d is invalid extern symbol in %s\n", + i, obj->filename); + + return -EINVAL; + } + continue; + } + if (sym->st_shndx < SHN_LORESERVE && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + i, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (sym_type == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + continue; + } + } + + return 0; +} + +static int linker_sanity_check_elf_relos(struct src_obj *obj, struct src_sec *sec) +{ + struct src_sec *link_sec, *sym_sec; + Elf64_Rel *relo; + int i, n; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel -> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + return 0; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (i = 0; i < n; i++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32 && + sym_type != R_BPF_64_ABS64 && sym_type != R_BPF_64_ABS32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + i, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + i, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + + return 0; +} + +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id >= btf__type_cnt(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + dst_sec->ephemeral = src_sec->ephemeral; + + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -ENOMEM; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->ephemeral || src->ephemeral) + return true; + + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align, src_align; + size_t dst_align_sz, dst_final_sz; + int err; + + /* Ephemeral source section doesn't contribute anything to ELF + * section data. + */ + if (src->ephemeral) + return 0; + + /* Some sections (like .maps) can contain both externs (and thus be + * ephemeral) and non-externs (map definitions). So it's possible that + * it has to be "upgraded" from ephemeral to non-ephemeral when the + * first non-ephemeral entity appears. In such case, we add ELF + * section, data, etc. + */ + if (dst->ephemeral) { + err = init_sec(linker, dst, src); + if (err) + return err; + } + + dst_align = dst->shdr->sh_addralign; + src_align = src->shdr->sh_addralign; + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + /* If dst_align_sz == 0, realloc() behaves in a special way: + * 1. When dst->raw_data is NULL it returns: + * "either NULL or a pointer suitable to be passed to free()" [1]. + * 2. When dst->raw_data is not-NULL it frees dst->raw_data and returns NULL, + * thus invalidating any "pointer suitable to be passed to free()" obtained + * at step (1). + * + * The dst_align_sz > 0 check avoids error exit after (2), otherwise + * dst->raw_data would be freed again in bpf_linker__free(). + * + * [1] man 3 realloc + */ + if (!tmp && dst_align_sz > 0) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped || sec->ephemeral) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize, err; + int str_sec_idx = symtab->shdr->sh_link; + const char *sym_name; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + /* We already validated all-zero symbol #0 and we already + * appended it preventively to the final SYMTAB, so skip it. + */ + if (i == 0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -EINVAL; + } + + err = linker_append_elf_sym(linker, obj, sym, sym_name, i); + if (err) + return err; + } + + return 0; +} + +static Elf64_Sym *get_sym_by_idx(struct bpf_linker *linker, size_t sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms = symtab->raw_data; + + return &syms[sym_idx]; +} + +static struct glob_sym *find_glob_sym(struct bpf_linker *linker, const char *sym_name) +{ + struct glob_sym *glob_sym; + const char *name; + int i; + + for (i = 0; i < linker->glob_sym_cnt; i++) { + glob_sym = &linker->glob_syms[i]; + name = strset__data(linker->strtab_strs) + glob_sym->name_off; + + if (strcmp(name, sym_name) == 0) + return glob_sym; + } + + return NULL; +} + +static struct glob_sym *add_glob_sym(struct bpf_linker *linker) +{ + struct glob_sym *syms, *sym; + + syms = libbpf_reallocarray(linker->glob_syms, linker->glob_sym_cnt + 1, + sizeof(*linker->glob_syms)); + if (!syms) + return NULL; + + sym = &syms[linker->glob_sym_cnt]; + memset(sym, 0, sizeof(*sym)); + sym->var_idx = -1; + + linker->glob_syms = syms; + linker->glob_sym_cnt++; + + return sym; +} + +static bool glob_sym_btf_matches(const char *sym_name, bool exact, + const struct btf *btf1, __u32 id1, + const struct btf *btf2, __u32 id2) +{ + const struct btf_type *t1, *t2; + bool is_static1, is_static2; + const char *n1, *n2; + int i, n; + +recur: + n1 = n2 = NULL; + t1 = skip_mods_and_typedefs(btf1, id1, &id1); + t2 = skip_mods_and_typedefs(btf2, id2, &id2); + + /* check if only one side is FWD, otherwise handle with common logic */ + if (!exact && btf_is_fwd(t1) != btf_is_fwd(t2)) { + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible forward declaration names '%s' and '%s'\n", + sym_name, n1, n2); + return false; + } + /* validate if FWD kind matches concrete kind */ + if (btf_is_fwd(t1)) { + if (btf_kflag(t1) && btf_is_union(t2)) + return true; + if (!btf_kflag(t1) && btf_is_struct(t2)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t1) ? "union" : "struct", btf_kind_str(t2)); + } else { + if (btf_kflag(t2) && btf_is_union(t1)) + return true; + if (!btf_kflag(t2) && btf_is_struct(t1)) + return true; + pr_warn("global '%s': incompatible %s forward declaration and concrete kind %s\n", + sym_name, btf_kflag(t2) ? "union" : "struct", btf_kind_str(t1)); + } + return false; + } + + if (btf_kind(t1) != btf_kind(t2)) { + pr_warn("global '%s': incompatible BTF kinds %s and %s\n", + sym_name, btf_kind_str(t1), btf_kind_str(t2)); + return false; + } + + switch (btf_kind(t1)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_FWD: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + n1 = btf__str_by_offset(btf1, t1->name_off); + n2 = btf__str_by_offset(btf2, t2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible %s names '%s' and '%s'\n", + sym_name, btf_kind_str(t1), n1, n2); + return false; + } + break; + default: + break; + } + + switch (btf_kind(t1)) { + case BTF_KIND_UNKN: /* void */ + case BTF_KIND_FWD: + return true; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* ignore encoding for int and enum values for enum */ + if (t1->size != t2->size) { + pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", + sym_name, btf_kind_str(t1), n1, t1->size, t2->size); + return false; + } + return true; + case BTF_KIND_PTR: + /* just validate overall shape of the referenced type, so no + * contents comparison for struct/union, and allowd fwd vs + * struct/union + */ + exact = false; + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_ARRAY: + /* ignore index type and array size */ + id1 = btf_array(t1)->type; + id2 = btf_array(t2)->type; + goto recur; + case BTF_KIND_FUNC: + /* extern and global linkages are compatible */ + is_static1 = btf_func_linkage(t1) == BTF_FUNC_STATIC; + is_static2 = btf_func_linkage(t2) == BTF_FUNC_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible func '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_VAR: + /* extern and global linkages are compatible */ + is_static1 = btf_var(t1)->linkage == BTF_VAR_STATIC; + is_static2 = btf_var(t2)->linkage == BTF_VAR_STATIC; + if (is_static1 != is_static2) { + pr_warn("global '%s': incompatible var '%s' linkage\n", sym_name, n1); + return false; + } + + id1 = t1->type; + id2 = t2->type; + goto recur; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m1, *m2; + + if (!exact) + return true; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s fields %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_members(t1); + m2 = btf_members(t2); + for (i = 0; i < n; i++, m1++, m2++) { + n1 = btf__str_by_offset(btf1, m1->name_off); + n2 = btf__str_by_offset(btf2, m2->name_off); + if (strcmp(n1, n2) != 0) { + pr_warn("global '%s': incompatible field #%d names '%s' and '%s'\n", + sym_name, i, n1, n2); + return false; + } + if (m1->offset != m2->offset) { + pr_warn("global '%s': incompatible field #%d ('%s') offsets\n", + sym_name, i, n1); + return false; + } + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + return true; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m1, *m2; + + if (btf_vlen(t1) != btf_vlen(t2)) { + pr_warn("global '%s': incompatible number of %s params %u and %u\n", + sym_name, btf_kind_str(t1), btf_vlen(t1), btf_vlen(t2)); + return false; + } + + n = btf_vlen(t1); + m1 = btf_params(t1); + m2 = btf_params(t2); + for (i = 0; i < n; i++, m1++, m2++) { + /* ignore func arg names */ + if (!glob_sym_btf_matches(sym_name, exact, btf1, m1->type, btf2, m2->type)) + return false; + } + + /* now check return type as well */ + id1 = t1->type; + id2 = t2->type; + goto recur; + } + + /* skip_mods_and_typedefs() make this impossible */ + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + /* DATASECs are never compared with each other */ + case BTF_KIND_DATASEC: + default: + pr_warn("global '%s': unsupported BTF kind %s\n", + sym_name, btf_kind_str(t1)); + return false; + } +} + +static bool map_defs_match(const char *sym_name, + const struct btf *main_btf, + const struct btf_map_def *main_def, + const struct btf_map_def *main_inner_def, + const struct btf *extra_btf, + const struct btf_map_def *extra_def, + const struct btf_map_def *extra_inner_def) +{ + const char *reason; + + if (main_def->map_type != extra_def->map_type) { + reason = "type"; + goto mismatch; + } + + /* check key type/size match */ + if (main_def->key_size != extra_def->key_size) { + reason = "key_size"; + goto mismatch; + } + if (!!main_def->key_type_id != !!extra_def->key_type_id) { + reason = "key type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_KEY_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->key_type_id, + extra_btf, extra_def->key_type_id)) { + reason = "key type"; + goto mismatch; + } + + /* validate value type/size match */ + if (main_def->value_size != extra_def->value_size) { + reason = "value_size"; + goto mismatch; + } + if (!!main_def->value_type_id != !!extra_def->value_type_id) { + reason = "value type"; + goto mismatch; + } + if ((main_def->parts & MAP_DEF_VALUE_TYPE) + && !glob_sym_btf_matches(sym_name, true /*exact*/, + main_btf, main_def->value_type_id, + extra_btf, extra_def->value_type_id)) { + reason = "key type"; + goto mismatch; + } + + if (main_def->max_entries != extra_def->max_entries) { + reason = "max_entries"; + goto mismatch; + } + if (main_def->map_flags != extra_def->map_flags) { + reason = "map_flags"; + goto mismatch; + } + if (main_def->numa_node != extra_def->numa_node) { + reason = "numa_node"; + goto mismatch; + } + if (main_def->pinning != extra_def->pinning) { + reason = "pinning"; + goto mismatch; + } + + if ((main_def->parts & MAP_DEF_INNER_MAP) != (extra_def->parts & MAP_DEF_INNER_MAP)) { + reason = "inner map"; + goto mismatch; + } + + if (main_def->parts & MAP_DEF_INNER_MAP) { + char inner_map_name[128]; + + snprintf(inner_map_name, sizeof(inner_map_name), "%s.inner", sym_name); + + return map_defs_match(inner_map_name, + main_btf, main_inner_def, NULL, + extra_btf, extra_inner_def, NULL); + } + + return true; + +mismatch: + pr_warn("global '%s': map %s mismatch\n", sym_name, reason); + return false; +} + +static bool glob_map_defs_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, int btf_id) +{ + struct btf_map_def dst_def = {}, dst_inner_def = {}; + struct btf_map_def src_def = {}, src_inner_def = {}; + const struct btf_type *t; + int err; + + t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(t)) { + pr_warn("global '%s': invalid map definition type [%d]\n", sym_name, btf_id); + return false; + } + t = skip_mods_and_typedefs(obj->btf, t->type, NULL); + + err = parse_btf_map_def(sym_name, obj->btf, t, true /*strict*/, &src_def, &src_inner_def); + if (err) { + pr_warn("global '%s': invalid map definition\n", sym_name); + return false; + } + + /* re-parse existing map definition */ + t = btf__type_by_id(linker->btf, glob_sym->btf_id); + t = skip_mods_and_typedefs(linker->btf, t->type, NULL); + err = parse_btf_map_def(sym_name, linker->btf, t, true /*strict*/, &dst_def, &dst_inner_def); + if (err) { + /* this should not happen, because we already validated it */ + pr_warn("global '%s': invalid dst map definition\n", sym_name); + return false; + } + + /* Currently extern map definition has to be complete and match + * concrete map definition exactly. This restriction might be lifted + * in the future. + */ + return map_defs_match(sym_name, linker->btf, &dst_def, &dst_inner_def, + obj->btf, &src_def, &src_inner_def); +} + +static bool glob_syms_match(const char *sym_name, + struct bpf_linker *linker, struct glob_sym *glob_sym, + struct src_obj *obj, Elf64_Sym *sym, size_t sym_idx, int btf_id) +{ + const struct btf_type *src_t; + + /* if we are dealing with externs, BTF types describing both global + * and extern VARs/FUNCs should be completely present in all files + */ + if (!glob_sym->btf_id || !btf_id) { + pr_warn("BTF info is missing for global symbol '%s'\n", sym_name); + return false; + } + + src_t = btf__type_by_id(obj->btf, btf_id); + if (!btf_is_var(src_t) && !btf_is_func(src_t)) { + pr_warn("only extern variables and functions are supported, but got '%s' for '%s'\n", + btf_kind_str(src_t), sym_name); + return false; + } + + /* deal with .maps definitions specially */ + if (glob_sym->sec_id && strcmp(linker->secs[glob_sym->sec_id].sec_name, MAPS_ELF_SEC) == 0) + return glob_map_defs_match(sym_name, linker, glob_sym, obj, sym, btf_id); + + if (!glob_sym_btf_matches(sym_name, true /*exact*/, + linker->btf, glob_sym->btf_id, obj->btf, btf_id)) + return false; + + return true; +} + +static bool btf_is_non_static(const struct btf_type *t) +{ + return (btf_is_var(t) && btf_var(t)->linkage != BTF_VAR_STATIC) + || (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_STATIC); +} + +static int find_glob_sym_btf(struct src_obj *obj, Elf64_Sym *sym, const char *sym_name, + int *out_btf_sec_id, int *out_btf_id) +{ + int i, j, n, m, btf_id = 0; + const struct btf_type *t; + const struct btf_var_secinfo *vi; + const char *name; + + if (!obj->btf) { + pr_warn("failed to find BTF info for object '%s'\n", obj->filename); + return -EINVAL; + } + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* some global and extern FUNCs and VARs might not be associated with any + * DATASEC, so try to detect them in the same pass + */ + if (btf_is_non_static(t)) { + name = btf__str_by_offset(obj->btf, t->name_off); + if (strcmp(name, sym_name) != 0) + continue; + + /* remember and still try to find DATASEC */ + btf_id = i; + continue; + } + + if (!btf_is_datasec(t)) + continue; + + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + t = btf__type_by_id(obj->btf, vi->type); + name = btf__str_by_offset(obj->btf, t->name_off); + + if (strcmp(name, sym_name) != 0) + continue; + if (btf_is_var(t) && btf_var(t)->linkage == BTF_VAR_STATIC) + continue; + if (btf_is_func(t) && btf_func_linkage(t) == BTF_FUNC_STATIC) + continue; + + if (btf_id && btf_id != vi->type) { + pr_warn("global/extern '%s' BTF is ambiguous: both types #%d and #%u match\n", + sym_name, btf_id, vi->type); + return -EINVAL; + } + + *out_btf_sec_id = i; + *out_btf_id = vi->type; + + return 0; + } + } + + /* free-floating extern or global FUNC */ + if (btf_id) { + *out_btf_sec_id = 0; + *out_btf_id = btf_id; + return 0; + } + + pr_warn("failed to find BTF info for global/extern symbol '%s'\n", sym_name); + return -ENOENT; +} + +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static int complete_extern_btf_info(struct btf *dst_btf, int dst_id, + struct btf *src_btf, int src_id) +{ + struct btf_type *dst_t = btf_type_by_id(dst_btf, dst_id); + struct btf_type *src_t = btf_type_by_id(src_btf, src_id); + struct btf_param *src_p, *dst_p; + const char *s; + int i, n, off; + + /* We already made sure that source and destination types (FUNC or + * VAR) match in terms of types and argument names. + */ + if (btf_is_var(dst_t)) { + btf_var(dst_t)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + return 0; + } + + dst_t->info = btf_type_info(BTF_KIND_FUNC, BTF_FUNC_GLOBAL, 0); + + /* now onto FUNC_PROTO types */ + src_t = btf_type_by_id(src_btf, src_t->type); + dst_t = btf_type_by_id(dst_btf, dst_t->type); + + /* Fill in all the argument names, which for extern FUNCs are missing. + * We'll end up with two copies of FUNCs/VARs for externs, but that + * will be taken care of by BTF dedup at the very end. + * It might be that BTF types for extern in one file has less/more BTF + * information (e.g., FWD instead of full STRUCT/UNION information), + * but that should be (in most cases, subject to BTF dedup rules) + * handled and resolved by BTF dedup algorithm as well, so we won't + * worry about it. Our only job is to make sure that argument names + * are populated on both sides, otherwise BTF dedup will pedantically + * consider them different. + */ + src_p = btf_params(src_t); + dst_p = btf_params(dst_t); + for (i = 0, n = btf_vlen(dst_t); i < n; i++, src_p++, dst_p++) { + if (!src_p->name_off) + continue; + + /* src_btf has more complete info, so add name to dst_btf */ + s = btf__str_by_offset(src_btf, src_p->name_off); + off = btf__add_str(dst_btf, s); + if (off < 0) + return off; + dst_p->name_off = off; + } + return 0; +} + +static void sym_update_bind(Elf64_Sym *sym, int sym_bind) +{ + sym->st_info = ELF64_ST_INFO(sym_bind, ELF64_ST_TYPE(sym->st_info)); +} + +static void sym_update_type(Elf64_Sym *sym, int sym_type) +{ + sym->st_info = ELF64_ST_INFO(ELF64_ST_BIND(sym->st_info), sym_type); +} + +static void sym_update_visibility(Elf64_Sym *sym, int sym_vis) +{ + /* libelf doesn't provide setters for ST_VISIBILITY, + * but it is stored in the lower 2 bits of st_other + */ + sym->st_other &= ~0x03; + sym->st_other |= sym_vis; +} + +static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, + Elf64_Sym *sym, const char *sym_name, int src_sym_idx) +{ + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + struct glob_sym *glob_sym = NULL; + int name_off, sym_type, sym_bind, sym_vis, err; + int btf_sec_id = 0, btf_id = 0; + size_t dst_sym_idx; + Elf64_Sym *dst_sym; + bool sym_is_extern; + + sym_type = ELF64_ST_TYPE(sym->st_info); + sym_bind = ELF64_ST_BIND(sym->st_info); + sym_vis = ELF64_ST_VISIBILITY(sym->st_other); + sym_is_extern = sym->st_shndx == SHN_UNDEF; + + if (sym_is_extern) { + if (!obj->btf) { + pr_warn("externs without BTF info are not supported\n"); + return -ENOTSUP; + } + } else if (sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + return 0; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (sym_type == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[src_sym_idx] = dst_sec->sec_sym_idx; + return 0; + } + } + + if (sym_bind == STB_LOCAL) + goto add_sym; + + /* find matching BTF info */ + err = find_glob_sym_btf(obj, sym, sym_name, &btf_sec_id, &btf_id); + if (err) + return err; + + if (sym_is_extern && btf_sec_id) { + const char *sec_name = NULL; + const struct btf_type *t; + + t = btf__type_by_id(obj->btf, btf_sec_id); + sec_name = btf__str_by_offset(obj->btf, t->name_off); + + /* Clang puts unannotated extern vars into + * '.extern' BTF DATASEC. Treat them the same + * as unannotated extern funcs (which are + * currently not put into any DATASECs). + * Those don't have associated src_sec/dst_sec. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) != 0) { + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("failed to find matching ELF sec '%s'\n", sec_name); + return -ENOENT; + } + dst_sec = &linker->secs[src_sec->dst_id]; + } + } + + glob_sym = find_glob_sym(linker, sym_name); + if (glob_sym) { + /* Preventively resolve to existing symbol. This is + * needed for further relocation symbol remapping in + * the next step of linking. + */ + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + + /* If both symbols are non-externs, at least one of + * them has to be STB_WEAK, otherwise they are in + * a conflict with each other. + */ + if (!sym_is_extern && !glob_sym->is_extern + && !glob_sym->is_weak && sym_bind != STB_WEAK) { + pr_warn("conflicting non-weak symbol #%d (%s) definition in '%s'\n", + src_sym_idx, sym_name, obj->filename); + return -EINVAL; + } + + if (!glob_syms_match(sym_name, linker, glob_sym, obj, sym, src_sym_idx, btf_id)) + return -EINVAL; + + dst_sym = get_sym_by_idx(linker, glob_sym->sym_idx); + + /* If new symbol is strong, then force dst_sym to be strong as + * well; this way a mix of weak and non-weak extern + * definitions will end up being strong. + */ + if (sym_bind == STB_GLOBAL) { + /* We still need to preserve type (NOTYPE or + * OBJECT/FUNC, depending on whether the symbol is + * extern or not) + */ + sym_update_bind(dst_sym, STB_GLOBAL); + glob_sym->is_weak = false; + } + + /* Non-default visibility is "contaminating", with stricter + * visibility overwriting more permissive ones, even if more + * permissive visibility comes from just an extern definition. + * Currently only STV_DEFAULT and STV_HIDDEN are allowed and + * ensured by ELF symbol sanity checks above. + */ + if (sym_vis > ELF64_ST_VISIBILITY(dst_sym->st_other)) + sym_update_visibility(dst_sym, sym_vis); + + /* If the new symbol is extern, then regardless if + * existing symbol is extern or resolved global, just + * keep the existing one untouched. + */ + if (sym_is_extern) + return 0; + + /* If existing symbol is a strong resolved symbol, bail out, + * because we lost resolution battle have nothing to + * contribute. We already checked abover that there is no + * strong-strong conflict. We also already tightened binding + * and visibility, so nothing else to contribute at that point. + */ + if (!glob_sym->is_extern && sym_bind == STB_WEAK) + return 0; + + /* At this point, new symbol is strong non-extern, + * so overwrite glob_sym with new symbol information. + * Preserve binding and visibility. + */ + sym_update_type(dst_sym, sym_type); + dst_sym->st_shndx = dst_sec->sec_idx; + dst_sym->st_value = src_sec->dst_off + sym->st_value; + dst_sym->st_size = sym->st_size; + + /* see comment below about dst_sec->id vs dst_sec->sec_idx */ + glob_sym->sec_id = dst_sec->id; + glob_sym->is_extern = false; + + if (complete_extern_btf_info(linker->btf, glob_sym->btf_id, + obj->btf, btf_id)) + return -EINVAL; + + /* request updating VAR's/FUNC's underlying BTF type when appending BTF type */ + glob_sym->underlying_btf_id = 0; + + obj->sym_map[src_sym_idx] = glob_sym->sym_idx; + return 0; + } + +add_sym: + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = dst_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[src_sym_idx] = dst_sym_idx; + + if (sym_type == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + if (sym_bind != STB_LOCAL) { + glob_sym = add_glob_sym(linker); + if (!glob_sym) + return -ENOMEM; + + glob_sym->sym_idx = dst_sym_idx; + /* we use dst_sec->id (and not dst_sec->sec_idx), because + * ephemeral sections (.kconfig, .ksyms, etc) don't have + * sec_idx (as they don't have corresponding ELF section), but + * still have id. .extern doesn't have even ephemeral section + * associated with it, so dst_sec->id == dst_sec->sec_idx == 0. + */ + glob_sym->sec_id = dst_sec ? dst_sec->id : 0; + glob_sym->name_off = name_off; + /* we will fill btf_id in during BTF merging step */ + glob_sym->btf_id = 0; + glob_sym->is_extern = sym_is_extern; + glob_sym->is_weak = sym_bind == STB_WEAK; + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("sections %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(linker, dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx, dst_sym_idx, sym_type; + Elf64_Sym *src_sym; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + if (!obj->btf) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig, .ksyms, .extern, which are used + * for special extern variables. + * + * For all but one such special (ephemeral) + * sections, we pre-create "section shells" to be able + * to keep track of extra per-section metadata later + * (e.g., those BTF extern variables). + * + * .extern is even more special, though, because it + * contains extern variables that need to be resolved + * by static linker, not libbpf and kernel. When such + * externs are resolved, we are going to remove them + * from .extern BTF section and might end up not + * needing it at all. Each resolved extern should have + * matching non-extern VAR/FUNC in other sections. + * + * We do support leaving some of the externs + * unresolved, though, to support cases of building + * libraries, which will later be linked against final + * BPF applications. So if at finalization we still + * see unresolved externs, we'll create .extern + * section on our own. + */ + if (strcmp(sec_name, BTF_EXTERN_SEC) == 0) + continue; + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + int new_id = id_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); + return -EINVAL; + } + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + const char *name; + + if (!obj->btf) + return 0; + + start_id = btf__type_cnt(linker->btf); + n = btf__type_cnt(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i < n; i++) { + struct glob_sym *glob_sym = NULL; + + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + if (btf_is_non_static(t)) { + /* there should be glob_sym already */ + name = btf__str_by_offset(obj->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + + /* VARs without corresponding glob_sym are those that + * belong to skipped/deduplicated sections (i.e., + * license and version), so just skip them + */ + if (!glob_sym) + continue; + + /* linker_append_elf_sym() might have requested + * updating underlying type ID, if extern was resolved + * to strong symbol or weak got upgraded to non-weak + */ + if (glob_sym->underlying_btf_id == 0) + glob_sym->underlying_btf_id = -t->type; + + /* globals from previous object files that match our + * VAR/FUNC already have a corresponding associated + * BTF type, so just make sure to use it + */ + if (glob_sym->btf_id) { + /* reuse existing BTF type for global var/func */ + obj->btf_type_map[i] = glob_sym->btf_id; + continue; + } + } + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + + /* record just appended BTF type for var/func */ + if (glob_sym) { + glob_sym->btf_id = id; + glob_sym->underlying_btf_id = -t->type; + } + } + + /* remap all the types except DATASECs */ + n = btf__type_cnt(linker->btf); + for (i = start_id; i < n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's + * actual type), if necessary + */ + for (i = 0; i < linker->glob_sym_cnt; i++) { + struct glob_sym *glob_sym = &linker->glob_syms[i]; + struct btf_type *glob_t; + + if (glob_sym->underlying_btf_id >= 0) + continue; + + glob_sym->underlying_btf_id = obj->btf_type_map[-glob_sym->underlying_btf_id]; + + glob_t = btf_type_by_id(linker->btf, glob_sym->btf_id); + glob_t->type = glob_sym->underlying_btf_id; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + int new_id = obj->btf_type_map[src_var->type]; + struct glob_sym *glob_sym = NULL; + + t = btf_type_by_id(linker->btf, new_id); + if (btf_is_non_static(t)) { + name = btf__str_by_offset(linker->btf, t->name_off); + glob_sym = find_glob_sym(linker, name); + if (glob_sym->sec_id != dst_sec->id) { + pr_warn("global '%s': section mismatch %d vs %d\n", + name, glob_sym->sec_id, dst_sec->id); + return -EINVAL; + } + } + + /* If there is already a member (VAR or FUNC) mapped + * to the same type, don't add a duplicate entry. + * This will happen when multiple object files define + * the same extern VARs/FUNCs. + */ + if (glob_sym && glob_sym->var_idx >= 0) { + __s64 sz; + + dst_var = &dst_sec->sec_vars[glob_sym->var_idx]; + /* Because underlying BTF type might have + * changed, so might its size have changed, so + * re-calculate and update it in sec_var. + */ + sz = btf__resolve_size(linker->btf, glob_sym->underlying_btf_id); + if (sz < 0) { + pr_warn("global '%s': failed to resolve size of underlying type: %d\n", + name, (int)sz); + return -EINVAL; + } + dst_var->size = sz; + continue; + } + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + + if (glob_sym) + glob_sym->var_idx = dst_sec->sec_var_cnt - 1; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return libbpf_err(-EINVAL); + + err = finalize_btf(linker); + if (err) + return libbpf_err(err); + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return libbpf_err(err); + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return libbpf_err(err); + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + LIBBPF_OPTS(btf_dedup_opts, opts); + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__type_cnt(linker->btf) == 1) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->has_btf) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + opts.btf_ext = linker->btf_ext; + err = btf__dedup(linker->btf, &opts); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + goto out; + } + +out: + free(data); + return err; +} diff --git a/third_party/libbpf/src/netlink.c b/third_party/libbpf/src/netlink.c new file mode 100644 index 0000000..84dd5fa --- /dev/null +++ b/third_party/libbpf/src/netlink.c @@ -0,0 +1,917 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); + +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, + void *cookie); + +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + +struct xdp_id_md { + int ifindex; + __u32 flags; + struct xdp_link_info info; + __u64 feature_flags; +}; + +struct xdp_features_md { + int ifindex; + __u64 flags; +}; + +static int libbpf_netlink_open(__u32 *nl_pid, int proto) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + pr_warn("Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static void libbpf_netlink_close(int sock) +{ + close(sock); +} + +enum { + NL_CONT, + NL_NEXT, + NL_DONE, +}; + +static int netlink_recvmsg(int sock, struct msghdr *mhdr, int flags) +{ + int len; + + do { + len = recvmsg(sock, mhdr, flags); + } while (len < 0 && (errno == EINTR || errno == EAGAIN)); + + if (len < 0) + return -errno; + return len; +} + +static int alloc_iov(struct iovec *iov, int len) +{ + void *nbuf; + + nbuf = realloc(iov->iov_base, len); + if (!nbuf) + return -ENOMEM; + + iov->iov_base = nbuf; + iov->iov_len = len; + return 0; +} + +static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct iovec iov = {}; + struct msghdr mhdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + bool multipart = true; + struct nlmsgerr *err; + struct nlmsghdr *nh; + int len, ret; + + ret = alloc_iov(&iov, 4096); + if (ret) + goto done; + + while (multipart) { +start: + multipart = false; + len = netlink_recvmsg(sock, &mhdr, MSG_PEEK | MSG_TRUNC); + if (len < 0) { + ret = len; + goto done; + } + + if (len > iov.iov_len) { + ret = alloc_iov(&iov, len); + if (ret) + goto done; + } + + len = netlink_recvmsg(sock, &mhdr, 0); + if (len < 0) { + ret = len; + goto done; + } + + if (len == 0) + break; + + for (nh = (struct nlmsghdr *)iov.iov_base; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + libbpf_nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + ret = 0; + goto done; + default: + break; + } + if (_fn) { + ret = _fn(nh, fn, cookie); + switch (ret) { + case NL_CONT: + break; + case NL_NEXT: + goto start; + case NL_DONE: + ret = 0; + goto done; + default: + goto done; + } + } + } + } + ret = 0; +done: + free(iov.iov_base); + return ret; +} + +static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, + int proto, __dump_nlmsg_t parse_msg, + libbpf_dump_nlmsg_t parse_attr, + void *cookie) +{ + __u32 nl_pid = 0; + int sock, ret; + + sock = libbpf_netlink_open(&nl_pid, proto); + if (sock < 0) + return sock; + + req->nh.nlmsg_pid = 0; + req->nh.nlmsg_seq = time(NULL); + + if (send(sock, req, req->nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto out; + } + + ret = libbpf_netlink_recv(sock, nl_pid, req->nh.nlmsg_seq, + parse_msg, parse_attr, cookie); +out: + libbpf_netlink_close(sock); + return ret; +} + +static int parse_genl_family_id(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[CTRL_ATTR_FAMILY_ID + 1]; + __u16 *id = cookie; + + libbpf_nla_parse(tb, CTRL_ATTR_FAMILY_ID, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + if (!tb[CTRL_ATTR_FAMILY_ID]) + return NL_CONT; + + *id = libbpf_nla_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]); + return NL_DONE; +} + +static int libbpf_netlink_resolve_genl_family_id(const char *name, + __u16 len, __u16 *id) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN), + .nh.nlmsg_type = GENL_ID_CTRL, + .nh.nlmsg_flags = NLM_F_REQUEST, + .gnl.cmd = CTRL_CMD_GETFAMILY, + .gnl.version = 2, + }; + int err; + + err = nlattr_add(&req, CTRL_ATTR_FAMILY_NAME, name, len); + if (err < 0) + return err; + + return libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_genl_family_id, NULL, id); +} + +static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, + __u32 flags) +{ + struct nlattr *nla; + int ret; + struct libbpf_nla_req req; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + nla = nlattr_begin_nested(&req, IFLA_XDP); + if (!nla) + return -EMSGSIZE; + ret = nlattr_add(&req, IFLA_XDP_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + if (flags) { + ret = nlattr_add(&req, IFLA_XDP_FLAGS, &flags, sizeof(flags)); + if (ret < 0) + return ret; + } + if (flags & XDP_FLAGS_REPLACE) { + ret = nlattr_add(&req, IFLA_XDP_EXPECTED_FD, &old_fd, + sizeof(old_fd)); + if (ret < 0) + return ret; + } + nlattr_end_nested(&req, nla); + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + int old_prog_fd, err; + + if (!OPTS_VALID(opts, bpf_xdp_attach_opts)) + return libbpf_err(-EINVAL); + + old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + if (old_prog_fd) + flags |= XDP_FLAGS_REPLACE; + else + old_prog_fd = -1; + + err = __bpf_set_link_xdp_fd_replace(ifindex, prog_fd, old_prog_fd, flags); + return libbpf_err(err); +} + +int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *opts) +{ + return bpf_xdp_attach(ifindex, -1, flags, opts); +} + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, + libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + + if (libbpf_nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb) +{ + struct nlattr *xdp_tb[IFLA_XDP_MAX + 1]; + struct xdp_id_md *xdp_id = cookie; + struct ifinfomsg *ifinfo = msg; + int ret; + + if (xdp_id->ifindex && xdp_id->ifindex != ifinfo->ifi_index) + return 0; + + if (!tb[IFLA_XDP]) + return 0; + + ret = libbpf_nla_parse_nested(xdp_tb, IFLA_XDP_MAX, tb[IFLA_XDP], NULL); + if (ret) + return ret; + + if (!xdp_tb[IFLA_XDP_ATTACHED]) + return 0; + + xdp_id->info.attach_mode = libbpf_nla_getattr_u8( + xdp_tb[IFLA_XDP_ATTACHED]); + + if (xdp_id->info.attach_mode == XDP_ATTACHED_NONE) + return 0; + + if (xdp_tb[IFLA_XDP_PROG_ID]) + xdp_id->info.prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_PROG_ID]); + + if (xdp_tb[IFLA_XDP_SKB_PROG_ID]) + xdp_id->info.skb_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_SKB_PROG_ID]); + + if (xdp_tb[IFLA_XDP_DRV_PROG_ID]) + xdp_id->info.drv_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_DRV_PROG_ID]); + + if (xdp_tb[IFLA_XDP_HW_PROG_ID]) + xdp_id->info.hw_prog_id = libbpf_nla_getattr_u32( + xdp_tb[IFLA_XDP_HW_PROG_ID]); + + return 0; +} + +static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[NETDEV_CMD_MAX + 1]; + struct xdp_features_md *md = cookie; + __u32 ifindex; + + libbpf_nla_parse(tb, NETDEV_CMD_MAX, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + + if (!tb[NETDEV_A_DEV_IFINDEX] || !tb[NETDEV_A_DEV_XDP_FEATURES]) + return NL_CONT; + + ifindex = libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_IFINDEX]); + if (ifindex != md->ifindex) + return NL_CONT; + + md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + return NL_DONE; +} + +int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifinfo.ifi_family = AF_PACKET, + }; + struct xdp_id_md xdp_id = {}; + struct xdp_features_md md = { + .ifindex = ifindex, + }; + __u16 id; + int err; + + if (!OPTS_VALID(opts, bpf_xdp_query_opts)) + return libbpf_err(-EINVAL); + + if (xdp_flags & ~XDP_FLAGS_MASK) + return libbpf_err(-EINVAL); + + /* Check whether the single {HW,DRV,SKB} mode is set */ + xdp_flags &= XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE; + if (xdp_flags & (xdp_flags - 1)) + return libbpf_err(-EINVAL); + + xdp_id.ifindex = ifindex; + xdp_id.flags = xdp_flags; + + err = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, __dump_link_nlmsg, + get_xdp_info, &xdp_id); + if (err) + return libbpf_err(err); + + OPTS_SET(opts, prog_id, xdp_id.info.prog_id); + OPTS_SET(opts, drv_prog_id, xdp_id.info.drv_prog_id); + OPTS_SET(opts, hw_prog_id, xdp_id.info.hw_prog_id); + OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id); + OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode); + + if (!OPTS_HAS(opts, feature_flags)) + return 0; + + err = libbpf_netlink_resolve_genl_family_id("netdev", sizeof("netdev"), &id); + if (err < 0) { + if (err == -ENOENT) { + opts->feature_flags = 0; + goto skip_feature_flags; + } + return libbpf_err(err); + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = id; + req.gnl.cmd = NETDEV_CMD_DEV_GET; + req.gnl.version = 2; + + err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex)); + if (err < 0) + return libbpf_err(err); + + err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_xdp_features, NULL, &md); + if (err) + return libbpf_err(err); + + opts->feature_flags = md.flags; + +skip_feature_flags: + return 0; +} + +int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) +{ + LIBBPF_OPTS(bpf_xdp_query_opts, opts); + int ret; + + ret = bpf_xdp_query(ifindex, flags, &opts); + if (ret) + return libbpf_err(ret); + + flags &= XDP_FLAGS_MODES; + + if (opts.attach_mode != XDP_ATTACHED_MULTI && !flags) + *prog_id = opts.prog_id; + else if (flags & XDP_FLAGS_DRV_MODE) + *prog_id = opts.drv_prog_id; + else if (flags & XDP_FLAGS_HW_MODE) + *prog_id = opts.hw_prog_id; + else if (flags & XDP_FLAGS_SKB_MODE) + *prog_id = opts.skb_prog_id; + else + *prog_id = 0; + + return 0; +} + + +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); + +static int clsact_config(struct libbpf_nla_req *req) +{ + req->tc.tcm_parent = TC_H_CLSACT; + req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); + + return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); +} + +static int attach_point_to_config(struct bpf_tc_hook *hook, + qdisc_config_t *config) +{ + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + case BPF_TC_INGRESS | BPF_TC_EGRESS: + if (OPTS_GET(hook, parent, 0)) + return -EINVAL; + *config = &clsact_config; + return 0; + case BPF_TC_CUSTOM: + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + +static int tc_get_tcm_parent(enum bpf_tc_attach_point attach_point, + __u32 *parent) +{ + switch (attach_point) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + if (*parent) + return -EINVAL; + *parent = TC_H_MAKE(TC_H_CLSACT, + attach_point == BPF_TC_INGRESS ? + TC_H_MIN_INGRESS : TC_H_MIN_EGRESS); + break; + case BPF_TC_CUSTOM: + if (!*parent) + return -EINVAL; + break; + default: + return -EINVAL; + } + return 0; +} + +static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) +{ + qdisc_config_t config; + int ret; + struct libbpf_nla_req req; + + ret = attach_point_to_config(hook, &config); + if (ret < 0) + return ret; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags; + req.nh.nlmsg_type = cmd; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); + + ret = config(&req); + if (ret < 0) + return ret; + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_NEWQDISC, NLM_F_CREATE | NLM_F_EXCL); +} + +static int tc_qdisc_delete(struct bpf_tc_hook *hook) +{ + return tc_qdisc_modify(hook, RTM_DELQDISC, 0); +} + +int bpf_tc_hook_create(struct bpf_tc_hook *hook) +{ + int ret; + + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + ret = tc_qdisc_create_excl(hook); + return libbpf_err(ret); +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush); + +int bpf_tc_hook_destroy(struct bpf_tc_hook *hook) +{ + if (!hook || !OPTS_VALID(hook, bpf_tc_hook) || + OPTS_GET(hook, ifindex, 0) <= 0) + return libbpf_err(-EINVAL); + + switch (OPTS_GET(hook, attach_point, 0)) { + case BPF_TC_INGRESS: + case BPF_TC_EGRESS: + return libbpf_err(__bpf_tc_detach(hook, NULL, true)); + case BPF_TC_INGRESS | BPF_TC_EGRESS: + return libbpf_err(tc_qdisc_delete(hook)); + case BPF_TC_CUSTOM: + return libbpf_err(-EOPNOTSUPP); + default: + return libbpf_err(-EINVAL); + } +} + +struct bpf_cb_ctx { + struct bpf_tc_opts *opts; + bool processed; +}; + +static int __get_tc_info(void *cookie, struct tcmsg *tc, struct nlattr **tb, + bool unicast) +{ + struct nlattr *tbb[TCA_BPF_MAX + 1]; + struct bpf_cb_ctx *info = cookie; + + if (!info || !info->opts) + return -EINVAL; + if (unicast && info->processed) + return -EINVAL; + if (!tb[TCA_OPTIONS]) + return NL_CONT; + + libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL); + if (!tbb[TCA_BPF_ID]) + return -EINVAL; + + OPTS_SET(info->opts, prog_id, libbpf_nla_getattr_u32(tbb[TCA_BPF_ID])); + OPTS_SET(info->opts, handle, tc->tcm_handle); + OPTS_SET(info->opts, priority, TC_H_MAJ(tc->tcm_info) >> 16); + + info->processed = true; + return unicast ? NL_NEXT : NL_DONE; +} + +static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct tcmsg *tc = NLMSG_DATA(nh); + struct nlattr *tb[TCA_MAX + 1]; + + libbpf_nla_parse(tb, TCA_MAX, + (struct nlattr *)((void *)tc + NLMSG_ALIGN(sizeof(*tc))), + NLMSG_PAYLOAD(nh, sizeof(*tc)), NULL); + if (!tb[TCA_KIND]) + return NL_CONT; + return __get_tc_info(cookie, tc, tb, nh->nlmsg_flags & NLM_F_ECHO); +} + +static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) +{ + struct bpf_prog_info info; + __u32 info_len = sizeof(info); + char name[256]; + int len, ret; + + memset(&info, 0, info_len); + ret = bpf_prog_get_info_by_fd(fd, &info, &info_len); + if (ret < 0) + return ret; + + ret = nlattr_add(req, TCA_BPF_FD, &fd, sizeof(fd)); + if (ret < 0) + return ret; + len = snprintf(name, sizeof(name), "%s:[%u]", info.name, info.id); + if (len < 0) + return -errno; + if (len >= sizeof(name)) + return -ENAMETOOLONG; + return nlattr_add(req, TCA_BPF_NAME, name, len + 1); +} + +int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, bpf_flags, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + struct nlattr *nla; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || !prog_fd || prog_id) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + if (flags & ~BPF_TC_F_REPLACE) + return libbpf_err(-EINVAL); + + flags = (flags & BPF_TC_F_REPLACE) ? NLM_F_REPLACE : NLM_F_EXCL; + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | + NLM_F_ECHO | flags; + req.nh.nlmsg_type = RTM_NEWTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + nla = nlattr_begin_nested(&req, TCA_OPTIONS); + if (!nla) + return libbpf_err(-EMSGSIZE); + ret = tc_add_fd_and_name(&req, prog_fd); + if (ret < 0) + return libbpf_err(ret); + bpf_flags = TCA_BPF_FLAG_ACT_DIRECT; + ret = nlattr_add(&req, TCA_BPF_FLAGS, &bpf_flags, sizeof(bpf_flags)); + if (ret < 0) + return libbpf_err(ret); + nlattr_end_nested(&req, nla); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} + +static int __bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts, + const bool flush) +{ + __u32 protocol = 0, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct libbpf_nla_req req; + + if (!hook || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return -EINVAL; + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id) + return -EINVAL; + if (priority > UINT16_MAX) + return -EINVAL; + if (!flush) { + if (!handle || !priority) + return -EINVAL; + protocol = ETH_P_ALL; + } else { + if (handle || priority) + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_DELTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + if (!flush) { + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + } + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return ret; + req.tc.tcm_parent = parent; + + if (!flush) { + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return ret; + } + + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); +} + +int bpf_tc_detach(const struct bpf_tc_hook *hook, + const struct bpf_tc_opts *opts) +{ + int ret; + + if (!opts) + return libbpf_err(-EINVAL); + + ret = __bpf_tc_detach(hook, opts, false); + return libbpf_err(ret); +} + +int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) +{ + __u32 protocol, handle, priority, parent, prog_id, flags; + int ret, ifindex, attach_point, prog_fd; + struct bpf_cb_ctx info = {}; + struct libbpf_nla_req req; + + if (!hook || !opts || + !OPTS_VALID(hook, bpf_tc_hook) || + !OPTS_VALID(opts, bpf_tc_opts)) + return libbpf_err(-EINVAL); + + ifindex = OPTS_GET(hook, ifindex, 0); + parent = OPTS_GET(hook, parent, 0); + attach_point = OPTS_GET(hook, attach_point, 0); + + handle = OPTS_GET(opts, handle, 0); + priority = OPTS_GET(opts, priority, 0); + prog_fd = OPTS_GET(opts, prog_fd, 0); + prog_id = OPTS_GET(opts, prog_id, 0); + flags = OPTS_GET(opts, flags, 0); + + if (ifindex <= 0 || flags || prog_fd || prog_id || + !handle || !priority) + return libbpf_err(-EINVAL); + if (priority > UINT16_MAX) + return libbpf_err(-EINVAL); + + protocol = ETH_P_ALL; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = RTM_GETTFILTER; + req.tc.tcm_family = AF_UNSPEC; + req.tc.tcm_ifindex = ifindex; + req.tc.tcm_handle = handle; + req.tc.tcm_info = TC_H_MAKE(priority << 16, htons(protocol)); + + ret = tc_get_tcm_parent(attach_point, &parent); + if (ret < 0) + return libbpf_err(ret); + req.tc.tcm_parent = parent; + + ret = nlattr_add(&req, TCA_KIND, "bpf", sizeof("bpf")); + if (ret < 0) + return libbpf_err(ret); + + info.opts = opts; + + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); + if (ret < 0) + return libbpf_err(ret); + if (!info.processed) + return libbpf_err(-ENOENT); + return ret; +} diff --git a/third_party/libbpf/src/nlattr.c b/third_party/libbpf/src/nlattr.c new file mode 100644 index 0000000..975e265 --- /dev/null +++ b/third_party/libbpf/src/nlattr.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#include +#include +#include +#include +#include "nlattr.h" +#include "libbpf_internal.h" + +static uint16_t nla_attr_minlen[LIBBPF_NLA_TYPE_MAX+1] = { + [LIBBPF_NLA_U8] = sizeof(uint8_t), + [LIBBPF_NLA_U16] = sizeof(uint16_t), + [LIBBPF_NLA_U32] = sizeof(uint32_t), + [LIBBPF_NLA_U64] = sizeof(uint64_t), + [LIBBPF_NLA_STRING] = 1, + [LIBBPF_NLA_FLAG] = 0, +}; + +static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) +{ + int totlen = NLA_ALIGN(nla->nla_len); + + *remaining -= totlen; + return (struct nlattr *)((void *)nla + totlen); +} + +static int nla_ok(const struct nlattr *nla, int remaining) +{ + return remaining >= (int)sizeof(*nla) && + nla->nla_len >= sizeof(*nla) && + nla->nla_len <= remaining; +} + +static int nla_type(const struct nlattr *nla) +{ + return nla->nla_type & NLA_TYPE_MASK; +} + +static int validate_nla(struct nlattr *nla, int maxtype, + struct libbpf_nla_policy *policy) +{ + struct libbpf_nla_policy *pt; + unsigned int minlen = 0; + int type = nla_type(nla); + + if (type < 0 || type > maxtype) + return 0; + + pt = &policy[type]; + + if (pt->type > LIBBPF_NLA_TYPE_MAX) + return 0; + + if (pt->minlen) + minlen = pt->minlen; + else if (pt->type != LIBBPF_NLA_UNSPEC) + minlen = nla_attr_minlen[pt->type]; + + if (libbpf_nla_len(nla) < minlen) + return -1; + + if (pt->maxlen && libbpf_nla_len(nla) > pt->maxlen) + return -1; + + if (pt->type == LIBBPF_NLA_STRING) { + char *data = libbpf_nla_data(nla); + + if (data[libbpf_nla_len(nla) - 1] != '\0') + return -1; + } + + return 0; +} + +static inline int nlmsg_len(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_len - NLMSG_HDRLEN; +} + +/** + * Create attribute index based on a stream of attributes. + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg head Head of attribute stream. + * @arg len Length of attribute stream. + * @arg policy Attribute validation policy. + * + * Iterates over the stream of attributes and stores a pointer to each + * attribute in the index array using the attribute type as index to + * the array. Attribute with a type greater than the maximum type + * specified will be silently ignored in order to maintain backwards + * compatibility. If \a policy is not NULL, the attribute will be + * validated using the specified policy. + * + * @see nla_validate + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy) +{ + struct nlattr *nla; + int rem, err; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + libbpf_nla_for_each_attr(nla, head, len, rem) { + int type = nla_type(nla); + + if (type > maxtype) + continue; + + if (policy) { + err = validate_nla(nla, maxtype, policy); + if (err < 0) + goto errout; + } + + if (tb[type]) + pr_warn("Attribute of type %#x found multiple times in message, " + "previous attribute is being ignored.\n", type); + + tb[type] = nla; + } + + err = 0; +errout: + return err; +} + +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to libbpf_nla_parse(). + * + * @see libbpf_nla_parse + * @return 0 on success or a negative error code. + */ +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy) +{ + return libbpf_nla_parse(tb, maxtype, libbpf_nla_data(nla), + libbpf_nla_len(nla), policy); +} + +/* dump netlink extended ack error message */ +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) +{ + struct libbpf_nla_policy extack_policy[NLMSGERR_ATTR_MAX + 1] = { + [NLMSGERR_ATTR_MSG] = { .type = LIBBPF_NLA_STRING }, + [NLMSGERR_ATTR_OFFS] = { .type = LIBBPF_NLA_U32 }, + }; + struct nlattr *tb[NLMSGERR_ATTR_MAX + 1], *attr; + struct nlmsgerr *err; + char *errmsg = NULL; + int hlen, alen; + + /* no TLVs, nothing to do here */ + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return 0; + + err = (struct nlmsgerr *)NLMSG_DATA(nlh); + hlen = sizeof(*err); + + /* if NLM_F_CAPPED is set then the inner err msg was capped */ + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + hlen += nlmsg_len(&err->msg); + + attr = (struct nlattr *) ((void *) err + hlen); + alen = (void *)nlh + nlh->nlmsg_len - (void *)attr; + + if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, + extack_policy) != 0) { + pr_warn("Failed to parse extended error attributes\n"); + return 0; + } + + if (tb[NLMSGERR_ATTR_MSG]) + errmsg = (char *) libbpf_nla_data(tb[NLMSGERR_ATTR_MSG]); + + pr_warn("Kernel error message: %s\n", errmsg); + + return 0; +} diff --git a/third_party/libbpf/src/nlattr.h b/third_party/libbpf/src/nlattr.h new file mode 100644 index 0000000..d92d1c1 --- /dev/null +++ b/third_party/libbpf/src/nlattr.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * NETLINK Netlink attributes + * + * Copyright (c) 2003-2013 Thomas Graf + */ + +#ifndef __LIBBPF_NLATTR_H +#define __LIBBPF_NLATTR_H + +#include +#include +#include +#include +#include +#include + +/* avoid multiple definition of netlink features */ +#define __LINUX_NETLINK_H + +/** + * Standard attribute types to specify validation policy + */ +enum { + LIBBPF_NLA_UNSPEC, /**< Unspecified type, binary data chunk */ + LIBBPF_NLA_U8, /**< 8 bit integer */ + LIBBPF_NLA_U16, /**< 16 bit integer */ + LIBBPF_NLA_U32, /**< 32 bit integer */ + LIBBPF_NLA_U64, /**< 64 bit integer */ + LIBBPF_NLA_STRING, /**< NUL terminated character string */ + LIBBPF_NLA_FLAG, /**< Flag */ + LIBBPF_NLA_MSECS, /**< Micro seconds (64bit) */ + LIBBPF_NLA_NESTED, /**< Nested attributes */ + __LIBBPF_NLA_TYPE_MAX, +}; + +#define LIBBPF_NLA_TYPE_MAX (__LIBBPF_NLA_TYPE_MAX - 1) + +/** + * @ingroup attr + * Attribute validation policy. + * + * See section @core_doc{core_attr_parse,Attribute Parsing} for more details. + */ +struct libbpf_nla_policy { + /** Type of attribute or LIBBPF_NLA_UNSPEC */ + uint16_t type; + + /** Minimal length of payload required */ + uint16_t minlen; + + /** Maximal length of payload allowed */ + uint16_t maxlen; +}; + +struct libbpf_nla_req { + struct nlmsghdr nh; + union { + struct ifinfomsg ifinfo; + struct tcmsg tc; + struct genlmsghdr gnl; + }; + char buf[128]; +}; + +/** + * @ingroup attr + * Iterate over a stream of attributes + * @arg pos loop counter, set to current attribute + * @arg head head of attribute stream + * @arg len length of attribute stream + * @arg rem initialized to len, holds bytes currently remaining in stream + */ +#define libbpf_nla_for_each_attr(pos, head, len, rem) \ + for (pos = head, rem = len; \ + nla_ok(pos, rem); \ + pos = nla_next(pos, &(rem))) + +/** + * libbpf_nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *libbpf_nla_data(const struct nlattr *nla) +{ + return (void *)nla + NLA_HDRLEN; +} + +static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)libbpf_nla_data(nla); +} + +static inline uint16_t libbpf_nla_getattr_u16(const struct nlattr *nla) +{ + return *(uint16_t *)libbpf_nla_data(nla); +} + +static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)libbpf_nla_data(nla); +} + +static inline uint64_t libbpf_nla_getattr_u64(const struct nlattr *nla) +{ + return *(uint64_t *)libbpf_nla_data(nla); +} + +static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)libbpf_nla_data(nla); +} + +/** + * libbpf_nla_len - length of payload + * @nla: netlink attribute + */ +static inline int libbpf_nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int libbpf_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, + int len, struct libbpf_nla_policy *policy); +int libbpf_nla_parse_nested(struct nlattr *tb[], int maxtype, + struct nlattr *nla, + struct libbpf_nla_policy *policy); + +int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh); + +static inline struct nlattr *nla_data(struct nlattr *nla) +{ + return (struct nlattr *)((void *)nla + NLA_HDRLEN); +} + +static inline struct nlattr *req_tail(struct libbpf_nla_req *req) +{ + return (struct nlattr *)((void *)req + NLMSG_ALIGN(req->nh.nlmsg_len)); +} + +static inline int nlattr_add(struct libbpf_nla_req *req, int type, + const void *data, int len) +{ + struct nlattr *nla; + + if (NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(NLA_HDRLEN + len) > sizeof(*req)) + return -EMSGSIZE; + if (!!data != !!len) + return -EINVAL; + + nla = req_tail(req); + nla->nla_type = type; + nla->nla_len = NLA_HDRLEN + len; + if (data) + memcpy(nla_data(nla), data, len); + req->nh.nlmsg_len = NLMSG_ALIGN(req->nh.nlmsg_len) + NLA_ALIGN(nla->nla_len); + return 0; +} + +static inline struct nlattr *nlattr_begin_nested(struct libbpf_nla_req *req, int type) +{ + struct nlattr *tail; + + tail = req_tail(req); + if (nlattr_add(req, type | NLA_F_NESTED, NULL, 0)) + return NULL; + return tail; +} + +static inline void nlattr_end_nested(struct libbpf_nla_req *req, + struct nlattr *tail) +{ + tail->nla_len = (void *)req_tail(req) - (void *)tail; +} + +#endif /* __LIBBPF_NLATTR_H */ diff --git a/third_party/libbpf/src/relo_core.c b/third_party/libbpf/src/relo_core.c new file mode 100644 index 0000000..a26b2f5 --- /dev/null +++ b/third_party/libbpf/src/relo_core.c @@ -0,0 +1,1687 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Facebook */ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include "relo_core.h" + +static const char *btf_kind_str(const struct btf_type *t) +{ + return btf_type_str(t); +} + +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) +{ + return btf_type_skip_modifiers(btf, id, res_id); +} + +static const char *btf__name_by_offset(const struct btf *btf, u32 offset) +{ + return btf_name_by_offset(btf, offset); +} + +static s64 btf__resolve_size(const struct btf *btf, u32 type_id) +{ + const struct btf_type *t; + int size; + + t = btf_type_by_id(btf, type_id); + t = btf_resolve_size(btf, t, &size); + if (IS_ERR(t)) + return PTR_ERR(t); + return size; +} + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +#undef pr_warn +#undef pr_info +#undef pr_debug +#define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) +#else +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "bpf.h" +#include "btf.h" +#include "str_error.h" +#include "libbpf_internal.h" +#endif + +static bool is_flex_arr(const struct btf *btf, + const struct bpf_core_accessor *acc, + const struct btf_array *arr) +{ + const struct btf_type *t; + + /* not a flexible array, if not inside a struct or has non-zero size */ + if (!acc->name || arr->nelems > 0) + return false; + + /* has to be the last member of enclosing struct */ + t = btf_type_by_id(btf, acc->type_id); + return acc->idx == btf_vlen(t) - 1; +} + +static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: return "byte_off"; + case BPF_CORE_FIELD_BYTE_SIZE: return "byte_sz"; + case BPF_CORE_FIELD_EXISTS: return "field_exists"; + case BPF_CORE_FIELD_SIGNED: return "signed"; + case BPF_CORE_FIELD_LSHIFT_U64: return "lshift_u64"; + case BPF_CORE_FIELD_RSHIFT_U64: return "rshift_u64"; + case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; + case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; + case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; + case BPF_CORE_TYPE_SIZE: return "type_size"; + case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; + case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; + default: return "unknown"; + } +} + +static bool core_relo_is_field_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + case BPF_CORE_FIELD_BYTE_SIZE: + case BPF_CORE_FIELD_EXISTS: + case BPF_CORE_FIELD_SIGNED: + case BPF_CORE_FIELD_LSHIFT_U64: + case BPF_CORE_FIELD_RSHIFT_U64: + return true; + default: + return false; + } +} + +static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_TYPE_ID_LOCAL: + case BPF_CORE_TYPE_ID_TARGET: + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + case BPF_CORE_TYPE_SIZE: + return true; + default: + return false; + } +} + +static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) +{ + switch (kind) { + case BPF_CORE_ENUMVAL_EXISTS: + case BPF_CORE_ENUMVAL_VALUE: + return true; + default: + return false; + } +} + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + +/* + * Turn bpf_core_relo into a low- and high-level spec representation, + * validating correctness along the way, as well as calculating resulting + * field bit offset, specified by accessor string. Low-level spec captures + * every single level of nestedness, including traversing anonymous + * struct/union members. High-level one only captures semantically meaningful + * "turning points": named fields and array indicies. + * E.g., for this case: + * + * struct sample { + * int __unimportant; + * struct { + * int __1; + * int __2; + * int a[7]; + * }; + * }; + * + * struct sample *s = ...; + * + * int x = &s->a[3]; // access string = '0:1:2:3' + * + * Low-level spec has 1:1 mapping with each element of access string (it's + * just a parsed access string representation): [0, 1, 2, 3]. + * + * High-level spec will capture only 3 points: + * - initial zero-index access by pointer (&s->... is the same as &s[0]...); + * - field 'a' access (corresponds to '2' in low-level spec); + * - array element #3 access (corresponds to '3' in low-level spec). + * + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, + * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their + * spec and raw_spec are kept empty. + * + * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access + * string to specify enumerator's value index that need to be relocated. + */ +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec) +{ + int access_idx, parsed_len, i; + struct bpf_core_accessor *acc; + const struct btf_type *t; + const char *name, *spec_str; + __u32 id, name_off; + __s64 sz; + + spec_str = btf__name_by_offset(btf, relo->access_str_off); + if (str_is_empty(spec_str) || *spec_str == ':') + return -EINVAL; + + memset(spec, 0, sizeof(*spec)); + spec->btf = btf; + spec->root_type_id = relo->type_id; + spec->relo_kind = relo->kind; + + /* type-based relocations don't have a field access string */ + if (core_relo_is_type_based(relo->kind)) { + if (strcmp(spec_str, "0")) + return -EINVAL; + return 0; + } + + /* parse spec_str="0:1:2:3:4" into array raw_spec=[0, 1, 2, 3, 4] */ + while (*spec_str) { + if (*spec_str == ':') + ++spec_str; + if (sscanf(spec_str, "%d%n", &access_idx, &parsed_len) != 1) + return -EINVAL; + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + spec_str += parsed_len; + spec->raw_spec[spec->raw_len++] = access_idx; + } + + if (spec->raw_len == 0) + return -EINVAL; + + t = skip_mods_and_typedefs(btf, relo->type_id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[0]; + acc = &spec->spec[0]; + acc->type_id = id; + acc->idx = access_idx; + spec->len++; + + if (core_relo_is_enumval_based(relo->kind)) { + if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + return -EINVAL; + + /* record enumerator name in a first accessor */ + name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off + : btf_enum64(t)[access_idx].name_off; + acc->name = btf__name_by_offset(btf, name_off); + return 0; + } + + if (!core_relo_is_field_based(relo->kind)) + return -EINVAL; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset = access_idx * sz * 8; + + for (i = 1; i < spec->raw_len; i++) { + t = skip_mods_and_typedefs(btf, id, &id); + if (!t) + return -EINVAL; + + access_idx = spec->raw_spec[i]; + acc = &spec->spec[spec->len]; + + if (btf_is_composite(t)) { + const struct btf_member *m; + __u32 bit_offset; + + if (access_idx >= btf_vlen(t)) + return -EINVAL; + + bit_offset = btf_member_bit_offset(t, access_idx); + spec->bit_offset += bit_offset; + + m = btf_members(t) + access_idx; + if (m->name_off) { + name = btf__name_by_offset(btf, m->name_off); + if (str_is_empty(name)) + return -EINVAL; + + acc->type_id = id; + acc->idx = access_idx; + acc->name = name; + spec->len++; + } + + id = m->type; + } else if (btf_is_array(t)) { + const struct btf_array *a = btf_array(t); + bool flex; + + t = skip_mods_and_typedefs(btf, a->type, &id); + if (!t) + return -EINVAL; + + flex = is_flex_arr(btf, acc - 1, a); + if (!flex && access_idx >= a->nelems) + return -EINVAL; + + spec->spec[spec->len].type_id = id; + spec->spec[spec->len].idx = access_idx; + spec->len++; + + sz = btf__resolve_size(btf, id); + if (sz < 0) + return sz; + spec->bit_offset += access_idx * sz * 8; + } else { + pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + prog_name, relo->type_id, spec_str, i, id, btf_kind_str(t)); + return -EINVAL; + } + } + + return 0; +} + +/* Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +static int bpf_core_fields_are_compat(const struct btf *local_btf, + __u32 local_id, + const struct btf *targ_btf, + __u32 targ_id) +{ + const struct btf_type *local_type, *targ_type; + +recur: + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_is_composite(local_type) && btf_is_composite(targ_type)) + return 1; + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_PTR: + case BTF_KIND_FLOAT: + return 1; + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + case BTF_KIND_ENUM: { + const char *local_name, *targ_name; + size_t local_len, targ_len; + + local_name = btf__name_by_offset(local_btf, + local_type->name_off); + targ_name = btf__name_by_offset(targ_btf, targ_type->name_off); + local_len = bpf_core_essential_name_len(local_name); + targ_len = bpf_core_essential_name_len(targ_name); + /* one of them is anonymous or both w/ same flavor-less names */ + return local_len == 0 || targ_len == 0 || + (local_len == targ_len && + strncmp(local_name, targ_name, local_len) == 0); + } + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && + btf_int_offset(targ_type) == 0; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + default: + return 0; + } +} + +/* + * Given single high-level named field accessor in local type, find + * corresponding high-level accessor for a target type. Along the way, + * maintain low-level spec for target as well. Also keep updating target + * bit offset. + * + * Searching is performed through recursive exhaustive enumeration of all + * fields of a struct/union. If there are any anonymous (embedded) + * structs/unions, they are recursively searched as well. If field with + * desired name is found, check compatibility between local and target types, + * before returning result. + * + * 1 is returned, if field is found. + * 0 is returned if no compatible field is found. + * <0 is returned on error. + */ +static int bpf_core_match_member(const struct btf *local_btf, + const struct bpf_core_accessor *local_acc, + const struct btf *targ_btf, + __u32 targ_id, + struct bpf_core_spec *spec, + __u32 *next_targ_id) +{ + const struct btf_type *local_type, *targ_type; + const struct btf_member *local_member, *m; + const char *local_name, *targ_name; + __u32 local_id; + int i, n, found; + + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!targ_type) + return -EINVAL; + if (!btf_is_composite(targ_type)) + return 0; + + local_id = local_acc->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_member = btf_members(local_type) + local_acc->idx; + local_name = btf__name_by_offset(local_btf, local_member->name_off); + + n = btf_vlen(targ_type); + m = btf_members(targ_type); + for (i = 0; i < n; i++, m++) { + __u32 bit_offset; + + bit_offset = btf_member_bit_offset(targ_type, i); + + /* too deep struct/union/array nesting */ + if (spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + /* speculate this member will be the good one */ + spec->bit_offset += bit_offset; + spec->raw_spec[spec->raw_len++] = i; + + targ_name = btf__name_by_offset(targ_btf, m->name_off); + if (str_is_empty(targ_name)) { + /* embedded struct/union, we need to go deeper */ + found = bpf_core_match_member(local_btf, local_acc, + targ_btf, m->type, + spec, next_targ_id); + if (found) /* either found or error */ + return found; + } else if (strcmp(local_name, targ_name) == 0) { + /* matching named field */ + struct bpf_core_accessor *targ_acc; + + targ_acc = &spec->spec[spec->len++]; + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + + *next_targ_id = m->type; + found = bpf_core_fields_are_compat(local_btf, + local_member->type, + targ_btf, m->type); + if (!found) + spec->len--; /* pop accessor */ + return found; + } + /* member turned out not to be what we looked for */ + spec->bit_offset -= bit_offset; + spec->raw_len--; + } + + return 0; +} + +/* + * Try to match local spec to a target type and, if successful, produce full + * target spec (high-level, low-level + bit offset). + */ +static int bpf_core_spec_match(struct bpf_core_spec *local_spec, + const struct btf *targ_btf, __u32 targ_id, + struct bpf_core_spec *targ_spec) +{ + const struct btf_type *targ_type; + const struct bpf_core_accessor *local_acc; + struct bpf_core_accessor *targ_acc; + int i, sz, matched; + __u32 name_off; + + memset(targ_spec, 0, sizeof(*targ_spec)); + targ_spec->btf = targ_btf; + targ_spec->root_type_id = targ_id; + targ_spec->relo_kind = local_spec->relo_kind; + + if (core_relo_is_type_based(local_spec->relo_kind)) { + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + } + + local_acc = &local_spec->spec[0]; + targ_acc = &targ_spec->spec[0]; + + if (core_relo_is_enumval_based(local_spec->relo_kind)) { + size_t local_essent_len, targ_essent_len; + const char *targ_name; + + /* has to resolve to an enum */ + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); + if (!btf_is_any_enum(targ_type)) + return 0; + + local_essent_len = bpf_core_essential_name_len(local_acc->name); + + for (i = 0; i < btf_vlen(targ_type); i++) { + if (btf_is_enum(targ_type)) + name_off = btf_enum(targ_type)[i].name_off; + else + name_off = btf_enum64(targ_type)[i].name_off; + + targ_name = btf__name_by_offset(targ_spec->btf, name_off); + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != local_essent_len) + continue; + if (strncmp(local_acc->name, targ_name, local_essent_len) == 0) { + targ_acc->type_id = targ_id; + targ_acc->idx = i; + targ_acc->name = targ_name; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + return 1; + } + } + return 0; + } + + if (!core_relo_is_field_based(local_spec->relo_kind)) + return -EINVAL; + + for (i = 0; i < local_spec->len; i++, local_acc++, targ_acc++) { + targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, + &targ_id); + if (!targ_type) + return -EINVAL; + + if (local_acc->name) { + matched = bpf_core_match_member(local_spec->btf, + local_acc, + targ_btf, targ_id, + targ_spec, &targ_id); + if (matched <= 0) + return matched; + } else { + /* for i=0, targ_id is already treated as array element + * type (because it's the original struct), for others + * we should find array element type first + */ + if (i > 0) { + const struct btf_array *a; + bool flex; + + if (!btf_is_array(targ_type)) + return 0; + + a = btf_array(targ_type); + flex = is_flex_arr(targ_btf, targ_acc - 1, a); + if (!flex && local_acc->idx >= a->nelems) + return 0; + if (!skip_mods_and_typedefs(targ_btf, a->type, + &targ_id)) + return -EINVAL; + } + + /* too deep struct/union/array nesting */ + if (targ_spec->raw_len == BPF_CORE_SPEC_MAX_LEN) + return -E2BIG; + + targ_acc->type_id = targ_id; + targ_acc->idx = local_acc->idx; + targ_acc->name = NULL; + targ_spec->len++; + targ_spec->raw_spec[targ_spec->raw_len] = targ_acc->idx; + targ_spec->raw_len++; + + sz = btf__resolve_size(targ_btf, targ_id); + if (sz < 0) + return sz; + targ_spec->bit_offset += local_acc->idx * sz * 8; + } + } + + return 1; +} + +static int bpf_core_calc_field_relo(const char *prog_name, + const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, __u32 *field_sz, __u32 *type_id, + bool *validate) +{ + const struct bpf_core_accessor *acc; + const struct btf_type *t; + __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id; + const struct btf_member *m; + const struct btf_type *mt; + bool bitfield; + __s64 sz; + + *field_sz = 0; + + if (relo->kind == BPF_CORE_FIELD_EXISTS) { + *val = spec ? 1 : 0; + return 0; + } + + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + + acc = &spec->spec[spec->len - 1]; + t = btf_type_by_id(spec->btf, acc->type_id); + + /* a[n] accessor needs special handling */ + if (!acc->name) { + if (relo->kind == BPF_CORE_FIELD_BYTE_OFFSET) { + *val = spec->bit_offset / 8; + /* remember field size for load/store mem size */ + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *field_sz = sz; + *type_id = acc->type_id; + } else if (relo->kind == BPF_CORE_FIELD_BYTE_SIZE) { + sz = btf__resolve_size(spec->btf, acc->type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + } else { + pr_warn("prog '%s': relo %d at insn #%d can't be applied to array access\n", + prog_name, relo->kind, relo->insn_off / 8); + return -EINVAL; + } + if (validate) + *validate = true; + return 0; + } + + m = btf_members(t) + acc->idx; + mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id); + bit_off = spec->bit_offset; + bit_sz = btf_member_bitfield_size(t, acc->idx); + + bitfield = bit_sz > 0; + if (bitfield) { + byte_sz = mt->size; + byte_off = bit_off / 8 / byte_sz * byte_sz; + /* figure out smallest int size necessary for bitfield load */ + while (bit_off + bit_sz - byte_off * 8 > byte_sz * 8) { + if (byte_sz >= 8) { + /* bitfield can't be read with 64-bit read */ + pr_warn("prog '%s': relo %d at insn #%d can't be satisfied for bitfield\n", + prog_name, relo->kind, relo->insn_off / 8); + return -E2BIG; + } + byte_sz *= 2; + byte_off = bit_off / 8 / byte_sz * byte_sz; + } + } else { + sz = btf__resolve_size(spec->btf, field_type_id); + if (sz < 0) + return -EINVAL; + byte_sz = sz; + byte_off = spec->bit_offset / 8; + bit_sz = byte_sz * 8; + } + + /* for bitfields, all the relocatable aspects are ambiguous and we + * might disagree with compiler, so turn off validation of expected + * value, except for signedness + */ + if (validate) + *validate = !bitfield; + + switch (relo->kind) { + case BPF_CORE_FIELD_BYTE_OFFSET: + *val = byte_off; + if (!bitfield) { + *field_sz = byte_sz; + *type_id = field_type_id; + } + break; + case BPF_CORE_FIELD_BYTE_SIZE: + *val = byte_sz; + break; + case BPF_CORE_FIELD_SIGNED: + *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || + (btf_int_encoding(mt) & BTF_INT_SIGNED); + if (validate) + *validate = true; /* signedness is never ambiguous */ + break; + case BPF_CORE_FIELD_LSHIFT_U64: +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *val = 64 - (bit_off + bit_sz - byte_off * 8); +#else + *val = (8 - byte_sz) * 8 + (bit_off - byte_off * 8); +#endif + break; + case BPF_CORE_FIELD_RSHIFT_U64: + *val = 64 - bit_sz; + if (validate) + *validate = true; /* right shift is never ambiguous */ + break; + case BPF_CORE_FIELD_EXISTS: + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val, bool *validate) +{ + __s64 sz; + + /* by default, always check expected value in bpf_insn */ + if (validate) + *validate = true; + + /* type-based relos return zero when target type is not found */ + if (!spec) { + *val = 0; + return 0; + } + + switch (relo->kind) { + case BPF_CORE_TYPE_ID_TARGET: + *val = spec->root_type_id; + /* type ID, embedded in bpf_insn, might change during linking, + * so enforcing it is pointless + */ + if (validate) + *validate = false; + break; + case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: + *val = 1; + break; + case BPF_CORE_TYPE_SIZE: + sz = btf__resolve_size(spec->btf, spec->root_type_id); + if (sz < 0) + return -EINVAL; + *val = sz; + break; + case BPF_CORE_TYPE_ID_LOCAL: + /* BPF_CORE_TYPE_ID_LOCAL is handled specially and shouldn't get here */ + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, + const struct bpf_core_spec *spec, + __u64 *val) +{ + const struct btf_type *t; + + switch (relo->kind) { + case BPF_CORE_ENUMVAL_EXISTS: + *val = spec ? 1 : 0; + break; + case BPF_CORE_ENUMVAL_VALUE: + if (!spec) + return -EUCLEAN; /* request instruction poisoning */ + t = btf_type_by_id(spec->btf, spec->spec[0].type_id); + if (btf_is_enum(t)) + *val = btf_enum(t)[spec->spec[0].idx].val; + else + *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +/* Calculate original and target relocation values, given local and target + * specs and relocation kind. These values are calculated for each candidate. + * If there are multiple candidates, resulting values should all be consistent + * with each other. Otherwise, libbpf will refuse to proceed due to ambiguity. + * If instruction has to be poisoned, *poison will be set to true. + */ +static int bpf_core_calc_relo(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct bpf_core_spec *local_spec, + const struct bpf_core_spec *targ_spec, + struct bpf_core_relo_res *res) +{ + int err = -EOPNOTSUPP; + + res->orig_val = 0; + res->new_val = 0; + res->poison = false; + res->validate = true; + res->fail_memsz_adjust = false; + res->orig_sz = res->new_sz = 0; + res->orig_type_id = res->new_type_id = 0; + + if (core_relo_is_field_based(relo->kind)) { + err = bpf_core_calc_field_relo(prog_name, relo, local_spec, + &res->orig_val, &res->orig_sz, + &res->orig_type_id, &res->validate); + err = err ?: bpf_core_calc_field_relo(prog_name, relo, targ_spec, + &res->new_val, &res->new_sz, + &res->new_type_id, NULL); + if (err) + goto done; + /* Validate if it's safe to adjust load/store memory size. + * Adjustments are performed only if original and new memory + * sizes differ. + */ + res->fail_memsz_adjust = false; + if (res->orig_sz != res->new_sz) { + const struct btf_type *orig_t, *new_t; + + orig_t = btf_type_by_id(local_spec->btf, res->orig_type_id); + new_t = btf_type_by_id(targ_spec->btf, res->new_type_id); + + /* There are two use cases in which it's safe to + * adjust load/store's mem size: + * - reading a 32-bit kernel pointer, while on BPF + * size pointers are always 64-bit; in this case + * it's safe to "downsize" instruction size due to + * pointer being treated as unsigned integer with + * zero-extended upper 32-bits; + * - reading unsigned integers, again due to + * zero-extension is preserving the value correctly. + * + * In all other cases it's incorrect to attempt to + * load/store field because read value will be + * incorrect, so we poison relocated instruction. + */ + if (btf_is_ptr(orig_t) && btf_is_ptr(new_t)) + goto done; + if (btf_is_int(orig_t) && btf_is_int(new_t) && + btf_int_encoding(orig_t) != BTF_INT_SIGNED && + btf_int_encoding(new_t) != BTF_INT_SIGNED) + goto done; + + /* mark as invalid mem size adjustment, but this will + * only be checked for LDX/STX/ST insns + */ + res->fail_memsz_adjust = true; + } + } else if (core_relo_is_type_based(relo->kind)) { + err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val, &res->validate); + err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val, NULL); + } else if (core_relo_is_enumval_based(relo->kind)) { + err = bpf_core_calc_enumval_relo(relo, local_spec, &res->orig_val); + err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val); + } + +done: + if (err == -EUCLEAN) { + /* EUCLEAN is used to signal instruction poisoning request */ + res->poison = true; + err = 0; + } else if (err == -EOPNOTSUPP) { + /* EOPNOTSUPP means unknown/unsupported relocation */ + pr_warn("prog '%s': relo #%d: unrecognized CO-RE relocation %s (%d) at insn #%d\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), + relo->kind, relo->insn_off / 8); + } + + return err; +} + +/* + * Turn instruction for which CO_RE relocation failed into invalid one with + * distinct signature. + */ +static void bpf_core_poison_insn(const char *prog_name, int relo_idx, + int insn_idx, struct bpf_insn *insn) +{ + pr_debug("prog '%s': relo #%d: substituting insn #%d w/ invalid insn\n", + prog_name, relo_idx, insn_idx); + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with the following message: + * invalid func unknown#195896080 + */ + insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */ +} + +static int insn_bpf_size_to_bytes(struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_DW: return 8; + case BPF_W: return 4; + case BPF_H: return 2; + case BPF_B: return 1; + default: return -1; + } +} + +static int insn_bytes_to_bpf_size(__u32 sz) +{ + switch (sz) { + case 8: return BPF_DW; + case 4: return BPF_W; + case 2: return BPF_H; + case 1: return BPF_B; + default: return -1; + } +} + +/* + * Patch relocatable BPF instruction. + * + * Patched value is determined by relocation kind and target specification. + * For existence relocations target spec will be NULL if field/type is not found. + * Expected insn->imm value is determined using relocation kind and local + * spec, and is checked before patching instruction. If actual insn->imm value + * is wrong, bail out with error. + * + * Currently supported classes of BPF instruction are: + * 1. rX = (assignment with immediate operand); + * 2. rX += (arithmetic operations with immediate operand); + * 3. rX = (load with 64-bit immediate value); + * 4. rX = *(T *)(rY + ), where T is one of {u8, u16, u32, u64}; + * 5. *(T *)(rX + ) = rY, where T is one of {u8, u16, u32, u64}; + * 6. *(T *)(rX + ) = , where T is one of {u8, u16, u32, u64}. + */ +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res) +{ + __u64 orig_val, new_val; + __u8 class; + + class = BPF_CLASS(insn->code); + + if (res->poison) { +poison: + /* poison second part of ldimm64 to avoid confusing error from + * verifier about "unknown opcode 00" + */ + if (is_ldimm64_insn(insn)) + bpf_core_poison_insn(prog_name, relo_idx, insn_idx + 1, insn + 1); + bpf_core_poison_insn(prog_name, relo_idx, insn_idx, insn); + return 0; + } + + orig_val = res->orig_val; + new_val = res->new_val; + + switch (class) { + case BPF_ALU: + case BPF_ALU64: + if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + if (res->validate && insn->imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, insn->imm, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + orig_val = insn->imm; + insn->imm = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)orig_val, (unsigned long long)new_val); + break; + case BPF_LDX: + case BPF_ST: + case BPF_STX: + if (res->validate && insn->off != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, + (unsigned long long)new_val); + return -EINVAL; + } + if (new_val > SHRT_MAX) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)new_val); + return -ERANGE; + } + if (res->fail_memsz_adjust) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. " + "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n", + prog_name, relo_idx, insn_idx); + goto poison; + } + + orig_val = insn->off; + insn->off = new_val; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, + (unsigned long long)new_val); + + if (res->new_sz != res->orig_sz) { + int insn_bytes_sz, insn_bpf_sz; + + insn_bytes_sz = insn_bpf_size_to_bytes(insn); + if (insn_bytes_sz != res->orig_sz) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n", + prog_name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz); + return -EINVAL; + } + + insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz); + if (insn_bpf_sz < 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n", + prog_name, relo_idx, insn_idx, res->new_sz); + return -EINVAL; + } + + insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n", + prog_name, relo_idx, insn_idx, res->orig_sz, res->new_sz); + } + break; + case BPF_LD: { + __u64 imm; + + if (!is_ldimm64_insn(insn) || + insn[0].src_reg != 0 || insn[0].off != 0 || + insn[1].code != 0 || insn[1].dst_reg != 0 || + insn[1].src_reg != 0 || insn[1].off != 0) { + pr_warn("prog '%s': relo #%d: insn #%d (LDIMM64) has unexpected form\n", + prog_name, relo_idx, insn_idx); + return -EINVAL; + } + + imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); + if (res->validate && imm != orig_val) { + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", + prog_name, relo_idx, + insn_idx, (unsigned long long)imm, + (unsigned long long)orig_val, (unsigned long long)new_val); + return -EINVAL; + } + + insn[0].imm = new_val; + insn[1].imm = new_val >> 32; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", + prog_name, relo_idx, insn_idx, + (unsigned long long)imm, (unsigned long long)new_val); + break; + } + default: + pr_warn("prog '%s': relo #%d: trying to relocate unrecognized insn #%d, code:0x%x, src:0x%x, dst:0x%x, off:0x%x, imm:0x%x\n", + prog_name, relo_idx, insn_idx, insn->code, + insn->src_reg, insn->dst_reg, insn->off, insn->imm); + return -EINVAL; + } + + return 0; +} + +/* Output spec definition in the format: + * [] () + => @, + * where is a C-syntax view of recorded field access, e.g.: x.a[3].b + */ +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) +{ + const struct btf_type *t; + const char *s; + __u32 type_id; + int i, len = 0; + +#define append_buf(fmt, args...) \ + ({ \ + int r; \ + r = snprintf(buf, buf_sz, fmt, ##args); \ + len += r; \ + if (r >= buf_sz) \ + r = buf_sz; \ + buf += r; \ + buf_sz -= r; \ + }) + + type_id = spec->root_type_id; + t = btf_type_by_id(spec->btf, type_id); + s = btf__name_by_offset(spec->btf, t->name_off); + + append_buf("<%s> [%u] %s %s", + core_relo_kind_str(spec->relo_kind), + type_id, btf_kind_str(t), str_is_empty(s) ? "" : s); + + if (core_relo_is_type_based(spec->relo_kind)) + return len; + + if (core_relo_is_enumval_based(spec->relo_kind)) { + t = skip_mods_and_typedefs(spec->btf, type_id, NULL); + if (btf_is_enum(t)) { + const struct btf_enum *e; + const char *fmt_str; + + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; + append_buf(fmt_str, s, e->val); + } else { + const struct btf_enum64 *e; + const char *fmt_str; + + e = btf_enum64(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; + append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); + } + return len; + } + + if (core_relo_is_field_based(spec->relo_kind)) { + for (i = 0; i < spec->len; i++) { + if (spec->spec[i].name) + append_buf(".%s", spec->spec[i].name); + else if (i > 0 || spec->spec[i].idx > 0) + append_buf("[%u]", spec->spec[i].idx); + } + + append_buf(" ("); + for (i = 0; i < spec->raw_len; i++) + append_buf("%s%d", i == 0 ? "" : ":", spec->raw_spec[i]); + + if (spec->bit_offset % 8) + append_buf(" @ offset %u.%u)", spec->bit_offset / 8, spec->bit_offset % 8); + else + append_buf(" @ offset %u)", spec->bit_offset / 8); + return len; + } + + return len; +#undef append_buf +} + +/* + * Calculate CO-RE relocation target result. + * + * The outline and important points of the algorithm: + * 1. For given local type, find corresponding candidate target types. + * Candidate type is a type with the same "essential" name, ignoring + * everything after last triple underscore (___). E.g., `sample`, + * `sample___flavor_one`, `sample___flavor_another_one`, are all candidates + * for each other. Names with triple underscore are referred to as + * "flavors" and are useful, among other things, to allow to + * specify/support incompatible variations of the same kernel struct, which + * might differ between different kernel versions and/or build + * configurations. + * + * N.B. Struct "flavors" could be generated by bpftool's BTF-to-C + * converter, when deduplicated BTF of a kernel still contains more than + * one different types with the same name. In that case, ___2, ___3, etc + * are appended starting from second name conflict. But start flavors are + * also useful to be defined "locally", in BPF program, to extract same + * data from incompatible changes between different kernel + * versions/configurations. For instance, to handle field renames between + * kernel versions, one can use two flavors of the struct name with the + * same common name and use conditional relocations to extract that field, + * depending on target kernel version. + * 2. For each candidate type, try to match local specification to this + * candidate target type. Matching involves finding corresponding + * high-level spec accessors, meaning that all named fields should match, + * as well as all array accesses should be within the actual bounds. Also, + * types should be compatible (see bpf_core_fields_are_compat for details). + * 3. It is supported and expected that there might be multiple flavors + * matching the spec. As long as all the specs resolve to the same set of + * offsets across all candidates, there is no error. If there is any + * ambiguity, CO-RE relocation will fail. This is necessary to accommodate + * imperfection of BTF deduplication, which can cause slight duplication of + * the same BTF type, if some directly or indirectly referenced (by + * pointer) type gets resolved to different actual types in different + * object files. If such a situation occurs, deduplicated BTF will end up + * with two (or more) structurally identical types, which differ only in + * types they refer to through pointer. This should be OK in most cases and + * is not an error. + * 4. Candidate types search is performed by linearly scanning through all + * types in target BTF. It is anticipated that this is overall more + * efficient memory-wise and not significantly worse (if not better) + * CPU-wise compared to prebuilding a map from all local type names to + * a list of candidate type names. It's also sped up by caching resolved + * list of matching candidates per each local "root" type ID, that has at + * least one bpf_core_relo associated with it. This list is shared + * between multiple relocations for the same type ID and is updated as some + * of the candidates are pruned due to structural incompatibility. + */ +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, + int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res) +{ + struct bpf_core_spec *local_spec = &specs_scratch[0]; + struct bpf_core_spec *cand_spec = &specs_scratch[1]; + struct bpf_core_spec *targ_spec = &specs_scratch[2]; + struct bpf_core_relo_res cand_res; + const struct btf_type *local_type; + const char *local_name; + __u32 local_id; + char spec_buf[256]; + int i, j, err; + + local_id = relo->type_id; + local_type = btf_type_by_id(local_btf, local_id); + local_name = btf__name_by_offset(local_btf, local_type->name_off); + if (!local_name) + return -EINVAL; + + err = bpf_core_parse_spec(prog_name, local_btf, relo, local_spec); + if (err) { + const char *spec_str; + + spec_str = btf__name_by_offset(local_btf, relo->access_str_off); + pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", + prog_name, relo_idx, local_id, btf_kind_str(local_type), + str_is_empty(local_name) ? "" : local_name, + spec_str ?: "", err); + return -EINVAL; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), local_spec); + pr_debug("prog '%s': relo #%d: %s\n", prog_name, relo_idx, spec_buf); + + /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ + if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { + /* bpf_insn's imm value could get out of sync during linking */ + memset(targ_res, 0, sizeof(*targ_res)); + targ_res->validate = false; + targ_res->poison = false; + targ_res->orig_val = local_spec->root_type_id; + targ_res->new_val = local_spec->root_type_id; + return 0; + } + + /* libbpf doesn't support candidate search for anonymous types */ + if (str_is_empty(local_name)) { + pr_warn("prog '%s': relo #%d: <%s> (%d) relocation doesn't support anonymous types\n", + prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); + return -EOPNOTSUPP; + } + + for (i = 0, j = 0; i < cands->len; i++) { + err = bpf_core_spec_match(local_spec, cands->cands[i].btf, + cands->cands[i].id, cand_spec); + if (err < 0) { + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_warn("prog '%s': relo #%d: error matching candidate #%d %s: %d\n ", + prog_name, relo_idx, i, spec_buf, err); + return err; + } + + bpf_core_format_spec(spec_buf, sizeof(spec_buf), cand_spec); + pr_debug("prog '%s': relo #%d: %s candidate #%d %s\n", prog_name, + relo_idx, err == 0 ? "non-matching" : "matching", i, spec_buf); + + if (err == 0) + continue; + + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); + if (err) + return err; + + if (j == 0) { + *targ_res = cand_res; + *targ_spec = *cand_spec; + } else if (cand_spec->bit_offset != targ_spec->bit_offset) { + /* if there are many field relo candidates, they + * should all resolve to the same bit offset + */ + pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", + prog_name, relo_idx, cand_spec->bit_offset, + targ_spec->bit_offset); + return -EINVAL; + } else if (cand_res.poison != targ_res->poison || + cand_res.new_val != targ_res->new_val) { + /* all candidates should result in the same relocation + * decision and value, otherwise it's dangerous to + * proceed due to ambiguity + */ + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", + prog_name, relo_idx, + cand_res.poison ? "failure" : "success", + (unsigned long long)cand_res.new_val, + targ_res->poison ? "failure" : "success", + (unsigned long long)targ_res->new_val); + return -EINVAL; + } + + cands->cands[j++] = cands->cands[i]; + } + + /* + * For BPF_CORE_FIELD_EXISTS relo or when used BPF program has field + * existence checks or kernel version/config checks, it's expected + * that we might not find any candidates. In this case, if field + * wasn't found in any candidate, the list of candidates shouldn't + * change at all, we'll just handle relocating appropriately, + * depending on relo's kind. + */ + if (j > 0) + cands->len = j; + + /* + * If no candidates were found, it might be both a programmer error, + * as well as expected case, depending whether instruction w/ + * relocation is guarded in some way that makes it unreachable (dead + * code) if relocation can't be resolved. This is handled in + * bpf_core_patch_insn() uniformly by replacing that instruction with + * BPF helper call insn (using invalid helper ID). If that instruction + * is indeed unreachable, then it will be ignored and eliminated by + * verifier. If it was an error, then verifier will complain and point + * to a specific instruction number in its log. + */ + if (j == 0) { + pr_debug("prog '%s': relo #%d: no matching targets found\n", + prog_name, relo_idx); + + /* calculate single target relo result explicitly */ + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, targ_res); + if (err) + return err; + } + + return 0; +} + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". This function assumes that root types were + * already checked for name match. + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/third_party/libbpf/src/relo_core.h b/third_party/libbpf/src/relo_core.h new file mode 100644 index 0000000..1c0566d --- /dev/null +++ b/third_party/libbpf/src/relo_core.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2019 Facebook */ + +#ifndef __RELO_CORE_H +#define __RELO_CORE_H + +#include + +struct bpf_core_cand { + const struct btf *btf; + __u32 id; +}; + +/* dynamically sized list of type IDs and its associated struct btf */ +struct bpf_core_cand_list { + struct bpf_core_cand *cands; + int len; +}; + +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + +struct bpf_core_relo_res { + /* expected value in the instruction, unless validate == false */ + __u64 orig_val; + /* new value that needs to be patched up to */ + __u64 new_val; + /* relocation unsuccessful, poison instruction, but don't fail load */ + bool poison; + /* some relocations can't be validated against orig_val */ + bool validate; + /* for field byte offset relocations or the forms: + * *(T *)(rX + ) = rY + * rX = *(T *)(rY + ), + * we remember original and resolved field size to adjust direct + * memory loads of pointers and integers; this is necessary for 32-bit + * host kernel architectures, but also allows to automatically + * relocate fields that were resized from, e.g., u32 to u64, etc. + */ + bool fail_memsz_adjust; + __u32 orig_sz; + __u32 orig_type_id; + __u32 new_sz; + __u32 new_type_id; +}; + +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); + +size_t bpf_core_essential_name_len(const char *name); + +int bpf_core_calc_relo_insn(const char *prog_name, + const struct bpf_core_relo *relo, int relo_idx, + const struct btf *local_btf, + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch, + struct bpf_core_relo_res *targ_res); + +int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, + int insn_idx, const struct bpf_core_relo *relo, + int relo_idx, const struct bpf_core_relo_res *res); + +int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, + const struct bpf_core_relo *relo, + struct bpf_core_spec *spec); + +int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec); + +#endif diff --git a/third_party/libbpf/src/ringbuf.c b/third_party/libbpf/src/ringbuf.c new file mode 100644 index 0000000..0219936 --- /dev/null +++ b/third_party/libbpf/src/ringbuf.c @@ -0,0 +1,587 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Ring buffer operations. + * + * Copyright (C) 2020 Facebook, Inc. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf.h" +#include "libbpf_internal.h" +#include "bpf.h" + +struct ring { + ring_buffer_sample_fn sample_cb; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + +static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +{ + if (r->consumer_pos) { + munmap(r->consumer_pos, rb->page_size); + r->consumer_pos = NULL; + } + if (r->producer_pos) { + munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); + r->producer_pos = NULL; + } +} + +/* Add extra RINGBUF maps to this ring buffer manager */ +int ring_buffer__add(struct ring_buffer *rb, int map_fd, + ring_buffer_sample_fn sample_cb, void *ctx) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + struct epoll_event *e; + struct ring *r; + __u64 mmap_sz; + void *tmp; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_map_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + if (info.type != BPF_MAP_TYPE_RINGBUF) { + pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n", + map_fd); + return libbpf_err(-EINVAL); + } + + tmp = libbpf_reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->rings = tmp; + + tmp = libbpf_reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events)); + if (!tmp) + return libbpf_err(-ENOMEM); + rb->events = tmp; + + r = &rb->rings[rb->ring_cnt]; + memset(r, 0, sizeof(*r)); + + r->map_fd = map_fd; + r->sample_cb = sample_cb; + r->ctx = ctx; + r->mask = info.max_entries - 1; + + /* Map writable consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->consumer_pos = tmp; + + /* Map read-only producer page and data pages. We map twice as big + * data size to allow simple reading of samples that wrap around the + * end of a ring buffer. See kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); + return libbpf_err(-E2BIG); + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + r->producer_pos = tmp; + r->data = tmp + rb->page_size; + + e = &rb->events[rb->ring_cnt]; + memset(e, 0, sizeof(*e)); + + e->events = EPOLLIN; + e->data.fd = rb->ring_cnt; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { + err = -errno; + ringbuf_unmap_ring(rb, r); + pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", + map_fd, err); + return libbpf_err(err); + } + + rb->ring_cnt++; + return 0; +} + +void ring_buffer__free(struct ring_buffer *rb) +{ + int i; + + if (!rb) + return; + + for (i = 0; i < rb->ring_cnt; ++i) + ringbuf_unmap_ring(rb, &rb->rings[i]); + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb->events); + free(rb->rings); + free(rb); +} + +struct ring_buffer * +ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx, + const struct ring_buffer_opts *opts) +{ + struct ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = ring_buffer__add(rb, map_fd, sample_cb, ctx); + if (err) + goto err_out; + + return rb; + +err_out: + ring_buffer__free(rb); + return errno = -err, NULL; +} + +static inline int roundup_len(__u32 len) +{ + /* clear out top 2 bits (discard and busy, if set) */ + len <<= 2; + len >>= 2; + /* add length prefix */ + len += BPF_RINGBUF_HDR_SZ; + /* round up to 8 byte alignment */ + return (len + 7) / 8 * 8; +} + +static int64_t ringbuf_process_ring(struct ring *r) +{ + int *len_ptr, len, err; + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt = 0; + unsigned long cons_pos, prod_pos; + bool got_new_data; + void *sample; + + cons_pos = smp_load_acquire(r->consumer_pos); + do { + got_new_data = false; + prod_pos = smp_load_acquire(r->producer_pos); + while (cons_pos < prod_pos) { + len_ptr = r->data + (cons_pos & r->mask); + len = smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + goto done; + + got_new_data = true; + cons_pos += roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err = r->sample_cb(r->ctx, sample, len); + if (err < 0) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + cons_pos); + return err; + } + cnt++; + } + + smp_store_release(r->consumer_pos, cons_pos); + } + } while (got_new_data); +done: + return cnt; +} + +/* Consume available ring buffer(s) data without event polling. + * Returns number of records consumed across all registered ring buffers (or + * INT_MAX, whichever is less), or negative number if any of the callbacks + * return error. + */ +int ring_buffer__consume(struct ring_buffer *rb) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = &rb->rings[i]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Poll for available data and consume records, if any are available. + * Returns number of records consumed (or INT_MAX, whichever is less), or + * negative number, if any of the registered callbacks returned error. + */ +int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) +{ + int i, cnt; + int64_t err, res = 0; + + cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + if (cnt < 0) + return libbpf_err(-errno); + + for (i = 0; i < cnt; i++) { + __u32 ring_id = rb->events[i].data.fd; + struct ring *ring = &rb->rings[ring_id]; + + err = ringbuf_process_ring(ring); + if (err < 0) + return libbpf_err(err); + res += err; + } + if (res > INT_MAX) + return INT_MAX; + return res; +} + +/* Get an fd that can be used to sleep until data is available in the ring(s) */ +int ring_buffer__epoll_fd(const struct ring_buffer *rb) +{ + return rb->epoll_fd; +} + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + __u64 mmap_sz; + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_map_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("user ringbuf: ring buf size (%u) is too big\n", info.max_entries); + return -E2BIG; + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* The top two bits are used as special flags */ + if (size & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT)) + return errno = E2BIG, NULL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/third_party/libbpf/src/skel_internal.h b/third_party/libbpf/src/skel_internal.h new file mode 100644 index 0000000..1e82ab0 --- /dev/null +++ b/third_party/libbpf/src/skel_internal.h @@ -0,0 +1,374 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Facebook */ +#ifndef __SKEL_INTERNAL_H +#define __SKEL_INTERNAL_H + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include "bpf.h" +#endif + +#ifndef __NR_bpf +# if defined(__mips__) && defined(_ABIO32) +# define __NR_bpf 4355 +# elif defined(__mips__) && defined(_ABIN32) +# define __NR_bpf 6319 +# elif defined(__mips__) && defined(_ABI64) +# define __NR_bpf 5315 +# endif +#endif + +/* This file is a base header for auto-generated *.lskel.h files. + * Its contents will change and may become part of auto-generation in the future. + * + * The layout of bpf_[map|prog]_desc and bpf_loader_ctx is feature dependent + * and will change from one version of libbpf to another and features + * requested during loader program generation. + */ +struct bpf_map_desc { + /* output of the loader prog */ + int map_fd; + /* input for the loader prog */ + __u32 max_entries; + __aligned_u64 initial_value; +}; +struct bpf_prog_desc { + int prog_fd; +}; + +enum { + BPF_SKEL_KERNEL = (1ULL << 0), +}; + +struct bpf_loader_ctx { + __u32 sz; + __u32 flags; + __u32 log_level; + __u32 log_size; + __u64 log_buf; +}; + +struct bpf_load_and_run_opts { + struct bpf_loader_ctx *ctx; + const void *data; + const void *insns; + __u32 data_sz; + __u32 insns_sz; + const char *errstr; +}; + +long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); + +static inline int skel_sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, + unsigned int size) +{ +#ifdef __KERNEL__ + return kern_sys_bpf(cmd, attr, size); +#else + return syscall(__NR_bpf, cmd, attr, size); +#endif +} + +#ifdef __KERNEL__ +static inline int close(int fd) +{ + return close_fd(fd); +} + +static inline void *skel_alloc(size_t size) +{ + struct bpf_loader_ctx *ctx = kzalloc(size, GFP_KERNEL); + + if (!ctx) + return NULL; + ctx->flags |= BPF_SKEL_KERNEL; + return ctx; +} + +static inline void skel_free(const void *p) +{ + kfree(p); +} + +/* skel->bss/rodata maps are populated the following way: + * + * For kernel use: + * skel_prep_map_data() allocates kernel memory that kernel module can directly access. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform probe_read_kernel() from maps.rodata.initial_value. + * skel_finalize_map_data() sets skel->rodata to point to actual value in a bpf map and + * does maps.rodata.initial_value = ~0ULL to signal skel_free_map_data() that kvfree + * is not nessary. + * + * For user space: + * skel_prep_map_data() mmaps anon memory into skel->rodata that can be accessed directly. + * Generated lskel stores the pointer in skel->rodata and in skel->maps.rodata.initial_value. + * The loader program will perform copy_from_user() from maps.rodata.initial_value. + * skel_finalize_map_data() remaps bpf array map value from the kernel memory into + * skel->rodata address. + * + * The "bpftool gen skeleton -L" command generates lskel.h that is suitable for + * both kernel and user space. The generated loader program does + * either bpf_probe_read_kernel() or bpf_copy_from_user() from initial_value + * depending on bpf_loader_ctx->flags. + */ +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + if (addr != ~0ULL) + kvfree(p); + /* When addr == ~0ULL the 'p' points to + * ((struct bpf_array *)map)->value. See skel_finalize_map_data. + */ +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = kvmalloc(val_sz, GFP_KERNEL); + if (!addr) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + struct bpf_map *map; + void *addr = NULL; + + kvfree((void *) (long) *init_val); + *init_val = ~0ULL; + + /* At this point bpf_load_and_run() finished without error and + * 'fd' is a valid bpf map FD. All sanity checks below should succeed. + */ + map = bpf_map_get(fd); + if (IS_ERR(map)) + return NULL; + if (map->map_type != BPF_MAP_TYPE_ARRAY) + goto out; + addr = ((struct bpf_array *)map)->value; + /* the addr stays valid, since FD is not closed */ +out: + bpf_map_put(map); + return addr; +} + +#else + +static inline void *skel_alloc(size_t size) +{ + return calloc(1, size); +} + +static inline void skel_free(void *p) +{ + free(p); +} + +static inline void skel_free_map_data(void *p, __u64 addr, size_t sz) +{ + munmap(p, sz); +} + +static inline void *skel_prep_map_data(const void *val, size_t mmap_sz, size_t val_sz) +{ + void *addr; + + addr = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (addr == (void *) -1) + return NULL; + memcpy(addr, val, val_sz); + return addr; +} + +static inline void *skel_finalize_map_data(__u64 *init_val, size_t mmap_sz, int flags, int fd) +{ + void *addr; + + addr = mmap((void *) (long) *init_val, mmap_sz, flags, MAP_SHARED | MAP_FIXED, fd, 0); + if (addr == (void *) -1) + return NULL; + return addr; +} +#endif + +static inline int skel_closenz(int fd) +{ + if (fd > 0) + return close(fd); + return -EINVAL; +} + +#ifndef offsetofend +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) +#endif + +static inline int skel_map_create(enum bpf_map_type map_type, + const char *map_name, + __u32 key_size, + __u32 value_size, + __u32 max_entries) +{ + const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + + attr.map_type = map_type; + strncpy(attr.map_name, map_name, sizeof(attr.map_name)); + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + + return skel_sys_bpf(BPF_MAP_CREATE, &attr, attr_sz); +} + +static inline int skel_map_update_elem(int fd, const void *key, + const void *value, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long) key; + attr.value = (long) value; + attr.flags = flags; + + return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + +static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.raw_tracepoint.name = (long) name; + attr.raw_tracepoint.prog_fd = prog_fd; + + return skel_sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); +} + +static inline int skel_link_create(int prog_fd, int target_fd, + enum bpf_attach_type attach_type) +{ + const size_t attr_sz = offsetofend(union bpf_attr, link_create.iter_info_len); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.link_create.prog_fd = prog_fd; + attr.link_create.target_fd = target_fd; + attr.link_create.attach_type = attach_type; + + return skel_sys_bpf(BPF_LINK_CREATE, &attr, attr_sz); +} + +#ifdef __KERNEL__ +#define set_err +#else +#define set_err err = -errno +#endif + +static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) +{ + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); + int map_fd = -1, prog_fd = -1, key = 0, err; + union bpf_attr attr; + + err = map_fd = skel_map_create(BPF_MAP_TYPE_ARRAY, "__loader.map", 4, opts->data_sz, 1); + if (map_fd < 0) { + opts->errstr = "failed to create loader map"; + set_err; + goto out; + } + + err = skel_map_update_elem(map_fd, &key, opts->data, 0); + if (err < 0) { + opts->errstr = "failed to update loader map"; + set_err; + goto out; + } + + memset(&attr, 0, prog_load_attr_sz); + attr.prog_type = BPF_PROG_TYPE_SYSCALL; + attr.insns = (long) opts->insns; + attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); + attr.license = (long) "Dual BSD/GPL"; + memcpy(attr.prog_name, "__loader.prog", sizeof("__loader.prog")); + attr.fd_array = (long) &map_fd; + attr.log_level = opts->ctx->log_level; + attr.log_size = opts->ctx->log_size; + attr.log_buf = opts->ctx->log_buf; + attr.prog_flags = BPF_F_SLEEPABLE; + err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, prog_load_attr_sz); + if (prog_fd < 0) { + opts->errstr = "failed to load loader prog"; + set_err; + goto out; + } + + memset(&attr, 0, test_run_attr_sz); + attr.test.prog_fd = prog_fd; + attr.test.ctx_in = (long) opts->ctx; + attr.test.ctx_size_in = opts->ctx->sz; + err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz); + if (err < 0 || (int)attr.test.retval < 0) { + opts->errstr = "failed to execute loader prog"; + if (err < 0) { + set_err; + } else { + err = (int)attr.test.retval; +#ifndef __KERNEL__ + errno = -err; +#endif + } + goto out; + } + err = 0; +out: + if (map_fd >= 0) + close(map_fd); + if (prog_fd >= 0) + close(prog_fd); + return err; +} + +#endif diff --git a/third_party/libbpf/src/str_error.c b/third_party/libbpf/src/str_error.c new file mode 100644 index 0000000..146da01 --- /dev/null +++ b/third_party/libbpf/src/str_error.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#undef _GNU_SOURCE +#include +#include +#include "str_error.h" + +/* make sure libbpf doesn't use kernel-only integer typedefs */ +#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *libbpf_strerror_r(int err, char *dst, int len) +{ + int ret = strerror_r(err < 0 ? -err : err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/third_party/libbpf/src/str_error.h b/third_party/libbpf/src/str_error.h new file mode 100644 index 0000000..a139334 --- /dev/null +++ b/third_party/libbpf/src/str_error.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __LIBBPF_STR_ERROR_H +#define __LIBBPF_STR_ERROR_H + +char *libbpf_strerror_r(int err, char *dst, int len); +#endif /* __LIBBPF_STR_ERROR_H */ diff --git a/third_party/libbpf/src/strset.c b/third_party/libbpf/src/strset.c new file mode 100644 index 0000000..2464bcb --- /dev/null +++ b/third_party/libbpf/src/strset.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include +#include +#include +#include +#include +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(long key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + key; + + return str_hash(str); +} + +static bool strset_equal_fn(long key1, long key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + key1; + const char *str2 = s->strs_data + key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, off, off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); + free(set); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, new_off, &old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, new_off, new_off, + HASHMAP_ADD, &old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/third_party/libbpf/src/strset.h b/third_party/libbpf/src/strset.h new file mode 100644 index 0000000..b6ddf77 --- /dev/null +++ b/third_party/libbpf/src/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include +#include + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ diff --git a/third_party/libbpf/src/usdt.bpf.h b/third_party/libbpf/src/usdt.bpf.h new file mode 100644 index 0000000..0bd4c13 --- /dev/null +++ b/third_party/libbpf/src/usdt.bpf.h @@ -0,0 +1,250 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef __USDT_BPF_H__ +#define __USDT_BPF_H__ + +#include +#include +#include + +/* Below types and maps are internal implementation details of libbpf's USDT + * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should + * be considered an unstable API as well and might be adjusted based on user + * feedback from using libbpf's USDT support in production. + */ + +/* User can override BPF_USDT_MAX_SPEC_CNT to change default size of internal + * map that keeps track of USDT argument specifications. This might be + * necessary if there are a lot of USDT attachments. + */ +#ifndef BPF_USDT_MAX_SPEC_CNT +#define BPF_USDT_MAX_SPEC_CNT 256 +#endif +/* User can override BPF_USDT_MAX_IP_CNT to change default size of internal + * map that keeps track of IP (memory address) mapping to USDT argument + * specification. + * Note, if kernel supports BPF cookies, this map is not used and could be + * resized all the way to 1 to save a bit of memory. + */ +#ifndef BPF_USDT_MAX_IP_CNT +#define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) +#endif + +enum __bpf_usdt_arg_type { + BPF_USDT_ARG_CONST, + BPF_USDT_ARG_REG, + BPF_USDT_ARG_REG_DEREF, +}; + +struct __bpf_usdt_arg_spec { + /* u64 scalar interpreted depending on arg_type, see below */ + __u64 val_off; + /* arg location case, see bpf_udst_arg() for details */ + enum __bpf_usdt_arg_type arg_type; + /* offset of referenced register within struct pt_regs */ + short reg_off; + /* whether arg should be interpreted as signed value */ + bool arg_signed; + /* number of bits that need to be cleared and, optionally, + * sign-extended to cast arguments that are 1, 2, or 4 bytes + * long into final 8-byte u64/s64 value returned to user + */ + char arg_bitshift; +}; + +/* should match USDT_MAX_ARG_CNT in usdt.c exactly */ +#define BPF_USDT_MAX_ARG_CNT 12 +struct __bpf_usdt_spec { + struct __bpf_usdt_arg_spec args[BPF_USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, BPF_USDT_MAX_SPEC_CNT); + __type(key, int); + __type(value, struct __bpf_usdt_spec); +} __bpf_usdt_specs SEC(".maps") __weak; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, BPF_USDT_MAX_IP_CNT); + __type(key, long); + __type(value, __u32); +} __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; + +extern const _Bool LINUX_HAS_BPF_COOKIE __kconfig; + +static __always_inline +int __bpf_usdt_spec_id(struct pt_regs *ctx) +{ + if (!LINUX_HAS_BPF_COOKIE) { + long ip = PT_REGS_IP(ctx); + int *spec_id_ptr; + + spec_id_ptr = bpf_map_lookup_elem(&__bpf_usdt_ip_to_spec_id, &ip); + return spec_id_ptr ? *spec_id_ptr : -ESRCH; + } + + return bpf_get_attach_cookie(ctx); +} + +/* Return number of USDT arguments defined for currently traced USDT. */ +__weak __hidden +int bpf_usdt_arg_cnt(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + return spec->arg_cnt; +} + +/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res. + * Returns 0 on success; negative error, otherwise. + * On error *res is guaranteed to be set to zero. + */ +__weak __hidden +int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) +{ + struct __bpf_usdt_spec *spec; + struct __bpf_usdt_arg_spec *arg_spec; + unsigned long val; + int err, spec_id; + + *res = 0; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return -ESRCH; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return -ESRCH; + + if (arg_num >= BPF_USDT_MAX_ARG_CNT) + return -ENOENT; + barrier_var(arg_num); + if (arg_num >= spec->arg_cnt) + return -ENOENT; + + arg_spec = &spec->args[arg_num]; + switch (arg_spec->arg_type) { + case BPF_USDT_ARG_CONST: + /* Arg is just a constant ("-4@$-9" in USDT arg spec). + * value is recorded in arg_spec->val_off directly. + */ + val = arg_spec->val_off; + break; + case BPF_USDT_ARG_REG: + /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), + * so we read the contents of that register directly from + * struct pt_regs. To keep things simple user-space parts + * record offsetof(struct pt_regs, ) in arg_spec->reg_off. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + break; + case BPF_USDT_ARG_REG_DEREF: + /* Arg is in memory addressed by register, plus some offset + * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is + * identified like with BPF_USDT_ARG_REG case, and the offset + * is in arg_spec->val_off. We first fetch register contents + * from pt_regs, then do another user-space probe read to + * fetch argument value itself. + */ + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + if (err) + return err; + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); + if (err) + return err; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + val >>= arg_spec->arg_bitshift; +#endif + break; + default: + return -EINVAL; + } + + /* cast arg from 1, 2, or 4 bytes to final 8 byte size clearing + * necessary upper arg_bitshift bits, with sign extension if argument + * is signed + */ + val <<= arg_spec->arg_bitshift; + if (arg_spec->arg_signed) + val = ((long)val) >> arg_spec->arg_bitshift; + else + val = val >> arg_spec->arg_bitshift; + *res = val; + return 0; +} + +/* Retrieve user-specified cookie value provided during attach as + * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie + * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself + * utilizing BPF cookies internally, so user can't use BPF cookie directly + * for USDT programs and has to use bpf_usdt_cookie() API instead. + */ +__weak __hidden +long bpf_usdt_cookie(struct pt_regs *ctx) +{ + struct __bpf_usdt_spec *spec; + int spec_id; + + spec_id = __bpf_usdt_spec_id(ctx); + if (spec_id < 0) + return 0; + + spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id); + if (!spec) + return 0; + + return spec->usdt_cookie; +} + +/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ +#define ___bpf_usdt_args0() ctx +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) + +/* + * BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for + * tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes. + * Original struct pt_regs * context is preserved as 'ctx' argument. + */ +#define BPF_USDT(name, args...) \ +name(struct pt_regs *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args); \ +typeof(name(0)) name(struct pt_regs *ctx) \ +{ \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + return ____##name(___bpf_usdt_args(args)); \ + _Pragma("GCC diagnostic pop") \ +} \ +static __always_inline typeof(name(0)) \ +____##name(struct pt_regs *ctx, ##args) + +#endif /* __USDT_BPF_H__ */ diff --git a/third_party/libbpf/src/usdt.c b/third_party/libbpf/src/usdt.c new file mode 100644 index 0000000..37455d0 --- /dev/null +++ b/third_party/libbpf/src/usdt.c @@ -0,0 +1,1558 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* s8 will be marked as poison while it's a reg of riscv */ +#if defined(__riscv) +#define rv_s8 s8 +#endif + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_common.h" +#include "libbpf_internal.h" +#include "hashmap.h" + +/* libbpf's USDT support consists of BPF-side state/code and user-space + * state/code working together in concert. BPF-side parts are defined in + * usdt.bpf.h header library. User-space state is encapsulated by struct + * usdt_manager and all the supporting code centered around usdt_manager. + * + * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map + * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that + * don't support BPF cookie (see below). These two maps are implicitly + * embedded into user's end BPF object file when user's code included + * usdt.bpf.h. This means that libbpf doesn't do anything special to create + * these USDT support maps. They are created by normal libbpf logic of + * instantiating BPF maps when opening and loading BPF object. + * + * As such, libbpf is basically unaware of the need to do anything + * USDT-related until the very first call to bpf_program__attach_usdt(), which + * can be called by user explicitly or happen automatically during skeleton + * attach (or, equivalently, through generic bpf_program__attach() call). At + * this point, libbpf will instantiate and initialize struct usdt_manager and + * store it in bpf_object. USDT manager is per-BPF object construct, as each + * independent BPF object might or might not have USDT programs, and thus all + * the expected USDT-related state. There is no coordination between two + * bpf_object in parts of USDT attachment, they are oblivious of each other's + * existence and libbpf is just oblivious, dealing with bpf_object-specific + * USDT state. + * + * Quick crash course on USDTs. + * + * From user-space application's point of view, USDT is essentially just + * a slightly special function call that normally has zero overhead, unless it + * is being traced by some external entity (e.g, BPF-based tool). Here's how + * a typical application can trigger USDT probe: + * + * #include // provided by systemtap-sdt-devel package + * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h + * + * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y); + * + * USDT is identified by it's : pair of names. Each + * individual USDT has a fixed number of arguments (3 in the above example) + * and specifies values of each argument as if it was a function call. + * + * USDT call is actually not a function call, but is instead replaced by + * a single NOP instruction (thus zero overhead, effectively). But in addition + * to that, those USDT macros generate special SHT_NOTE ELF records in + * .note.stapsdt ELF section. Here's an example USDT definition as emitted by + * `readelf -n `: + * + * stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors) + * Provider: test + * Name: usdt12 + * Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e + * Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil + * + * In this case we have USDT test:usdt12 with 12 arguments. + * + * Location and base are offsets used to calculate absolute IP address of that + * NOP instruction that kernel can replace with an interrupt instruction to + * trigger instrumentation code (BPF program for all that we care about). + * + * Semaphore above is and optional feature. It records an address of a 2-byte + * refcount variable (normally in '.probes' ELF section) used for signaling if + * there is anything that is attached to USDT. This is useful for user + * applications if, for example, they need to prepare some arguments that are + * passed only to USDTs and preparation is expensive. By checking if USDT is + * "activated", an application can avoid paying those costs unnecessarily. + * Recent enough kernel has built-in support for automatically managing this + * refcount, which libbpf expects and relies on. If USDT is defined without + * associated semaphore, this value will be zero. See selftests for semaphore + * examples. + * + * Arguments is the most interesting part. This USDT specification string is + * providing information about all the USDT arguments and their locations. The + * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and + * whether the argument is signed or unsigned (negative size means signed). + * The part after @ sign is assembly-like definition of argument location + * (see [0] for more details). Technically, assembler can provide some pretty + * advanced definitions, but libbpf is currently supporting three most common + * cases: + * 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9); + * 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer + * whose value is in register %rdx"; + * 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which + * specifies signed 32-bit integer stored at offset -1204 bytes from + * memory address stored in %rbp. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * + * During attachment, libbpf parses all the relevant USDT specifications and + * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side + * code through spec map. This allows BPF applications to quickly fetch the + * actual value at runtime using a simple BPF-side code. + * + * With basics out of the way, let's go over less immediately obvious aspects + * of supporting USDTs. + * + * First, there is no special USDT BPF program type. It is actually just + * a uprobe BPF program (which for kernel, at least currently, is just a kprobe + * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference + * that uprobe is usually attached at the function entry, while USDT will + * normally will be somewhere inside the function. But it should always be + * pointing to NOP instruction, which makes such uprobes the fastest uprobe + * kind. + * + * Second, it's important to realize that such STAP_PROBEn(provider, name, ...) + * macro invocations can end up being inlined many-many times, depending on + * specifics of each individual user application. So single conceptual USDT + * (identified by provider:name pair of identifiers) is, generally speaking, + * multiple uprobe locations (USDT call sites) in different places in user + * application. Further, again due to inlining, each USDT call site might end + * up having the same argument #N be located in a different place. In one call + * site it could be a constant, in another will end up in a register, and in + * yet another could be some other register or even somewhere on the stack. + * + * As such, "attaching to USDT" means (in general case) attaching the same + * uprobe BPF program to multiple target locations in user application, each + * potentially having a completely different USDT spec associated with it. + * To wire all this up together libbpf allocates a unique integer spec ID for + * each unique USDT spec. Spec IDs are allocated as sequential small integers + * so that they can be used as keys in array BPF map (for performance reasons). + * Spec ID allocation and accounting is big part of what usdt_manager is + * about. This state has to be maintained per-BPF object and coordinate + * between different USDT attachments within the same BPF object. + * + * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out + * as struct usdt_spec. Each invocation of BPF program at runtime needs to + * know its associated spec ID. It gets it either through BPF cookie, which + * libbpf sets to spec ID during attach time, or, if kernel is too old to + * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such + * case. The latter means that some modes of operation can't be supported + * without BPF cookie. Such mode is attaching to shared library "generically", + * without specifying target process. In such case, it's impossible to + * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode + * is not supported without BPF cookie support. + * + * Note that libbpf is using BPF cookie functionality for its own internal + * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf + * provides conceptually equivalent USDT cookie support. It's still u64 + * user-provided value that can be associated with USDT attachment. Note that + * this will be the same value for all USDT call sites within the same single + * *logical* USDT attachment. This makes sense because to user attaching to + * USDT is a single BPF program triggered for singular USDT probe. The fact + * that this is done at multiple actual locations is a mostly hidden + * implementation details. This USDT cookie value can be fetched with + * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h + * + * Lastly, while single USDT can have tons of USDT call sites, it doesn't + * necessarily have that many different USDT specs. It very well might be + * that 1000 USDT call sites only need 5 different USDT specs, because all the + * arguments are typically contained in a small set of registers or stack + * locations. As such, it's wasteful to allocate as many USDT spec IDs as + * there are USDT call sites. So libbpf tries to be frugal and performs + * on-the-fly deduplication during a single USDT attachment to only allocate + * the minimal required amount of unique USDT specs (and thus spec IDs). This + * is trivially achieved by using USDT spec string (Arguments string from USDT + * note) as a lookup key in a hashmap. USDT spec string uniquely defines + * everything about how to fetch USDT arguments, so two USDT call sites + * sharing USDT spec string can safely share the same USDT spec and spec ID. + * Note, this spec string deduplication is happening only during the same USDT + * attachment, so each USDT spec shares the same USDT cookie value. This is + * not generally true for other USDT attachments within the same BPF object, + * as even if USDT spec string is the same, USDT cookie value can be + * different. It was deemed excessive to try to deduplicate across independent + * USDT attachments by taking into account USDT spec string *and* USDT cookie + * value, which would complicated spec ID accounting significantly for little + * gain. + */ + +#define USDT_BASE_SEC ".stapsdt.base" +#define USDT_SEMA_SEC ".probes" +#define USDT_NOTE_SEC ".note.stapsdt" +#define USDT_NOTE_TYPE 3 +#define USDT_NOTE_NAME "stapsdt" + +/* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */ +enum usdt_arg_type { + USDT_ARG_CONST, + USDT_ARG_REG, + USDT_ARG_REG_DEREF, +}; + +/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */ +struct usdt_arg_spec { + __u64 val_off; + enum usdt_arg_type arg_type; + short reg_off; + bool arg_signed; + char arg_bitshift; +}; + +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ +#define USDT_MAX_ARG_CNT 12 + +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ +struct usdt_spec { + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct usdt_note { + const char *provider; + const char *name; + /* USDT args specification string, e.g.: + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" + */ + const char *args; + long loc_addr; + long base_addr; + long sema_addr; +}; + +struct usdt_target { + long abs_ip; + long rel_ip; + long sema_off; + struct usdt_spec spec; + const char *spec_str; +}; + +struct usdt_manager { + struct bpf_map *specs_map; + struct bpf_map *ip_to_spec_id_map; + + int *free_spec_ids; + size_t free_spec_cnt; + size_t next_free_spec_id; + + bool has_bpf_cookie; + bool has_sema_refcnt; +}; + +struct usdt_manager *usdt_manager_new(struct bpf_object *obj) +{ + static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset"; + struct usdt_manager *man; + struct bpf_map *specs_map, *ip_to_spec_id_map; + + specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs"); + ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id"); + if (!specs_map || !ip_to_spec_id_map) { + pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n"); + return ERR_PTR(-ESRCH); + } + + man = calloc(1, sizeof(*man)); + if (!man) + return ERR_PTR(-ENOMEM); + + man->specs_map = specs_map; + man->ip_to_spec_id_map = ip_to_spec_id_map; + + /* Detect if BPF cookie is supported for kprobes. + * We don't need IP-to-ID mapping if we can use BPF cookies. + * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value") + */ + man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE); + + /* Detect kernel support for automatic refcounting of USDT semaphore. + * If this is not supported, USDTs with semaphores will not be supported. + * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") + */ + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; + + return man; +} + +void usdt_manager_free(struct usdt_manager *man) +{ + if (IS_ERR_OR_NULL(man)) + return; + + free(man->free_spec_ids); + free(man); +} + +static int sanity_check_usdt_elf(Elf *elf, const char *path) +{ + GElf_Ehdr ehdr; + int endianness; + + if (elf_kind(elf) != ELF_K_ELF) { + pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path); + return -EBADF; + } + + switch (gelf_getclass(elf)) { + case ELFCLASS64: + if (sizeof(void *) != 8) { + pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + case ELFCLASS32: + if (sizeof(void *) != 4) { + pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path); + return -EBADF; + } + break; + default: + pr_warn("usdt: unsupported ELF class for '%s'\n", path); + return -EBADF; + } + + if (!gelf_getehdr(elf, &ehdr)) + return -EINVAL; + + if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) { + pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n", + path, ehdr.e_type); + return -EBADF; + } + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + endianness = ELFDATA2LSB; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + endianness = ELFDATA2MSB; +#else +# error "Unrecognized __BYTE_ORDER__" +#endif + if (endianness != ehdr.e_ident[EI_DATA]) { + pr_warn("usdt: ELF endianness mismatch for '%s'\n", path); + return -EBADF; + } + + return 0; +} + +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) +{ + Elf_Scn *sec = NULL; + size_t shstrndx; + + if (elf_getshdrstrndx(elf, &shstrndx)) + return -EINVAL; + + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) + return -EINVAL; + + while ((sec = elf_nextscn(elf, sec)) != NULL) { + char *name; + + if (!gelf_getshdr(sec, shdr)) + return -EINVAL; + + name = elf_strptr(elf, shstrndx, shdr->sh_name); + if (name && strcmp(sec_name, name) == 0) { + *scn = sec; + return 0; + } + } + + return -ENOENT; +} + +struct elf_seg { + long start; + long end; + long offset; + bool is_exec; +}; + +static int cmp_elf_segs(const void *_a, const void *_b) +{ + const struct elf_seg *a = _a; + const struct elf_seg *b = _b; + + return a->start < b->start ? -1 : 1; +} + +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) +{ + GElf_Phdr phdr; + size_t n; + int i, err; + struct elf_seg *seg; + void *tmp; + + *seg_cnt = 0; + + if (elf_getphdrnum(elf, &n)) { + err = -errno; + return err; + } + + for (i = 0; i < n; i++) { + if (!gelf_getphdr(elf, i, &phdr)) { + err = -errno; + return err; + } + + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, + (long)phdr.p_type, (long)phdr.p_flags); + if (phdr.p_type != PT_LOAD) + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) + return -ENOMEM; + + *segs = tmp; + seg = *segs + *seg_cnt; + (*seg_cnt)++; + + seg->start = phdr.p_vaddr; + seg->end = phdr.p_vaddr + phdr.p_memsz; + seg->offset = phdr.p_offset; + seg->is_exec = phdr.p_flags & PF_X; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); + return -ESRCH; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + return 0; +} + +static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +{ + char path[PATH_MAX], line[PATH_MAX], mode[16]; + size_t seg_start, seg_end, seg_off; + struct elf_seg *seg; + int tmp_pid, i, err; + FILE *f; + + *seg_cnt = 0; + + /* Handle containerized binaries only accessible from + * /proc//root/. They will be reported as just / in + * /proc//maps. + */ + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + goto proceed; + + if (!realpath(lib_path, path)) { + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", + lib_path, -errno); + libbpf_strlcpy(path, lib_path, sizeof(path)); + } + +proceed: + sprintf(line, "/proc/%d/maps", pid); + f = fopen(line, "re"); + if (!f) { + err = -errno; + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", + line, lib_path, err); + return err; + } + + /* We need to handle lines with no path at the end: + * + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + */ + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + void *tmp; + + /* to handle no path case (see above) we need to capture line + * without skipping any whitespaces. So we need to strip + * leading whitespaces manually here + */ + i = 0; + while (isblank(line[i])) + i++; + if (strcmp(line + i, path) != 0) + continue; + + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", + path, seg_start, seg_end, mode, seg_off); + + /* ignore non-executable sections for shared libs */ + if (mode[2] != 'x') + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + + *segs = tmp; + seg = *segs + *seg_cnt; + *seg_cnt += 1; + + seg->start = seg_start; + seg->end = seg_end; + seg->offset = seg_off; + seg->is_exec = true; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", + lib_path, path, pid); + err = -ESRCH; + goto err_out; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + err = 0; +err_out: + fclose(f); + return err; +} + +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long virtaddr) +{ + struct elf_seg *seg; + int i; + + /* for ELF binaries (both executables and shared libraries), we are + * given virtual address (absolute for executables, relative for + * libraries) which should match address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= virtaddr && virtaddr < seg->end) + return seg; + } + return NULL; +} + +static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long offset) +{ + struct elf_seg *seg; + int i; + + /* for VMA segments from /proc//maps file, provided "address" is + * actually a file offset, so should be fall within logical + * offset-based range of [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start)) + return seg; + } + return NULL; +} + +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *usdt_note); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); + +static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, + const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie, + struct usdt_target **out_targets, size_t *out_target_cnt) +{ + size_t off, name_off, desc_off, seg_cnt = 0, vma_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *vma_segs = NULL; + struct usdt_target *targets = NULL, *target; + long base_addr = 0; + Elf_Scn *notes_scn, *base_scn; + GElf_Shdr base_shdr, notes_shdr; + GElf_Ehdr ehdr; + GElf_Nhdr nhdr; + Elf_Data *data; + int err; + + *out_targets = NULL; + *out_target_cnt = 0; + + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); + if (err) { + pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); + return err; + } + + if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { + pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); + return -EINVAL; + } + + err = parse_elf_segs(elf, path, &segs, &seg_cnt); + if (err) { + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); + goto err_out; + } + + /* .stapsdt.base ELF section is optional, but is used for prelink + * offset compensation (see a big comment further below) + */ + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) + base_addr = base_shdr.sh_addr; + + data = elf_getdata(notes_scn, 0); + off = 0; + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; + struct usdt_note note; + struct elf_seg *seg = NULL; + void *tmp; + + err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); + if (err) + goto err_out; + + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) + continue; + + /* We need to compensate "prelink effect". See [0] for details, + * relevant parts quoted here: + * + * Each SDT probe also expands into a non-allocated ELF note. You can + * find this by looking at SHT_NOTE sections and decoding the format; + * see below for details. Because the note is non-allocated, it means + * there is no runtime cost, and also preserved in both stripped files + * and .debug files. + * + * However, this means that prelink won't adjust the note's contents + * for address offsets. Instead, this is done via the .stapsdt.base + * section. This is a special section that is added to the text. We + * will only ever have one of these sections in a final link and it + * will only ever be one byte long. Nothing about this section itself + * matters, we just use it as a marker to detect prelink address + * adjustments. + * + * Each probe note records the link-time address of the .stapsdt.base + * section alongside the probe PC address. The decoder compares the + * base address stored in the note with the .stapsdt.base section's + * sh_addr. Initially these are the same, but the section header will + * be adjusted by prelink. So the decoder applies the difference to + * the probe PC address to get the correct prelinked PC address; the + * same adjustment is applied to the semaphore address, if any. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + */ + usdt_abs_ip = note.loc_addr; + if (base_addr) + usdt_abs_ip += base_addr - note.base_addr; + + /* When attaching uprobes (which is what USDTs basically are) + * kernel expects file offset to be specified, not a relative + * virtual address, so we need to translate virtual address to + * file offset, for both ET_EXEC and ET_DYN binaries. + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + /* translate from virtual address to file offset */ + usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset; + + if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) { + /* If we don't have BPF cookie support but need to + * attach to a shared library, we'll need to know and + * record absolute addresses of attach points due to + * the need to lookup USDT spec by absolute IP of + * triggered uprobe. Doing this resolution is only + * possible when we have a specific PID of the process + * that's using specified shared library. BPF cookie + * removes the absolute address limitation as we don't + * need to do this lookup (we just use BPF cookie as + * an index of USDT spec), so for newer kernels with + * BPF cookie support libbpf supports USDT attachment + * to shared libraries with no PID filter. + */ + if (pid < 0) { + pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n"); + err = -ENOTSUP; + goto err_out; + } + + /* vma_segs are lazily initialized only if necessary */ + if (vma_seg_cnt == 0) { + err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt); + if (err) { + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", + pid, path, err); + goto err_out; + } + } + + seg = find_vma_seg(vma_segs, vma_seg_cnt, usdt_rel_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_rel_ip); + goto err_out; + } + + usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip; + } + + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); + + /* Adjust semaphore address to be a file offset */ + if (note.sema_addr) { + if (!man->has_sema_refcnt) { + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", + usdt_provider, usdt_name, path); + err = -ENOTSUP; + goto err_out; + } + + seg = find_elf_seg(segs, seg_cnt, note.sema_addr); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", + usdt_provider, usdt_name, path, note.sema_addr); + goto err_out; + } + if (seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + note.sema_addr); + goto err_out; + } + + usdt_sema_off = note.sema_addr - seg->start + seg->offset; + + pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", + path, note.sema_addr, note.base_addr, usdt_sema_off, + seg->start, seg->end, seg->offset); + } + + /* Record adjusted addresses and offsets and parse USDT spec */ + tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + targets = tmp; + + target = &targets[target_cnt]; + memset(target, 0, sizeof(*target)); + + target->abs_ip = usdt_abs_ip; + target->rel_ip = usdt_rel_ip; + target->sema_off = usdt_sema_off; + + /* notes.args references strings from ELF itself, so they can + * be referenced safely until elf_end() call + */ + target->spec_str = note.args; + + err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); + if (err) + goto err_out; + + target_cnt++; + } + + *out_targets = targets; + *out_target_cnt = target_cnt; + err = target_cnt; + +err_out: + free(segs); + free(vma_segs); + if (err < 0) + free(targets); + return err; +} + +struct bpf_link_usdt { + struct bpf_link link; + + struct usdt_manager *usdt_man; + + size_t spec_cnt; + int *spec_ids; + + size_t uprobe_cnt; + struct { + long abs_ip; + struct bpf_link *link; + } *uprobes; +}; + +static int bpf_link_usdt_detach(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + struct usdt_manager *man = usdt_link->usdt_man; + int i; + + for (i = 0; i < usdt_link->uprobe_cnt; i++) { + /* detach underlying uprobe link */ + bpf_link__destroy(usdt_link->uprobes[i].link); + /* there is no need to update specs map because it will be + * unconditionally overwritten on subsequent USDT attaches, + * but if BPF cookies are not used we need to remove entry + * from ip_to_spec_id map, otherwise we'll run into false + * conflicting IP errors + */ + if (!man->has_bpf_cookie) { + /* not much we can do about errors here */ + (void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map), + &usdt_link->uprobes[i].abs_ip); + } + } + + /* try to return the list of previously used spec IDs to usdt_manager + * for future reuse for subsequent USDT attaches + */ + if (!man->free_spec_ids) { + /* if there were no free spec IDs yet, just transfer our IDs */ + man->free_spec_ids = usdt_link->spec_ids; + man->free_spec_cnt = usdt_link->spec_cnt; + usdt_link->spec_ids = NULL; + } else { + /* otherwise concat IDs */ + size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt; + int *new_free_ids; + + new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt, + sizeof(*new_free_ids)); + /* If we couldn't resize free_spec_ids, we'll just leak + * a bunch of free IDs; this is very unlikely to happen and if + * system is so exhausted on memory, it's the least of user's + * concerns, probably. + * So just do our best here to return those IDs to usdt_manager. + * Another edge case when we can legitimately get NULL is when + * new_cnt is zero, which can happen in some edge cases, so we + * need to be careful about that. + */ + if (new_free_ids || new_cnt == 0) { + memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, + usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); + man->free_spec_ids = new_free_ids; + man->free_spec_cnt = new_cnt; + } + } + + return 0; +} + +static void bpf_link_usdt_dealloc(struct bpf_link *link) +{ + struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link); + + free(usdt_link->spec_ids); + free(usdt_link->uprobes); + free(usdt_link); +} + +static size_t specs_hash_fn(long key, void *ctx) +{ + return str_hash((char *)key); +} + +static bool specs_equal_fn(long key1, long key2, void *ctx) +{ + return strcmp((char *)key1, (char *)key2) == 0; +} + +static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, + struct bpf_link_usdt *link, struct usdt_target *target, + int *spec_id, bool *is_new) +{ + long tmp; + void *new_ids; + int err; + + /* check if we already allocated spec ID for this spec string */ + if (hashmap__find(specs_hash, target->spec_str, &tmp)) { + *spec_id = tmp; + *is_new = false; + return 0; + } + + /* otherwise it's a new ID that needs to be set up in specs map and + * returned back to usdt_manager when USDT link is detached + */ + new_ids = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); + if (!new_ids) + return -ENOMEM; + link->spec_ids = new_ids; + + /* get next free spec ID, giving preference to free list, if not empty */ + if (man->free_spec_cnt) { + *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->free_spec_cnt--; + } else { + /* don't allocate spec ID bigger than what fits in specs map */ + if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map)) + return -E2BIG; + + *spec_id = man->next_free_spec_id; + + /* cache spec ID for current spec string for future lookups */ + err = hashmap__add(specs_hash, target->spec_str, *spec_id); + if (err) + return err; + + man->next_free_spec_id++; + } + + /* remember new spec ID in the link for later return back to free list on detach */ + link->spec_ids[link->spec_cnt] = *spec_id; + link->spec_cnt++; + *is_new = true; + return 0; +} + +struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog, + pid_t pid, const char *path, + const char *usdt_provider, const char *usdt_name, + __u64 usdt_cookie) +{ + int i, fd, err, spec_map_fd, ip_map_fd; + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct hashmap *specs_hash = NULL; + struct bpf_link_usdt *link = NULL; + struct usdt_target *targets = NULL; + size_t target_cnt; + Elf *elf; + + spec_map_fd = bpf_map__fd(man->specs_map); + ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + return libbpf_err_ptr(err); + } + + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + err = -EBADF; + pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); + goto err_out; + } + + err = sanity_check_usdt_elf(elf, path); + if (err) + goto err_out; + + /* normalize PID filter */ + if (pid < 0) + pid = -1; + else if (pid == 0) + pid = getpid(); + + /* discover USDT in given binary, optionally limiting + * activations to a given PID, if pid > 0 + */ + err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + usdt_cookie, &targets, &target_cnt); + if (err <= 0) { + err = (err == 0) ? -ENOENT : err; + goto err_out; + } + + specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL); + if (IS_ERR(specs_hash)) { + err = PTR_ERR(specs_hash); + goto err_out; + } + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto err_out; + } + + link->usdt_man = man; + link->link.detach = &bpf_link_usdt_detach; + link->link.dealloc = &bpf_link_usdt_dealloc; + + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < target_cnt; i++) { + struct usdt_target *target = &targets[i]; + struct bpf_link *uprobe_link; + bool is_new; + int spec_id; + + /* Spec ID can be either reused or newly allocated. If it is + * newly allocated, we'll need to fill out spec map, otherwise + * entire spec should be valid and can be just used by a new + * uprobe. We reuse spec when USDT arg spec is identical. We + * also never share specs between two different USDT + * attachments ("links"), so all the reused specs already + * share USDT cookie value implicitly. + */ + err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new); + if (err) + goto err_out; + + if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) { + err = -errno; + pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n", + spec_id, usdt_provider, usdt_name, path, err); + goto err_out; + } + if (!man->has_bpf_cookie && + bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) { + err = -errno; + if (err == -EEXIST) { + pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n", + spec_id, usdt_provider, usdt_name, path); + } else { + pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n", + target->abs_ip, spec_id, usdt_provider, usdt_name, + path, err); + } + goto err_out; + } + + opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + + free(targets); + hashmap__free(specs_hash); + elf_end(elf); + close(fd); + + return &link->link; + +err_out: + if (link) + bpf_link__destroy(&link->link); + free(targets); + hashmap__free(specs_hash); + if (elf) + elf_end(elf); + close(fd); + return libbpf_err_ptr(err); +} + +/* Parse out USDT ELF note from '.note.stapsdt' section. + * Logic inspired by perf's code. + */ +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, + struct usdt_note *note) +{ + const char *provider, *name, *args; + long addrs[3]; + size_t len; + + /* sanity check USDT note name and type first */ + if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) + return -EINVAL; + if (nhdr->n_type != USDT_NOTE_TYPE) + return -EINVAL; + + /* sanity check USDT note contents ("description" in ELF terminology) */ + len = nhdr->n_descsz; + data = data + desc_off; + + /* +3 is the very minimum required to store three empty strings */ + if (len < sizeof(addrs) + 3) + return -EINVAL; + + /* get location, base, and semaphore addrs */ + memcpy(&addrs, data, sizeof(addrs)); + + /* parse string fields: provider, name, args */ + provider = data + sizeof(addrs); + + name = (const char *)memchr(provider, '\0', data + len - provider); + if (!name) /* non-zero-terminated provider */ + return -EINVAL; + name++; + if (name >= data + len || *name == '\0') /* missing or empty name */ + return -EINVAL; + + args = memchr(name, '\0', data + len - name); + if (!args) /* non-zero-terminated name */ + return -EINVAL; + ++args; + if (args >= data + len) /* missing arguments spec */ + return -EINVAL; + + note->provider = provider; + note->name = name; + if (*args == '\0' || *args == ':') + note->args = ""; + else + note->args = args; + note->loc_addr = addrs[0]; + note->base_addr = addrs[1]; + note->sema_addr = addrs[2]; + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie) +{ + struct usdt_arg_spec *arg; + const char *s; + int arg_sz, len; + + spec->usdt_cookie = usdt_cookie; + spec->arg_cnt = 0; + + s = note->args; + while (s[0]) { + if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { + pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", + USDT_MAX_ARG_CNT, note->provider, note->name, note->args); + return -E2BIG; + } + + arg = &spec->args[spec->arg_cnt]; + len = parse_usdt_arg(s, spec->arg_cnt, arg, &arg_sz); + if (len < 0) + return len; + + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + spec->arg_cnt, s, arg_sz); + return -EINVAL; + } + + s += len; + spec->arg_cnt++; + } + + return 0; +} + +/* Architecture-specific logic for parsing USDT argument location specs */ + +#if defined(__x86_64__) || defined(__i386__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *names[4]; + size_t pt_regs_off; + } reg_map[] = { +#ifdef __x86_64__ +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64) +#else +#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32) +#endif + { {"rip", "eip", "", ""}, reg_off(rip, eip) }, + { {"rax", "eax", "ax", "al"}, reg_off(rax, eax) }, + { {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) }, + { {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) }, + { {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) }, + { {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) }, + { {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) }, + { {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) }, + { {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) }, +#undef reg_off +#ifdef __x86_64__ + { {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) }, + { {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) }, + { {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) }, + { {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) }, + { {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) }, + { {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) }, + { {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) }, + { {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) }, +#endif + }; + int i, j; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) { + if (strcmp(reg_name, reg_map[i].names[j]) == 0) + return reg_map[i].pt_regs_off; + } + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -4@-20(%rbp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case without offset, e.g., 8@(%rsp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -4@%eax */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ $%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@$71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__s390x__) + +/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + unsigned int reg; + int len; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", arg_sz, &off, ®, &len) == 3) { + /* Memory dereference case, e.g., -2@-28(%r15) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %%r%u %n", arg_sz, ®, &len) == 2) { + /* Register read case, e.g., -8@%r0 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + if (reg > 15) { + pr_warn("usdt: unrecognized register '%%r%u'\n", reg); + return -EINVAL; + } + arg->reg_off = offsetof(user_pt_regs, gprs[reg]); + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@71 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__aarch64__) + +static int calc_pt_regs_off(const char *reg_name) +{ + int reg_num; + + if (sscanf(reg_name, "x%d", ®_num) == 1) { + if (reg_num >= 0 && reg_num < 31) + return offsetof(struct user_pt_regs, regs[reg_num]); + } else if (strcmp(reg_name, "sp") == 0) { + return offsetof(struct user_pt_regs, sp); + } + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , %ld ] %n", arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[sp, 96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@x4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__riscv) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "ra", offsetof(struct user_regs_struct, ra) }, + { "sp", offsetof(struct user_regs_struct, sp) }, + { "gp", offsetof(struct user_regs_struct, gp) }, + { "tp", offsetof(struct user_regs_struct, tp) }, + { "a0", offsetof(struct user_regs_struct, a0) }, + { "a1", offsetof(struct user_regs_struct, a1) }, + { "a2", offsetof(struct user_regs_struct, a2) }, + { "a3", offsetof(struct user_regs_struct, a3) }, + { "a4", offsetof(struct user_regs_struct, a4) }, + { "a5", offsetof(struct user_regs_struct, a5) }, + { "a6", offsetof(struct user_regs_struct, a6) }, + { "a7", offsetof(struct user_regs_struct, a7) }, + { "s0", offsetof(struct user_regs_struct, s0) }, + { "s1", offsetof(struct user_regs_struct, s1) }, + { "s2", offsetof(struct user_regs_struct, s2) }, + { "s3", offsetof(struct user_regs_struct, s3) }, + { "s4", offsetof(struct user_regs_struct, s4) }, + { "s5", offsetof(struct user_regs_struct, s5) }, + { "s6", offsetof(struct user_regs_struct, s6) }, + { "s7", offsetof(struct user_regs_struct, s7) }, + { "s8", offsetof(struct user_regs_struct, rv_s8) }, + { "s9", offsetof(struct user_regs_struct, s9) }, + { "s10", offsetof(struct user_regs_struct, s10) }, + { "s11", offsetof(struct user_regs_struct, s11) }, + { "t0", offsetof(struct user_regs_struct, t0) }, + { "t1", offsetof(struct user_regs_struct, t1) }, + { "t2", offsetof(struct user_regs_struct, t2) }, + { "t3", offsetof(struct user_regs_struct, t3) }, + { "t4", offsetof(struct user_regs_struct, t4) }, + { "t5", offsetof(struct user_regs_struct, t5) }, + { "t6", offsetof(struct user_regs_struct, t6) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", arg_sz, &off, reg_name, &len) == 3) { + /* Memory dereference case, e.g., -8@-88(s0) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@a1 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#elif defined(__arm__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "r0", offsetof(struct pt_regs, uregs[0]) }, + { "r1", offsetof(struct pt_regs, uregs[1]) }, + { "r2", offsetof(struct pt_regs, uregs[2]) }, + { "r3", offsetof(struct pt_regs, uregs[3]) }, + { "r4", offsetof(struct pt_regs, uregs[4]) }, + { "r5", offsetof(struct pt_regs, uregs[5]) }, + { "r6", offsetof(struct pt_regs, uregs[6]) }, + { "r7", offsetof(struct pt_regs, uregs[7]) }, + { "r8", offsetof(struct pt_regs, uregs[8]) }, + { "r9", offsetof(struct pt_regs, uregs[9]) }, + { "r10", offsetof(struct pt_regs, uregs[10]) }, + { "fp", offsetof(struct pt_regs, uregs[11]) }, + { "ip", offsetof(struct pt_regs, uregs[12]) }, + { "sp", offsetof(struct pt_regs, uregs[13]) }, + { "lr", offsetof(struct pt_regs, uregs[14]) }, + { "pc", offsetof(struct pt_regs, uregs[15]) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , #%ld ] %n", + arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[fp, #96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ #%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@#5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@r4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); + return -EINVAL; + } + + return len; +} + +#else + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); + return -ENOTSUP; +} + +#endif diff --git a/third_party/libbpf/src/zip.c b/third_party/libbpf/src/zip.c new file mode 100644 index 0000000..3f26d62 --- /dev/null +++ b/third_party/libbpf/src/zip.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Routines for dealing with .zip archives. + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf_internal.h" +#include "zip.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" + +/* Specification of ZIP file format can be found here: + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + * For a high level overview of the structure of a ZIP file see + * sections 4.3.1 - 4.3.6. + * + * Data structures appearing in ZIP files do not contain any + * padding and they might be misaligned. To allow us to safely + * operate on pointers to such structures and their members, we + * declare the types as packed. + */ + +#define END_OF_CD_RECORD_MAGIC 0x06054b50 + +/* See section 4.3.16 of the spec. */ +struct end_of_cd_record { + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ + __u32 magic; + + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. + * Zip archive might span multiple files (disks). + */ + __u16 this_disk; + + /* Number of the file containing the beginning of the central directory or + * 0xFFFF if ZIP64 archive. + */ + __u16 cd_disk; + + /* Number of central directory records on this disk or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records; + + /* Number of central directory records on all disks or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records_total; + + /* Size of the central directory record or 0xFFFFFFFF if ZIP64 archive. */ + __u32 cd_size; + + /* Offset of the central directory from the beginning of the archive or + * 0xFFFFFFFF if ZIP64 archive. + */ + __u32 cd_offset; + + /* Length of comment data following end of central directory record. */ + __u16 comment_length; + + /* Up to 64k of arbitrary bytes. */ + /* uint8_t comment[comment_length] */ +} __attribute__((packed)); + +#define CD_FILE_HEADER_MAGIC 0x02014b50 +#define FLAG_ENCRYPTED (1 << 0) +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) + +/* See section 4.3.12 of the spec. */ +struct cd_file_header { + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ + __u32 magic; + __u16 version; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; + __u16 file_comment_length; + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ + __u16 disk; + __u16 internal_attributes; + __u32 external_attributes; + /* Offset from the start of the disk containing the local file header to the + * start of the local file header. + */ + __u32 offset; +} __attribute__((packed)); + +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 + +/* See section 4.3.7 of the spec. */ +struct local_file_header { + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ + __u32 magic; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; +} __attribute__((packed)); + +#pragma GCC diagnostic pop + +struct zip_archive { + void *data; + __u32 size; + __u32 cd_offset; + __u32 cd_records; +}; + +static void *check_access(struct zip_archive *archive, __u32 offset, __u32 size) +{ + if (offset + size > archive->size || offset > offset + size) + return NULL; + + return archive->data + offset; +} + +/* Returns 0 on success, -EINVAL on error and -ENOTSUP if the eocd indicates the + * archive uses features which are not supported. + */ +static int try_parse_end_of_cd(struct zip_archive *archive, __u32 offset) +{ + __u16 comment_length, cd_records; + struct end_of_cd_record *eocd; + __u32 cd_offset, cd_size; + + eocd = check_access(archive, offset, sizeof(*eocd)); + if (!eocd || eocd->magic != END_OF_CD_RECORD_MAGIC) + return -EINVAL; + + comment_length = eocd->comment_length; + if (offset + sizeof(*eocd) + comment_length != archive->size) + return -EINVAL; + + cd_records = eocd->cd_records; + if (eocd->this_disk != 0 || eocd->cd_disk != 0 || eocd->cd_records_total != cd_records) + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ + return -ENOTSUP; + + cd_offset = eocd->cd_offset; + cd_size = eocd->cd_size; + if (!check_access(archive, cd_offset, cd_size)) + return -EINVAL; + + archive->cd_offset = cd_offset; + archive->cd_records = cd_records; + return 0; +} + +static int find_cd(struct zip_archive *archive) +{ + int64_t limit, offset; + int rc = -EINVAL; + + if (archive->size <= sizeof(struct end_of_cd_record)) + return -EINVAL; + + /* Because the end of central directory ends with a variable length array of + * up to 0xFFFF bytes we can't know exactly where it starts and need to + * search for it at the end of the file, scanning the (limit, offset] range. + */ + offset = archive->size - sizeof(struct end_of_cd_record); + limit = (int64_t)offset - (1 << 16); + + for (; offset >= 0 && offset > limit && rc != 0; offset--) { + rc = try_parse_end_of_cd(archive, offset); + if (rc == -ENOTSUP) + break; + } + return rc; +} + +struct zip_archive *zip_archive_open(const char *path) +{ + struct zip_archive *archive; + int err, fd; + off_t size; + void *data; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return ERR_PTR(-errno); + + size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1 || size > UINT32_MAX) { + close(fd); + return ERR_PTR(-EINVAL); + } + + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + err = -errno; + close(fd); + + if (data == MAP_FAILED) + return ERR_PTR(err); + + archive = malloc(sizeof(*archive)); + if (!archive) { + munmap(data, size); + return ERR_PTR(-ENOMEM); + }; + + archive->data = data; + archive->size = size; + + err = find_cd(archive); + if (err) { + munmap(data, size); + free(archive); + return ERR_PTR(err); + } + + return archive; +} + +void zip_archive_close(struct zip_archive *archive) +{ + munmap(archive->data, archive->size); + free(archive); +} + +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, + __u32 offset) +{ + struct local_file_header *lfh; + + lfh = check_access(archive, offset, sizeof(*lfh)); + if (!lfh || lfh->magic != LOCAL_FILE_HEADER_MAGIC) + return NULL; + + return lfh; +} + +static int get_entry_at_offset(struct zip_archive *archive, __u32 offset, struct zip_entry *out) +{ + struct local_file_header *lfh; + __u32 compressed_size; + const char *name; + void *data; + + lfh = local_file_header_at_offset(archive, offset); + if (!lfh) + return -EINVAL; + + offset += sizeof(*lfh); + if ((lfh->flags & FLAG_ENCRYPTED) || (lfh->flags & FLAG_HAS_DATA_DESCRIPTOR)) + return -EINVAL; + + name = check_access(archive, offset, lfh->file_name_length); + if (!name) + return -EINVAL; + + offset += lfh->file_name_length; + if (!check_access(archive, offset, lfh->extra_field_length)) + return -EINVAL; + + offset += lfh->extra_field_length; + compressed_size = lfh->compressed_size; + data = check_access(archive, offset, compressed_size); + if (!data) + return -EINVAL; + + out->compression = lfh->compression; + out->name_length = lfh->file_name_length; + out->name = name; + out->data = data; + out->data_length = compressed_size; + out->data_offset = offset; + + return 0; +} + +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, + struct zip_entry *out) +{ + size_t file_name_length = strlen(file_name); + __u32 i, offset = archive->cd_offset; + + for (i = 0; i < archive->cd_records; ++i) { + __u16 cdfh_name_length, cdfh_flags; + struct cd_file_header *cdfh; + const char *cdfh_name; + + cdfh = check_access(archive, offset, sizeof(*cdfh)); + if (!cdfh || cdfh->magic != CD_FILE_HEADER_MAGIC) + return -EINVAL; + + offset += sizeof(*cdfh); + cdfh_name_length = cdfh->file_name_length; + cdfh_name = check_access(archive, offset, cdfh_name_length); + if (!cdfh_name) + return -EINVAL; + + cdfh_flags = cdfh->flags; + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && + file_name_length == cdfh_name_length && + memcmp(file_name, archive->data + offset, file_name_length) == 0) { + return get_entry_at_offset(archive, cdfh->offset, out); + } + + offset += cdfh_name_length; + offset += cdfh->extra_field_length; + offset += cdfh->file_comment_length; + } + + return -ENOENT; +} diff --git a/third_party/libbpf/src/zip.h b/third_party/libbpf/src/zip.h new file mode 100644 index 0000000..1c1bb21 --- /dev/null +++ b/third_party/libbpf/src/zip.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LIBBPF_ZIP_H +#define __LIBBPF_ZIP_H + +#include + +/* Represents an open zip archive. + * Only basic ZIP files are supported, in particular the following are not + * supported: + * - encryption + * - streaming + * - multi-part ZIP files + * - ZIP64 + */ +struct zip_archive; + +/* Carries information on name, compression method, and data corresponding to a + * file in a zip archive. + */ +struct zip_entry { + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ + __u16 compression; + + /* Non-null terminated name of the file. */ + const char *name; + /* Length of the file name. */ + __u16 name_length; + + /* Pointer to the file data. */ + const void *data; + /* Length of the file data. */ + __u32 data_length; + /* Offset of the file data within the archive. */ + __u32 data_offset; +}; + +/* Open a zip archive. Returns NULL in case of an error. */ +struct zip_archive *zip_archive_open(const char *path); + +/* Close a zip archive and release resources. */ +void zip_archive_close(struct zip_archive *archive); + +/* Look up an entry corresponding to a file in given zip archive. */ +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); + +#endif