diff --git a/src/15-javagc/.gitignore b/src/15-javagc/.gitignore new file mode 100644 index 0000000..b069296 --- /dev/null +++ b/src/15-javagc/.gitignore @@ -0,0 +1,8 @@ +.vscode +package.json +*.o +*.skel.json +*.skel.yaml +package.yaml +ecli +javagc diff --git a/src/15-javagc/Makefile b/src/15-javagc/Makefile new file mode 100644 index 0000000..fd10894 --- /dev/null +++ b/src/15-javagc/Makefile @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../../libbpf/src) +BPFTOOL_SRC := $(abspath ../../bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +LIBBLAZESYM_SRC := $(abspath ../../blazesym/) +LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a) +LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h) +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../../vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = javagc # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall + +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := # profile +APPS += $(BZS_APPS) +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: + $(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/15-javagc/README.md b/src/15-javagc/README.md index 436cfeb..97a4002 100644 --- a/src/15-javagc/README.md +++ b/src/15-javagc/README.md @@ -1,3 +1,35 @@ # eBPF 入门实践教程:使用 usdt 捕获用户态 Java GC 事件耗时 +## usdt 介绍 +TODO + +## java GC + +TODO + +## 安装依赖 + +构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。 + +在 Ubuntu/Debian 上,你需要执行以下命令: + +```shell +sudo apt install clang libelf1 libelf-dev zlib1g-dev +``` + +在 CentOS/Fedora 上,你需要执行以下命令: + +```shell +sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel +``` + +## 编译运行 + +编译运行上述代码: + +TODO + +## 总结 + +TODO diff --git a/src/15-javagc/javagc.bpf.c b/src/15-javagc/javagc.bpf.c new file mode 100644 index 0000000..35535d9 --- /dev/null +++ b/src/15-javagc/javagc.bpf.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Chen Tao */ +#include +#include +#include +#include +#include "javagc.h" + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, uint32_t); + __type(value, struct data_t); +} data_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, int); +} perf_map SEC(".maps"); + +__u32 time; + +static int gc_start(struct pt_regs *ctx) +{ + struct data_t data = {}; + + data.cpu = bpf_get_smp_processor_id(); + data.pid = bpf_get_current_pid_tgid() >> 32; + data.ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&data_map, &data.pid, &data, 0); + return 0; +} + +static int gc_end(struct pt_regs *ctx) +{ + struct data_t data = {}; + struct data_t *p; + __u32 val; + + data.cpu = bpf_get_smp_processor_id(); + data.pid = bpf_get_current_pid_tgid() >> 32; + data.ts = bpf_ktime_get_ns(); + p = bpf_map_lookup_elem(&data_map, &data.pid); + if (!p) + return 0; + + val = data.ts - p->ts; + if (val > time) { + data.ts = val; + bpf_perf_event_output(ctx, &perf_map, BPF_F_CURRENT_CPU, &data, sizeof(data)); + } + bpf_map_delete_elem(&data_map, &data.pid); + return 0; +} + +SEC("usdt") +int handle_gc_start(struct pt_regs *ctx) +{ + return gc_start(ctx); +} + +SEC("usdt") +int handle_gc_end(struct pt_regs *ctx) +{ + return gc_end(ctx); +} + +SEC("usdt") +int handle_mem_pool_gc_start(struct pt_regs *ctx) +{ + return gc_start(ctx); +} + +SEC("usdt") +int handle_mem_pool_gc_end(struct pt_regs *ctx) +{ + return gc_end(ctx); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/src/15-javagc/javagc.c b/src/15-javagc/javagc.c new file mode 100644 index 0000000..883ae70 --- /dev/null +++ b/src/15-javagc/javagc.c @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* + * Copyright (c) 2022 Chen Tao + * Based on ugc from BCC by Sasha Goldshtein + * Create: Wed Jun 29 16:00:19 2022 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "javagc.skel.h" +#include "javagc.h" + +#define BINARY_PATH_SIZE (256) +#define PERF_BUFFER_PAGES (32) +#define PERF_POLL_TIMEOUT_MS (200) + +static struct env { + pid_t pid; + int time; + bool exiting; + bool verbose; +} env = { + .pid = -1, + .time = 1000, + .exiting = false, + .verbose = false, +}; + +const char *argp_program_version = "javagc 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; + +const char argp_program_doc[] = +"Monitor javagc time cost.\n" +"\n" +"USAGE: javagc [--help] [-p PID] [-t GC time]\n" +"\n" +"EXAMPLES:\n" +"javagc -p 185 # trace PID 185 only\n" +"javagc -p 185 -t 100 # trace PID 185 java gc time beyond 100us\n"; + +static const struct argp_option opts[] = { + { "pid", 'p', "PID", 0, "Trace this PID only" }, + { "time", 't', "TIME", 0, "Java gc time" }, + { "verbose", 'v', NULL, 0, "Verbose debug output" }, + { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" }, + {}, +}; + +static error_t parse_arg(int key, char *arg, struct argp_state *state) +{ + int err = 0; + + switch (key) { + case 'h': + argp_state_help(state, stderr, ARGP_HELP_STD_HELP); + break; + case 'v': + env.verbose = true; + break; + case 'p': + errno = 0; + env.pid = strtol(arg, NULL, 10); + if (errno) { + err = errno; + fprintf(stderr, "invalid PID: %s\n", arg); + argp_usage(state); + } + break; + case 't': + errno = 0; + env.time = strtol(arg, NULL, 10); + if (errno) { + err = errno; + fprintf(stderr, "invalid time: %s\n", arg); + argp_usage(state); + } + break; + default: + return ARGP_ERR_UNKNOWN; + } + return err; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && ! env.verbose) + return 0; + + return vfprintf(stderr, format, args); +} + +static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) +{ + struct data_t *e = (struct data_t *)data; + struct tm *tm = NULL; + char ts[16]; + time_t t; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + printf("%-8s %-7d %-7d %-7lld\n", ts, e->cpu, e->pid, e->ts/1000); +} + +static void handle_lost_events(void *ctx, int cpu, __u64 data_sz) +{ + printf("lost data\n"); +} + +static void sig_handler(int sig) +{ + env.exiting = true; +} + +static int get_jvmso_path(char *path) +{ + char mode[16], line[128], buf[64]; + size_t seg_start, seg_end, seg_off; + FILE *f; + int i = 0; + + sprintf(buf, "/proc/%d/maps", env.pid); + f = fopen(buf, "r"); + if (!f) + return -1; + + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + i = 0; + while (isblank(line[i])) + i++; + if (strstr(line + i, "libjvm.so")) { + break; + } + } + + strcpy(path, line + i); + fclose(f); + + return 0; +} + +int main(int argc, char **argv) +{ + static const struct argp argp = { + .options = opts, + .parser = parse_arg, + .doc = argp_program_doc, + }; + char binary_path[BINARY_PATH_SIZE] = {0}; + struct javagc_bpf *skel = NULL; + int err; + struct perf_buffer *pb = NULL; + + err = argp_parse(&argp, argc, argv, 0, NULL, NULL); + if (err) + return err; + + /* + * libbpf will auto load the so if it in /usr/lib64 /usr/lib etc, + * but the jvmso not there. + */ + err = get_jvmso_path(binary_path); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + skel = javagc_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + skel->bss->time = env.time * 1000; + + err = javagc_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } + + skel->links.handle_mem_pool_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, + binary_path, "hotspot", "mem__pool__gc__begin", NULL); + if (!skel->links.handle_mem_pool_gc_start) { + err = errno; + fprintf(stderr, "attach usdt mem__pool__gc__begin failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_mem_pool_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, + binary_path, "hotspot", "mem__pool__gc__end", NULL); + if (!skel->links.handle_mem_pool_gc_end) { + err = errno; + fprintf(stderr, "attach usdt mem__pool__gc__end failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_gc_start = bpf_program__attach_usdt(skel->progs.handle_gc_start, env.pid, + binary_path, "hotspot", "gc__begin", NULL); + if (!skel->links.handle_gc_start) { + err = errno; + fprintf(stderr, "attach usdt gc__begin failed: %s\n", strerror(err)); + goto cleanup; + } + + skel->links.handle_gc_end = bpf_program__attach_usdt(skel->progs.handle_gc_end, env.pid, + binary_path, "hotspot", "gc__end", NULL); + if (!skel->links.handle_gc_end) { + err = errno; + fprintf(stderr, "attach usdt gc__end failed: %s\n", strerror(err)); + goto cleanup; + } + + signal(SIGINT, sig_handler); + printf("Tracing javagc time... Hit Ctrl-C to end.\n"); + printf("%-8s %-7s %-7s %-7s\n", + "TIME", "CPU", "PID", "GC TIME"); + + pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_map), PERF_BUFFER_PAGES, + handle_event, handle_lost_events, NULL, NULL); + while (!env.exiting) { + err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS); + if (err < 0 && err != -EINTR) { + fprintf(stderr, "error polling perf buffer: %s\n", strerror(-err)); + goto cleanup; + } + /* reset err to return 0 if exiting */ + err = 0; + } + +cleanup: + perf_buffer__free(pb); + javagc_bpf__destroy(skel); + + return err != 0; +} diff --git a/src/15-javagc/javagc.h b/src/15-javagc/javagc.h new file mode 100644 index 0000000..878f7db --- /dev/null +++ b/src/15-javagc/javagc.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2022 Chen Tao */ +#ifndef __JAVAGC_H +#define __JAVAGC_H + +struct data_t { + __u32 cpu; + __u32 pid; + __u64 ts; +}; + +#endif /* __JAVAGC_H */ diff --git a/src/16-memleak/.gitignore b/src/16-memleak/.gitignore new file mode 100644 index 0000000..3bbbd45 --- /dev/null +++ b/src/16-memleak/.gitignore @@ -0,0 +1,8 @@ +.vscode +package.json +*.o +*.skel.json +*.skel.yaml +package.yaml +ecli +memleak diff --git a/src/16-memleak/Makefile b/src/16-memleak/Makefile new file mode 100644 index 0000000..84ead7e --- /dev/null +++ b/src/16-memleak/Makefile @@ -0,0 +1,141 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../../libbpf/src) +BPFTOOL_SRC := $(abspath ../../bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +LIBBLAZESYM_SRC := $(abspath ../../blazesym/) +LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a) +LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h) +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../../vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = memleak # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall + +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := # profile +APPS += $(BZS_APPS) +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: + $(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/16-memleak/README.md b/src/16-memleak/README.md index 285b099..d1eec0d 100644 --- a/src/16-memleak/README.md +++ b/src/16-memleak/README.md @@ -18,53 +18,55 @@ ```c struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, pid_t); - __type(value, u64); - __uint(max_entries, 10240); + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, u64); + __uint(max_entries, 10240); } sizes SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); /* address */ - __type(value, struct alloc_info); - __uint(max_entries, ALLOCS_MAX_ENTRIES); + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* address */ + __type(value, struct alloc_info); + __uint(max_entries, ALLOCS_MAX_ENTRIES); } allocs SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); /* stack id */ - __type(value, union combined_alloc_info); - __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* stack id */ + __type(value, union combined_alloc_info); + __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); } combined_allocs SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_HASH); - __type(key, u64); - __type(value, u64); - __uint(max_entries, 10240); + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u64); + __uint(max_entries, 10240); } memptrs SEC(".maps"); struct { - __uint(type, BPF_MAP_TYPE_STACK_TRACE); - __type(key, u32); + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); } stack_traces SEC(".maps"); struct alloc_info { - __u64 size; - __u64 timestamp_ns; - int stack_id; + __u64 size; + __u64 timestamp_ns; + int stack_id; }; union combined_alloc_info { - struct { - __u64 total_size : 40; - __u64 number_of_allocs : 24; - }; - __u64 bits; + struct { + __u64 total_size : 40; + __u64 number_of_allocs : 24; + }; + __u64 bits; }; ``` + 这段代码定义了memleak工具中使用的5个BPF Map: + + sizes用于记录程序中每个内存分配请求的大小; + allocs用于跟踪每个内存分配请求的详细信息,包括请求的大小、堆栈信息等; + combined_allocs的键是堆栈的唯一标识符(stack id),值是一个combined_alloc_info联合体,用于记录该堆栈的内存分配总大小和内存分配数量; @@ -76,29 +78,30 @@ union combined_alloc_info { ```c static int gen_alloc_enter(size_t size) { - if (size < min_size || size > max_size) - return 0; + if (size < min_size || size > max_size) + return 0; - if (sample_rate > 1) { - if (bpf_ktime_get_ns() % sample_rate != 0) - return 0; - } + if (sample_rate > 1) { + if (bpf_ktime_get_ns() % sample_rate != 0) + return 0; + } - const pid_t pid = bpf_get_current_pid_tgid() >> 32; - bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); - if (trace_all) - bpf_printk("alloc entered, size = %lu\n", size); + if (trace_all) + bpf_printk("alloc entered, size = %lu\n", size); - return 0; + return 0; } SEC("uprobe") int BPF_KPROBE(malloc_enter, size_t size) { - return gen_alloc_enter(size); + return gen_alloc_enter(size); } ``` + 这个函数用于处理内存分配请求的进入事件。它会首先检查内存分配请求的大小是否在指定的范围内,如果不在范围内,则直接返回0表示不处理该事件。如果启用了采样率(sample_rate > 1),则该函数会采样内存分配请求的进入事件。如果当前时间戳不是采样周期的倍数,则也会直接返回0,表示不处理该事件。接下来,该函数会获取当前线程的PID并将其存储在pid变量中。然后,它会将当前线程的pid和请求的内存分配大小存储在sizes map中,以便后续收集和分析内存分配信息。如果开启了跟踪模式(trace_all),该函数会通过bpf_printk打印日志信息,以便用户实时监控内存分配的情况。 最后定义了BPF_KPROBE(malloc_enter, size_t size),它会在malloc函数被调用时被BPF uprobe拦截执行,并通过gen_alloc_enter来记录内存分配大小。 @@ -106,59 +109,59 @@ int BPF_KPROBE(malloc_enter, size_t size) ```c static void update_statistics_add(u64 stack_id, u64 sz) { - union combined_alloc_info *existing_cinfo; + union combined_alloc_info *existing_cinfo; - existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); - if (!existing_cinfo) - return; + existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); + if (!existing_cinfo) + return; - const union combined_alloc_info incremental_cinfo = { - .total_size = sz, - .number_of_allocs = 1 - }; + const union combined_alloc_info incremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; - __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); + __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); } static int gen_alloc_exit2(void *ctx, u64 address) { - const pid_t pid = bpf_get_current_pid_tgid() >> 32; - struct alloc_info info; + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + struct alloc_info info; - const u64* size = bpf_map_lookup_elem(&sizes, &pid); - if (!size) - return 0; // missed alloc entry + const u64* size = bpf_map_lookup_elem(&sizes, &pid); + if (!size) + return 0; // missed alloc entry - __builtin_memset(&info, 0, sizeof(info)); + __builtin_memset(&info, 0, sizeof(info)); - info.size = *size; - bpf_map_delete_elem(&sizes, &pid); + info.size = *size; + bpf_map_delete_elem(&sizes, &pid); - if (address != 0) { - info.timestamp_ns = bpf_ktime_get_ns(); + if (address != 0) { + info.timestamp_ns = bpf_ktime_get_ns(); - info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); + info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); - bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); + bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); - update_statistics_add(info.stack_id, info.size); - } + update_statistics_add(info.stack_id, info.size); + } - if (trace_all) { - bpf_printk("alloc exited, size = %lu, result = %lx\n", - info.size, address); - } + if (trace_all) { + bpf_printk("alloc exited, size = %lu, result = %lx\n", + info.size, address); + } - return 0; + return 0; } static int gen_alloc_exit(struct pt_regs *ctx) { - return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); + return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); } SEC("uretprobe") int BPF_KRETPROBE(malloc_exit) { - return gen_alloc_exit(ctx); + return gen_alloc_exit(ctx); } ``` @@ -175,51 +178,53 @@ update_statistics_add函数的主要作用是更新内存分配的统计信息 在gen_alloc_exit函数中,将ctx参数传递给gen_alloc_exit2函数,并将它的返回值作为自己的返回值。这里使用了PT_REGS_RC宏获取函数返回值。 最后定义的BPF_KRETPROBE(malloc_exit)是一个kretprobe类型的函数,用于在malloc函数返回时执行。并调用gen_alloc_exit函数跟踪内存分配和释放的请求。 + ```c static void update_statistics_del(u64 stack_id, u64 sz) { - union combined_alloc_info *existing_cinfo; + union combined_alloc_info *existing_cinfo; - existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); - if (!existing_cinfo) { - bpf_printk("failed to lookup combined allocs\n"); + existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); + if (!existing_cinfo) { + bpf_printk("failed to lookup combined allocs\n"); - return; - } + return; + } - const union combined_alloc_info decremental_cinfo = { - .total_size = sz, - .number_of_allocs = 1 - }; + const union combined_alloc_info decremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; - __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); + __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); } static int gen_free_enter(const void *address) { - const u64 addr = (u64)address; + const u64 addr = (u64)address; - const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); - if (!info) - return 0; + const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); + if (!info) + return 0; - bpf_map_delete_elem(&allocs, &addr); - update_statistics_del(info->stack_id, info->size); + bpf_map_delete_elem(&allocs, &addr); + update_statistics_del(info->stack_id, info->size); - if (trace_all) { - bpf_printk("free entered, address = %lx, size = %lu\n", - address, info->size); - } + if (trace_all) { + bpf_printk("free entered, address = %lx, size = %lu\n", + address, info->size); + } - return 0; + return 0; } SEC("uprobe") int BPF_KPROBE(free_enter, void *address) { - return gen_free_enter(address); + return gen_free_enter(address); } ``` + gen_free_enter函数接收一个地址参数,该函数首先使用allocs map查找该地址对应的内存分配信息。如果未找到,则表示该地址没有被分配,该函数返回0。如果找到了对应的内存分配信息,则使用bpf_map_delete_elem从allocs map中删除该信息。 接下来,调用update_statistics_del函数用于更新内存分配的统计信息,它接收堆栈ID和内存块大小作为参数。首先在combined_allocs map中查找堆栈ID对应的内存分配统计信息。如果没有找到,则输出一条日志,表示查找失败,并且函数直接返回。如果找到了对应的内存分配统计信息,则使用原子操作从内存分配统计信息中减去该内存块大小和1(表示减少了1个内存块)。这是因为堆栈ID对应的内存块数量减少了1,而堆栈ID对应的内存块总大小也减少了该内存块的大小。 @@ -253,4 +258,4 @@ Tracing outstanding memory allocs... Hit Ctrl-C to end memleak是一个内存泄漏监控工具,可以用来跟踪内存分配和释放时间对应的调用栈信息。随着时间的推移,这个工具可以显示长期不被释放的内存。 -这份代码来自于https://github.com/iovisor/bcc/blob/master/libbpf-tools/memleak.bpf.c +这份代码来自于 diff --git a/src/16-memleak/core_fixes.bpf.h b/src/16-memleak/core_fixes.bpf.h new file mode 100644 index 0000000..552c9fa --- /dev/null +++ b/src/16-memleak/core_fixes.bpf.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2021 Hengqi Chen */ + +#ifndef __CORE_FIXES_BPF_H +#define __CORE_FIXES_BPF_H + +#include +#include + +/** + * commit 2f064a59a1 ("sched: Change task_struct::state") changes + * the name of task_struct::state to task_struct::__state + * see: + * https://github.com/torvalds/linux/commit/2f064a59a1 + */ +struct task_struct___o { + volatile long int state; +} __attribute__((preserve_access_index)); + +struct task_struct___x { + unsigned int __state; +} __attribute__((preserve_access_index)); + +static __always_inline __s64 get_task_state(void *task) +{ + struct task_struct___x *t = task; + + if (bpf_core_field_exists(t->__state)) + return BPF_CORE_READ(t, __state); + return BPF_CORE_READ((struct task_struct___o *)task, state); +} + +/** + * commit 309dca309fc3 ("block: store a block_device pointer in struct bio") + * adds a new member bi_bdev which is a pointer to struct block_device + * see: + * https://github.com/torvalds/linux/commit/309dca309fc3 + */ +struct bio___o { + struct gendisk *bi_disk; +} __attribute__((preserve_access_index)); + +struct bio___x { + struct block_device *bi_bdev; +} __attribute__((preserve_access_index)); + +static __always_inline struct gendisk *get_gendisk(void *bio) +{ + struct bio___x *b = bio; + + if (bpf_core_field_exists(b->bi_bdev)) + return BPF_CORE_READ(b, bi_bdev, bd_disk); + return BPF_CORE_READ((struct bio___o *)bio, bi_disk); +} + +/** + * commit d5869fdc189f ("block: introduce block_rq_error tracepoint") + * adds a new tracepoint block_rq_error and it shares the same arguments + * with tracepoint block_rq_complete. As a result, the kernel BTF now has + * a `struct trace_event_raw_block_rq_completion` instead of + * `struct trace_event_raw_block_rq_complete`. + * see: + * https://github.com/torvalds/linux/commit/d5869fdc189f + */ +struct trace_event_raw_block_rq_complete___x { + dev_t dev; + sector_t sector; + unsigned int nr_sector; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_block_rq_completion___x { + dev_t dev; + sector_t sector; + unsigned int nr_sector; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_block_rq_completion() +{ + if (bpf_core_type_exists(struct trace_event_raw_block_rq_completion___x)) + return true; + return false; +} + +/** + * commit d152c682f03c ("block: add an explicit ->disk backpointer to the + * request_queue") and commit f3fa33acca9f ("block: remove the ->rq_disk + * field in struct request") make some changes to `struct request` and + * `struct request_queue`. Now, to get the `struct gendisk *` field in a CO-RE + * way, we need both `struct request` and `struct request_queue`. + * see: + * https://github.com/torvalds/linux/commit/d152c682f03c + * https://github.com/torvalds/linux/commit/f3fa33acca9f + */ +struct request_queue___x { + struct gendisk *disk; +} __attribute__((preserve_access_index)); + +struct request___x { + struct request_queue___x *q; + struct gendisk *rq_disk; +} __attribute__((preserve_access_index)); + +static __always_inline struct gendisk *get_disk(void *request) +{ + struct request___x *r = request; + + if (bpf_core_field_exists(r->rq_disk)) + return BPF_CORE_READ(r, rq_disk); + return BPF_CORE_READ(r, q, disk); +} + +/** + * commit 6521f8917082("namei: prepare for idmapped mounts") add `struct + * user_namespace *mnt_userns` as vfs_create() and vfs_unlink() first argument. + * At the same time, struct renamedata {} add `struct user_namespace + * *old_mnt_userns` item. Now, to kprobe vfs_create()/vfs_unlink() in a CO-RE + * way, determine whether there is a `old_mnt_userns` field for `struct + * renamedata` to decide which input parameter of the vfs_create() to use as + * `dentry`. + * see: + * https://github.com/torvalds/linux/commit/6521f8917082 + */ +struct renamedata___x { + struct user_namespace *old_mnt_userns; +} __attribute__((preserve_access_index)); + +static __always_inline bool renamedata_has_old_mnt_userns_field(void) +{ + if (bpf_core_field_exists(struct renamedata___x, old_mnt_userns)) + return true; + return false; +} + +/** + * commit 3544de8ee6e4("mm, tracing: record slab name for kmem_cache_free()") + * replaces `trace_event_raw_kmem_free` with `trace_event_raw_kfree` and adds + * `tracepoint_kmem_cache_free` to enhance the information recorded for + * `kmem_cache_free`. + * see: + * https://github.com/torvalds/linux/commit/3544de8ee6e4 + */ + +struct trace_event_raw_kmem_free___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kfree___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_kmem_cache_free___x { + const void *ptr; +} __attribute__((preserve_access_index)); + +static __always_inline bool has_kfree() +{ + if (bpf_core_type_exists(struct trace_event_raw_kfree___x)) + return true; + return false; +} + +static __always_inline bool has_kmem_cache_free() +{ + if (bpf_core_type_exists(struct trace_event_raw_kmem_cache_free___x)) + return true; + return false; +} + +#endif /* __CORE_FIXES_BPF_H */ diff --git a/src/16-memleak/maps.bpf.h b/src/16-memleak/maps.bpf.h new file mode 100644 index 0000000..51d1012 --- /dev/null +++ b/src/16-memleak/maps.bpf.h @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2020 Anton Protopopov +#ifndef __MAPS_BPF_H +#define __MAPS_BPF_H + +#include +#include + +static __always_inline void * +bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) +{ + void *val; + long err; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); + if (err && err != -EEXIST) + return 0; + + return bpf_map_lookup_elem(map, key); +} + +#endif /* __MAPS_BPF_H */ diff --git a/src/16-memleak/memleak.bpf.c b/src/16-memleak/memleak.bpf.c index ac35a55..aa213c8 100644 --- a/src/16-memleak/memleak.bpf.c +++ b/src/16-memleak/memleak.bpf.c @@ -337,7 +337,7 @@ int memleak__kfree(void *ctx) ptr = BPF_CORE_READ(args, ptr); } - return gen_free_enter((void *)ptr); + return gen_free_enter(ptr); } SEC("tracepoint/kmem/kmem_cache_alloc") @@ -375,7 +375,7 @@ int memleak__kmem_cache_free(void *ctx) ptr = BPF_CORE_READ(args, ptr); } - return gen_free_enter((void *)ptr); + return gen_free_enter(ptr); } SEC("tracepoint/kmem/mm_page_alloc") diff --git a/src/16-memleak/memleak.c b/src/16-memleak/memleak.c new file mode 100644 index 0000000..1f28ebd --- /dev/null +++ b/src/16-memleak/memleak.c @@ -0,0 +1,1068 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2023 Meta Platforms, Inc. and affiliates. +// +// Based on memleak(8) from BCC by Sasha Goldshtein and others. +// 1-Mar-2023 JP Kobryn Created this. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "memleak.h" +#include "memleak.skel.h" +#include "trace_helpers.h" + +#ifdef USE_BLAZESYM +#include "blazesym.h" +#endif + +static struct env { + int interval; + int nr_intervals; + pid_t pid; + bool trace_all; + bool show_allocs; + bool combined_only; + int min_age_ns; + uint64_t sample_rate; + int top_stacks; + size_t min_size; + size_t max_size; + char object[32]; + + bool wa_missing_free; + bool percpu; + int perf_max_stack_depth; + int stack_map_max_entries; + long page_size; + bool kernel_trace; + bool verbose; + char command[32]; +} env = { + .interval = 5, // posarg 1 + .nr_intervals = -1, // posarg 2 + .pid = -1, // -p --pid + .trace_all = false, // -t --trace + .show_allocs = false, // -a --show-allocs + .combined_only = false, // --combined-only + .min_age_ns = 500, // -o --older (arg * 1e6) + .wa_missing_free = false, // --wa-missing-free + .sample_rate = 1, // -s --sample-rate + .top_stacks = 10, // -T --top + .min_size = 0, // -z --min-size + .max_size = -1, // -Z --max-size + .object = {0}, // -O --obj + .percpu = false, // --percpu + .perf_max_stack_depth = 127, + .stack_map_max_entries = 10240, + .page_size = 1, + .kernel_trace = true, + .verbose = false, + .command = {0}, // -c --command +}; + +struct allocation_node { + uint64_t address; + size_t size; + struct allocation_node* next; +}; + +struct allocation { + uint64_t stack_id; + size_t size; + size_t count; + struct allocation_node* allocations; +}; + +#define __ATTACH_UPROBE(skel, sym_name, prog_name, is_retprobe) \ + do { \ + LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts, \ + .func_name = #sym_name, \ + .retprobe = is_retprobe); \ + skel->links.prog_name = bpf_program__attach_uprobe_opts( \ + skel->progs.prog_name, \ + env.pid, \ + env.object, \ + 0, \ + &uprobe_opts); \ + } while (false) + +#define __CHECK_PROGRAM(skel, prog_name) \ + do { \ + if (!skel->links.prog_name) { \ + perror("no program attached for " #prog_name); \ + return -errno; \ + } \ + } while (false) + +#define __ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, is_retprobe) \ + do { \ + __ATTACH_UPROBE(skel, sym_name, prog_name, is_retprobe); \ + __CHECK_PROGRAM(skel, prog_name); \ + } while (false) + +#define ATTACH_UPROBE(skel, sym_name, prog_name) __ATTACH_UPROBE(skel, sym_name, prog_name, false) +#define ATTACH_URETPROBE(skel, sym_name, prog_name) __ATTACH_UPROBE(skel, sym_name, prog_name, true) + +#define ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name) __ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, false) +#define ATTACH_URETPROBE_CHECKED(skel, sym_name, prog_name) __ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, true) + +static void sig_handler(int signo); + +static long argp_parse_long(int key, const char *arg, struct argp_state *state); +static error_t argp_parse_arg(int key, char *arg, struct argp_state *state); + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args); + +static int event_init(int *fd); +static int event_wait(int fd, uint64_t expected_event); +static int event_notify(int fd, uint64_t event); + +static pid_t fork_sync_exec(const char *command, int fd); + +#ifdef USE_BLAZESYM +static void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const blazesym_csym *sym); +static void print_stack_frames_by_blazesym(); +#else +static void print_stack_frames_by_ksyms(); +static void print_stack_frames_by_syms_cache(); +#endif +static int print_stack_frames(struct allocation *allocs, size_t nr_allocs, int stack_traces_fd); + +static int alloc_size_compare(const void *a, const void *b); + +static int print_outstanding_allocs(int allocs_fd, int stack_traces_fd); +static int print_outstanding_combined_allocs(int combined_allocs_fd, int stack_traces_fd); + +static bool has_kernel_node_tracepoints(); +static void disable_kernel_node_tracepoints(struct memleak_bpf *skel); +static void disable_kernel_percpu_tracepoints(struct memleak_bpf *skel); +static void disable_kernel_tracepoints(struct memleak_bpf *skel); + +static int attach_uprobes(struct memleak_bpf *skel); + +const char *argp_program_version = "memleak 0.1"; +const char *argp_program_bug_address = + "https://github.com/iovisor/bcc/tree/master/libbpf-tools"; + +const char argp_args_doc[] = +"Trace outstanding memory allocations\n" +"\n" +"USAGE: memleak [-h] [-c COMMAND] [-p PID] [-t] [-n] [-a] [-o AGE_MS] [-C] [-F] [-s SAMPLE_RATE] [-T TOP_STACKS] [-z MIN_SIZE] [-Z MAX_SIZE] [-O OBJECT] [-P] [INTERVAL] [INTERVALS]\n" +"\n" +"EXAMPLES:\n" +"./memleak -p $(pidof allocs)\n" +" Trace allocations and display a summary of 'leaked' (outstanding)\n" +" allocations every 5 seconds\n" +"./memleak -p $(pidof allocs) -t\n" +" Trace allocations and display each individual allocator function call\n" +"./memleak -ap $(pidof allocs) 10\n" +" Trace allocations and display allocated addresses, sizes, and stacks\n" +" every 10 seconds for outstanding allocations\n" +"./memleak -c './allocs'\n" +" Run the specified command and trace its allocations\n" +"./memleak\n" +" Trace allocations in kernel mode and display a summary of outstanding\n" +" allocations every 5 seconds\n" +"./memleak -o 60000\n" +" Trace allocations in kernel mode and display a summary of outstanding\n" +" allocations that are at least one minute (60 seconds) old\n" +"./memleak -s 5\n" +" Trace roughly every 5th allocation, to reduce overhead\n" +""; + +static const struct argp_option argp_options[] = { + // name/longopt:str, key/shortopt:int, arg:str, flags:int, doc:str + {"pid", 'p', "PID", 0, "process ID to trace. if not specified, trace kernel allocs"}, + {"trace", 't', 0, 0, "print trace messages for each alloc/free call" }, + {"show-allocs", 'a', 0, 0, "show allocation addresses and sizes as well as call stacks"}, + {"older", 'o', "AGE_MS", 0, "prune allocations younger than this age in milliseconds"}, + {"command", 'c', "COMMAND", 0, "execute and trace the specified command"}, + {"combined-only", 'C', 0, 0, "show combined allocation statistics only"}, + {"wa-missing-free", 'F', 0, 0, "workaround to alleviate misjudgments when free is missing"}, + {"sample-rate", 's', "SAMPLE_RATE", 0, "sample every N-th allocation to decrease the overhead"}, + {"top", 'T', "TOP_STACKS", 0, "display only this many top allocating stacks (by size)"}, + {"min-size", 'z', "MIN_SIZE", 0, "capture only allocations larger than this size"}, + {"max-size", 'Z', "MAX_SIZE", 0, "capture only allocations smaller than this size"}, + {"obj", 'O', "OBJECT", 0, "attach to allocator functions in the specified object"}, + {"percpu", 'P', NULL, 0, "trace percpu allocations"}, + {}, +}; + +static volatile sig_atomic_t exiting; +static volatile sig_atomic_t child_exited; + +static struct sigaction sig_action = { + .sa_handler = sig_handler +}; + +static int child_exec_event_fd = -1; + +#ifdef USE_BLAZESYM +static blazesym *symbolizer; +static sym_src_cfg src_cfg; +#else +struct syms_cache *syms_cache; +struct ksyms *ksyms; +#endif +static void (*print_stack_frames_func)(); + +static uint64_t *stack; + +static struct allocation *allocs; + +static const char default_object[] = "libc.so.6"; + +int main(int argc, char *argv[]) +{ + int ret = 0; + struct memleak_bpf *skel = NULL; + + static const struct argp argp = { + .options = argp_options, + .parser = argp_parse_arg, + .doc = argp_args_doc, + }; + + // parse command line args to env settings + if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) { + fprintf(stderr, "failed to parse args\n"); + + goto cleanup; + } + + // install signal handler + if (sigaction(SIGINT, &sig_action, NULL) || sigaction(SIGCHLD, &sig_action, NULL)) { + perror("failed to set up signal handling"); + ret = -errno; + + goto cleanup; + } + + // post-processing and validation of env settings + if (env.min_size > env.max_size) { + fprintf(stderr, "min size (-z) can't be greater than max_size (-Z)\n"); + return 1; + } + + if (!strlen(env.object)) { + printf("using default object: %s\n", default_object); + strncpy(env.object, default_object, sizeof(env.object) - 1); + } + + env.page_size = sysconf(_SC_PAGE_SIZE); + printf("using page size: %ld\n", env.page_size); + + env.kernel_trace = env.pid < 0 && !strlen(env.command); + printf("tracing kernel: %s\n", env.kernel_trace ? "true" : "false"); + + // if specific userspace program was specified, + // create the child process and use an eventfd to synchronize the call to exec() + if (strlen(env.command)) { + if (env.pid >= 0) { + fprintf(stderr, "cannot specify both command and pid\n"); + ret = 1; + + goto cleanup; + } + + if (event_init(&child_exec_event_fd)) { + fprintf(stderr, "failed to init child event\n"); + + goto cleanup; + } + + const pid_t child_pid = fork_sync_exec(env.command, child_exec_event_fd); + if (child_pid < 0) { + perror("failed to spawn child process"); + ret = -errno; + + goto cleanup; + } + + env.pid = child_pid; + } + + // allocate space for storing a stack trace + stack = calloc(env.perf_max_stack_depth, sizeof(*stack)); + if (!stack) { + fprintf(stderr, "failed to allocate stack array\n"); + ret = -ENOMEM; + + goto cleanup; + } + +#ifdef USE_BLAZESYM + if (env.pid < 0) { + src_cfg.src_type = SRC_T_KERNEL; + src_cfg.params.kernel.kallsyms = NULL; + src_cfg.params.kernel.kernel_image = NULL; + } else { + src_cfg.src_type = SRC_T_PROCESS; + src_cfg.params.process.pid = env.pid; + } +#endif + + // allocate space for storing "allocation" structs + if (env.combined_only) + allocs = calloc(COMBINED_ALLOCS_MAX_ENTRIES, sizeof(*allocs)); + else + allocs = calloc(ALLOCS_MAX_ENTRIES, sizeof(*allocs)); + + if (!allocs) { + fprintf(stderr, "failed to allocate array\n"); + ret = -ENOMEM; + + goto cleanup; + } + + libbpf_set_print(libbpf_print_fn); + + skel = memleak_bpf__open(); + if (!skel) { + fprintf(stderr, "failed to open bpf object\n"); + ret = 1; + + goto cleanup; + } + + skel->rodata->min_size = env.min_size; + skel->rodata->max_size = env.max_size; + skel->rodata->page_size = env.page_size; + skel->rodata->sample_rate = env.sample_rate; + skel->rodata->trace_all = env.trace_all; + skel->rodata->stack_flags = env.kernel_trace ? 0 : BPF_F_USER_STACK; + skel->rodata->wa_missing_free = env.wa_missing_free; + + bpf_map__set_value_size(skel->maps.stack_traces, + env.perf_max_stack_depth * sizeof(unsigned long)); + bpf_map__set_max_entries(skel->maps.stack_traces, env.stack_map_max_entries); + + // disable kernel tracepoints based on settings or availability + if (env.kernel_trace) { + if (!has_kernel_node_tracepoints()) + disable_kernel_node_tracepoints(skel); + + if (!env.percpu) + disable_kernel_percpu_tracepoints(skel); + } else { + disable_kernel_tracepoints(skel); + } + + ret = memleak_bpf__load(skel); + if (ret) { + fprintf(stderr, "failed to load bpf object\n"); + + goto cleanup; + } + + const int allocs_fd = bpf_map__fd(skel->maps.allocs); + const int combined_allocs_fd = bpf_map__fd(skel->maps.combined_allocs); + const int stack_traces_fd = bpf_map__fd(skel->maps.stack_traces); + + // if userspace oriented, attach upbrobes + if (!env.kernel_trace) { + ret = attach_uprobes(skel); + if (ret) { + fprintf(stderr, "failed to attach uprobes\n"); + + goto cleanup; + } + } + + ret = memleak_bpf__attach(skel); + if (ret) { + fprintf(stderr, "failed to attach bpf program(s)\n"); + + goto cleanup; + } + + // if running a specific userspace program, + // notify the child process that it can exec its program + if (strlen(env.command)) { + ret = event_notify(child_exec_event_fd, 1); + if (ret) { + fprintf(stderr, "failed to notify child to perform exec\n"); + + goto cleanup; + } + } + +#ifdef USE_BLAZESYM + symbolizer = blazesym_new(); + if (!symbolizer) { + fprintf(stderr, "Failed to load blazesym\n"); + ret = -ENOMEM; + + goto cleanup; + } + print_stack_frames_func = print_stack_frames_by_blazesym; +#else + if (env.kernel_trace) { + ksyms = ksyms__load(); + if (!ksyms) { + fprintf(stderr, "Failed to load ksyms\n"); + ret = -ENOMEM; + + goto cleanup; + } + print_stack_frames_func = print_stack_frames_by_ksyms; + } else { + syms_cache = syms_cache__new(0); + if (!syms_cache) { + fprintf(stderr, "Failed to create syms_cache\n"); + ret = -ENOMEM; + + goto cleanup; + } + print_stack_frames_func = print_stack_frames_by_syms_cache; + } +#endif + + printf("Tracing outstanding memory allocs... Hit Ctrl-C to end\n"); + + // main loop + while (!exiting && env.nr_intervals) { + env.nr_intervals--; + + sleep(env.interval); + + if (env.combined_only) + print_outstanding_combined_allocs(combined_allocs_fd, stack_traces_fd); + else + print_outstanding_allocs(allocs_fd, stack_traces_fd); + } + + // after loop ends, check for child process and cleanup accordingly + if (env.pid > 0 && strlen(env.command)) { + if (!child_exited) { + if (kill(env.pid, SIGTERM)) { + perror("failed to signal child process"); + ret = -errno; + + goto cleanup; + } + printf("signaled child process\n"); + } + + if (waitpid(env.pid, NULL, 0) < 0) { + perror("failed to reap child process"); + ret = -errno; + + goto cleanup; + } + printf("reaped child process\n"); + } + +cleanup: +#ifdef USE_BLAZESYM + blazesym_free(symbolizer); +#else + if (syms_cache) + syms_cache__free(syms_cache); + if (ksyms) + ksyms__free(ksyms); +#endif + memleak_bpf__destroy(skel); + + free(allocs); + free(stack); + + printf("done\n"); + + return ret; +} + +long argp_parse_long(int key, const char *arg, struct argp_state *state) +{ + errno = 0; + const long temp = strtol(arg, NULL, 10); + if (errno || temp <= 0) { + fprintf(stderr, "error arg:%c %s\n", (char)key, arg); + argp_usage(state); + } + + return temp; +} + +error_t argp_parse_arg(int key, char *arg, struct argp_state *state) +{ + static int pos_args = 0; + + switch (key) { + case 'p': + env.pid = atoi(arg); + break; + case 't': + env.trace_all = true; + break; + case 'a': + env.show_allocs = true; + break; + case 'o': + env.min_age_ns = 1e6 * atoi(arg); + break; + case 'c': + strncpy(env.command, arg, sizeof(env.command) - 1); + break; + case 'C': + env.combined_only = true; + break; + case 'F': + env.wa_missing_free = true; + break; + case 's': + env.sample_rate = argp_parse_long(key, arg, state); + break; + case 'T': + env.top_stacks = atoi(arg); + break; + case 'z': + env.min_size = argp_parse_long(key, arg, state); + break; + case 'Z': + env.max_size = argp_parse_long(key, arg, state); + break; + case 'O': + strncpy(env.object, arg, sizeof(env.object) - 1); + break; + case 'P': + env.percpu = true; + break; + case ARGP_KEY_ARG: + pos_args++; + + if (pos_args == 1) { + env.interval = argp_parse_long(key, arg, state); + } + else if (pos_args == 2) { + env.nr_intervals = argp_parse_long(key, arg, state); + } else { + fprintf(stderr, "Unrecognized positional argument: %s\n", arg); + argp_usage(state); + } + + break; + default: + return ARGP_ERR_UNKNOWN; + } + + return 0; +} + +int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + + return vfprintf(stderr, format, args); +} + +void sig_handler(int signo) +{ + if (signo == SIGCHLD) + child_exited = 1; + + exiting = 1; +} + +int event_init(int *fd) +{ + if (!fd) { + fprintf(stderr, "pointer to fd is null\n"); + + return 1; + } + + const int tmp_fd = eventfd(0, EFD_CLOEXEC); + if (tmp_fd < 0) { + perror("failed to create event fd"); + + return -errno; + } + + *fd = tmp_fd; + + return 0; +} + +int event_wait(int fd, uint64_t expected_event) +{ + uint64_t event = 0; + const ssize_t bytes = read(fd, &event, sizeof(event)); + if (bytes < 0) { + perror("failed to read from fd"); + + return -errno; + } else if (bytes != sizeof(event)) { + fprintf(stderr, "read unexpected size\n"); + + return 1; + } + + if (event != expected_event) { + fprintf(stderr, "read event %lu, expected %lu\n", event, expected_event); + + return 1; + } + + return 0; +} + +int event_notify(int fd, uint64_t event) +{ + const ssize_t bytes = write(fd, &event, sizeof(event)); + if (bytes < 0) { + perror("failed to write to fd"); + + return -errno; + } else if (bytes != sizeof(event)) { + fprintf(stderr, "attempted to write %zu bytes, wrote %zd bytes\n", sizeof(event), bytes); + + return 1; + } + + return 0; +} + +pid_t fork_sync_exec(const char *command, int fd) +{ + const pid_t pid = fork(); + + switch (pid) { + case -1: + perror("failed to create child process"); + break; + case 0: { + const uint64_t event = 1; + if (event_wait(fd, event)) { + fprintf(stderr, "failed to wait on event"); + exit(EXIT_FAILURE); + } + + printf("received go event. executing child command\n"); + + const int err = execl(command, command, NULL); + if (err) { + perror("failed to execute child command"); + return -1; + } + + break; + } + default: + printf("child created with pid: %d\n", pid); + + break; + } + + return pid; +} + +#if USE_BLAZESYM +void print_stack_frame_by_blazesym(size_t frame, uint64_t addr, const blazesym_csym *sym) +{ + if (!sym) + printf("\t%zu [<%016lx>] <%s>\n", frame, addr, "null sym"); + else if (sym->path && strlen(sym->path)) + printf("\t%zu [<%016lx>] %s+0x%lx %s:%ld\n", frame, addr, sym->symbol, addr - sym->start_address, sym->path, sym->line_no); + else + printf("\t%zu [<%016lx>] %s+0x%lx\n", frame, addr, sym->symbol, addr - sym->start_address); +} + +void print_stack_frames_by_blazesym() +{ + const blazesym_result *result = blazesym_symbolize(symbolizer, &src_cfg, 1, stack, env.perf_max_stack_depth); + + for (size_t j = 0; j < result->size; ++j) { + const uint64_t addr = stack[j]; + + if (addr == 0) + break; + + // no symbol found + if (!result || j >= result->size || result->entries[j].size == 0) { + print_stack_frame_by_blazesym(j, addr, NULL); + + continue; + } + + // single symbol found + if (result->entries[j].size == 1) { + const blazesym_csym *sym = &result->entries[j].syms[0]; + print_stack_frame_by_blazesym(j, addr, sym); + + continue; + } + + // multi symbol found + printf("\t%zu [<%016lx>] (%lu entries)\n", j, addr, result->entries[j].size); + + for (size_t k = 0; k < result->entries[j].size; ++k) { + const blazesym_csym *sym = &result->entries[j].syms[k]; + if (sym->path && strlen(sym->path)) + printf("\t\t%s@0x%lx %s:%ld\n", sym->symbol, sym->start_address, sym->path, sym->line_no); + else + printf("\t\t%s@0x%lx\n", sym->symbol, sym->start_address); + } + } + + blazesym_result_free(result); +} +#else +void print_stack_frames_by_ksyms() +{ + for (size_t i = 0; i < env.perf_max_stack_depth; ++i) { + const uint64_t addr = stack[i]; + + if (addr == 0) + break; + + const struct ksym *ksym = ksyms__map_addr(ksyms, addr); + if (ksym) + printf("\t%zu [<%016lx>] %s+0x%lx\n", i, addr, ksym->name, addr - ksym->addr); + else + printf("\t%zu [<%016lx>] <%s>\n", i, addr, "null sym"); + } +} + +void print_stack_frames_by_syms_cache() +{ + const struct syms *syms = syms_cache__get_syms(syms_cache, env.pid); + if (!syms) { + fprintf(stderr, "Failed to get syms\n"); + return; + } + + for (size_t i = 0; i < env.perf_max_stack_depth; ++i) { + const uint64_t addr = stack[i]; + + if (addr == 0) + break; + + char *dso_name; + uint64_t dso_offset; + const struct sym *sym = syms__map_addr_dso(syms, addr, &dso_name, &dso_offset); + if (sym) { + printf("\t%zu [<%016lx>] %s+0x%lx", i, addr, sym->name, sym->offset); + if (dso_name) + printf(" [%s]", dso_name); + printf("\n"); + } else { + printf("\t%zu [<%016lx>] <%s>\n", i, addr, "null sym"); + } + } +} +#endif + +int print_stack_frames(struct allocation *allocs, size_t nr_allocs, int stack_traces_fd) +{ + for (size_t i = 0; i < nr_allocs; ++i) { + const struct allocation *alloc = &allocs[i]; + + printf("%zu bytes in %zu allocations from stack\n", alloc->size, alloc->count); + + if (env.show_allocs) { + struct allocation_node* it = alloc->allocations; + while (it != NULL) { + printf("\taddr = %#lx size = %zu\n", it->address, it->size); + it = it->next; + } + } + + if (bpf_map_lookup_elem(stack_traces_fd, &alloc->stack_id, stack)) { + if (errno == ENOENT) + continue; + + perror("failed to lookup stack trace"); + + return -errno; + } + + (*print_stack_frames_func)(); + } + + return 0; +} + +int alloc_size_compare(const void *a, const void *b) +{ + const struct allocation *x = (struct allocation *)a; + const struct allocation *y = (struct allocation *)b; + + // descending order + + if (x->size > y->size) + return -1; + + if (x->size < y->size) + return 1; + + return 0; +} + +int print_outstanding_allocs(int allocs_fd, int stack_traces_fd) +{ + time_t t = time(NULL); + struct tm *tm = localtime(&t); + + size_t nr_allocs = 0; + + // for each struct alloc_info "alloc_info" in the bpf map "allocs" + for (uint64_t prev_key = 0, curr_key = 0;; prev_key = curr_key) { + struct alloc_info alloc_info = {}; + memset(&alloc_info, 0, sizeof(alloc_info)); + + if (bpf_map_get_next_key(allocs_fd, &prev_key, &curr_key)) { + if (errno == ENOENT) { + break; // no more keys, done + } + + perror("map get next key error"); + + return -errno; + } + + if (bpf_map_lookup_elem(allocs_fd, &curr_key, &alloc_info)) { + if (errno == ENOENT) + continue; + + perror("map lookup error"); + + return -errno; + } + + // filter by age + if (get_ktime_ns() - env.min_age_ns < alloc_info.timestamp_ns) { + continue; + } + + // filter invalid stacks + if (alloc_info.stack_id < 0) { + continue; + } + + // when the stack_id exists in the allocs array, + // increment size with alloc_info.size + bool stack_exists = false; + + for (size_t i = 0; !stack_exists && i < nr_allocs; ++i) { + struct allocation *alloc = &allocs[i]; + + if (alloc->stack_id == alloc_info.stack_id) { + alloc->size += alloc_info.size; + alloc->count++; + + if (env.show_allocs) { + struct allocation_node* node = malloc(sizeof(struct allocation_node)); + if (!node) { + perror("malloc failed"); + return -errno; + } + node->address = curr_key; + node->size = alloc_info.size; + node->next = alloc->allocations; + alloc->allocations = node; + } + + stack_exists = true; + break; + } + } + + if (stack_exists) + continue; + + // when the stack_id does not exist in the allocs array, + // create a new entry in the array + struct allocation alloc = { + .stack_id = alloc_info.stack_id, + .size = alloc_info.size, + .count = 1, + .allocations = NULL + }; + + if (env.show_allocs) { + struct allocation_node* node = malloc(sizeof(struct allocation_node)); + if (!node) { + perror("malloc failed"); + return -errno; + } + node->address = curr_key; + node->size = alloc_info.size; + node->next = NULL; + alloc.allocations = node; + } + + memcpy(&allocs[nr_allocs], &alloc, sizeof(alloc)); + nr_allocs++; + } + + // sort the allocs array in descending order + qsort(allocs, nr_allocs, sizeof(allocs[0]), alloc_size_compare); + + // get min of allocs we stored vs the top N requested stacks + size_t nr_allocs_to_show = nr_allocs < env.top_stacks ? nr_allocs : env.top_stacks; + + printf("[%d:%d:%d] Top %zu stacks with outstanding allocations:\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, nr_allocs_to_show); + + print_stack_frames(allocs, nr_allocs_to_show, stack_traces_fd); + + // Reset allocs list so that we dont accidentaly reuse data the next time we call this function + for (size_t i = 0; i < nr_allocs; i++) { + allocs[i].stack_id = 0; + if (env.show_allocs) { + struct allocation_node *it = allocs[i].allocations; + while (it != NULL) { + struct allocation_node *this = it; + it = it->next; + free(this); + } + allocs[i].allocations = NULL; + } + } + + return 0; +} + +int print_outstanding_combined_allocs(int combined_allocs_fd, int stack_traces_fd) +{ + time_t t = time(NULL); + struct tm *tm = localtime(&t); + + size_t nr_allocs = 0; + + // for each stack_id "curr_key" and union combined_alloc_info "alloc" + // in bpf_map "combined_allocs" + for (uint64_t prev_key = 0, curr_key = 0;; prev_key = curr_key) { + union combined_alloc_info combined_alloc_info; + memset(&combined_alloc_info, 0, sizeof(combined_alloc_info)); + + if (bpf_map_get_next_key(combined_allocs_fd, &prev_key, &curr_key)) { + if (errno == ENOENT) { + break; // no more keys, done + } + + perror("map get next key error"); + + return -errno; + } + + if (bpf_map_lookup_elem(combined_allocs_fd, &curr_key, &combined_alloc_info)) { + if (errno == ENOENT) + continue; + + perror("map lookup error"); + + return -errno; + } + + const struct allocation alloc = { + .stack_id = curr_key, + .size = combined_alloc_info.total_size, + .count = combined_alloc_info.number_of_allocs, + .allocations = NULL + }; + + memcpy(&allocs[nr_allocs], &alloc, sizeof(alloc)); + nr_allocs++; + } + + qsort(allocs, nr_allocs, sizeof(allocs[0]), alloc_size_compare); + + // get min of allocs we stored vs the top N requested stacks + nr_allocs = nr_allocs < env.top_stacks ? nr_allocs : env.top_stacks; + + printf("[%d:%d:%d] Top %zu stacks with outstanding allocations:\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, nr_allocs); + + print_stack_frames(allocs, nr_allocs, stack_traces_fd); + + return 0; +} + +bool has_kernel_node_tracepoints() +{ + return tracepoint_exists("kmem", "kmalloc_node") && + tracepoint_exists("kmem", "kmem_cache_alloc_node"); +} + +void disable_kernel_node_tracepoints(struct memleak_bpf *skel) +{ + bpf_program__set_autoload(skel->progs.memleak__kmalloc_node, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_alloc_node, false); +} + +void disable_kernel_percpu_tracepoints(struct memleak_bpf *skel) +{ + bpf_program__set_autoload(skel->progs.memleak__percpu_alloc_percpu, false); + bpf_program__set_autoload(skel->progs.memleak__percpu_free_percpu, false); +} + +void disable_kernel_tracepoints(struct memleak_bpf *skel) +{ + bpf_program__set_autoload(skel->progs.memleak__kmalloc, false); + bpf_program__set_autoload(skel->progs.memleak__kmalloc_node, false); + bpf_program__set_autoload(skel->progs.memleak__kfree, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_alloc, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_alloc_node, false); + bpf_program__set_autoload(skel->progs.memleak__kmem_cache_free, false); + bpf_program__set_autoload(skel->progs.memleak__mm_page_alloc, false); + bpf_program__set_autoload(skel->progs.memleak__mm_page_free, false); + bpf_program__set_autoload(skel->progs.memleak__percpu_alloc_percpu, false); + bpf_program__set_autoload(skel->progs.memleak__percpu_free_percpu, false); +} + +int attach_uprobes(struct memleak_bpf *skel) +{ + ATTACH_UPROBE_CHECKED(skel, malloc, malloc_enter); + ATTACH_URETPROBE_CHECKED(skel, malloc, malloc_exit); + + ATTACH_UPROBE_CHECKED(skel, calloc, calloc_enter); + ATTACH_URETPROBE_CHECKED(skel, calloc, calloc_exit); + + ATTACH_UPROBE_CHECKED(skel, realloc, realloc_enter); + ATTACH_URETPROBE_CHECKED(skel, realloc, realloc_exit); + + ATTACH_UPROBE_CHECKED(skel, mmap, mmap_enter); + ATTACH_URETPROBE_CHECKED(skel, mmap, mmap_exit); + + ATTACH_UPROBE_CHECKED(skel, posix_memalign, posix_memalign_enter); + ATTACH_URETPROBE_CHECKED(skel, posix_memalign, posix_memalign_exit); + + ATTACH_UPROBE_CHECKED(skel, memalign, memalign_enter); + ATTACH_URETPROBE_CHECKED(skel, memalign, memalign_exit); + + ATTACH_UPROBE_CHECKED(skel, free, free_enter); + ATTACH_UPROBE_CHECKED(skel, munmap, munmap_enter); + + // the following probes are intentinally allowed to fail attachment + + // deprecated in libc.so bionic + ATTACH_UPROBE(skel, valloc, valloc_enter); + ATTACH_URETPROBE(skel, valloc, valloc_exit); + + // deprecated in libc.so bionic + ATTACH_UPROBE(skel, pvalloc, pvalloc_enter); + ATTACH_URETPROBE(skel, pvalloc, pvalloc_exit); + + // added in C11 + ATTACH_UPROBE(skel, aligned_alloc, aligned_alloc_enter); + ATTACH_URETPROBE(skel, aligned_alloc, aligned_alloc_exit); + + + return 0; +} diff --git a/src/16-memleak/trace_helpers.c b/src/16-memleak/trace_helpers.c new file mode 100644 index 0000000..89c4835 --- /dev/null +++ b/src/16-memleak/trace_helpers.c @@ -0,0 +1,1202 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +// Copyright (c) 2020 Wenbo Zhang +// +// Based on ksyms improvements from Andrii Nakryiko, add more helpers. +// 28-Feb-2020 Wenbo Zhang Created this. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_helpers.h" +#include "uprobe_helpers.h" + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define DISK_NAME_LEN 32 + +#define MINORBITS 20 +#define MINORMASK ((1U << MINORBITS) - 1) + +#define MKDEV(ma, mi) (((ma) << MINORBITS) | (mi)) + +struct ksyms { + struct ksym *syms; + int syms_sz; + int syms_cap; + char *strs; + int strs_sz; + int strs_cap; +}; + +static int ksyms__add_symbol(struct ksyms *ksyms, const char *name, unsigned long addr) +{ + size_t new_cap, name_len = strlen(name) + 1; + struct ksym *ksym; + void *tmp; + + if (ksyms->strs_sz + name_len > ksyms->strs_cap) { + new_cap = ksyms->strs_cap * 4 / 3; + if (new_cap < ksyms->strs_sz + name_len) + new_cap = ksyms->strs_sz + name_len; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(ksyms->strs, new_cap); + if (!tmp) + return -1; + ksyms->strs = tmp; + ksyms->strs_cap = new_cap; + } + if (ksyms->syms_sz + 1 > ksyms->syms_cap) { + new_cap = ksyms->syms_cap * 4 / 3; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(ksyms->syms, sizeof(*ksyms->syms) * new_cap); + if (!tmp) + return -1; + ksyms->syms = tmp; + ksyms->syms_cap = new_cap; + } + + ksym = &ksyms->syms[ksyms->syms_sz]; + /* while constructing, re-use pointer as just a plain offset */ + ksym->name = (void *)(unsigned long)ksyms->strs_sz; + ksym->addr = addr; + + memcpy(ksyms->strs + ksyms->strs_sz, name, name_len); + ksyms->strs_sz += name_len; + ksyms->syms_sz++; + + return 0; +} + +static int ksym_cmp(const void *p1, const void *p2) +{ + const struct ksym *s1 = p1, *s2 = p2; + + if (s1->addr == s2->addr) + return strcmp(s1->name, s2->name); + return s1->addr < s2->addr ? -1 : 1; +} + +struct ksyms *ksyms__load(void) +{ + char sym_type, sym_name[256]; + struct ksyms *ksyms; + unsigned long sym_addr; + int i, ret; + FILE *f; + + f = fopen("/proc/kallsyms", "r"); + if (!f) + return NULL; + + ksyms = calloc(1, sizeof(*ksyms)); + if (!ksyms) + goto err_out; + + while (true) { + ret = fscanf(f, "%lx %c %s%*[^\n]\n", + &sym_addr, &sym_type, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 3) + goto err_out; + if (ksyms__add_symbol(ksyms, sym_name, sym_addr)) + goto err_out; + } + + /* now when strings are finalized, adjust pointers properly */ + for (i = 0; i < ksyms->syms_sz; i++) + ksyms->syms[i].name += (unsigned long)ksyms->strs; + + qsort(ksyms->syms, ksyms->syms_sz, sizeof(*ksyms->syms), ksym_cmp); + + fclose(f); + return ksyms; + +err_out: + ksyms__free(ksyms); + fclose(f); + return NULL; +} + +void ksyms__free(struct ksyms *ksyms) +{ + if (!ksyms) + return; + + free(ksyms->syms); + free(ksyms->strs); + free(ksyms); +} + +const struct ksym *ksyms__map_addr(const struct ksyms *ksyms, + unsigned long addr) +{ + int start = 0, end = ksyms->syms_sz - 1, mid; + unsigned long sym_addr; + + /* find largest sym_addr <= addr using binary search */ + while (start < end) { + mid = start + (end - start + 1) / 2; + sym_addr = ksyms->syms[mid].addr; + + if (sym_addr <= addr) + start = mid; + else + end = mid - 1; + } + + if (start == end && ksyms->syms[start].addr <= addr) + return &ksyms->syms[start]; + return NULL; +} + +const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms, + const char *name) +{ + int i; + + for (i = 0; i < ksyms->syms_sz; i++) { + if (strcmp(ksyms->syms[i].name, name) == 0) + return &ksyms->syms[i]; + } + + return NULL; +} + +struct load_range { + uint64_t start; + uint64_t end; + uint64_t file_off; +}; + +enum elf_type { + EXEC, + DYN, + PERF_MAP, + VDSO, + UNKNOWN, +}; + +struct dso { + char *name; + struct load_range *ranges; + int range_sz; + /* Dyn's first text section virtual addr at execution */ + uint64_t sh_addr; + /* Dyn's first text section file offset */ + uint64_t sh_offset; + enum elf_type type; + + struct sym *syms; + int syms_sz; + int syms_cap; + + /* + * libbpf's struct btf is actually a pretty efficient + * "set of strings" data structure, so we create an + * empty one and use it to store symbol names. + */ + struct btf *btf; +}; + +struct map { + uint64_t start_addr; + uint64_t end_addr; + uint64_t file_off; + uint64_t dev_major; + uint64_t dev_minor; + uint64_t inode; +}; + +struct syms { + struct dso *dsos; + int dso_sz; +}; + +static bool is_file_backed(const char *mapname) +{ +#define STARTS_WITH(mapname, prefix) \ + (!strncmp(mapname, prefix, sizeof(prefix) - 1)) + + return mapname[0] && !( + STARTS_WITH(mapname, "//anon") || + STARTS_WITH(mapname, "/dev/zero") || + STARTS_WITH(mapname, "/anon_hugepage") || + STARTS_WITH(mapname, "[stack") || + STARTS_WITH(mapname, "/SYSV") || + STARTS_WITH(mapname, "[heap]") || + STARTS_WITH(mapname, "[vsyscall]")); +} + +static bool is_perf_map(const char *path) +{ + return false; +} + +static bool is_vdso(const char *path) +{ + return !strcmp(path, "[vdso]"); +} + +static int get_elf_type(const char *path) +{ + GElf_Ehdr hdr; + void *res; + Elf *e; + int fd; + + if (is_vdso(path)) + return -1; + e = open_elf(path, &fd); + if (!e) + return -1; + res = gelf_getehdr(e, &hdr); + close_elf(e, fd); + if (!res) + return -1; + return hdr.e_type; +} + +static int get_elf_text_scn_info(const char *path, uint64_t *addr, + uint64_t *offset) +{ + Elf_Scn *section = NULL; + int fd = -1, err = -1; + GElf_Shdr header; + size_t stridx; + Elf *e = NULL; + char *name; + + e = open_elf(path, &fd); + if (!e) + goto err_out; + err = elf_getshdrstrndx(e, &stridx); + if (err < 0) + goto err_out; + + err = -1; + while ((section = elf_nextscn(e, section)) != 0) { + if (!gelf_getshdr(section, &header)) + continue; + + name = elf_strptr(e, stridx, header.sh_name); + if (name && !strcmp(name, ".text")) { + *addr = (uint64_t)header.sh_addr; + *offset = (uint64_t)header.sh_offset; + err = 0; + break; + } + } + +err_out: + close_elf(e, fd); + return err; +} + +static int syms__add_dso(struct syms *syms, struct map *map, const char *name) +{ + struct dso *dso = NULL; + int i, type; + void *tmp; + + for (i = 0; i < syms->dso_sz; i++) { + if (!strcmp(syms->dsos[i].name, name)) { + dso = &syms->dsos[i]; + break; + } + } + + if (!dso) { + tmp = realloc(syms->dsos, (syms->dso_sz + 1) * + sizeof(*syms->dsos)); + if (!tmp) + return -1; + syms->dsos = tmp; + dso = &syms->dsos[syms->dso_sz++]; + memset(dso, 0, sizeof(*dso)); + dso->name = strdup(name); + dso->btf = btf__new_empty(); + } + + tmp = realloc(dso->ranges, (dso->range_sz + 1) * sizeof(*dso->ranges)); + if (!tmp) + return -1; + dso->ranges = tmp; + dso->ranges[dso->range_sz].start = map->start_addr; + dso->ranges[dso->range_sz].end = map->end_addr; + dso->ranges[dso->range_sz].file_off = map->file_off; + dso->range_sz++; + type = get_elf_type(name); + if (type == ET_EXEC) { + dso->type = EXEC; + } else if (type == ET_DYN) { + dso->type = DYN; + if (get_elf_text_scn_info(name, &dso->sh_addr, &dso->sh_offset) < 0) + return -1; + } else if (is_perf_map(name)) { + dso->type = PERF_MAP; + } else if (is_vdso(name)) { + dso->type = VDSO; + } else { + dso->type = UNKNOWN; + } + return 0; +} + +static struct dso *syms__find_dso(const struct syms *syms, unsigned long addr, + uint64_t *offset) +{ + struct load_range *range; + struct dso *dso; + int i, j; + + for (i = 0; i < syms->dso_sz; i++) { + dso = &syms->dsos[i]; + for (j = 0; j < dso->range_sz; j++) { + range = &dso->ranges[j]; + if (addr <= range->start || addr >= range->end) + continue; + if (dso->type == DYN || dso->type == VDSO) { + /* Offset within the mmap */ + *offset = addr - range->start + range->file_off; + /* Offset within the ELF for dyn symbol lookup */ + *offset += dso->sh_addr - dso->sh_offset; + } else { + *offset = addr; + } + + return dso; + } + } + + return NULL; +} + +static int dso__load_sym_table_from_perf_map(struct dso *dso) +{ + return -1; +} + +static int dso__add_sym(struct dso *dso, const char *name, uint64_t start, + uint64_t size) +{ + struct sym *sym; + size_t new_cap; + void *tmp; + int off; + + off = btf__add_str(dso->btf, name); + if (off < 0) + return off; + + if (dso->syms_sz + 1 > dso->syms_cap) { + new_cap = dso->syms_cap * 4 / 3; + if (new_cap < 1024) + new_cap = 1024; + tmp = realloc(dso->syms, sizeof(*dso->syms) * new_cap); + if (!tmp) + return -1; + dso->syms = tmp; + dso->syms_cap = new_cap; + } + + sym = &dso->syms[dso->syms_sz++]; + /* while constructing, re-use pointer as just a plain offset */ + sym->name = (void*)(unsigned long)off; + sym->start = start; + sym->size = size; + sym->offset = 0; + + return 0; +} + +static int sym_cmp(const void *p1, const void *p2) +{ + const struct sym *s1 = p1, *s2 = p2; + + if (s1->start == s2->start) + return strcmp(s1->name, s2->name); + return s1->start < s2->start ? -1 : 1; +} + +static int dso__add_syms(struct dso *dso, Elf *e, Elf_Scn *section, + size_t stridx, size_t symsize) +{ + Elf_Data *data = NULL; + + while ((data = elf_getdata(section, data)) != 0) { + size_t i, symcount = data->d_size / symsize; + + if (data->d_size % symsize) + return -1; + + for (i = 0; i < symcount; ++i) { + const char *name; + GElf_Sym sym; + + if (!gelf_getsym(data, (int)i, &sym)) + continue; + if (!(name = elf_strptr(e, stridx, sym.st_name))) + continue; + if (name[0] == '\0') + continue; + + if (sym.st_value == 0) + continue; + + if (dso__add_sym(dso, name, sym.st_value, sym.st_size)) + goto err_out; + } + } + + return 0; + +err_out: + return -1; +} + +static void dso__free_fields(struct dso *dso) +{ + if (!dso) + return; + + free(dso->name); + free(dso->ranges); + free(dso->syms); + btf__free(dso->btf); +} + +static int dso__load_sym_table_from_elf(struct dso *dso, int fd) +{ + Elf_Scn *section = NULL; + Elf *e; + int i; + + e = fd > 0 ? open_elf_by_fd(fd) : open_elf(dso->name, &fd); + if (!e) + return -1; + + while ((section = elf_nextscn(e, section)) != 0) { + GElf_Shdr header; + + if (!gelf_getshdr(section, &header)) + continue; + + if (header.sh_type != SHT_SYMTAB && + header.sh_type != SHT_DYNSYM) + continue; + + if (dso__add_syms(dso, e, section, header.sh_link, + header.sh_entsize)) + goto err_out; + } + + /* now when strings are finalized, adjust pointers properly */ + for (i = 0; i < dso->syms_sz; i++) + dso->syms[i].name = + btf__name_by_offset(dso->btf, + (unsigned long)dso->syms[i].name); + + qsort(dso->syms, dso->syms_sz, sizeof(*dso->syms), sym_cmp); + + close_elf(e, fd); + return 0; + +err_out: + dso__free_fields(dso); + close_elf(e, fd); + return -1; +} + +static int create_tmp_vdso_image(struct dso *dso) +{ + uint64_t start_addr, end_addr; + long pid = getpid(); + char buf[PATH_MAX]; + void *image = NULL; + char tmpfile[128]; + int ret, fd = -1; + uint64_t sz; + char *name; + FILE *f; + + snprintf(tmpfile, sizeof(tmpfile), "/proc/%ld/maps", pid); + f = fopen(tmpfile, "r"); + if (!f) + return -1; + + while (true) { + ret = fscanf(f, "%lx-%lx %*s %*x %*x:%*x %*u%[^\n]", + &start_addr, &end_addr, buf); + if (ret == EOF && feof(f)) + break; + if (ret != 3) + goto err_out; + + name = buf; + while (isspace(*name)) + name++; + if (!is_file_backed(name)) + continue; + if (is_vdso(name)) + break; + } + + sz = end_addr - start_addr; + image = malloc(sz); + if (!image) + goto err_out; + memcpy(image, (void *)start_addr, sz); + + snprintf(tmpfile, sizeof(tmpfile), + "/tmp/libbpf_%ld_vdso_image_XXXXXX", pid); + fd = mkostemp(tmpfile, O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "failed to create temp file: %s\n", + strerror(errno)); + goto err_out; + } + /* Unlink the file to avoid leaking */ + if (unlink(tmpfile) == -1) + fprintf(stderr, "failed to unlink %s: %s\n", tmpfile, + strerror(errno)); + if (write(fd, image, sz) == -1) { + fprintf(stderr, "failed to write to vDSO image: %s\n", + strerror(errno)); + close(fd); + fd = -1; + goto err_out; + } + +err_out: + fclose(f); + free(image); + return fd; +} + +static int dso__load_sym_table_from_vdso_image(struct dso *dso) +{ + int fd = create_tmp_vdso_image(dso); + + if (fd < 0) + return -1; + return dso__load_sym_table_from_elf(dso, fd); +} + +static int dso__load_sym_table(struct dso *dso) +{ + if (dso->type == UNKNOWN) + return -1; + if (dso->type == PERF_MAP) + return dso__load_sym_table_from_perf_map(dso); + if (dso->type == EXEC || dso->type == DYN) + return dso__load_sym_table_from_elf(dso, 0); + if (dso->type == VDSO) + return dso__load_sym_table_from_vdso_image(dso); + return -1; +} + +static struct sym *dso__find_sym(struct dso *dso, uint64_t offset) +{ + unsigned long sym_addr; + int start, end, mid; + + if (!dso->syms && dso__load_sym_table(dso)) + return NULL; + + start = 0; + end = dso->syms_sz - 1; + + /* find largest sym_addr <= addr using binary search */ + while (start < end) { + mid = start + (end - start + 1) / 2; + sym_addr = dso->syms[mid].start; + + if (sym_addr <= offset) + start = mid; + else + end = mid - 1; + } + + if (start == end && dso->syms[start].start <= offset) { + (dso->syms[start]).offset = offset - dso->syms[start].start; + return &dso->syms[start]; + } + return NULL; +} + +struct syms *syms__load_file(const char *fname) +{ + char buf[PATH_MAX], perm[5]; + struct syms *syms; + struct map map; + char *name; + FILE *f; + int ret; + + f = fopen(fname, "r"); + if (!f) + return NULL; + + syms = calloc(1, sizeof(*syms)); + if (!syms) + goto err_out; + + while (true) { + ret = fscanf(f, "%lx-%lx %4s %lx %lx:%lx %lu%[^\n]", + &map.start_addr, &map.end_addr, perm, + &map.file_off, &map.dev_major, + &map.dev_minor, &map.inode, buf); + if (ret == EOF && feof(f)) + break; + if (ret != 8) /* perf-.map */ + goto err_out; + + if (perm[2] != 'x') + continue; + + name = buf; + while (isspace(*name)) + name++; + if (!is_file_backed(name)) + continue; + + if (syms__add_dso(syms, &map, name)) + goto err_out; + } + + fclose(f); + return syms; + +err_out: + syms__free(syms); + fclose(f); + return NULL; +} + +struct syms *syms__load_pid(pid_t tgid) +{ + char fname[128]; + + snprintf(fname, sizeof(fname), "/proc/%ld/maps", (long)tgid); + return syms__load_file(fname); +} + +void syms__free(struct syms *syms) +{ + int i; + + if (!syms) + return; + + for (i = 0; i < syms->dso_sz; i++) + dso__free_fields(&syms->dsos[i]); + free(syms->dsos); + free(syms); +} + +const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr) +{ + struct dso *dso; + uint64_t offset; + + dso = syms__find_dso(syms, addr, &offset); + if (!dso) + return NULL; + return dso__find_sym(dso, offset); +} + +const struct sym *syms__map_addr_dso(const struct syms *syms, unsigned long addr, + char **dso_name, unsigned long *dso_offset) +{ + struct dso *dso; + uint64_t offset; + + dso = syms__find_dso(syms, addr, &offset); + if (!dso) + return NULL; + + *dso_name = dso->name; + *dso_offset = offset; + + return dso__find_sym(dso, offset); +} + +struct syms_cache { + struct { + struct syms *syms; + int tgid; + } *data; + int nr; +}; + +struct syms_cache *syms_cache__new(int nr) +{ + struct syms_cache *syms_cache; + + syms_cache = calloc(1, sizeof(*syms_cache)); + if (!syms_cache) + return NULL; + if (nr > 0) + syms_cache->data = calloc(nr, sizeof(*syms_cache->data)); + return syms_cache; +} + +void syms_cache__free(struct syms_cache *syms_cache) +{ + int i; + + if (!syms_cache) + return; + + for (i = 0; i < syms_cache->nr; i++) + syms__free(syms_cache->data[i].syms); + free(syms_cache->data); + free(syms_cache); +} + +struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid) +{ + void *tmp; + int i; + + for (i = 0; i < syms_cache->nr; i++) { + if (syms_cache->data[i].tgid == tgid) + return syms_cache->data[i].syms; + } + + tmp = realloc(syms_cache->data, (syms_cache->nr + 1) * + sizeof(*syms_cache->data)); + if (!tmp) + return NULL; + syms_cache->data = tmp; + syms_cache->data[syms_cache->nr].syms = syms__load_pid(tgid); + syms_cache->data[syms_cache->nr].tgid = tgid; + return syms_cache->data[syms_cache->nr++].syms; +} + +struct partitions { + struct partition *items; + int sz; +}; + +static int partitions__add_partition(struct partitions *partitions, + const char *name, unsigned int dev) +{ + struct partition *partition; + void *tmp; + + tmp = realloc(partitions->items, (partitions->sz + 1) * + sizeof(*partitions->items)); + if (!tmp) + return -1; + partitions->items = tmp; + partition = &partitions->items[partitions->sz]; + partition->name = strdup(name); + partition->dev = dev; + partitions->sz++; + + return 0; +} + +struct partitions *partitions__load(void) +{ + char part_name[DISK_NAME_LEN]; + unsigned int devmaj, devmin; + unsigned long long nop; + struct partitions *partitions; + char buf[64]; + FILE *f; + + f = fopen("/proc/partitions", "r"); + if (!f) + return NULL; + + partitions = calloc(1, sizeof(*partitions)); + if (!partitions) + goto err_out; + + while (fgets(buf, sizeof(buf), f) != NULL) { + /* skip heading */ + if (buf[0] != ' ' || buf[0] == '\n') + continue; + if (sscanf(buf, "%u %u %llu %s", &devmaj, &devmin, &nop, + part_name) != 4) + goto err_out; + if (partitions__add_partition(partitions, part_name, + MKDEV(devmaj, devmin))) + goto err_out; + } + + fclose(f); + return partitions; + +err_out: + partitions__free(partitions); + fclose(f); + return NULL; +} + +void partitions__free(struct partitions *partitions) +{ + int i; + + if (!partitions) + return; + + for (i = 0; i < partitions->sz; i++) + free(partitions->items[i].name); + free(partitions->items); + free(partitions); +} + +const struct partition * +partitions__get_by_dev(const struct partitions *partitions, unsigned int dev) +{ + int i; + + for (i = 0; i < partitions->sz; i++) { + if (partitions->items[i].dev == dev) + return &partitions->items[i]; + } + + return NULL; +} + +const struct partition * +partitions__get_by_name(const struct partitions *partitions, const char *name) +{ + int i; + + for (i = 0; i < partitions->sz; i++) { + if (strcmp(partitions->items[i].name, name) == 0) + return &partitions->items[i]; + } + + return NULL; +} + +static void print_stars(unsigned int val, unsigned int val_max, int width) +{ + int num_stars, num_spaces, i; + bool need_plus; + + num_stars = min(val, val_max) * width / val_max; + num_spaces = width - num_stars; + need_plus = val > val_max; + + for (i = 0; i < num_stars; i++) + printf("*"); + for (i = 0; i < num_spaces; i++) + printf(" "); + if (need_plus) + printf("+"); +} + +void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type) +{ + int stars_max = 40, idx_max = -1; + unsigned int val, val_max = 0; + unsigned long long low, high; + int stars, width, i; + + for (i = 0; i < vals_size; i++) { + val = vals[i]; + if (val > 0) + idx_max = i; + if (val > val_max) + val_max = val; + } + + if (idx_max < 0) + return; + + printf("%*s%-*s : count distribution\n", idx_max <= 32 ? 5 : 15, "", + idx_max <= 32 ? 19 : 29, val_type); + + if (idx_max <= 32) + stars = stars_max; + else + stars = stars_max / 2; + + for (i = 0; i <= idx_max; i++) { + low = (1ULL << (i + 1)) >> 1; + high = (1ULL << (i + 1)) - 1; + if (low == high) + low -= 1; + val = vals[i]; + width = idx_max <= 32 ? 10 : 20; + printf("%*lld -> %-*lld : %-8d |", width, low, width, high, val); + print_stars(val, val_max, stars); + printf("|\n"); + } +} + +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type) +{ + int i, stars_max = 40, idx_min = -1, idx_max = -1; + unsigned int val, val_max = 0; + + for (i = 0; i < vals_size; i++) { + val = vals[i]; + if (val > 0) { + idx_max = i; + if (idx_min < 0) + idx_min = i; + } + if (val > val_max) + val_max = val; + } + + if (idx_max < 0) + return; + + printf(" %-13s : count distribution\n", val_type); + for (i = idx_min; i <= idx_max; i++) { + val = vals[i]; + if (!val) + continue; + printf(" %-10d : %-8d |", base + i * step, val); + print_stars(val, val_max, stars_max); + printf("|\n"); + } +} + +unsigned long long get_ktime_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec; +} + +bool is_kernel_module(const char *name) +{ + bool found = false; + char buf[64]; + FILE *f; + + f = fopen("/proc/modules", "r"); + if (!f) + return false; + + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%s %*s\n", buf) != 1) + break; + if (!strcmp(buf, name)) { + found = true; + break; + } + } + + fclose(f); + return found; +} + +static bool fentry_try_attach(int id) +{ + int prog_fd, attach_fd; + char error[4096]; + struct bpf_insn insns[] = { + { .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0 }, + { .code = BPF_JMP | BPF_EXIT }, + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .expected_attach_type = BPF_TRACE_FENTRY, + .attach_btf_id = id, + .log_buf = error, + .log_size = sizeof(error), + ); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", "GPL", insns, + sizeof(insns) / sizeof(struct bpf_insn), &opts); + if (prog_fd < 0) + return false; + + attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd); + if (attach_fd >= 0) + close(attach_fd); + + close(prog_fd); + return attach_fd >= 0; +} + +bool fentry_can_attach(const char *name, const char *mod) +{ + struct btf *btf, *vmlinux_btf, *module_btf = NULL; + int err, id; + + vmlinux_btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(vmlinux_btf); + if (err) + return false; + + btf = vmlinux_btf; + + if (mod) { + module_btf = btf__load_module_btf(mod, vmlinux_btf); + err = libbpf_get_error(module_btf); + if (!err) + btf = module_btf; + } + + id = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC); + + btf__free(module_btf); + btf__free(vmlinux_btf); + return id > 0 && fentry_try_attach(id); +} + +bool kprobe_exists(const char *name) +{ + char addr_range[256]; + char sym_name[256]; + FILE *f; + int ret; + + f = fopen("/sys/kernel/debug/kprobes/blacklist", "r"); + if (!f) + goto avail_filter; + + while (true) { + ret = fscanf(f, "%s %s%*[^\n]\n", addr_range, sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 2) { + fprintf(stderr, "failed to read symbol from kprobe blacklist\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return false; + } + } + fclose(f); + +avail_filter: + f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); + if (!f) + goto slow_path; + + while (true) { + ret = fscanf(f, "%s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 1) { + fprintf(stderr, "failed to read symbol from available_filter_functions\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return true; + } + } + + fclose(f); + return false; + +slow_path: + f = fopen("/proc/kallsyms", "r"); + if (!f) + return false; + + while (true) { + ret = fscanf(f, "%*x %*c %s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + if (ret != 1) { + fprintf(stderr, "failed to read symbol from kallsyms\n"); + break; + } + if (!strcmp(name, sym_name)) { + fclose(f); + return true; + } + } + + fclose(f); + return false; +} + +bool tracepoint_exists(const char *category, const char *event) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "/sys/kernel/debug/tracing/events/%s/%s/format", category, event); + if (!access(path, F_OK)) + return true; + return false; +} + +bool vmlinux_btf_exists(void) +{ + struct btf *btf; + int err; + + btf = btf__load_vmlinux_btf(); + err = libbpf_get_error(btf); + if (err) + return false; + + btf__free(btf); + return true; +} + +bool module_btf_exists(const char *mod) +{ + char sysfs_mod[80]; + + if (mod) { + snprintf(sysfs_mod, sizeof(sysfs_mod), "/sys/kernel/btf/%s", mod); + if (!access(sysfs_mod, R_OK)) + return true; + } + return false; +} + +bool probe_tp_btf(const char *name) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_TRACE_RAW_TP); + struct bpf_insn insns[] = { + { .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = BPF_REG_0, .imm = 0 }, + { .code = BPF_JMP | BPF_EXIT }, + }; + int fd, insn_cnt = sizeof(insns) / sizeof(struct bpf_insn); + + opts.attach_btf_id = libbpf_find_vmlinux_btf_id(name, BPF_TRACE_RAW_TP); + fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL", insns, insn_cnt, &opts); + if (fd >= 0) + close(fd); + return fd >= 0; +} + +bool probe_ringbuf() +{ + int map_fd; + + map_fd = bpf_map_create(BPF_MAP_TYPE_RINGBUF, NULL, 0, 0, getpagesize(), NULL); + if (map_fd < 0) + return false; + + close(map_fd); + return true; +} diff --git a/src/16-memleak/trace_helpers.h b/src/16-memleak/trace_helpers.h new file mode 100644 index 0000000..171bc4e --- /dev/null +++ b/src/16-memleak/trace_helpers.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __TRACE_HELPERS_H +#define __TRACE_HELPERS_H + +#include + +#define NSEC_PER_SEC 1000000000ULL + +struct ksym { + const char *name; + unsigned long addr; +}; + +struct ksyms; + +struct ksyms *ksyms__load(void); +void ksyms__free(struct ksyms *ksyms); +const struct ksym *ksyms__map_addr(const struct ksyms *ksyms, + unsigned long addr); +const struct ksym *ksyms__get_symbol(const struct ksyms *ksyms, + const char *name); + +struct sym { + const char *name; + unsigned long start; + unsigned long size; + unsigned long offset; +}; + +struct syms; + +struct syms *syms__load_pid(int tgid); +struct syms *syms__load_file(const char *fname); +void syms__free(struct syms *syms); +const struct sym *syms__map_addr(const struct syms *syms, unsigned long addr); +const struct sym *syms__map_addr_dso(const struct syms *syms, unsigned long addr, + char **dso_name, unsigned long *dso_offset); + +struct syms_cache; + +struct syms_cache *syms_cache__new(int nr); +struct syms *syms_cache__get_syms(struct syms_cache *syms_cache, int tgid); +void syms_cache__free(struct syms_cache *syms_cache); + +struct partition { + char *name; + unsigned int dev; +}; + +struct partitions; + +struct partitions *partitions__load(void); +void partitions__free(struct partitions *partitions); +const struct partition * +partitions__get_by_dev(const struct partitions *partitions, unsigned int dev); +const struct partition * +partitions__get_by_name(const struct partitions *partitions, const char *name); + +void print_log2_hist(unsigned int *vals, int vals_size, const char *val_type); +void print_linear_hist(unsigned int *vals, int vals_size, unsigned int base, + unsigned int step, const char *val_type); + +unsigned long long get_ktime_ns(void); + +bool is_kernel_module(const char *name); + +/* + * When attempting to use kprobe/kretprobe, please check out new fentry/fexit + * probes, as they provide better performance and usability. But in some + * situations we have to fallback to kprobe/kretprobe probes. This helper + * is used to detect fentry/fexit support for the specified kernel function. + * + * 1. A gap between kernel versions, kernel BTF is exposed + * starting from 5.4 kernel. but fentry/fexit is actually + * supported starting from 5.5. + * 2. Whether kernel supports module BTF or not + * + * *name* is the name of a kernel function to be attached to, which can be + * from vmlinux or a kernel module. + * *mod* is a hint that indicates the *name* may reside in module BTF, + * if NULL, it means *name* belongs to vmlinux. + */ +bool fentry_can_attach(const char *name, const char *mod); + +/* + * The name of a kernel function to be attached to may be changed between + * kernel releases. This helper is used to confirm whether the target kernel + * uses a certain function name before attaching. + * + * It is achieved by scaning + * /sys/kernel/debug/tracing/available_filter_functions + * If this file does not exist, it fallbacks to parse /proc/kallsyms, + * which is slower. + */ +bool kprobe_exists(const char *name); +bool tracepoint_exists(const char *category, const char *event); + +bool vmlinux_btf_exists(void); +bool module_btf_exists(const char *mod); + +bool probe_tp_btf(const char *name); +bool probe_ringbuf(); + +#endif /* __TRACE_HELPERS_H */