diff --git a/src/16-memleak/README.md b/src/16-memleak/README.md index d20cbe1..285b099 100644 --- a/src/16-memleak/README.md +++ b/src/16-memleak/README.md @@ -16,12 +16,241 @@ ## 编写 eBPF 程序 -TODO +```c +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, u64); + __uint(max_entries, 10240); +} sizes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* address */ + __type(value, struct alloc_info); + __uint(max_entries, ALLOCS_MAX_ENTRIES); +} allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* stack id */ + __type(value, union combined_alloc_info); + __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); +} combined_allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u64); + __uint(max_entries, 10240); +} memptrs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stack_traces SEC(".maps"); + +struct alloc_info { + __u64 size; + __u64 timestamp_ns; + int stack_id; +}; + +union combined_alloc_info { + struct { + __u64 total_size : 40; + __u64 number_of_allocs : 24; + }; + __u64 bits; +}; +``` +这段代码定义了memleak工具中使用的5个BPF Map: ++ sizes用于记录程序中每个内存分配请求的大小; ++ allocs用于跟踪每个内存分配请求的详细信息,包括请求的大小、堆栈信息等; ++ combined_allocs的键是堆栈的唯一标识符(stack id),值是一个combined_alloc_info联合体,用于记录该堆栈的内存分配总大小和内存分配数量; ++ memptrs用于跟踪每个内存分配请求返回的指针,以便在内存释放请求到来时找到对应的内存分配请求; ++ stack_traces是一个堆栈跟踪类型的哈希表,用于存储每个线程的堆栈信息(key为线程id,value为堆栈跟踪信息)以便在内存分配和释放请求到来时能够追踪和分析相应的堆栈信息。 + +其中combined_alloc_info是一个联合体,其中包含一个结构体和一个unsigned long long类型的变量bits。结构体中的两个成员变量total_size和number_of_allocs分别表示总分配大小和分配的次数。其中40和24分别表示total_size和number_of_allocs这两个成员变量所占用的位数,用来限制其大小。通过这样的位数限制,可以节省combined_alloc_info结构的存储空间。同时,由于total_size和number_of_allocs在存储时是共用一个unsigned long long类型的变量bits,因此可以通过在成员变量bits上进行位运算来访问和修改total_size和number_of_allocs,从而避免了在程序中定义额外的变量和函数的复杂性。 + +```c +static int gen_alloc_enter(size_t size) +{ + if (size < min_size || size > max_size) + return 0; + + if (sample_rate > 1) { + if (bpf_ktime_get_ns() % sample_rate != 0) + return 0; + } + + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); + + if (trace_all) + bpf_printk("alloc entered, size = %lu\n", size); + + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(malloc_enter, size_t size) +{ + return gen_alloc_enter(size); +} +``` +这个函数用于处理内存分配请求的进入事件。它会首先检查内存分配请求的大小是否在指定的范围内,如果不在范围内,则直接返回0表示不处理该事件。如果启用了采样率(sample_rate > 1),则该函数会采样内存分配请求的进入事件。如果当前时间戳不是采样周期的倍数,则也会直接返回0,表示不处理该事件。接下来,该函数会获取当前线程的PID并将其存储在pid变量中。然后,它会将当前线程的pid和请求的内存分配大小存储在sizes map中,以便后续收集和分析内存分配信息。如果开启了跟踪模式(trace_all),该函数会通过bpf_printk打印日志信息,以便用户实时监控内存分配的情况。 + +最后定义了BPF_KPROBE(malloc_enter, size_t size),它会在malloc函数被调用时被BPF uprobe拦截执行,并通过gen_alloc_enter来记录内存分配大小。 + +```c +static void update_statistics_add(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); + if (!existing_cinfo) + return; + + const union combined_alloc_info incremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); +} +static int gen_alloc_exit2(void *ctx, u64 address) +{ + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + struct alloc_info info; + + const u64* size = bpf_map_lookup_elem(&sizes, &pid); + if (!size) + return 0; // missed alloc entry + + __builtin_memset(&info, 0, sizeof(info)); + + info.size = *size; + bpf_map_delete_elem(&sizes, &pid); + + if (address != 0) { + info.timestamp_ns = bpf_ktime_get_ns(); + + info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); + + bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); + + update_statistics_add(info.stack_id, info.size); + } + + if (trace_all) { + bpf_printk("alloc exited, size = %lu, result = %lx\n", + info.size, address); + } + + return 0; +} +static int gen_alloc_exit(struct pt_regs *ctx) +{ + return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); +} + +SEC("uretprobe") +int BPF_KRETPROBE(malloc_exit) +{ + return gen_alloc_exit(ctx); +} +``` + +gen_alloc_exit2函数会在内存释放时被调用,它用来记录内存释放的信息,并更新相关的 map。具体地,它首先通过 bpf_get_current_pid_tgid 来获取当前进程的 PID,并将其右移32位,获得PID值,然后使用 bpf_map_lookup_elem 查找 sizes map 中与该 PID 相关联的内存分配大小信息,并将其赋值给 info.size。如果找不到相应的 entry,则返回 0,表示在内存分配时没有记录到该 PID 相关的信息。接着,它会调用 __builtin_memset 来将 info 的所有字段清零,并调用 bpf_map_delete_elem 来删除 sizes map 中与该 PID 相关联的 entry。 + +如果 address 不为 0,则说明存在相应的内存分配信息,此时它会调用 bpf_ktime_get_ns 来获取当前时间戳,并将其赋值给 info.timestamp_ns。然后,它会调用 bpf_get_stackid 来获取当前函数调用堆栈的 ID,并将其赋值给 info.stack_id。最后,它会调用 bpf_map_update_elem 来将 address 和 info 相关联,即将 address 映射到 info。随后,它会调用 update_statistics_add 函数来更新 combined_allocs map 中与 info.stack_id 相关联的内存分配信息。 + +最后,如果 trace_all 为真,则会调用 bpf_printk 打印相关的调试信息。 + +update_statistics_add函数的主要作用是更新内存分配的统计信息,其中参数stack_id是当前内存分配的堆栈ID,sz是当前内存分配的大小。该函数首先通过bpf_map_lookup_or_try_init函数在combined_allocs map中查找与当前堆栈ID相关联的combined_alloc_info结构体,如果找到了,则将新的分配大小和分配次数加入到已有的combined_alloc_info结构体中;如果未找到,则使用initial_cinfo初始化一个新的combined_alloc_info结构体,并添加到combined_allocs map中。 + +更新combined_alloc_info结构体的方法是使用__sync_fetch_and_add函数,原子地将incremental_cinfo中的值累加到existing_cinfo中的值中。通过这种方式,即使多个线程同时调用update_statistics_add函数,也可以保证计数的正确性。 + +在gen_alloc_exit函数中,将ctx参数传递给gen_alloc_exit2函数,并将它的返回值作为自己的返回值。这里使用了PT_REGS_RC宏获取函数返回值。 + +最后定义的BPF_KRETPROBE(malloc_exit)是一个kretprobe类型的函数,用于在malloc函数返回时执行。并调用gen_alloc_exit函数跟踪内存分配和释放的请求。 +```c +static void update_statistics_del(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); + if (!existing_cinfo) { + bpf_printk("failed to lookup combined allocs\n"); + + return; + } + + const union combined_alloc_info decremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); +} + +static int gen_free_enter(const void *address) +{ + const u64 addr = (u64)address; + + const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); + if (!info) + return 0; + + bpf_map_delete_elem(&allocs, &addr); + update_statistics_del(info->stack_id, info->size); + + if (trace_all) { + bpf_printk("free entered, address = %lx, size = %lu\n", + address, info->size); + } + + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(free_enter, void *address) +{ + return gen_free_enter(address); +} +``` +gen_free_enter函数接收一个地址参数,该函数首先使用allocs map查找该地址对应的内存分配信息。如果未找到,则表示该地址没有被分配,该函数返回0。如果找到了对应的内存分配信息,则使用bpf_map_delete_elem从allocs map中删除该信息。 + +接下来,调用update_statistics_del函数用于更新内存分配的统计信息,它接收堆栈ID和内存块大小作为参数。首先在combined_allocs map中查找堆栈ID对应的内存分配统计信息。如果没有找到,则输出一条日志,表示查找失败,并且函数直接返回。如果找到了对应的内存分配统计信息,则使用原子操作从内存分配统计信息中减去该内存块大小和1(表示减少了1个内存块)。这是因为堆栈ID对应的内存块数量减少了1,而堆栈ID对应的内存块总大小也减少了该内存块的大小。 + +最后定义了一个bpf程序BPF_KPROBE(free_enter, void *address)会在进程调用free函数时执行。它会接收参数address,表示正在释放的内存块的地址,并调用gen_free_enter函数来处理该内存块的释放。 ## 编译运行 -TODO +```console +$ git clone https://github.com/iovisor/bcc.git --recurse-submodules +$ cd libbpf-tools/ +$ make memleak +$ sudo ./memleak +using default object: libc.so.6 +using page size: 4096 +tracing kernel: true +Tracing outstanding memory allocs... Hit Ctrl-C to end +[17:17:27] Top 10 stacks with outstanding allocations: +1236992 bytes in 302 allocations from stack + 0 [] + 1 [] + 2 [] + 3 [] + 4 [] + 5 [] + 6 [] +... +``` ## 总结 -TODO +memleak是一个内存泄漏监控工具,可以用来跟踪内存分配和释放时间对应的调用栈信息。随着时间的推移,这个工具可以显示长期不被释放的内存。 + +这份代码来自于https://github.com/iovisor/bcc/blob/master/libbpf-tools/memleak.bpf.c diff --git a/src/16-memleak/memleak.bpf.c b/src/16-memleak/memleak.bpf.c new file mode 100644 index 0000000..ac35a55 --- /dev/null +++ b/src/16-memleak/memleak.bpf.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2023 Meta Platforms, Inc. and affiliates. +#include +#include +#include + +#include "maps.bpf.h" +#include "memleak.h" +#include "core_fixes.bpf.h" + +const volatile size_t min_size = 0; +const volatile size_t max_size = -1; +const volatile size_t page_size = 4096; +const volatile __u64 sample_rate = 1; +const volatile bool trace_all = false; +const volatile __u64 stack_flags = 0; +const volatile bool wa_missing_free = false; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, u64); + __uint(max_entries, 10240); +} sizes SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* address */ + __type(value, struct alloc_info); + __uint(max_entries, ALLOCS_MAX_ENTRIES); +} allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); /* stack id */ + __type(value, union combined_alloc_info); + __uint(max_entries, COMBINED_ALLOCS_MAX_ENTRIES); +} combined_allocs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u64); + __type(value, u64); + __uint(max_entries, 10240); +} memptrs SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stack_traces SEC(".maps"); + +static union combined_alloc_info initial_cinfo; + +static void update_statistics_add(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_or_try_init(&combined_allocs, &stack_id, &initial_cinfo); + if (!existing_cinfo) + return; + + const union combined_alloc_info incremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_add(&existing_cinfo->bits, incremental_cinfo.bits); +} + +static void update_statistics_del(u64 stack_id, u64 sz) +{ + union combined_alloc_info *existing_cinfo; + + existing_cinfo = bpf_map_lookup_elem(&combined_allocs, &stack_id); + if (!existing_cinfo) { + bpf_printk("failed to lookup combined allocs\n"); + + return; + } + + const union combined_alloc_info decremental_cinfo = { + .total_size = sz, + .number_of_allocs = 1 + }; + + __sync_fetch_and_sub(&existing_cinfo->bits, decremental_cinfo.bits); +} + +static int gen_alloc_enter(size_t size) +{ + if (size < min_size || size > max_size) + return 0; + + if (sample_rate > 1) { + if (bpf_ktime_get_ns() % sample_rate != 0) + return 0; + } + + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + bpf_map_update_elem(&sizes, &pid, &size, BPF_ANY); + + if (trace_all) + bpf_printk("alloc entered, size = %lu\n", size); + + return 0; +} + +static int gen_alloc_exit2(void *ctx, u64 address) +{ + const pid_t pid = bpf_get_current_pid_tgid() >> 32; + struct alloc_info info; + + const u64* size = bpf_map_lookup_elem(&sizes, &pid); + if (!size) + return 0; // missed alloc entry + + __builtin_memset(&info, 0, sizeof(info)); + + info.size = *size; + bpf_map_delete_elem(&sizes, &pid); + + if (address != 0) { + info.timestamp_ns = bpf_ktime_get_ns(); + + info.stack_id = bpf_get_stackid(ctx, &stack_traces, stack_flags); + + bpf_map_update_elem(&allocs, &address, &info, BPF_ANY); + + update_statistics_add(info.stack_id, info.size); + } + + if (trace_all) { + bpf_printk("alloc exited, size = %lu, result = %lx\n", + info.size, address); + } + + return 0; +} + +static int gen_alloc_exit(struct pt_regs *ctx) +{ + return gen_alloc_exit2(ctx, PT_REGS_RC(ctx)); +} + +static int gen_free_enter(const void *address) +{ + const u64 addr = (u64)address; + + const struct alloc_info *info = bpf_map_lookup_elem(&allocs, &addr); + if (!info) + return 0; + + bpf_map_delete_elem(&allocs, &addr); + update_statistics_del(info->stack_id, info->size); + + if (trace_all) { + bpf_printk("free entered, address = %lx, size = %lu\n", + address, info->size); + } + + return 0; +} + +SEC("uprobe") +int BPF_KPROBE(malloc_enter, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(malloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(free_enter, void *address) +{ + return gen_free_enter(address); +} + +SEC("uprobe") +int BPF_KPROBE(calloc_enter, size_t nmemb, size_t size) +{ + return gen_alloc_enter(nmemb * size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(calloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(realloc_enter, void *ptr, size_t size) +{ + gen_free_enter(ptr); + + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(realloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(mmap_enter, void *address, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(mmap_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(munmap_enter, void *address) +{ + return gen_free_enter(address); +} + +SEC("uprobe") +int BPF_KPROBE(posix_memalign_enter, void **memptr, size_t alignment, size_t size) +{ + const u64 memptr64 = (u64)(size_t)memptr; + const u64 pid = bpf_get_current_pid_tgid() >> 32; + bpf_map_update_elem(&memptrs, &pid, &memptr64, BPF_ANY); + + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(posix_memalign_exit) +{ + const u64 pid = bpf_get_current_pid_tgid() >> 32; + u64 *memptr64; + void *addr; + + memptr64 = bpf_map_lookup_elem(&memptrs, &pid); + if (!memptr64) + return 0; + + bpf_map_delete_elem(&memptrs, &pid); + + if (bpf_probe_read_user(&addr, sizeof(void*), (void*)(size_t)*memptr64)) + return 0; + + const u64 addr64 = (u64)(size_t)addr; + + return gen_alloc_exit2(ctx, addr64); +} + +SEC("uprobe") +int BPF_KPROBE(aligned_alloc_enter, size_t alignment, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(aligned_alloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(valloc_enter, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(valloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(memalign_enter, size_t alignment, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(memalign_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("uprobe") +int BPF_KPROBE(pvalloc_enter, size_t size) +{ + return gen_alloc_enter(size); +} + +SEC("uretprobe") +int BPF_KRETPROBE(pvalloc_exit) +{ + return gen_alloc_exit(ctx); +} + +SEC("tracepoint/kmem/kmalloc") +int memleak__kmalloc(struct trace_event_raw_kmem_alloc *ctx) +{ + if (wa_missing_free) + gen_free_enter(ctx->ptr); + + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} + +SEC("tracepoint/kmem/kmalloc_node") +int memleak__kmalloc_node(struct trace_event_raw_kmem_alloc_node *ctx) +{ + if (wa_missing_free) + gen_free_enter(ctx->ptr); + + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} + +SEC("tracepoint/kmem/kfree") +int memleak__kfree(void *ctx) +{ + const void *ptr; + + if (has_kfree()) { + struct trace_event_raw_kfree___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } else { + struct trace_event_raw_kmem_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } + + return gen_free_enter((void *)ptr); +} + +SEC("tracepoint/kmem/kmem_cache_alloc") +int memleak__kmem_cache_alloc(struct trace_event_raw_kmem_alloc *ctx) +{ + if (wa_missing_free) + gen_free_enter(ctx->ptr); + + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} + +SEC("tracepoint/kmem/kmem_cache_alloc_node") +int memleak__kmem_cache_alloc_node(struct trace_event_raw_kmem_alloc_node *ctx) +{ + if (wa_missing_free) + gen_free_enter(ctx->ptr); + + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} + +SEC("tracepoint/kmem/kmem_cache_free") +int memleak__kmem_cache_free(void *ctx) +{ + const void *ptr; + + if (has_kmem_cache_free()) { + struct trace_event_raw_kmem_cache_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } else { + struct trace_event_raw_kmem_free___x *args = ctx; + ptr = BPF_CORE_READ(args, ptr); + } + + return gen_free_enter((void *)ptr); +} + +SEC("tracepoint/kmem/mm_page_alloc") +int memleak__mm_page_alloc(struct trace_event_raw_mm_page_alloc *ctx) +{ + gen_alloc_enter(page_size << ctx->order); + + return gen_alloc_exit2(ctx, ctx->pfn); +} + +SEC("tracepoint/kmem/mm_page_free") +int memleak__mm_page_free(struct trace_event_raw_mm_page_free *ctx) +{ + return gen_free_enter((void *)ctx->pfn); +} + +SEC("tracepoint/percpu/percpu_alloc_percpu") +int memleak__percpu_alloc_percpu(struct trace_event_raw_percpu_alloc_percpu *ctx) +{ + gen_alloc_enter(ctx->bytes_alloc); + + return gen_alloc_exit2(ctx, (u64)(ctx->ptr)); +} + +SEC("tracepoint/percpu/percpu_free_percpu") +int memleak__percpu_free_percpu(struct trace_event_raw_percpu_free_percpu *ctx) +{ + return gen_free_enter(ctx->ptr); +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/src/16-memleak/memleak.h b/src/16-memleak/memleak.h new file mode 100644 index 0000000..18ed1e1 --- /dev/null +++ b/src/16-memleak/memleak.h @@ -0,0 +1,21 @@ +#ifndef __MEMLEAK_H +#define __MEMLEAK_H + +#define ALLOCS_MAX_ENTRIES 1000000 +#define COMBINED_ALLOCS_MAX_ENTRIES 10240 + +struct alloc_info { + __u64 size; + __u64 timestamp_ns; + int stack_id; +}; + +union combined_alloc_info { + struct { + __u64 total_size : 40; + __u64 number_of_allocs : 24; + }; + __u64 bits; +}; + +#endif /* __MEMLEAK_H */