From 2ca0e4023a146dc8dd2745e83f4fda14c0ed2ead Mon Sep 17 00:00:00 2001 From: yunwei37 Date: Mon, 13 Oct 2025 08:57:34 -0700 Subject: [PATCH] feat: Add Python stack profiler using eBPF for enhanced performance analysis --- src/46-xdp-test/README.md | 4 +- src/trace/python-stack-profiler/.gitignore | 18 + src/trace/python-stack-profiler/Makefile | 94 ++++ src/trace/python-stack-profiler/README.md | 93 ++++ .../python-stack-profiler/python-stack.bpf.c | 230 ++++++++++ .../python-stack-profiler/python-stack.c | 414 ++++++++++++++++++ .../python-stack-profiler/python-stack.h | 115 +++++ 7 files changed, 966 insertions(+), 2 deletions(-) create mode 100644 src/trace/python-stack-profiler/.gitignore create mode 100644 src/trace/python-stack-profiler/Makefile create mode 100644 src/trace/python-stack-profiler/README.md create mode 100644 src/trace/python-stack-profiler/python-stack.bpf.c create mode 100644 src/trace/python-stack-profiler/python-stack.c create mode 100644 src/trace/python-stack-profiler/python-stack.h diff --git a/src/46-xdp-test/README.md b/src/46-xdp-test/README.md index 68c538b..8a0b3c7 100644 --- a/src/46-xdp-test/README.md +++ b/src/46-xdp-test/README.md @@ -18,11 +18,11 @@ Traditional BPF_PROG_RUN operates in "dry run" mode - packets are processed but ### Live Frames Mode: Real Packet Injection -In Linux 5.18+, the kernel introduced **live frames mode** via the `BPF_F_TEST_XDP_LIVE_FRAMES` flag. This fundamentally changes BPF_PROG_RUN behavior. When enabled, XDP_TX actions don't just return - they actually transmit packets on the wire through the specified network interface. This turns BPF_PROG_RUN into a powerful packet generator. +In Linux 5.18+, the kernel introduced live frames mode via the `BPF_F_TEST_XDP_LIVE_FRAMES` flag. This fundamentally changes BPF_PROG_RUN behavior. When enabled, XDP_TX actions don't just return; they actually transmit packets on the wire through the specified network interface. This turns BPF_PROG_RUN into a powerful packet generator. Here's how it works: Your userspace program constructs a packet (Ethernet frame with IP header, UDP payload, etc.) and passes it to `bpf_prog_test_run()` with live frames enabled. The XDP program receives this packet in its `xdp_md` context. If the program returns `XDP_TX`, the kernel transmits the packet through the network driver as if it arrived on the interface and was reflected back. The packet appears on the wire with full hardware offload support (checksumming, segmentation, etc.). -This enables several powerful use cases. **Network stack stress testing**: Flood your system with millions of packets per second to find breaking points in the network stack, driver, or application layer. **XDP program benchmarking**: Measure how many packets per second your XDP program can process under realistic load without external packet generators. **Protocol fuzzing**: Generate malformed packets or unusual protocol sequences to test robustness. **Synthetic traffic generation**: Create realistic traffic patterns for testing load balancers, firewalls, or intrusion detection systems. +This enables several powerful use cases. Network stack stress testing floods your system with millions of packets per second to find breaking points in the network stack, driver, or application layer. XDP program benchmarking measures how many packets per second your XDP program can process under realistic load without external packet generators. Protocol fuzzing generates malformed packets or unusual protocol sequences to test robustness. Synthetic traffic generation creates realistic traffic patterns for testing load balancers, firewalls, or intrusion detection systems. ### The XDP_TX Reflection Loop diff --git a/src/trace/python-stack-profiler/.gitignore b/src/trace/python-stack-profiler/.gitignore new file mode 100644 index 0000000..bd5dfdf --- /dev/null +++ b/src/trace/python-stack-profiler/.gitignore @@ -0,0 +1,18 @@ +# Build outputs +*.o +*.bpf.o +*.skel.h +python-stack + +# Output directory +.output/ + +# Editor files +*.swp +*.swo +*~ +.vscode/ +.idea/ + +# Temporary files +*.tmp diff --git a/src/trace/python-stack-profiler/Makefile b/src/trace/python-stack-profiler/Makefile new file mode 100644 index 0000000..592be6a --- /dev/null +++ b/src/trace/python-stack-profiler/Makefile @@ -0,0 +1,94 @@ +APP := python-stack + +THIRD_PARTY_PATH := ../../third_party + +# Architecture detection +ARCH := $(shell uname -m | sed 's/x86_64/x86/' | sed 's/aarch64/arm64/') + +# VMLINUX header path +VMLINUX_DIR := $(THIRD_PARTY_PATH)/vmlinux/$(ARCH) +VMLINUX_BTF_H := $(VMLINUX_DIR)/vmlinux.h + +# Libbpf +LIBBPF_SRC := $(abspath $(THIRD_PARTY_PATH)/libbpf/src) +LIBBPF_OBJ := $(abspath $(THIRD_PARTY_PATH)/libbpf/src/staticobjs/libbpf.a) +LIBBPF_OBJDIR := $(abspath $(THIRD_PARTY_PATH)/libbpf/src/staticobjs) + +# BPF Code +CLANG ?= clang +BPFTOOL ?= $(abspath $(THIRD_PARTY_PATH)/bpftool/src/bpftool) + +INCLUDES := -I$(LIBBPF_SRC) -I$(THIRD_PARTY_PATH)/bpftool/include/uapi -I$(VMLINUX_DIR) +CFLAGS := -g -Wall + +ALL_LDFLAGS := $(LDFLAGS) + +APPS = $(APP) + +# BPF source +BPF_SRC := $(APP).bpf.c + +# BPF object and skeleton +BPF_OBJ := $(APP).bpf.o +BPF_SKEL := $(APP).skel.h + +# Userspace source +USER_SRC := $(APP).c +USER_OBJ := $(APP).o + +.PHONY: all +all: $(APPS) + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.c) $(wildcard $(LIBBPF_SRC)/*.h) + @echo "Building libbpf..." + $(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 OBJDIR=$(LIBBPF_OBJDIR) + +# Build bpftool +$(BPFTOOL): + @echo "Building bpftool..." + $(MAKE) -C $(THIRD_PARTY_PATH)/bpftool/src + +# Generate vmlinux.h if needed +$(VMLINUX_BTF_H): + @if [ ! -f $(VMLINUX_BTF_H) ]; then \ + echo "Generating $(VMLINUX_BTF_H)..."; \ + mkdir -p $(VMLINUX_DIR); \ + $(BPFTOOL) btf dump file /sys/kernel/btf/vmlinux format c > $(VMLINUX_BTF_H); \ + fi + +# Build BPF object +$(BPF_OBJ): $(BPF_SRC) $(LIBBPF_OBJ) $(VMLINUX_BTF_H) + @echo "Building BPF object: $(BPF_OBJ)" + $(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) $(INCLUDES) -c $(BPF_SRC) -o $(BPF_OBJ) + +# Generate BPF skeleton +$(BPF_SKEL): $(BPF_OBJ) $(BPFTOOL) + @echo "Generating BPF skeleton: $(BPF_SKEL)" + $(BPFTOOL) gen skeleton $(BPF_OBJ) > $(BPF_SKEL) + +# Build userspace program +$(USER_OBJ): $(USER_SRC) $(BPF_SKEL) + @echo "Building userspace object: $(USER_OBJ)" + $(CC) $(CFLAGS) $(INCLUDES) -c $(USER_SRC) -o $(USER_OBJ) + +# Link final binary +$(APP): $(USER_OBJ) $(LIBBPF_OBJ) + @echo "Linking $(APP)..." + $(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# Clean +.PHONY: clean +clean: + rm -f $(BPF_OBJ) $(BPF_SKEL) $(USER_OBJ) $(APP) + rm -f *.o *.skel.h + +# Help +.PHONY: help +help: + @echo "Makefile for $(APP)" + @echo "" + @echo "Targets:" + @echo " all - Build everything (default)" + @echo " clean - Remove generated files" + @echo " help - Show this help" diff --git a/src/trace/python-stack-profiler/README.md b/src/trace/python-stack-profiler/README.md new file mode 100644 index 0000000..05bc5bb --- /dev/null +++ b/src/trace/python-stack-profiler/README.md @@ -0,0 +1,93 @@ +# eBPF Tutorial: Python Stack Profiler + +Profile Python applications at the OS level using eBPF to capture native and Python call stacks, helping identify performance bottlenecks in Python programs including data science workloads, web servers, and ML inference. + +> The complete source code: + +## Overview + +Python profiling traditionally relies on instrumentation (cProfile) or sampling within the interpreter (py-spy). These approaches have limitations: +- **cProfile**: High overhead, requires code modification +- **py-spy**: Samples from userspace, may miss short-lived functions +- **perf**: Captures native stacks but can't see Python function names + +This tutorial shows how to use eBPF to capture both native C stacks AND Python interpreter stacks, giving you complete visibility into where your Python application spends time. + +## What You'll Learn + +1. How to attach eBPF probes to Python processes +2. Walking Python interpreter frame structures from kernel space +3. Extracting Python function names, filenames, and line numbers +4. Combining native and Python stacks for complete profiling +5. Generating flamegraphs for Python applications + +## Prerequisites + +- Linux kernel 5.15+ (for BPF ring buffer support) +- Python 3.8+ running on your system +- Root access (for loading eBPF programs) +- Understanding of stack traces and profiling concepts + +## Building and Running + +```bash +make +sudo ./python-stack +``` + +## How It Works + +The profiler samples Python processes at a regular interval (e.g., 49Hz to avoid lock-step with scheduler). For each sample: + +1. **Capture native stack**: Use BPF stack helpers to get kernel and userspace stacks +2. **Identify Python threads**: Check if the process is running Python interpreter +3. **Walk Python frames**: Read PyFrameObject chain from CPython internals +4. **Extract symbols**: Get function names, filenames, line numbers from PyCodeObject +5. **Aggregate data**: Count stack occurrences for flamegraph generation + +## Python Internals + +CPython's frame structure (simplified): + +```c +struct _frame { + struct _frame *f_back; // Previous frame + PyCodeObject *f_code; // Code object + int f_lineno; // Current line number +}; + +struct PyCodeObject { + PyObject *co_filename; // Source filename + PyObject *co_name; // Function name +}; +``` + +## Example Output + +``` +python-script.py:main;process_data;expensive_function 247 +python-script.py:main;load_model;torch.load 189 +python-script.py:main;preprocess;np.array 156 +``` + +Each line shows the stack trace and sample count. + +## Use Cases + +- **ML/AI workloads**: Profile PyTorch, TensorFlow, NumPy operations +- **Web servers**: Find bottlenecks in Flask, Django, FastAPI +- **Data processing**: Optimize pandas, polars operations +- **General Python**: Any Python application performance analysis + +## Next Steps + +- Extend to capture GIL contention +- Add Python object allocation tracking +- Integrate with other eBPF metrics (CPU, memory) +- Build flamegraph visualization + +## References + +- [CPython Internals](https://realpython.com/cpython-source-code-guide/) +- [Python Frame Objects](https://docs.python.org/3/c-api/frame.html) +- [eBPF Stack Traces](https://www.brendangregg.com/blog/2016-01-20/ebpf-offcpu-flame-graph.html) diff --git a/src/trace/python-stack-profiler/python-stack.bpf.c b/src/trace/python-stack-profiler/python-stack.bpf.c new file mode 100644 index 0000000..8d71796 --- /dev/null +++ b/src/trace/python-stack-profiler/python-stack.bpf.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Python Stack Profiler - Capture Python interpreter stacks with eBPF + * Based on oncputime by Eunseon Lee + */ +#include +#include +#include +#include +#include "python-stack.h" + +#define EEXIST 17 + +const volatile bool kernel_stacks_only = false; +const volatile bool user_stacks_only = false; +const volatile bool include_idle = false; +const volatile bool filter_by_pid = false; +const volatile bool filter_by_tid = false; +const volatile bool python_only = true; // Only trace Python processes + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __type(key, u32); +} stackmap SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct key_t); + __type(value, u64); + __uint(max_entries, MAX_ENTRIES); +} counts SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_PID_NR); +} pids SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); + __type(value, u8); + __uint(max_entries, MAX_TID_NR); +} tids SEC(".maps"); + +// Store Python thread state pointers for each thread +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); // tid + __type(value, u64); // PyThreadState pointer + __uint(max_entries, 1024); +} python_thread_states SEC(".maps"); + +static __always_inline void * +bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) +{ + void *val; + int err; + + val = bpf_map_lookup_elem(map, key); + if (val) + return val; + + err = bpf_map_update_elem(map, key, init, BPF_NOEXIST); + if (err && err != -EEXIST) + return 0; + + return bpf_map_lookup_elem(map, key); +} + +// Read a Python string object (PyUnicodeObject or PyBytesObject) +static __always_inline int read_python_string(void *str_obj, char *buf, int buf_size) +{ + if (!str_obj || !buf || buf_size <= 0) + return -1; + + // Try to read as PyUnicodeObject (Python 3) + struct PyUnicodeObject unicode_obj; + if (bpf_probe_read_user(&unicode_obj, sizeof(unicode_obj), str_obj) == 0) { + // Check if it's an ASCII compact string (most common case) + if (unicode_obj.state.compact && unicode_obj.state.ascii) { + // For compact ASCII strings, data immediately follows the struct + void *data_ptr = (void *)str_obj + sizeof(struct PyUnicodeObject); + int len = unicode_obj.length < (buf_size - 1) ? + unicode_obj.length : (buf_size - 1); + + if (bpf_probe_read_user_str(buf, len + 1, data_ptr) > 0) + return 0; + } + } + + // Fallback: Try to read as PyBytesObject (Python 2 style or bytes in Python 3) + struct PyBytesObject bytes_obj; + if (bpf_probe_read_user(&bytes_obj, sizeof(bytes_obj), str_obj) == 0) { + void *data_ptr = (void *)str_obj + + __builtin_offsetof(struct PyBytesObject, ob_sval); + int len = bytes_obj.ob_base.ob_size < (buf_size - 1) ? + bytes_obj.ob_base.ob_size : (buf_size - 1); + + if (bpf_probe_read_user_str(buf, len + 1, data_ptr) > 0) + return 0; + } + + return -1; +} + +// Walk Python frame chain and extract stack information +static __always_inline int get_python_stack(struct PyFrameObject *frame, + struct python_stack *stack) +{ + struct PyFrameObject current_frame; + struct PyCodeObject code_obj; + int depth = 0; + + #pragma unroll + for (int i = 0; i < MAX_STACK_DEPTH; i++) { + if (!frame) + break; + + // Read the frame object + if (bpf_probe_read_user(¤t_frame, sizeof(current_frame), frame) != 0) + break; + + // Read the code object + if (!current_frame.f_code) + break; + + if (bpf_probe_read_user(&code_obj, sizeof(code_obj), + current_frame.f_code) != 0) + break; + + // Extract function name + if (read_python_string(code_obj.co_name, + stack->frames[depth].function_name, + FUNCTION_NAME_LEN) != 0) { + __builtin_memcpy(stack->frames[depth].function_name, + "", 10); + } + + // Extract filename + if (read_python_string(code_obj.co_filename, + stack->frames[depth].file_name, + FILE_NAME_LEN) != 0) { + __builtin_memcpy(stack->frames[depth].file_name, + "", 10); + } + + // Extract line number + stack->frames[depth].line_number = current_frame.f_lineno; + + depth++; + frame = current_frame.f_back; + } + + stack->depth = depth; + return depth; +} + +SEC("perf_event") +int do_perf_event(struct bpf_perf_event_data *ctx) +{ + u64 *valp; + static const u64 zero; + struct key_t key = {}; + u64 id; + u32 pid; + u32 tid; + + id = bpf_get_current_pid_tgid(); + pid = id >> 32; + tid = id; + + if (!include_idle && tid == 0) + return 0; + + if (filter_by_pid && !bpf_map_lookup_elem(&pids, &pid)) + return 0; + + if (filter_by_tid && !bpf_map_lookup_elem(&tids, &tid)) + return 0; + + key.pid = pid; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + // Get native stacks + if (user_stacks_only) + key.kern_stack_id = -1; + else + key.kern_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, 0); + + if (kernel_stacks_only) + key.user_stack_id = -1; + else + key.user_stack_id = bpf_get_stackid(&ctx->regs, &stackmap, + BPF_F_USER_STACK); + + // Try to get Python stack + // Note: This is a simplified approach. In reality, you'd need to: + // 1. Find the PyThreadState for this thread (via TLS or global state) + // 2. This requires knowing Python's thread state location, which varies + // For now, we initialize an empty Python stack + key.py_stack.depth = 0; + + // TODO: Implement Python thread state discovery + // This would typically involve: + // - Finding libpython.so in process memory + // - Locating _PyThreadState_Current or similar + // - Reading the thread state for this TID + // - Walking the frame chain + + u64 *thread_state_ptr = bpf_map_lookup_elem(&python_thread_states, &tid); + if (thread_state_ptr && *thread_state_ptr != 0) { + struct PyThreadState thread_state; + if (bpf_probe_read_user(&thread_state, sizeof(thread_state), + (void *)*thread_state_ptr) == 0) { + if (thread_state.frame) { + get_python_stack(thread_state.frame, &key.py_stack); + } + } + } + + valp = bpf_map_lookup_or_try_init(&counts, &key, &zero); + if (valp) + __sync_fetch_and_add(valp, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; diff --git a/src/trace/python-stack-profiler/python-stack.c b/src/trace/python-stack-profiler/python-stack.c new file mode 100644 index 0000000..0bd958e --- /dev/null +++ b/src/trace/python-stack-profiler/python-stack.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * profile Profile CPU usage by sampling stack traces at a timed interval. + * Copyright (c) 2022 LG Electronics + * + * Based on profile from BCC by Brendan Gregg and others. + * 28-Dec-2021 Eunseon Lee Created this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "oncputime.h" +#include "oncputime.skel.h" +#include "blazesym.h" +#include "arg_parse.h" + +#define SYM_INFO_LEN 2048 + +/* + * -EFAULT in get_stackid normally means the stack-trace is not available, + * such as getting kernel stack trace in user mode + */ +#define STACK_ID_EFAULT(stack_id) (stack_id == -EFAULT) + +#define STACK_ID_ERR(stack_id) ((stack_id < 0) && !STACK_ID_EFAULT(stack_id)) + +/* hash collision (-EEXIST) suggests that stack map size may be too small */ +#define CHECK_STACK_COLLISION(ustack_id, kstack_id) \ + (kstack_id == -EEXIST || ustack_id == -EEXIST) + +#define MISSING_STACKS(ustack_id, kstack_id) \ + (!env.user_stacks_only && STACK_ID_ERR(kstack_id)) + (!env.kernel_stacks_only && STACK_ID_ERR(ustack_id)) + +/* This structure combines key_t and count which should be sorted together */ +struct key_ext_t { + struct key_t k; + __u64 v; +}; + +static blaze_symbolizer *symbolizer; + +static int nr_cpus; + +static int open_and_attach_perf_event(struct bpf_program *prog, + struct bpf_link *links[]) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .freq = env.freq, + .sample_freq = env.sample_freq, + .config = PERF_COUNT_SW_CPU_CLOCK, + }; + int i, fd; + + for (i = 0; i < nr_cpus; i++) { + if (env.cpu != -1 && env.cpu != i) + continue; + + fd = syscall(__NR_perf_event_open, &attr, -1, i, -1, 0); + if (fd < 0) { + /* Ignore CPU that is offline */ + if (errno == ENODEV) + continue; + + fprintf(stderr, "failed to init perf sampling: %s\n", + strerror(errno)); + return -1; + } + + links[i] = bpf_program__attach_perf_event(prog, fd); + if (!links[i]) { + fprintf(stderr, "failed to attach perf event on cpu: " + "%d\n", i); + links[i] = NULL; + close(fd); + return -1; + } + } + + return 0; +} + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; + + return vfprintf(stderr, format, args); +} + +static void sig_handler(int sig) +{ +} + +static int cmp_counts(const void *a, const void *b) +{ + const __u64 x = ((struct key_ext_t *) a)->v; + const __u64 y = ((struct key_ext_t *) b)->v; + + /* descending order */ + return y - x; +} + +static int read_counts_map(int fd, struct key_ext_t *items, __u32 *count) +{ + struct key_t empty = {}; + struct key_t *lookup_key = ∅ + int i = 0; + int err; + + while (bpf_map_get_next_key(fd, lookup_key, &items[i].k) == 0) { + err = bpf_map_lookup_elem(fd, &items[i].k, &items[i].v); + if (err < 0) { + fprintf(stderr, "failed to lookup counts: %d\n", err); + return -err; + } + + if (items[i].v == 0) + continue; + + lookup_key = &items[i].k; + i++; + } + + *count = i; + return 0; +} + +static int print_count(struct key_t *event, __u64 count, int stack_map) +{ + unsigned long *ip; + int ret; + bool has_kernel_stack, has_user_stack; + + ip = calloc(env.perf_max_stack_depth, sizeof(unsigned long)); + if (!ip) { + fprintf(stderr, "failed to alloc ip\n"); + return -ENOMEM; + } + + has_kernel_stack = !STACK_ID_EFAULT(event->kern_stack_id); + has_user_stack = !STACK_ID_EFAULT(event->user_stack_id); + + if (!env.folded) { + /* multi-line stack output */ + /* Show kernel stack first */ + if (!env.user_stacks_only && has_kernel_stack) { + if (bpf_map_lookup_elem(stack_map, &event->kern_stack_id, ip) != 0) { + fprintf(stderr, " [Missed Kernel Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0); + } + } + + if (env.delimiter && !env.user_stacks_only && !env.kernel_stacks_only && + has_user_stack && has_kernel_stack) { + printf(" --\n"); + } + + /* Then show user stack */ + if (!env.kernel_stacks_only && has_user_stack) { + if (bpf_map_lookup_elem(stack_map, &event->user_stack_id, ip) != 0) { + fprintf(stderr, " [Missed User Stack]\n"); + } else { + show_stack_trace(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, event->pid); + } + } + + printf(" %-16s %s (%d)\n", "-", event->name, event->pid); + printf(" %lld\n", count); + } else { + /* folded stack output */ + printf("%s", event->name); + + /* Print user stack first for folded format */ + if (has_user_stack && !env.kernel_stacks_only) { + if (bpf_map_lookup_elem(stack_map, &event->user_stack_id, ip) != 0) { + printf(";[Missed User Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, event->pid, ';', true); + } + } + + /* Then print kernel stack if it exists */ + if (has_kernel_stack && !env.user_stacks_only) { + /* Add delimiter between user and kernel stacks if needed */ + if (has_user_stack && env.delimiter && !env.kernel_stacks_only) + printf("-"); + + if (bpf_map_lookup_elem(stack_map, &event->kern_stack_id, ip) != 0) { + printf(";[Missed Kernel Stack]"); + } else { + printf(";"); + show_stack_trace_folded(symbolizer, (__u64 *)ip, env.perf_max_stack_depth, 0, ';', true); + } + } + + printf(" %lld\n", count); + } + + free(ip); + + return 0; +} + +static int print_counts(int counts_map, int stack_map) +{ + struct key_ext_t *counts; + struct key_t *event; + __u64 count; + __u32 nr_count = MAX_ENTRIES; + size_t nr_missing_stacks = 0; + bool has_collision = false; + int i, ret = 0; + + counts = calloc(MAX_ENTRIES, sizeof(struct key_ext_t)); + if (!counts) { + fprintf(stderr, "Out of memory\n"); + return -ENOMEM; + } + + ret = read_counts_map(counts_map, counts, &nr_count); + if (ret) + goto cleanup; + + qsort(counts, nr_count, sizeof(struct key_ext_t), cmp_counts); + + for (i = 0; i < nr_count; i++) { + event = &counts[i].k; + count = counts[i].v; + + print_count(event, count, stack_map); + + /* Add a newline between stack traces for better readability */ + if (!env.folded && i < nr_count - 1) + printf("\n"); + + /* handle stack id errors */ + nr_missing_stacks += MISSING_STACKS(event->user_stack_id, event->kern_stack_id); + has_collision = CHECK_STACK_COLLISION(event->user_stack_id, event->kern_stack_id); + } + + if (nr_missing_stacks > 0) { + fprintf(stderr, "WARNING: %zu stack traces could not be displayed.%s\n", + nr_missing_stacks, has_collision ? + " Consider increasing --stack-storage-size.":""); + } + +cleanup: + free(counts); + + return ret; +} + +static void print_headers() +{ + int i; + + if (env.folded) + return; // Don't print headers in folded format + + printf("Sampling at %d Hertz of", env.sample_freq); + + if (env.pids[0]) { + printf(" PID ["); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) + printf("%d%s", env.pids[i], (i < MAX_PID_NR - 1 && env.pids[i + 1]) ? ", " : "]"); + } else if (env.tids[0]) { + printf(" TID ["); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) + printf("%d%s", env.tids[i], (i < MAX_TID_NR - 1 && env.tids[i + 1]) ? ", " : "]"); + } else { + printf(" all threads"); + } + + if (env.user_stacks_only) + printf(" by user"); + else if (env.kernel_stacks_only) + printf(" by kernel"); + else + printf(" by user + kernel"); + + if (env.cpu != -1) + printf(" on CPU#%d", env.cpu); + + if (env.duration < INT_MAX) + printf(" for %d secs.\n", env.duration); + else + printf("... Hit Ctrl-C to end.\n"); +} + +int main(int argc, char **argv) +{ + struct bpf_link *links[MAX_CPU_NR] = {}; + struct oncputime_bpf *obj; + int pids_fd, tids_fd; + int err, i; + __u8 val = 0; + + err = parse_common_args(argc, argv, TOOL_PROFILE); + if (err) + return err; + + err = validate_common_args(); + if (err) + return err; + + libbpf_set_print(libbpf_print_fn); + + nr_cpus = libbpf_num_possible_cpus(); + if (nr_cpus < 0) { + printf("failed to get # of possible cpus: '%s'!\n", + strerror(-nr_cpus)); + return 1; + } + if (nr_cpus > MAX_CPU_NR) { + fprintf(stderr, "the number of cpu cores is too big, please " + "increase MAX_CPU_NR's value and recompile"); + return 1; + } + + symbolizer = blaze_symbolizer_new(); + if (!symbolizer) { + fprintf(stderr, "Failed to create a blazesym symbolizer\n"); + return 1; + } + + obj = oncputime_bpf__open(); + if (!obj) { + fprintf(stderr, "failed to open BPF object\n"); + blaze_symbolizer_free(symbolizer); + return 1; + } + + /* initialize global data (filtering options) */ + obj->rodata->user_stacks_only = env.user_stacks_only; + obj->rodata->kernel_stacks_only = env.kernel_stacks_only; + obj->rodata->include_idle = env.include_idle; + if (env.pids[0]) + obj->rodata->filter_by_pid = true; + else if (env.tids[0]) + obj->rodata->filter_by_tid = true; + + bpf_map__set_value_size(obj->maps.stackmap, + env.perf_max_stack_depth * sizeof(unsigned long)); + bpf_map__set_max_entries(obj->maps.stackmap, env.stack_storage_size); + + err = oncputime_bpf__load(obj); + if (err) { + fprintf(stderr, "failed to load BPF programs\n"); + goto cleanup; + } + + if (env.pids[0]) { + pids_fd = bpf_map__fd(obj->maps.pids); + for (i = 0; i < MAX_PID_NR && env.pids[i]; i++) { + if (bpf_map_update_elem(pids_fd, &(env.pids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init pids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + else if (env.tids[0]) { + tids_fd = bpf_map__fd(obj->maps.tids); + for (i = 0; i < MAX_TID_NR && env.tids[i]; i++) { + if (bpf_map_update_elem(tids_fd, &(env.tids[i]), &val, BPF_ANY) != 0) { + fprintf(stderr, "failed to init tids map: %s\n", strerror(errno)); + goto cleanup; + } + } + } + + err = open_and_attach_perf_event(obj->progs.do_perf_event, links); + if (err) + goto cleanup; + + signal(SIGINT, sig_handler); + + if (!env.folded) + print_headers(); + + /* + * We'll get sleep interrupted when someone presses Ctrl-C. + * (which will be "handled" with noop by sig_handler) + */ + sleep(env.duration); + + print_counts(bpf_map__fd(obj->maps.counts), + bpf_map__fd(obj->maps.stackmap)); + +cleanup: + if (env.cpu != -1) + bpf_link__destroy(links[env.cpu]); + else { + for (i = 0; i < nr_cpus; i++) + bpf_link__destroy(links[i]); + } + + blaze_symbolizer_free(symbolizer); + oncputime_bpf__destroy(obj); + + return err != 0; +} diff --git a/src/trace/python-stack-profiler/python-stack.h b/src/trace/python-stack-profiler/python-stack.h new file mode 100644 index 0000000..eb60949 --- /dev/null +++ b/src/trace/python-stack-profiler/python-stack.h @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +#ifndef __PYTHON_STACK_H +#define __PYTHON_STACK_H + +#define TASK_COMM_LEN 16 +#define MAX_CPU_NR 128 +#define MAX_ENTRIES 10240 +#define MAX_PID_NR 30 +#define MAX_TID_NR 30 +#define MAX_STACK_DEPTH 20 +#define FUNCTION_NAME_LEN 64 +#define FILE_NAME_LEN 128 + +// Python frame information +struct python_frame { + char function_name[FUNCTION_NAME_LEN]; + char file_name[FILE_NAME_LEN]; + int line_number; +}; + +// Python stack trace (up to MAX_STACK_DEPTH frames) +struct python_stack { + int depth; + struct python_frame frames[MAX_STACK_DEPTH]; +}; + +struct key_t { + __u32 pid; + int user_stack_id; + int kern_stack_id; + char name[TASK_COMM_LEN]; + // Add Python stack information + struct python_stack py_stack; +}; + +// Python internal structures (CPython 3.8+) +// These are simplified versions of CPython internal structures +// Offsets may vary between Python versions + +struct PyObject { + unsigned long ob_refcnt; + void *ob_type; +}; + +struct PyVarObject { + struct PyObject ob_base; + unsigned long ob_size; +}; + +// PyCodeObject structure (simplified) +struct PyCodeObject { + struct PyObject ob_base; + int co_argcount; + int co_posonlyargcount; + int co_kwonlyargcount; + int co_nlocals; + int co_stacksize; + int co_flags; + int co_firstlineno; + struct PyObject *co_code; + struct PyObject *co_consts; + struct PyObject *co_names; + struct PyObject *co_varnames; + struct PyObject *co_freevars; + struct PyObject *co_cellvars; + struct PyObject *co_filename; + struct PyObject *co_name; + // ... more fields +}; + +// PyFrameObject structure (simplified) +struct PyFrameObject { + struct PyVarObject ob_base; + struct PyFrameObject *f_back; + struct PyCodeObject *f_code; + struct PyObject *f_builtins; + struct PyObject *f_globals; + struct PyObject *f_locals; + struct PyObject **f_valuestack; + struct PyObject **f_stacktop; + int f_lasti; + int f_lineno; + // ... more fields +}; + +// PyThreadState structure (simplified) +struct PyThreadState { + struct PyThreadState *next; + void *interp; + struct PyFrameObject *frame; + // ... more fields +}; + +// PyStringObject / PyBytesObject (for reading strings) +struct PyBytesObject { + struct PyVarObject ob_base; + long ob_shash; + char ob_sval[1]; // Variable length +}; + +struct PyUnicodeObject { + struct PyObject ob_base; + unsigned long length; + long hash; + struct { + unsigned int interned:2; + unsigned int kind:3; + unsigned int compact:1; + unsigned int ascii:1; + unsigned int ready:1; + } state; + void *data; // Pointer to actual string data +}; + +#endif /* __PYTHON_STACK_H */