add cuda tracer example

2026-02-03 10:14:44 +08:00 · 2025-05-24 17:37:36 +08:00
parent 4cf80067a0
commit 516f19f27c
10 changed files with 2013 additions and 0 deletions
--- a/src/47-cuda-events/.config
+++ b/src/47-cuda-events/.config
@@ -0,0 +1 @@
+level=Advance
--- a/src/47-cuda-events/.gitignore
+++ b/src/47-cuda-events/.gitignore
@@ -0,0 +1,9 @@
+.vscode
+package.json
+*.o
+*.skel.json
+*.skel.yaml
+package.yaml
+ecli
+bootstrap
+cuda_events
--- a/src/47-cuda-events/Makefile
+++ b/src/47-cuda-events/Makefile
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+OUTPUT := .output
+CLANG ?= clang
+LIBBPF_SRC := $(abspath ../third_party/libbpf/src)
+BPFTOOL_SRC := $(abspath ../third_party/bpftool/src)
+LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
+BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
+BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+LIBBLAZESYM_SRC := $(abspath ../third_party/blazesym/)
+LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym.a)
+LIBBLAZESYM_HEADER := $(abspath $(OUTPUT)/blazesym.h)
+ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
+			 | sed 's/arm.*/arm/' \
+			 | sed 's/aarch64/arm64/' \
+			 | sed 's/ppc64le/powerpc/' \
+			 | sed 's/mips.*/mips/' \
+			 | sed 's/riscv64/riscv/' \
+			 | sed 's/loongarch64/loongarch/')
+VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h
+# Use our own libbpf API headers and Linux UAPI headers distributed with
+# libbpf to avoid dependency on system-wide headers, which could be missing or
+# outdated
+INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
+CFLAGS := -g -Wall
+ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
+
+APPS = cuda_events # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
+
+CARGO ?= $(shell which cargo)
+ifeq ($(strip $(CARGO)),)
+BZS_APPS :=
+else
+BZS_APPS := # profile
+APPS += $(BZS_APPS)
+# Required by libblazesym
+ALL_LDFLAGS += -lrt -ldl -lpthread -lm
+endif
+
+# Get Clang's default includes on this system. We'll explicitly add these dirs
+# to the includes list when compiling with `-target bpf` because otherwise some
+# architecture-specific dirs will be "missing" on some architectures/distros -
+# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
+# sys/cdefs.h etc. might be missing.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+ifeq ($(V),1)
+	Q =
+	msg =
+else
+	Q = @
+	msg = @printf '  %-8s %s%s\n'					\
+		      "$(1)"						\
+		      "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))"	\
+		      "$(if $(3), $(3))";
+	MAKEFLAGS += --no-print-directory
+endif
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+$(call allow-override,CC,$(CROSS_COMPILE)cc)
+$(call allow-override,LD,$(CROSS_COMPILE)ld)
+
+.PHONY: all
+all: $(APPS)
+
+.PHONY: clean
+clean:
+	$(call msg,CLEAN)
+	$(Q)rm -rf $(OUTPUT) $(APPS)
+
+$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
+	$(call msg,MKDIR,$@)
+	$(Q)mkdir -p $@
+
+# Build libbpf
+$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
+	$(call msg,LIB,$@)
+	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
+		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
+		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
+		    install
+
+# Build bpftool
+$(BPFTOOL): | $(BPFTOOL_OUTPUT)
+	$(call msg,BPFTOOL,$@)
+	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
+
+
+$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
+	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
+
+$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB, $@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
+
+$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB,$@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
+
+# Build BPF code
+$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
+	$(call msg,BPF,$@)
+	$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH)		      \
+		     $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES)		      \
+		     -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+	$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+
+# Generate BPF skeletons
+$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
+	$(call msg,GEN-SKEL,$@)
+	$(Q)$(BPFTOOL) gen skeleton $< > $@
+
+# Build user-space code
+$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
+
+$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
+	$(call msg,CC,$@)
+	$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
+
+$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
+
+$(BZS_APPS): $(LIBBLAZESYM_OBJ)
+
+# Build application binary
+$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
+	$(call msg,BINARY,$@)
+	$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
--- a/src/47-cuda-events/README.md
+++ b/src/47-cuda-events/README.md
@@ -0,0 +1,143 @@
+# Tracing CUDA Events with eBPF
+
+This tutorial demonstrates how to use eBPF to trace CUDA runtime API calls using uprobes. This allows you to monitor CUDA applications and gain insights into memory operations, kernel launches, stream operations, and device management.
+
+## Overview
+
+CUDA (Compute Unified Device Architecture) is NVIDIA's parallel computing platform and API model. When developing or troubleshooting CUDA applications, it's often useful to trace CUDA runtime API calls to understand:
+
+- Memory allocation patterns (cudaMalloc, cudaFree)
+- Data transfer between host and device (cudaMemcpy)
+- Kernel execution (cudaLaunchKernel)
+- Stream and event usage (cudaStreamCreate, cudaEventRecord)
+- Device management (cudaGetDevice, cudaSetDevice)
+
+eBPF's uprobes feature allows us to attach tracing points to user-space functions in shared libraries like NVIDIA's CUDA Runtime API library (`libcudart.so`), making it an excellent tool for this purpose.
+
+## Prerequisites
+
+- Linux kernel 4.18+ with eBPF support
+- NVIDIA CUDA Toolkit installed
+- bpftrace installed (for the bpftrace script approach)
+- libbpf development libraries (for the libbpf-based approach)
+
+## Approach 1: Using bpftrace (Easier)
+
+The `cuda_events.bt` script uses bpftrace's uprobe functionality to trace important CUDA API calls.
+
+### Locating the CUDA Runtime Library
+
+First, locate your CUDA runtime library:
+
+```bash
+# Common locations:
+ls -l /usr/local/cuda/lib64/libcudart.so*
+ls -l /usr/lib/x86_64-linux-gnu/libcudart.so*
+```
+
+Update the library path in the script if it's different from the default `/usr/local/cuda/lib64/libcudart.so`. You'll need to modify every probe definition in the script.
+
+### Running the Script
+
+```bash
+sudo bpftrace cuda_events.bt
+```
+
+In another terminal, run your CUDA application, and you'll see the traced CUDA API calls.
+
+### Output Format
+
+The script provides detailed output with the following columns:
+
+- `TIME(ms)`: Timestamp in milliseconds since tracing started
+- `PROCESS`: Name of the process making the CUDA call
+- `PID`: Process ID
+- `EVENT`: CUDA function name
+- `DETAILS`: Call-specific information (sizes, pointers, return codes)
+
+### Example Output
+
+```
+TIME(ms)   PROCESS         PID        EVENT                DETAILS
+1234       my_cuda_app     12345      cudaMalloc           size=1048576 bytes
+1235       my_cuda_app     12345      cudaMalloc           returned=0 (success)
+1236       my_cuda_app     12345      cudaMemcpy           size=1048576 bytes, kind=1
+1237       my_cuda_app     12345      cudaMemcpy           returned=0 (success)
+1240       my_cuda_app     12345      cudaLaunchKernel     function=0x7f8b3c4d2a00
+1241       my_cuda_app     12345      cudaLaunchKernel     returned=0 (success)
+```
+
+## What We're Tracing
+
+The script traces the following CUDA functions:
+
+### Memory Management
+- `cudaMalloc`: Allocates memory on the GPU
+- `cudaFree`: Frees memory on the GPU
+- `cudaMemcpy`: Copies data between host and device memory
+
+### Kernel Execution
+- `cudaLaunchKernel`: Launches a CUDA kernel
+
+### Stream Operations
+- `cudaStreamCreate`: Creates a CUDA stream
+- `cudaStreamSynchronize`: Waits for all operations in a stream to complete
+
+### Device Management
+- `cudaGetDevice`: Gets the current CUDA device
+- `cudaSetDevice`: Sets the current CUDA device
+
+### Event Management
+- `cudaEventCreate`: Creates a CUDA event
+- `cudaEventRecord`: Records an event in a stream
+- `cudaEventSynchronize`: Waits for an event to complete
+
+## Test Application
+
+The `cuda_events_test.c` file provides a simple CUDA application that performs vector addition. You can compile and run it to generate CUDA API calls for testing:
+
+```bash
+nvcc -o cuda_events_test cuda_events_test.c
+```
+
+Then run the bpftrace script in one terminal:
+
+```bash
+sudo bpftrace cuda_events.bt
+```
+
+And the test application in another:
+
+```bash
+./cuda_events_test
+```
+
+## Limitations
+
+- The script only traces the main CUDA Runtime API functions. It doesn't trace CUDA driver API calls or CUDA library functions.
+- The path to `libcudart.so` needs to be updated manually if it's different from the default.
+- To capture more CUDA driver API functions, you would need to add additional probes for functions in `libcuda.so`.
+
+## Troubleshooting
+
+If you encounter issues:
+
+1. **Library Path**: Ensure the path to `libcudart.so` in the script is correct for your system
+2. **Permission Issues**: Make sure you're running with sudo
+3. **Missing Symbols**: Some CUDA library versions might have different function signatures or optimized symbols
+
+## Conclusion
+
+eBPF and uprobes provide a powerful way to trace CUDA applications without modifying source code or recompiling. This non-intrusive approach allows developers to debug CUDA applications and analyze GPU utilization patterns easily.
+
+By tracing CUDA API calls, you can:
+- Debug memory leaks in CUDA applications
+- Understand data transfer patterns between CPU and GPU
+- Profile kernel execution patterns
+- Verify proper event and stream synchronization
+
+## Further Reading
+
+- [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/)
+- [bpftrace Reference Guide](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md)
+- [Using uprobes with BPF](https://www.brendangregg.com/blog/2016-10-12/linux-bcc-nodejs-uprobes.html)
--- a/src/47-cuda-events/README.zh.md
+++ b/src/47-cuda-events/README.zh.md
@@ -0,0 +1,628 @@
+# eBPF 入门开发实践教程十一：在 eBPF 中使用 libbpf 开发用户态程序并跟踪 exec() 和 exit() 系统调用
+
+eBPF (Extended Berkeley Packet Filter) 是 Linux 内核上的一个强大的网络和性能分析工具。它允许开发者在内核运行时动态加载、更新和运行用户定义的代码。
+
+在本教程中，我们将了解内核态和用户态的 eBPF 程序是如何协同工作的。我们还将学习如何使用原生的 libbpf 开发用户态程序，将 eBPF 应用打包为可执行文件，实现跨内核版本分发。
+
+## libbpf 库，以及为什么需要使用它
+
+libbpf 是一个 C 语言库，伴随内核版本分发，用于辅助 eBPF 程序的加载和运行。它提供了用于与 eBPF 系统交互的一组 C API，使开发者能够更轻松地编写用户态程序来加载和管理 eBPF 程序。这些用户态程序通常用于分析、监控或优化系统性能。
+
+使用 libbpf 库有以下优势：
+
+- 它简化了 eBPF 程序的加载、更新和运行过程。
+- 它提供了一组易于使用的 API，使开发者能够专注于编写核心逻辑，而不是处理底层细节。
+- 它能够确保与内核中的 eBPF 子系统的兼容性，降低了维护成本。
+
+同时，libbpf 和 BTF（BPF Type Format）都是 eBPF 生态系统的重要组成部分。它们各自在实现跨内核版本兼容方面发挥着关键作用。BTF（BPF Type Format）是一种元数据格式，用于描述 eBPF 程序中的类型信息。BTF 的主要目的是提供一种结构化的方式，以描述内核中的数据结构，以便 eBPF 程序可以更轻松地访问和操作它们。
+
+BTF 在实现跨内核版本兼容方面的关键作用如下：
+
+- BTF 允许 eBPF 程序访问内核数据结构的详细类型信息，而无需对特定内核版本进行硬编码。这使得 eBPF 程序可以适应不同版本的内核，从而实现跨内核版本兼容。
+- 通过使用 BPF CO-RE（Compile Once, Run Everywhere）技术，eBPF 程序可以利用 BTF 在编译时解析内核数据结构的类型信息，进而生成可以在不同内核版本上运行的 eBPF 程序。
+
+结合 libbpf 和 BTF，eBPF 程序可以在各种不同版本的内核上运行，而无需为每个内核版本单独编译。这极大地提高了 eBPF 生态系统的可移植性和兼容性，降低了开发和维护的难度。
+
+## 什么是 bootstrap
+
+Bootstrap 是一个使用 libbpf 的完整应用，它利用 eBPF 程序来跟踪内核中的 exec() 系统调用（通过 SEC("tp/sched/sched_process_exec") handle_exec BPF 程序），这主要对应于新进程的创建（不包括 fork() 部分）。此外，它还跟踪进程的 exit() 系统调用（通过 SEC("tp/sched/sched_process_exit") handle_exit BPF 程序），以了解每个进程何时退出。
+
+这两个 BPF 程序共同工作，允许捕获关于新进程的有趣信息，例如二进制文件的文件名，以及测量进程的生命周期，并在进程结束时收集有趣的统计信息，例如退出代码或消耗的资源量等。这是深入了解内核内部并观察事物如何真正运作的良好起点。
+
+Bootstrap 还使用 argp API（libc 的一部分）进行命令行参数解析，使得用户可以通过命令行选项配置应用行为。这种方式提供了灵活性，让用户能够根据实际需求自定义程序行为。虽然这些功能使用 eunomia-bpf 工具也可以实现，但是这里我们使用 libbpf 可以在用户态提供更高的可扩展性，不过也带来了不少额外的复杂度。
+
+## Bootstrap
+
+Bootstrap 分为两个部分：内核态和用户态。内核态部分是一个 eBPF 程序，它跟踪 exec() 和 exit() 系统调用。用户态部分是一个 C 语言程序，它使用 libbpf 库来加载和运行内核态程序，并处理从内核态程序收集的数据。
+
+### 内核态 eBPF 程序 bootstrap.bpf.c
+
+```c
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/* Copyright (c) 2020 Facebook */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bootstrap.h"
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 8192);
+    __type(key, pid_t);
+    __type(value, u64);
+} exec_start SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 256 * 1024);
+} rb SEC(".maps");
+
+const volatile unsigned long long min_duration_ns = 0;
+
+SEC("tp/sched/sched_process_exec")
+int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
+{
+    struct task_struct *task;
+    unsigned fname_off;
+    struct event *e;
+    pid_t pid;
+    u64 ts;
+
+    /* remember time exec() was executed for this PID */
+    pid = bpf_get_current_pid_tgid() >> 32;
+    ts = bpf_ktime_get_ns();
+    bpf_map_update_elem(&exec_start, &pid, &ts, BPF_ANY);
+
+    /* don't emit exec events when minimum duration is specified */
+    if (min_duration_ns)
+        return 0;
+
+    /* reserve sample from BPF ringbuf */
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+
+    /* fill out the sample with data */
+    task = (struct task_struct *)bpf_get_current_task();
+
+    e->exit_event = false;
+    e->pid = pid;
+    e->ppid = BPF_CORE_READ(task, real_parent, tgid);
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+
+    fname_off = ctx->__data_loc_filename & 0xFFFF;
+    bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
+
+    /* successfully submit it to user-space for post-processing */
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+SEC("tp/sched/sched_process_exit")
+int handle_exit(struct trace_event_raw_sched_process_template* ctx)
+{
+    struct task_struct *task;
+    struct event *e;
+    pid_t pid, tid;
+    u64 id, ts, *start_ts, duration_ns = 0;
+    
+    /* get PID and TID of exiting thread/process */
+    id = bpf_get_current_pid_tgid();
+    pid = id >> 32;
+    tid = (u32)id;
+
+    /* ignore thread exits */
+    if (pid != tid)
+        return 0;
+
+    /* if we recorded start of the process, calculate lifetime duration */
+    start_ts = bpf_map_lookup_elem(&exec_start, &pid);
+    if (start_ts)
+        duration_ns = bpf_ktime_get_ns() - *start_ts;
+    else if (min_duration_ns)
+        return 0;
+    bpf_map_delete_elem(&exec_start, &pid);
+
+    /* if process didn't live long enough, return early */
+    if (min_duration_ns && duration_ns < min_duration_ns)
+        return 0;
+
+    /* reserve sample from BPF ringbuf */
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+
+    /* fill out the sample with data */
+    task = (struct task_struct *)bpf_get_current_task();
+
+    e->exit_event = true;
+    e->duration_ns = duration_ns;
+    e->pid = pid;
+    e->ppid = BPF_CORE_READ(task, real_parent, tgid);
+    e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+
+    /* send data to user-space for post-processing */
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+```
+
+这段代码是一个内核态 eBPF 程序（bootstrap.bpf.c），主要用于跟踪 exec() 和 exit() 系统调用。它通过 eBPF 程序捕获进程的创建和退出事件，并将相关信息发送到用户态程序进行处理。下面是对代码的详细解释。
+
+首先，我们引入所需的头文件，定义 eBPF 程序的许可证以及两个 eBPF maps：exec_start 和 rb。exec_start 是一个哈希类型的 eBPF map，用于存储进程开始执行时的时间戳。rb 是一个环形缓冲区类型的 eBPF map，用于存储捕获的事件数据，并将其发送到用户态程序。
+
+```c
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "bootstrap.h"
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 8192);
+    __type(key, pid_t);
+    __type(value, u64);
+} exec_start SEC(".maps");
+
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 256 * 1024);
+} rb SEC(".maps");
+
+const volatile unsigned long long min_duration_ns = 0;
+```
+
+接下来，我们定义了一个名为 handle_exec 的 eBPF 程序，它会在进程执行 exec() 系统调用时触发。首先，我们从当前进程中获取 PID，记录进程开始执行的时间戳，然后将其存储在 exec_start map 中。
+
+```c
+SEC("tp/sched/sched_process_exec")
+int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
+{
+    // ...
+    pid = bpf_get_current_pid_tgid() >> 32;
+    ts = bpf_ktime_get_ns();
+    bpf_map_update_elem(&exec_start, &pid, &ts, BPF_ANY);
+
+    // ...
+}
+```
+
+然后，我们从环形缓冲区 map rb 中预留一个事件结构，并填充相关数据，如进程 ID、父进程 ID、进程名等。之后，我们将这些数据发送到用户态程序进行处理。
+
+```c
+    // reserve sample from BPF ringbuf
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+
+    // fill out the sample with data
+    task = (struct task_struct *)bpf_get_current_task();
+
+    e->exit_event = false;
+    e->pid = pid;
+    e->ppid = BPF_CORE_READ(task, real_parent, tgid);
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+
+    fname_off = ctx->__data_loc_filename & 0xFFFF;
+    bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
+
+    // successfully submit it to user-space for post-processing
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+```
+
+最后，我们定义了一个名为 handle_exit 的 eBPF 程序，它会在进程执行 exit() 系统调用时触发。首先，我们从当前进程中获取 PID 和 TID（线程 ID）。如果 PID 和 TID 不相等，说明这是一个线程退出，我们将忽略此事件。
+
+```c
+SEC("tp/sched/sched_process_exit")
+int handle_exit(struct trace_event_raw_sched_process_template* ctx)
+{
+    // ...
+    id = bpf_get_current_pid_tgid();
+    pid = id >> 32;
+    tid = (u32)id;
+
+    /* ignore thread exits */
+    if (pid != tid)
+        return 0;
+
+    // ...
+}
+```
+
+接着，我们查找之前存储在 exec_start map 中的进程开始执行的时间戳。如果找到了时间戳，我们将计算进程的生命周期（持续时间），然后从 exec_start map 中删除该记录。如果未找到时间戳且指定了最小持续时间，则直接返回。
+
+```c
+    // if we recorded start of the process, calculate lifetime duration
+    start_ts = bpf_map_lookup_elem(&exec_start, &pid);
+    if (start_ts)
+        duration_ns = bpf_ktime_get_ns() - *start_ts;
+    else if (min_duration_ns)
+        return 0;
+    bpf_map_delete_elem(&exec_start, &pid);
+
+    // if process didn't live long enough, return early
+    if (min_duration_ns && duration_ns < min_duration_ns)
+        return 0;
+```
+
+然后，我们从环形缓冲区 map rb 中预留一个事件结构，并填充相关数据，如进程 ID、父进程 ID、进程名、进程持续时间等。最后，我们将这些数据发送到用户态程序进行处理。
+
+```c
+    /* reserve sample from BPF ringbuf */
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+
+    /* fill out the sample with data */
+    task = (struct task_struct *)bpf_get_current_task();
+
+    e->exit_event = true;
+    e->duration_ns = duration_ns;
+    e->pid = pid;
+    e->ppid = BPF_CORE_READ(task, real_parent, tgid);
+    e->exit_code = (BPF_CORE_READ(task, exit_code) >> 8) & 0xff;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+
+    /* send data to user-space for post-processing */
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+```
+
+这样，当进程执行 exec() 或 exit() 系统调用时，我们的 eBPF 程序会捕获相应的事件，并将详细信息发送到用户态程序进行后续处理。这使得我们可以轻松地监控进程的创建和退出，并获取有关进程的详细信息。
+
+除此之外，在 bootstrap.h 中，我们还定义了和用户态交互的数据结构：
+
+```c
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2020 Facebook */
+#ifndef __BOOTSTRAP_H
+#define __BOOTSTRAP_H
+
+#define TASK_COMM_LEN 16
+#define MAX_FILENAME_LEN 127
+
+struct event {
+    int pid;
+    int ppid;
+    unsigned exit_code;
+    unsigned long long duration_ns;
+    char comm[TASK_COMM_LEN];
+    char filename[MAX_FILENAME_LEN];
+    bool exit_event;
+};
+
+#endif /* __BOOTSTRAP_H */
+```
+
+### 用户态，bootstrap.c
+
+```c
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2020 Facebook */
+#include <argp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/resource.h>
+#include <bpf/libbpf.h>
+#include "bootstrap.h"
+#include "bootstrap.skel.h"
+
+static struct env {
+    bool verbose;
+    long min_duration_ms;
+} env;
+
+const char *argp_program_version = "bootstrap 0.0";
+const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
+const char argp_program_doc[] =
+"BPF bootstrap demo application.\n"
+"\n"
+"It traces process start and exits and shows associated \n"
+"information (filename, process duration, PID and PPID, etc).\n"
+"\n"
+"USAGE: ./bootstrap [-d <min-duration-ms>] [-v]\n";
+
+static const struct argp_option opts[] = {
+    { "verbose", 'v', NULL, 0, "Verbose debug output" },
+    { "duration", 'd', "DURATION-MS", 0, "Minimum process duration (ms) to report" },
+    {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+    switch (key) {
+    case 'v':
+        env.verbose = true;
+        break;
+    case 'd':
+        errno = 0;
+        env.min_duration_ms = strtol(arg, NULL, 10);
+        if (errno || env.min_duration_ms <= 0) {
+            fprintf(stderr, "Invalid duration: %s\n", arg);
+            argp_usage(state);
+        }
+        break;
+    case ARGP_KEY_ARG:
+        argp_usage(state);
+        break;
+    default:
+        return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+static const struct argp argp = {
+    .options = opts,
+    .parser = parse_arg,
+    .doc = argp_program_doc,
+};
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+    if (level == LIBBPF_DEBUG && !env.verbose)
+        return 0;
+    return vfprintf(stderr, format, args);
+}
+
+static volatile bool exiting = false;
+
+static void sig_handler(int sig)
+{
+    exiting = true;
+}
+
+static int handle_event(void *ctx, void *data, size_t data_sz)
+{
+    const struct event *e = data;
+    struct tm *tm;
+    char ts[32];
+    time_t t;
+
+    time(&t);
+    tm = localtime(&t);
+    strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+
+    if (e->exit_event) {
+        printf("%-8s %-5s %-16s %-7d %-7d [%u]",
+               ts, "EXIT", e->comm, e->pid, e->ppid, e->exit_code);
+        if (e->duration_ns)
+            printf(" (%llums)", e->duration_ns / 1000000);
+        printf("\n");
+    } else {
+        printf("%-8s %-5s %-16s %-7d %-7d %s\n",
+               ts, "EXEC", e->comm, e->pid, e->ppid, e->filename);
+    }
+
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    struct ring_buffer *rb = NULL;
+    struct bootstrap_bpf *skel;
+    int err;
+
+    /* Parse command line arguments */
+    err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+    if (err)
+        return err;
+
+    /* Set up libbpf errors and debug info callback */
+    libbpf_set_print(libbpf_print_fn);
+
+    /* Cleaner handling of Ctrl-C */
+    signal(SIGINT, sig_handler);
+    signal(SIGTERM, sig_handler);
+
+    /* Load and verify BPF application */
+    skel = bootstrap_bpf__open();
+    if (!skel) {
+        fprintf(stderr, "Failed to open and load BPF skeleton\n");
+        return 1;
+    }
+
+    /* Parameterize BPF code with minimum duration parameter */
+    skel->rodata->min_duration_ns = env.min_duration_ms * 1000000ULL;
+
+    /* Load & verify BPF programs */
+    err = bootstrap_bpf__load(skel);
+    if (err) {
+        fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+        goto cleanup;
+    }
+
+    /* Attach tracepoints */
+    err = bootstrap_bpf__attach(skel);
+    if (err) {
+        fprintf(stderr, "Failed to attach BPF skeleton\n");
+        goto cleanup;
+    }
+
+    /* Set up ring buffer polling */
+    rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
+    if (!rb) {
+        err = -1;
+        fprintf(stderr, "Failed to create ring buffer\n");
+        goto cleanup;
+    }
+
+    /* Process events */
+    printf("%-8s %-5s %-16s %-7s %-7s %s\n",
+           "TIME", "EVENT", "COMM", "PID", "PPID", "FILENAME/EXIT CODE");
+    while (!exiting) {
+        err = ring_buffer__poll(rb, 100 /* timeout, ms */);
+        /* Ctrl-C will cause -EINTR */
+        if (err == -EINTR) {
+            err = 0;
+            break;
+        }
+        if (err < 0) {
+            printf("Error polling perf buffer: %d\n", err);
+            break;
+        }
+    }
+
+cleanup:
+    /* Clean up */
+    ring_buffer__free(rb);
+    bootstrap_bpf__destroy(skel);
+
+    return err < 0 ? -err : 0;
+}
+```
+
+这个用户态程序主要用于加载、验证、附加 eBPF 程序，以及接收 eBPF 程序收集的事件数据，并将其打印出来。我们将分析一些关键部分。
+
+首先，我们定义了一个 env 结构，用于存储命令行参数：
+
+```c
+static struct env {
+    bool verbose;
+    long min_duration_ms;
+} env;
+```
+
+接下来，我们使用 argp 库来解析命令行参数：
+
+```c
+static const struct argp_option opts[] = {
+    { "verbose", 'v', NULL, 0, "Verbose debug output" },
+    { "duration", 'd', "DURATION-MS", 0, "Minimum process duration (ms) to report" },
+    {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+    // ...
+}
+
+static const struct argp argp = {
+    .options = opts,
+    .parser = parse_arg,
+    .doc = argp_program_doc,
+};
+```
+
+main() 函数中，首先解析命令行参数，然后设置 libbpf 的打印回调函数 libbpf_print_fn，以便在需要时输出调试信息：
+
+```c
+err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+if (err)
+    return err;
+
+libbpf_set_print(libbpf_print_fn);
+```
+
+接下来，我们打开 eBPF 脚手架（skeleton）文件，将最小持续时间参数传递给 eBPF 程序，并加载和附加 eBPF 程序：
+
+```c
+skel = bootstrap_bpf__open();
+if (!skel) {
+    fprintf(stderr, "Failed to open and load BPF skeleton\n");
+    return 1;
+}
+
+skel->rodata->min_duration_ns = env.min_duration_ms * 1000000ULL;
+
+err = bootstrap_bpf__load(skel);
+if (err) {
+    fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+    goto cleanup;
+}
+
+err = bootstrap_bpf__attach(skel);
+if (err) {
+    fprintf(stderr, "Failed to attach BPF skeleton\n");
+    goto cleanup;
+}
+```
+
+然后，我们创建一个环形缓冲区（ring buffer），用于接收 eBPF 程序发送的事件数据：
+
+```c
+rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
+if (!rb) {
+    err = -1;
+    fprintf(stderr, "Failed to create ring buffer\n");
+    goto cleanup;
+}
+```
+
+handle_event() 函数会处理从 eBPF 程序收到的事件。根据事件类型（进程执行或退出），它会提取并打印事件信息，如时间戳、进程名、进程 ID、父进程 ID、文件名或退出代码等。
+
+最后，我们使用 ring_buffer__poll() 函数轮询环形缓冲区，处理收到的事件数据：
+
+```c
+while (!exiting) {
+    err = ring_buffer__poll(rb, 100 /* timeout, ms */);
+    // ...
+}
+```
+
+当程序收到 SIGINT 或 SIGTERM 信号时，它会最后完成清理、退出操作，关闭和卸载 eBPF 程序：
+
+```c
+cleanup:
+ /* Clean up */
+ ring_buffer__free(rb);
+ bootstrap_bpf__destroy(skel);
+
+ return err < 0 ? -err : 0;
+}
+```
+
+## 安装依赖
+
+构建示例需要 clang、libelf 和 zlib。包名在不同的发行版中可能会有所不同。
+
+在 Ubuntu/Debian 上，你需要执行以下命令：
+
+```shell
+sudo apt install clang libelf1 libelf-dev zlib1g-dev
+```
+
+在 CentOS/Fedora 上，你需要执行以下命令：
+
+```shell
+sudo dnf install clang elfutils-libelf elfutils-libelf-devel zlib-devel
+```
+
+## 编译运行
+
+编译运行上述代码：
+
+```console
+$ git submodule update --init --recursive
+$ make
+  BPF      .output/bootstrap.bpf.o
+  GEN-SKEL .output/bootstrap.skel.h
+  CC       .output/bootstrap.o
+  BINARY   bootstrap
+$ sudo ./bootstrap 
+[sudo] password for yunwei: 
+TIME     EVENT COMM             PID     PPID    FILENAME/EXIT CODE
+03:16:41 EXEC  sh               110688  80168   /bin/sh
+03:16:41 EXEC  which            110689  110688  /usr/bin/which
+03:16:41 EXIT  which            110689  110688  [0] (0ms)
+03:16:41 EXIT  sh               110688  80168   [0] (0ms)
+03:16:41 EXEC  sh               110690  80168   /bin/sh
+03:16:41 EXEC  ps               110691  110690  /usr/bin/ps
+03:16:41 EXIT  ps               110691  110690  [0] (49ms)
+03:16:41 EXIT  sh               110690  80168   [0] (51ms)
+```
+
+## 总结
+
+通过这个实例，我们了解了如何将 eBPF 程序与用户态程序结合使用。这种结合为开发者提供了一个强大的工具集，可以实现跨内核和用户空间的高效数据收集和处理。通过使用 eBPF 和 libbpf，您可以构建更高效、可扩展和安全的监控和性能分析工具。
+
+如果您希望学习更多关于 eBPF 的知识和实践，可以访问我们的教程代码仓库 <https://github.com/eunomia-bpf/bpf-developer-tutorial> 或网站 <https://eunomia.dev/zh/tutorials/> 以获取更多示例和完整的教程。
--- a/src/47-cuda-events/cuda_events.bpf.c
+++ b/src/47-cuda-events/cuda_events.bpf.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include "cuda_events.h"
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 256 * 1024);
+} rb SEC(".maps");
+
+/* CUDA library path is defined via build system now */
+#ifndef CUDA_LIB_PATH
+#define CUDA_LIB_PATH "/usr/local/cuda/lib64/libcudart.so"
+#endif
+
+/* Helper function to prepare and submit an event */
+static inline int submit_event(enum cuda_event_type type, bool is_return)
+{
+    struct event *e;
+    
+    /* Reserve sample from BPF ringbuf */
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    /* Fill common fields */
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = type;
+    e->is_return = is_return;
+    
+    /* Submit to user-space for processing */
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper function for malloc event */
+static inline int submit_malloc_event(size_t size, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = CUDA_EVENT_MALLOC;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else {
+        e->mem.size = size;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper function for free event */
+static inline int submit_free_event(void *ptr, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = CUDA_EVENT_FREE;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else {
+        e->free_data.ptr = ptr;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper function for memcpy event */
+static inline int submit_memcpy_event(size_t size, int kind, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = CUDA_EVENT_MEMCPY;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else {
+        e->memcpy_data.size = size;
+        e->memcpy_data.kind = kind;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper for kernel launch */
+static inline int submit_launch_event(void *func, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = CUDA_EVENT_LAUNCH_KERNEL;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else {
+        e->launch.func = func;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper for device operations */
+static inline int submit_device_event(enum cuda_event_type type, int device, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = type;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else if (type == CUDA_EVENT_SET_DEVICE) {
+        e->device.device = device;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Helper for stream/event operations */
+static inline int submit_handle_event(enum cuda_event_type type, void *handle, bool is_return, int ret_val)
+{
+    struct event *e;
+    
+    e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e)
+        return 0;
+    
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = type;
+    e->is_return = is_return;
+    
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else if (handle) {
+        e->handle.handle = handle;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+
+/* Uprobe handlers for CUDA functions */
+
+/* Format of uprobe section definition supporting auto-attach:
+ * u[ret]probe/binary:function[+offset]
+ */
+
+/* Memory allocation/free operations */
+SEC("uprobe")
+int BPF_KPROBE(cuda_malloc_enter, void **ptr, size_t size)
+{
+    return submit_malloc_event(size, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_malloc_exit, int ret)
+{
+    return submit_malloc_event(0, true, ret);
+}
+
+SEC("uprobe")
+int BPF_KPROBE(cuda_free_enter, void *ptr)
+{
+    return submit_free_event(ptr, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_free_exit, int ret)
+{
+    return submit_free_event(0, true, ret);
+}
+
+/* Memory copy */
+SEC("uprobe")
+int BPF_KPROBE(cuda_memcpy_enter, void *dst, const void *src, size_t size, int kind)
+{
+    return submit_memcpy_event(size, kind, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_memcpy_exit, int ret)
+{
+    return submit_memcpy_event(0, 0, true, ret);
+}
+
+/* Kernel launch */
+SEC("uprobe")
+int BPF_KPROBE(cuda_launch_kernel_enter, const void *func)
+{
+    return submit_launch_event((void*)func, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_launch_kernel_exit, int ret)
+{
+    return submit_launch_event(0, true, ret);
+}
+
+/* Stream operations */
+SEC("uprobe")
+int BPF_KPROBE(cuda_stream_create_enter)
+{
+    return submit_handle_event(CUDA_EVENT_STREAM_CREATE, NULL, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_stream_create_exit, int ret)
+{
+    return submit_handle_event(CUDA_EVENT_STREAM_CREATE, NULL, true, ret);
+}
+
+SEC("uprobe")
+int BPF_KPROBE(cuda_stream_sync_enter, void *stream)
+{
+    return submit_handle_event(CUDA_EVENT_STREAM_SYNC, stream, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_stream_sync_exit, int ret)
+{
+    return submit_handle_event(CUDA_EVENT_STREAM_SYNC, NULL, true, ret);
+}
+
+/* Device management */
+SEC("uprobe")
+int BPF_KPROBE(cuda_get_device_enter)
+{
+    return submit_device_event(CUDA_EVENT_GET_DEVICE, 0, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_get_device_exit, int ret)
+{
+    return submit_device_event(CUDA_EVENT_GET_DEVICE, 0, true, ret);
+}
+
+SEC("uprobe")
+int BPF_KPROBE(cuda_set_device_enter, int device)
+{
+    return submit_device_event(CUDA_EVENT_SET_DEVICE, device, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_set_device_exit, int ret)
+{
+    return submit_device_event(CUDA_EVENT_SET_DEVICE, 0, true, ret);
+}
+
+/* Event operations */
+SEC("uprobe")
+int BPF_KPROBE(cuda_event_create_enter)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_CREATE, NULL, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_event_create_exit, int ret)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_CREATE, NULL, true, ret);
+}
+
+SEC("uprobe")
+int BPF_KPROBE(cuda_event_record_enter, void *event)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_RECORD, event, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_event_record_exit, int ret)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_RECORD, NULL, true, ret);
+}
+
+SEC("uprobe")
+int BPF_KPROBE(cuda_event_sync_enter, void *event)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_SYNC, event, false, 0);
+}
+
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_event_sync_exit, int ret)
+{
+    return submit_handle_event(CUDA_EVENT_EVENT_SYNC, NULL, true, ret);
+}
--- a/src/47-cuda-events/cuda_events.bt
+++ b/src/47-cuda-events/cuda_events.bt
@@ -0,0 +1,197 @@
+#!/usr/bin/env bpftrace
+
+/*
+ * cuda_events.bt  Trace CUDA events using bpftrace and uprobes.
+ *
+ * This script traces key CUDA API functions to provide visibility into:
+ * - Memory operations (cudaMalloc, cudaFree, cudaMemcpy)
+ * - Kernel launches (cudaLaunchKernel)
+ * - Stream operations (cudaStreamCreate, cudaStreamSynchronize)
+ * - Device management (cudaGetDevice, cudaSetDevice)
+ * 
+ * USAGE: sudo ./cuda_events.bt
+ *
+ * This requires:
+ * - bpftrace
+ * - CUDA toolkit installed (with libcudart.so)
+ *
+ * Note: You need to modify the library path in each probe definition if the
+ * default "/usr/local/cuda/lib64/libcudart.so" doesn't exist on your system.
+ * Search and replace all occurrences with your system's CUDA library path.
+ */
+
+BEGIN
+{
+    printf("Tracing CUDA events... Hit Ctrl-C to end.\n");
+    printf("%-12s %-16s %-12s %-20s %s\n", "TIME(ms)", "PROCESS", "PID", "EVENT", "DETAILS");
+    printf("Using CUDA library: /usr/local/cuda-12.6/lib64/libcudart.so\n");
+    printf("If this path is incorrect, please edit the script and update all probe definitions.\n");
+}
+
+// Memory allocation tracking
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaMalloc
+{
+    $size = arg1;
+    printf("%-12u %-16s %-12d %-20s size=%ld bytes\n", 
+           elapsed/1000000, comm, pid, "cudaMalloc", $size);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaMalloc
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaMalloc", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Memory free
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaFree
+{
+    printf("%-12u %-16s %-12d %-20s ptr=0x%lx\n", 
+           elapsed/1000000, comm, pid, "cudaFree", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaFree
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaFree", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Memory copy
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaMemcpy
+{
+    $size = arg3;
+    printf("%-12u %-16s %-12d %-20s size=%ld bytes, kind=%d\n", 
+           elapsed/1000000, comm, pid, "cudaMemcpy", $size, arg4);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaMemcpy
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaMemcpy", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Kernel launches
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaLaunchKernel
+{
+    printf("%-12u %-16s %-12d %-20s function=0x%lx\n", 
+           elapsed/1000000, comm, pid, "cudaLaunchKernel", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaLaunchKernel
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaLaunchKernel", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Stream operations
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaStreamCreate
+{
+    printf("%-12u %-16s %-12d %-20s\n", 
+           elapsed/1000000, comm, pid, "cudaStreamCreate");
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaStreamCreate
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaStreamCreate", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaStreamSynchronize
+{
+    printf("%-12u %-16s %-12d %-20s stream=0x%lx\n", 
+           elapsed/1000000, comm, pid, "cudaStreamSynchronize", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaStreamSynchronize
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaStreamSynchronize", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Device management
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaGetDevice
+{
+    printf("%-12u %-16s %-12d %-20s\n", 
+           elapsed/1000000, comm, pid, "cudaGetDevice");
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaGetDevice
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaGetDevice", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaSetDevice
+{
+    printf("%-12u %-16s %-12d %-20s device=%d\n", 
+           elapsed/1000000, comm, pid, "cudaSetDevice", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaSetDevice
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaSetDevice", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+// Event operations
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventCreate
+{
+    printf("%-12u %-16s %-12d %-20s\n", 
+           elapsed/1000000, comm, pid, "cudaEventCreate");
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventCreate
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaEventCreate", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventRecord
+{
+    printf("%-12u %-16s %-12d %-20s event=0x%lx\n", 
+           elapsed/1000000, comm, pid, "cudaEventRecord", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventRecord
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaEventRecord", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+uprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventSynchronize
+{
+    printf("%-12u %-16s %-12d %-20s event=0x%lx\n", 
+           elapsed/1000000, comm, pid, "cudaEventSynchronize", arg0);
+}
+
+uretprobe:/usr/local/cuda-12.6/lib64/libcudart.so:cudaEventSynchronize
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaEventSynchronize", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+END
+{
+    printf("Tracing complete.\n");
+} 
--- a/src/47-cuda-events/cuda_events.c
+++ b/src/47-cuda-events/cuda_events.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2023 */
+#include <argp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "cuda_events.h"
+#include "cuda_events.skel.h"
+
+static struct env {
+    bool verbose;
+    bool print_timestamp;
+    char *cuda_library_path;
+    bool include_returns;
+} env = {
+    .print_timestamp = true,
+    .include_returns = true,
+    .cuda_library_path = NULL,
+};
+
+const char *argp_program_version = "cuda_events 0.1";
+const char *argp_program_bug_address = "<your-email@example.com>";
+const char argp_program_doc[] =
+"CUDA events tracing tool using eBPF.\n"
+"\n"
+"It traces CUDA API calls and shows associated information\n"
+"such as memory allocations, kernel launches, data transfers, etc.\n"
+"\n"
+"USAGE: ./cuda_events [-v] [--no-timestamp] [--cuda-path PATH]\n";
+
+static const struct argp_option opts[] = {
+    { "verbose", 'v', NULL, 0, "Verbose debug output" },
+    { "no-timestamp", 't', NULL, 0, "Don't print timestamps" },
+    { "no-returns", 'r', NULL, 0, "Don't show function returns" },
+    { "cuda-path", 'p', "CUDA_PATH", 0, "Path to CUDA runtime library" },
+    {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+    switch (key) {
+    case 'v':
+        env.verbose = true;
+        break;
+    case 't':
+        env.print_timestamp = false;
+        break;
+    case 'r':
+        env.include_returns = false;
+        break;
+    case 'p':
+        env.cuda_library_path = arg;
+        break;
+    case ARGP_KEY_ARG:
+        argp_usage(state);
+        break;
+    default:
+        return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+static const struct argp argp = {
+    .options = opts,
+    .parser = parse_arg,
+    .doc = argp_program_doc,
+};
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+    if (level == LIBBPF_DEBUG && !env.verbose)
+        return 0;
+    return vfprintf(stderr, format, args);
+}
+
+static volatile bool exiting = false;
+
+static void sig_handler(int sig)
+{
+    exiting = true;
+}
+
+/* Return human-readable event type */
+static const char *event_type_str(enum cuda_event_type type)
+{
+    switch (type) {
+    case CUDA_EVENT_MALLOC:         return "cudaMalloc";
+    case CUDA_EVENT_FREE:           return "cudaFree";
+    case CUDA_EVENT_MEMCPY:         return "cudaMemcpy";
+    case CUDA_EVENT_LAUNCH_KERNEL:  return "cudaLaunchKernel";
+    case CUDA_EVENT_STREAM_CREATE:  return "cudaStreamCreate";
+    case CUDA_EVENT_STREAM_SYNC:    return "cudaStreamSynchronize";
+    case CUDA_EVENT_GET_DEVICE:     return "cudaGetDevice";
+    case CUDA_EVENT_SET_DEVICE:     return "cudaSetDevice";
+    case CUDA_EVENT_EVENT_CREATE:   return "cudaEventCreate";
+    case CUDA_EVENT_EVENT_RECORD:   return "cudaEventRecord";
+    case CUDA_EVENT_EVENT_SYNC:     return "cudaEventSynchronize";
+    default:                        return "Unknown";
+    }
+}
+
+/* Return human-readable CUDA error code */
+static const char *cuda_error_str(int error)
+{
+    switch (error) {
+    case 0:  return "Success";
+    case 1:  return "InvalidValue";
+    case 2:  return "OutOfMemory";
+    case 3:  return "NotInitialized";
+    case 4:  return "Deinitialized";
+    case 5:  return "ProfilerDisabled";
+    case 6:  return "ProfilerNotInitialized";
+    case 7:  return "ProfilerAlreadyStarted";
+    case 8:  return "ProfilerAlreadyStopped";
+    case 9:  return "InvalidConfiguration";
+    case 10: return "InvalidPitchValue";
+    case 11: return "InvalidSymbol";
+    case 12: return "InvalidHostPointer";
+    case 13: return "InvalidDevicePointer";
+    case 14: return "InvalidTexture";
+    case 15: return "InvalidTextureBinding";
+    case 16: return "InvalidChannelDescriptor";
+    case 17: return "InvalidMemcpyDirection";
+    case 18: return "AddressOfConstant";
+    case 19: return "TextureFetchFailed";
+    case 20: return "TextureNotBound";
+    case 21: return "SynchronizationError";
+    case 22: return "InvalidFilterSetting";
+    case 23: return "InvalidNormSetting";
+    case 24: return "MixedDeviceExecution";
+    case 25: return "NotYetImplemented";
+    case 26: return "MemoryValueTooLarge";
+    case 27: return "StubLibrary";
+    case 28: return "InsufficientDriver";
+    case 29: return "CallRequiresNewerDriver";
+    case 30: return "InvalidSurface";
+    case 31: return "DuplicateVariableName";
+    case 32: return "DuplicateTextureName";
+    case 33: return "DuplicateSurfaceName";
+    case 34: return "DevicesUnavailable";
+    case 35: return "IncompatibleDriverContext";
+    case 36: return "MissingConfiguration";
+    case 37: return "PriorLaunchFailure";
+    case 38: return "LaunchMaxDepthExceeded";
+    case 39: return "LaunchFileScopedTex";
+    case 40: return "LaunchFileScopedSurf";
+    case 41: return "SyncDepthExceeded";
+    case 42: return "LaunchPendingCountExceeded";
+    case 43: return "InvalidDeviceFunction";
+    case 44: return "NoDevice";
+    case 45: return "InvalidDevice";
+    case 46: return "DeviceNotLicensed";
+    case 47: return "SoftwareValidityNotEstablished";
+    case 48: return "StartupFailure";
+    case 49: return "InvalidKernelImage";
+    case 50: return "DeviceUninitialized";
+    case 51: return "MapBufferObjectFailed";
+    case 52: return "UnmapBufferObjectFailed";
+    case 53: return "ArrayIsMapped";
+    case 54: return "AlreadyMapped";
+    case 55: return "NoKernelImageForDevice";
+    case 56: return "AlreadyAcquired";
+    case 57: return "NotMapped";
+    case 58: return "NotMappedAsArray";
+    case 59: return "NotMappedAsPointer";
+    case 60: return "ECCUncorrectable";
+    case 61: return "UnsupportedLimit";
+    case 62: return "DeviceAlreadyInUse";
+    case 63: return "PeerAccessUnsupported";
+    case 64: return "InvalidPtx";
+    case 65: return "InvalidGraphicsContext";
+    case 66: return "NvlinkUncorrectable";
+    case 67: return "JitCompilerNotFound";
+    case 68: return "UnsupportedPtxVersion";
+    case 69: return "JitCompilationDisabled";
+    case 70: return "UnsupportedExecAffinity";
+    case 71: return "InvalidSource";
+    case 72: return "FileNotFound";
+    case 73: return "SharedObjectSymbolNotFound";
+    case 74: return "SharedObjectInitFailed";
+    case 75: return "OperatingSystem";
+    case 76: return "InvalidResourceHandle";
+    case 77: return "IllegalState";
+    case 78: return "SymbolNotFound";
+    case 79: return "NotReady";
+    case 80: return "IllegalAddress";
+    case 81: return "LaunchOutOfResources";
+    case 82: return "LaunchTimeout";
+    case 83: return "LaunchIncompatibleTexturing";
+    case 84: return "PeerAccessAlreadyEnabled";
+    case 85: return "PeerAccessNotEnabled";
+    case 86: return "SetOnActiveProcess";
+    case 87: return "ContextIsDestroyed";
+    case 88: return "Assert";
+    case 89: return "TooManyPeers";
+    case 90: return "HostMemoryAlreadyRegistered";
+    case 91: return "HostMemoryNotRegistered";
+    case 92: return "HardwareStackError";
+    case 93: return "IllegalInstruction";
+    case 94: return "MisalignedAddress";
+    case 95: return "InvalidAddressSpace";
+    case 96: return "InvalidPc";
+    case 97: return "LaunchFailure";
+    case 98: return "CooperativeLaunchTooLarge";
+    case 99: return "NotPermitted";
+    case 100: return "NotSupported";
+    case 101: return "SystemNotReady";
+    case 102: return "SystemDriverMismatch";
+    case 103: return "CompatNotSupportedOnDevice";
+    case 104: return "StreamCaptureUnsupported";
+    case 105: return "StreamCaptureInvalidated";
+    case 106: return "StreamCaptureMerge";
+    case 107: return "StreamCaptureUnmatched";
+    case 108: return "StreamCaptureUnjoined";
+    case 109: return "StreamCaptureIsolation";
+    case 110: return "StreamCaptureImplicit";
+    case 111: return "CapturedEvent";
+    case 112: return "StreamCaptureWrongThread";
+    case 113: return "Unknown";
+    case 114: return "Timeout";
+    case 115: return "GraphExecUpdateFailure";
+    case 116: return "ExternalDevice";
+    case 117: return "InvalidClusterSize";
+    case 118: return "UnknownError";
+    default: return "Unknown";
+    }
+}
+
+/* Return human-readable details for the event */
+static void get_event_details(const struct event *e, char *details, size_t len)
+{
+    switch (e->type) {
+    case CUDA_EVENT_MALLOC:
+        if (!e->is_return)
+            snprintf(details, len, "size=%zu bytes", e->mem.size);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    case CUDA_EVENT_FREE:
+        if (!e->is_return)
+            snprintf(details, len, "ptr=%p", e->free_data.ptr);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    case CUDA_EVENT_MEMCPY:
+        if (!e->is_return)
+            snprintf(details, len, "size=%zu bytes, kind=%d", 
+                    e->memcpy_data.size, e->memcpy_data.kind);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    case CUDA_EVENT_LAUNCH_KERNEL:
+        if (!e->is_return)
+            snprintf(details, len, "func=%p", e->launch.func);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    case CUDA_EVENT_SET_DEVICE:
+        if (!e->is_return)
+            snprintf(details, len, "device=%d", e->device.device);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    case CUDA_EVENT_STREAM_SYNC:
+    case CUDA_EVENT_EVENT_RECORD:
+    case CUDA_EVENT_EVENT_SYNC:
+        if (!e->is_return)
+            snprintf(details, len, "handle=%p", e->handle.handle);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    default:
+        if (!e->is_return)
+            snprintf(details, len, "");
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    }
+}
+
+static int handle_event(void *ctx, void *data, size_t data_sz)
+{
+    const struct event *e = data;
+    struct tm *tm;
+    char ts[32];
+    char details[MAX_DETAILS_LEN];
+    time_t t;
+
+    /* Skip return probes if requested */
+    if (e->is_return && !env.include_returns)
+        return 0;
+
+    time(&t);
+    tm = localtime(&t);
+    strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+
+    get_event_details(e, details, sizeof(details));
+
+    if (env.print_timestamp) {
+        printf("%-8s ", ts);
+    }
+
+    printf("%-16s %-7d %-20s %s%s\n", 
+           e->comm, e->pid, 
+           event_type_str(e->type),
+           e->is_return ? "ret: " : "",
+           details);
+
+    return 0;
+}
+
+/* Define CUDA API functions to trace */
+struct cuda_api_func {
+    const char *name;
+    struct bpf_program *prog_entry;
+    struct bpf_program *prog_exit;
+};
+
+/* Attach a uprobe to a CUDA API function */
+static int attach_cuda_func(struct cuda_events_bpf *skel, const char *lib_path, 
+                           const char *func_name, struct bpf_program *prog_entry,
+                           struct bpf_program *prog_exit)
+{
+    int err;
+    LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts);
+
+    /* Skip attaching if program is NULL (might have been filtered out) */
+    if (!prog_entry && !prog_exit)
+        return 0;
+
+    /* Attach entry uprobe */
+    if (prog_entry) {
+        uprobe_opts.func_name = func_name;
+        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_entry, -1, lib_path, 0, &uprobe_opts);
+        if (!link) {
+            fprintf(stderr, "Failed to attach entry uprobe for %s: %d\n", func_name, err);
+            return err;
+        }
+    }
+
+    /* Attach exit uprobe */
+    if (prog_exit) {
+        uprobe_opts.func_name = func_name;
+        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_exit, -1, lib_path, 0, &uprobe_opts);
+        if (!link) {
+            fprintf(stderr, "Failed to attach exit uprobe for %s: %d\n", func_name, err);
+            return err;
+        }
+    }
+
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    struct ring_buffer *rb = NULL;
+    struct cuda_events_bpf *skel;
+    int err;
+
+    /* Default CUDA library path if not specified */
+    const char *cuda_lib_path = "/usr/local/cuda/lib64/libcudart.so";
+
+    /* Parse command line arguments */
+    err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+    if (err)
+        return err;
+
+    /* Override CUDA library path if specified on command line */
+    if (env.cuda_library_path)
+        cuda_lib_path = env.cuda_library_path;
+
+    /* Set up libbpf errors and debug info callback */
+    libbpf_set_print(libbpf_print_fn);
+
+    /* Cleaner handling of Ctrl-C */
+    signal(SIGINT, sig_handler);
+    signal(SIGTERM, sig_handler);
+
+    /* Load and verify BPF application */
+    skel = cuda_events_bpf__open();
+    if (!skel) {
+        fprintf(stderr, "Failed to open and load BPF skeleton\n");
+        return 1;
+    }
+
+    /* Load & verify BPF programs */
+    err = cuda_events_bpf__load(skel);
+    if (err) {
+        fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+        goto cleanup;
+    }
+
+    /* Define CUDA functions to trace and their corresponding programs */
+    struct cuda_api_func cuda_funcs[] = {
+        {"cudaMalloc", skel->progs.cuda_malloc_enter, skel->progs.cuda_malloc_exit},
+        {"cudaFree", skel->progs.cuda_free_enter, skel->progs.cuda_free_exit},
+        {"cudaMemcpy", skel->progs.cuda_memcpy_enter, skel->progs.cuda_memcpy_exit},
+        {"cudaLaunchKernel", skel->progs.cuda_launch_kernel_enter, skel->progs.cuda_launch_kernel_exit},
+        {"cudaStreamCreate", skel->progs.cuda_stream_create_enter, skel->progs.cuda_stream_create_exit},
+        {"cudaStreamSynchronize", skel->progs.cuda_stream_sync_enter, skel->progs.cuda_stream_sync_exit},
+        {"cudaGetDevice", skel->progs.cuda_get_device_enter, skel->progs.cuda_get_device_exit},
+        {"cudaSetDevice", skel->progs.cuda_set_device_enter, skel->progs.cuda_set_device_exit},
+        {"cudaEventCreate", skel->progs.cuda_event_create_enter, skel->progs.cuda_event_create_exit},
+        {"cudaEventRecord", skel->progs.cuda_event_record_enter, skel->progs.cuda_event_record_exit},
+        {"cudaEventSynchronize", skel->progs.cuda_event_sync_enter, skel->progs.cuda_event_sync_exit},
+    };
+
+    /* Print CUDA library path being used */
+    printf("Using CUDA library: %s\n", cuda_lib_path);
+
+     /* Attach tracepoints */
+    err = cuda_events_bpf__attach(skel);
+    if (err) {
+        fprintf(stderr, "Failed to attach BPF skeleton\n");
+        goto cleanup;
+    }
+
+    /* Attach to CUDA functions */
+    for (size_t i = 0; i < sizeof(cuda_funcs) / sizeof(cuda_funcs[0]); i++) {
+        err = attach_cuda_func(skel, cuda_lib_path, cuda_funcs[i].name, 
+                              cuda_funcs[i].prog_entry, cuda_funcs[i].prog_exit);
+        if (err) {
+            fprintf(stderr, "Failed to attach to %s\n", cuda_funcs[i].name);
+            goto cleanup;
+        }
+    }
+    
+    /* Set up ring buffer polling */
+    rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
+    if (!rb) {
+        err = -1;
+        fprintf(stderr, "Failed to create ring buffer\n");
+        goto cleanup;
+    }
+
+    /* Process events */
+    if (env.print_timestamp) {
+        printf("%-8s ", "TIME");
+    }
+    printf("%-16s %-7s %-20s %s\n",
+           "PROCESS", "PID", "EVENT", "DETAILS");
+
+    while (!exiting) {
+        err = ring_buffer__poll(rb, 100 /* timeout, ms */);
+        /* Ctrl-C will cause -EINTR */
+        if (err == -EINTR) {
+            err = 0;
+            break;
+        }
+        if (err < 0) {
+            printf("Error polling ring buffer: %d\n", err);
+            break;
+        }
+    }
+
+cleanup:
+    /* Clean up */
+    ring_buffer__free(rb);
+    cuda_events_bpf__destroy(skel);
+
+    return err < 0 ? -err : 0;
+} 
--- a/src/47-cuda-events/cuda_events.h
+++ b/src/47-cuda-events/cuda_events.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __CUDA_EVENTS_H
+#define __CUDA_EVENTS_H
+
+#define TASK_COMM_LEN 16
+#define MAX_FUNC_NAME_LEN 32
+#define MAX_DETAILS_LEN 64
+
+enum cuda_event_type {
+    CUDA_EVENT_MALLOC = 0,
+    CUDA_EVENT_FREE,
+    CUDA_EVENT_MEMCPY,
+    CUDA_EVENT_LAUNCH_KERNEL,
+    CUDA_EVENT_STREAM_CREATE,
+    CUDA_EVENT_STREAM_SYNC,
+    CUDA_EVENT_GET_DEVICE,
+    CUDA_EVENT_SET_DEVICE,
+    CUDA_EVENT_EVENT_CREATE,
+    CUDA_EVENT_EVENT_RECORD,
+    CUDA_EVENT_EVENT_SYNC
+};
+
+struct event {
+    /* Common fields */
+    int pid;                  /* Process ID */
+    char comm[TASK_COMM_LEN]; /* Process name */
+    enum cuda_event_type type;/* Type of CUDA event */
+    
+    /* Event-specific data */
+    union {
+        struct {
+            size_t size;      /* Size for malloc/memcpy */
+        } mem;
+        
+        struct {
+            void *ptr;        /* Pointer for free */
+        } free_data;
+        
+        struct {
+            size_t size;      /* Size for memcpy */
+            int kind;         /* Kind of memcpy */
+        } memcpy_data;
+        
+        struct {
+            void *func;       /* Function pointer for kernel launch */
+        } launch;
+        
+        struct {
+            int device;       /* Device ID for set_device */
+        } device;
+        
+        struct {
+            void *handle;     /* Handle for stream/event operations */
+        } handle;
+    };
+    
+    /* Return value (for return probes) */
+    int ret_val;
+    bool is_return;           /* True if this is from a return probe */
+    
+    char details[MAX_DETAILS_LEN]; /* Additional details as string */
+};
+
+#endif /* __CUDA_EVENTS_H */ 
--- a/src/47-cuda-events/cuda_malloc.bt
+++ b/src/47-cuda-events/cuda_malloc.bt
@@ -0,0 +1,30 @@
+#!/usr/bin/env bpftrace
+
+BEGIN
+{
+    printf("Tracing CUDA events... Hit Ctrl-C to end.\n");
+    printf("%-12s %-16s %-12s %-20s %s\n", "TIME(ms)", "PROCESS", "PID", "EVENT", "DETAILS");
+    printf("Using CUDA library: /usr/local/cuda-12.6/lib64/libcudart.so\n");
+    printf("If this path is incorrect, please edit the script and update all probe definitions.\n");
+}
+
+// Memory allocation tracking
+uprobe:/root/yunwei37/cuda-exp/basic07:cudaMalloc
+{
+    $size = arg1;
+    printf("%-12u %-16s %-12d %-20s size=%ld bytes\n", 
+           elapsed/1000000, comm, pid, "cudaMalloc", $size);
+}
+
+uretprobe:/root/yunwei37/cuda-exp/basic07:cudaMalloc
+{
+    $ret = retval;
+    printf("%-12u %-16s %-12d %-20s returned=%d (%s)\n", 
+           elapsed/1000000, comm, pid, "cudaMalloc", $ret, 
+           $ret == 0 ? "success" : "error");
+}
+
+END
+{
+    printf("Tracing complete.\n");
+}