add-cuda-event-doc

2026-02-03 10:14:44 +08:00 · 2025-05-24 18:02:46 +08:00
parent 516f19f27c
commit 0a7d2a40c7
5 changed files with 550 additions and 137 deletions
--- a/src/47-cuda-events/.gitignore
+++ b/src/47-cuda-events/.gitignore
@@ -7,3 +7,4 @@ package.yaml
 ecli
 bootstrap
 cuda_events
+basic02
--- a/src/47-cuda-events/Makefile
+++ b/src/47-cuda-events/Makefile
@@ -21,9 +21,20 @@ VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h
 # libbpf to avoid dependency on system-wide headers, which could be missing or
 # outdated
 INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
-CFLAGS := -g -Wall
+CFLAGS := -g -Wall -D_GNU_SOURCE
 ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)

+# Default CUDA library path
+CUDA_LIB_PATH ?= /usr/local/cuda/lib64/libcudart.so
+
+# NVIDIA CUDA Compiler
+NVCC ?= nvcc
+
+# Auto-detect CUDA architecture if possible
+CUDA_DETECT_SCRIPT := $(OUTPUT)/detect_cuda_arch.sh
+CUDA_ARCH_FLAGS ?= $(shell if [ -f $(CUDA_DETECT_SCRIPT) ]; then bash $(CUDA_DETECT_SCRIPT); else echo "-arch=sm_61"; fi)
+NVCC_FLAGS = -O3 $(CUDA_ARCH_FLAGS)
+
 APPS = cuda_events # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall

 CARGO ?= $(shell which cargo)
@@ -69,7 +80,7 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc)
 $(call allow-override,LD,$(CROSS_COMPILE)ld)

 .PHONY: all
-all: $(APPS)
+all: cuda_events basic02

 .PHONY: clean
 clean:
@@ -80,6 +91,21 @@ $(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
 	$(call msg,MKDIR,$@)
 	$(Q)mkdir -p $@

+# Create a script to detect CUDA architecture
+$(CUDA_DETECT_SCRIPT): | $(OUTPUT)
+	$(call msg,SCRIPT,$@)
+	$(Q)echo '#!/bin/bash' > $@
+	$(Q)echo 'CUDA_VERSION=$$($(NVCC) --version | grep "release" | sed "s/.*release //" | sed "s/,.*//")' >> $@
+	$(Q)echo 'if [ -x "$$(command -v nvidia-smi)" ]; then' >> $@
+	$(Q)echo '  CUDA_ARCH=$$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | sed -e "s/\.//")' >> $@
+	$(Q)echo '  if [ -n "$$CUDA_ARCH" ]; then' >> $@
+	$(Q)echo '    echo "-arch=sm_$$CUDA_ARCH"' >> $@
+	$(Q)echo '    exit 0' >> $@
+	$(Q)echo '  fi' >> $@
+	$(Q)echo 'fi' >> $@
+	$(Q)echo 'echo "-arch=sm_61"' >> $@
+	$(Q)chmod +x $@
+
 # Build libbpf
 $(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
 	$(call msg,LIB,$@)
@@ -93,17 +119,11 @@ $(BPFTOOL): | $(BPFTOOL_OUTPUT)
 	$(call msg,BPFTOOL,$@)
 	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap

-
-$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
-	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
-
-$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
-	$(call msg,LIB, $@)
-	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
-
-$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
-	$(call msg,LIB,$@)
-	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
+# Build CUDA example
+basic02: basic02.cu $(CUDA_DETECT_SCRIPT)
+	$(call msg,NVCC,$@)
+	$(Q)$(NVCC) $(NVCC_FLAGS) -o $@ $<
+	$(Q)@echo "Compiling for architecture: $(CUDA_ARCH_FLAGS)"

 # Build BPF code
 $(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
--- a/src/47-cuda-events/README.md
+++ b/src/47-cuda-events/README.md
@@ -1,143 +1,469 @@
-# Tracing CUDA Events with eBPF
+# eBPF Tutorial: Tracing CUDA GPU Operations

-This tutorial demonstrates how to use eBPF to trace CUDA runtime API calls using uprobes. This allows you to monitor CUDA applications and gain insights into memory operations, kernel launches, stream operations, and device management.
+Have you ever wondered what's happening under the hood when your CUDA application is running? GPU operations can be challenging to debug and profile because they happen in a separate device with its own memory space. In this tutorial, we'll build a powerful eBPF-based tracing tool that lets you peek into CUDA API calls in real time.

-## Overview
+## Introduction to CUDA and GPU Tracing

-CUDA (Compute Unified Device Architecture) is NVIDIA's parallel computing platform and API model. When developing or troubleshooting CUDA applications, it's often useful to trace CUDA runtime API calls to understand:
+CUDA (Compute Unified Device Architecture) is NVIDIA's parallel computing platform and programming model that enables developers to use NVIDIA GPUs for general-purpose processing. When you run a CUDA application, several things happen behind the scenes:

- Memory allocation patterns (cudaMalloc, cudaFree)
- Data transfer between host and device (cudaMemcpy)
- Kernel execution (cudaLaunchKernel)
- Stream and event usage (cudaStreamCreate, cudaEventRecord)
- Device management (cudaGetDevice, cudaSetDevice)
+1. The host (CPU) allocates memory on the device (GPU)
+2. Data is transferred from host to device memory
+3. GPU kernels (functions) are launched to process the data
+4. Results are transferred back from device to host
+5. Device memory is freed

-eBPF's uprobes feature allows us to attach tracing points to user-space functions in shared libraries like NVIDIA's CUDA Runtime API library (`libcudart.so`), making it an excellent tool for this purpose.
+Each of these operations involves CUDA API calls like `cudaMalloc`, `cudaMemcpy`, and `cudaLaunchKernel`. Tracing these calls can provide valuable insights for debugging and performance optimization, but this isn't straightforward. GPU operations happen asynchronously, and traditional debugging tools often can't access GPU internals.

-## Prerequisites
+This is where eBPF comes to the rescue! By using uprobes, we can intercept CUDA API calls in the user-space CUDA runtime library (`libcudart.so`) before they reach the GPU. This gives us visibility into:

- Linux kernel 4.18+ with eBPF support
- NVIDIA CUDA Toolkit installed
- bpftrace installed (for the bpftrace script approach)
- libbpf development libraries (for the libbpf-based approach)
+- Memory allocation sizes and patterns
+- Data transfer directions and sizes
+- Kernel launch parameters
+- Error codes and failures
+- Timing of operations

-## Approach 1: Using bpftrace (Easier)
+## Key CUDA Functions We Trace

-The `cuda_events.bt` script uses bpftrace's uprobe functionality to trace important CUDA API calls.
-
-### Locating the CUDA Runtime Library
-
-First, locate your CUDA runtime library:
-
-```bash
-# Common locations:
-ls -l /usr/local/cuda/lib64/libcudart.so*
-ls -l /usr/lib/x86_64-linux-gnu/libcudart.so*
-```
-
-Update the library path in the script if it's different from the default `/usr/local/cuda/lib64/libcudart.so`. You'll need to modify every probe definition in the script.
-
-### Running the Script
-
-```bash
-sudo bpftrace cuda_events.bt
-```
-
-In another terminal, run your CUDA application, and you'll see the traced CUDA API calls.
-
-### Output Format
-
-The script provides detailed output with the following columns:
-
- `TIME(ms)`: Timestamp in milliseconds since tracing started
- `PROCESS`: Name of the process making the CUDA call
- `PID`: Process ID
- `EVENT`: CUDA function name
- `DETAILS`: Call-specific information (sizes, pointers, return codes)
-
-### Example Output
-
-```
-TIME(ms)   PROCESS         PID        EVENT                DETAILS
-1234       my_cuda_app     12345      cudaMalloc           size=1048576 bytes
-1235       my_cuda_app     12345      cudaMalloc           returned=0 (success)
-1236       my_cuda_app     12345      cudaMemcpy           size=1048576 bytes, kind=1
-1237       my_cuda_app     12345      cudaMemcpy           returned=0 (success)
-1240       my_cuda_app     12345      cudaLaunchKernel     function=0x7f8b3c4d2a00
-1241       my_cuda_app     12345      cudaLaunchKernel     returned=0 (success)
-```
-
-## What We're Tracing
-
-The script traces the following CUDA functions:
+Our tracer monitors several critical CUDA functions that represent the main operations in GPU computing. Understanding these functions helps you interpret the tracing results and diagnose issues in your CUDA applications:

 ### Memory Management
- `cudaMalloc`: Allocates memory on the GPU
- `cudaFree`: Frees memory on the GPU
- `cudaMemcpy`: Copies data between host and device memory
+
+- **`cudaMalloc`**: Allocates memory on the GPU device. By tracing this, we can see how much memory is being requested, when, and whether it succeeds. Memory allocation failures are a common source of problems in CUDA applications.
+  ```c
+  cudaError_t cudaMalloc(void** devPtr, size_t size);
+  ```
+
+- **`cudaFree`**: Releases previously allocated memory on the GPU. Tracing this helps identify memory leaks (allocated memory that's never freed) and double-free errors.
+  ```c
+  cudaError_t cudaFree(void* devPtr);
+  ```
+
+### Data Transfer
+
+- **`cudaMemcpy`**: Copies data between host (CPU) and device (GPU) memory, or between different locations in device memory. The direction parameter (`kind`) tells us whether data is moving to the GPU, from the GPU, or within the GPU.
+  ```c
+  cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind);
+  ```
+  
+  The `kind` parameter can be:
+  - `cudaMemcpyHostToDevice` (1): Copying from CPU to GPU
+  - `cudaMemcpyDeviceToHost` (2): Copying from GPU to CPU
+  - `cudaMemcpyDeviceToDevice` (3): Copying within GPU memory

 ### Kernel Execution
- `cudaLaunchKernel`: Launches a CUDA kernel

-### Stream Operations
- `cudaStreamCreate`: Creates a CUDA stream
- `cudaStreamSynchronize`: Waits for all operations in a stream to complete
+- **`cudaLaunchKernel`**: Launches a GPU kernel (function) to run on the device. This is where the actual parallel computation happens. Tracing this shows when kernels are launched and whether they succeed.
+  ```c
+  cudaError_t cudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, 
+                              void** args, size_t sharedMem, cudaStream_t stream);
+  ```
+
+### Streams and Synchronization
+
+CUDA uses streams for managing concurrency and asynchronous operations:
+
+- **`cudaStreamCreate`**: Creates a new stream for executing operations in order but potentially concurrently with other streams.
+  ```c
+  cudaError_t cudaStreamCreate(cudaStream_t* pStream);
+  ```
+
+- **`cudaStreamSynchronize`**: Waits for all operations in a stream to complete. This is a key synchronization point that can reveal performance bottlenecks.
+  ```c
+  cudaError_t cudaStreamSynchronize(cudaStream_t stream);
+  ```
+
+### Events
+
+CUDA events are used for timing and synchronization:
+
+- **`cudaEventCreate`**: Creates an event object for timing operations.
+  ```c
+  cudaError_t cudaEventCreate(cudaEvent_t* event);
+  ```
+
+- **`cudaEventRecord`**: Records an event in a stream, which can be used for timing or synchronization.
+  ```c
+  cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+  ```
+
+- **`cudaEventSynchronize`**: Waits for an event to complete, which is another synchronization point.
+  ```c
+  cudaError_t cudaEventSynchronize(cudaEvent_t event);
+  ```

 ### Device Management
- `cudaGetDevice`: Gets the current CUDA device
- `cudaSetDevice`: Sets the current CUDA device

-### Event Management
- `cudaEventCreate`: Creates a CUDA event
- `cudaEventRecord`: Records an event in a stream
- `cudaEventSynchronize`: Waits for an event to complete
+- **`cudaGetDevice`**: Gets the current device being used.
+  ```c
+  cudaError_t cudaGetDevice(int* device);
+  ```

-## Test Application
+- **`cudaSetDevice`**: Sets the device to be used for GPU executions.
+  ```c
+  cudaError_t cudaSetDevice(int device);
+  ```

-The `cuda_events_test.c` file provides a simple CUDA application that performs vector addition. You can compile and run it to generate CUDA API calls for testing:
+By tracing these functions, we gain complete visibility into the lifecycle of GPU operations, from device selection and memory allocation to data transfer, kernel execution, and synchronization. This enables us to identify bottlenecks, diagnose errors, and understand the behavior of CUDA applications.

-```bash
-nvcc -o cuda_events_test cuda_events_test.c
+## Architecture Overview
+
+Our CUDA events tracer consists of three main components:
+
+1. **Header File (`cuda_events.h`)**: Defines data structures for communication between kernel and user space
+2. **eBPF Program (`cuda_events.bpf.c`)**: Implements kernel-side hooks for CUDA functions using uprobes
+3. **User-Space Application (`cuda_events.c`)**: Loads the eBPF program, processes events, and displays them to the user
+
+The tool uses eBPF uprobes to attach to CUDA API functions in the CUDA runtime library. When a CUDA function is called, the eBPF program captures the parameters and results, sending them to user space through a ring buffer.
+
+## Key Data Structures
+
+The central data structure for our tracer is the `struct event` defined in `cuda_events.h`:
+
+```c
+struct event {
+    /* Common fields */
+    int pid;                  /* Process ID */
+    char comm[TASK_COMM_LEN]; /* Process name */
+    enum cuda_event_type type;/* Type of CUDA event */
+    
+    /* Event-specific data (union to save space) */
+    union {
+        struct { size_t size; } mem;                 /* For malloc/memcpy */
+        struct { void *ptr; } free_data;             /* For free */
+        struct { size_t size; int kind; } memcpy_data; /* For memcpy */
+        struct { void *func; } launch;               /* For kernel launch */
+        struct { int device; } device;               /* For device operations */
+        struct { void *handle; } handle;             /* For stream/event operations */
+    };
+    
+    bool is_return;           /* True if this is from a return probe */
+    int ret_val;              /* Return value (for return probes) */
+    char details[MAX_DETAILS_LEN]; /* Additional details as string */
+};
 ```

-Then run the bpftrace script in one terminal:
+This structure is designed to efficiently capture information about different types of CUDA operations. The `union` is a clever space-saving technique since each event only needs one type of data at a time. For example, a memory allocation event needs to store the size, while a free event needs to store a pointer.

-```bash
-sudo bpftrace cuda_events.bt
+The `cuda_event_type` enum helps us categorize different CUDA operations:
+
+```c
+enum cuda_event_type {
+    CUDA_EVENT_MALLOC = 0,
+    CUDA_EVENT_FREE,
+    CUDA_EVENT_MEMCPY,
+    CUDA_EVENT_LAUNCH_KERNEL,
+    CUDA_EVENT_STREAM_CREATE,
+    CUDA_EVENT_STREAM_SYNC,
+    CUDA_EVENT_GET_DEVICE,
+    CUDA_EVENT_SET_DEVICE,
+    CUDA_EVENT_EVENT_CREATE,
+    CUDA_EVENT_EVENT_RECORD,
+    CUDA_EVENT_EVENT_SYNC
+};
 ```

-And the test application in another:
+This enum covers the main CUDA operations we want to trace, from memory management to kernel launches and synchronization.

-```bash
-./cuda_events_test
+## The eBPF Program Implementation
+
+Let's dive into the eBPF program (`cuda_events.bpf.c`) that hooks into CUDA functions. The full code is available in the repository, but here are the key parts:
+
+First, we create a ring buffer to communicate with user space:
+
+```c
+struct {
+    __uint(type, BPF_MAP_TYPE_RINGBUF);
+    __uint(max_entries, 256 * 1024);
+} rb SEC(".maps");
 ```

-## Limitations
+The ring buffer is a crucial component for our tracer. It acts as a high-performance queue where the eBPF program can submit events, and the user-space application can retrieve them. We set a generous size of 256KB to handle bursts of events without losing data.

- The script only traces the main CUDA Runtime API functions. It doesn't trace CUDA driver API calls or CUDA library functions.
- The path to `libcudart.so` needs to be updated manually if it's different from the default.
- To capture more CUDA driver API functions, you would need to add additional probes for functions in `libcuda.so`.
+For each CUDA operation, we implement a helper function to collect relevant data. Let's look at the `submit_malloc_event` function as an example:

-## Troubleshooting
+```c
+static inline int submit_malloc_event(size_t size, bool is_return, int ret_val) {
+    struct event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+    if (!e) return 0;
+    
+    /* Fill common fields */
+    e->pid = bpf_get_current_pid_tgid() >> 32;
+    bpf_get_current_comm(&e->comm, sizeof(e->comm));
+    e->type = CUDA_EVENT_MALLOC;
+    e->is_return = is_return;
+    
+    /* Fill event-specific data */
+    if (is_return) {
+        e->ret_val = ret_val;
+    } else {
+        e->mem.size = size;
+    }
+    
+    bpf_ringbuf_submit(e, 0);
+    return 0;
+}
+```

-If you encounter issues:
+This function first reserves space in the ring buffer for our event. Then it fills in common fields like the process ID and name. For a malloc event, we store either the requested size (on function entry) or the return value (on function exit). Finally, we submit the event to the ring buffer.

-1. **Library Path**: Ensure the path to `libcudart.so` in the script is correct for your system
-2. **Permission Issues**: Make sure you're running with sudo
-3. **Missing Symbols**: Some CUDA library versions might have different function signatures or optimized symbols
+The actual probes are attached to CUDA functions using SEC annotations. For cudaMalloc, we have:

-## Conclusion
+```c
+SEC("uprobe")
+int BPF_KPROBE(cuda_malloc_enter, void **ptr, size_t size) {
+    return submit_malloc_event(size, false, 0);
+}

-eBPF and uprobes provide a powerful way to trace CUDA applications without modifying source code or recompiling. This non-intrusive approach allows developers to debug CUDA applications and analyze GPU utilization patterns easily.
+SEC("uretprobe")
+int BPF_KRETPROBE(cuda_malloc_exit, int ret) {
+    return submit_malloc_event(0, true, ret);
+}
+```

-By tracing CUDA API calls, you can:
- Debug memory leaks in CUDA applications
- Understand data transfer patterns between CPU and GPU
- Profile kernel execution patterns
- Verify proper event and stream synchronization
+The first function is called when `cudaMalloc` is entered, capturing the requested size. The second is called when `cudaMalloc` returns, capturing the error code. This pattern is repeated for each CUDA function we want to trace.

-## Further Reading
+One interesting case is `cudaMemcpy`, which transfers data between host and device:

- [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/)
- [bpftrace Reference Guide](https://github.com/iovisor/bpftrace/blob/master/docs/reference_guide.md)
- [Using uprobes with BPF](https://www.brendangregg.com/blog/2016-10-12/linux-bcc-nodejs-uprobes.html)
+```c
+SEC("uprobe")
+int BPF_KPROBE(cuda_memcpy_enter, void *dst, const void *src, size_t size, int kind) {
+    return submit_memcpy_event(size, kind, false, 0);
+}
+```
+
+Here, we capture not just the size but also the "kind" parameter, which indicates the direction of the transfer (host-to-device, device-to-host, or device-to-device). This gives us valuable information about data movement patterns.
+
+## User-Space Application Details
+
+The user-space application (`cuda_events.c`) is responsible for loading the eBPF program, processing events from the ring buffer, and displaying them in a user-friendly format.
+
+First, the program parses command-line arguments to configure its behavior:
+
+```c
+static struct env {
+    bool verbose;
+    bool print_timestamp;
+    char *cuda_library_path;
+    bool include_returns;
+    int target_pid;
+} env = {
+    .print_timestamp = true,
+    .include_returns = true,
+    .cuda_library_path = NULL,
+    .target_pid = -1,
+};
+```
+
+This structure stores configuration options like whether to print timestamps or include return probes. The default values provide a sensible starting point.
+
+The program uses `libbpf` to load and attach the eBPF program to CUDA functions:
+
+```c
+int attach_cuda_func(struct cuda_events_bpf *skel, const char *lib_path, 
+                    const char *func_name, struct bpf_program *prog_entry,
+                    struct bpf_program *prog_exit) {
+    /* Attach entry uprobe */
+    if (prog_entry) {
+        uprobe_opts.func_name = func_name;
+        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_entry, 
+                                env.target_pid, lib_path, 0, &uprobe_opts);
+        /* Error handling... */
+    }
+    
+    /* Attach exit uprobe */
+    if (prog_exit) {
+        /* Similar for return probe... */
+    }
+}
+```
+
+This function takes a function name (like "cudaMalloc") and the corresponding eBPF programs for entry and exit. It then attaches these programs as uprobes to the specified library.
+
+One of the most important functions is `handle_event`, which processes events from the ring buffer:
+
+```c
+static int handle_event(void *ctx, void *data, size_t data_sz) {
+    const struct event *e = data;
+    struct tm *tm;
+    char ts[32];
+    char details[MAX_DETAILS_LEN];
+    time_t t;
+
+    /* Skip return probes if requested */
+    if (e->is_return && !env.include_returns)
+        return 0;
+
+    time(&t);
+    tm = localtime(&t);
+    strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+
+    get_event_details(e, details, sizeof(details));
+
+    if (env.print_timestamp) {
+        printf("%-8s ", ts);
+    }
+
+    printf("%-16s %-7d %-20s %8s %s\n", 
+           e->comm, e->pid, 
+           event_type_str(e->type),
+           e->is_return ? "[EXIT]" : "[ENTER]",
+           details);
+
+    return 0;
+}
+```
+
+This function formats and displays event information, including timestamps, process details, event type, and specific parameters or return values.
+
+The `get_event_details` function converts raw event data into human-readable form:
+
+```c
+static void get_event_details(const struct event *e, char *details, size_t len) {
+    switch (e->type) {
+    case CUDA_EVENT_MALLOC:
+        if (!e->is_return)
+            snprintf(details, len, "size=%zu bytes", e->mem.size);
+        else
+            snprintf(details, len, "returned=%s", cuda_error_str(e->ret_val));
+        break;
+    
+    /* Similar cases for other event types... */
+    }
+}
+```
+
+This function handles each event type differently. For example, a malloc event shows the requested size on entry and the error code on exit.
+
+The main event loop is remarkably simple:
+
+```c
+while (!exiting) {
+    err = ring_buffer__poll(rb, 100 /* timeout, ms */);
+    /* Error handling... */
+}
+```
+
+This polls the ring buffer for events, calling `handle_event` for each one. The 100ms timeout ensures the program remains responsive to signals like Ctrl+C.
+
+## CUDA Error Handling and Reporting
+
+An important aspect of our tracer is translating CUDA error codes into human-readable messages. CUDA has over 100 different error codes, from simple ones like "out of memory" to complex ones like "unsupported PTX version."
+
+Our tool includes a comprehensive `cuda_error_str` function that maps these numeric codes to string descriptions:
+
+```c
+static const char *cuda_error_str(int error) {
+    switch (error) {
+    case 0:  return "Success";
+    case 1:  return "InvalidValue";
+    case 2:  return "OutOfMemory";
+    /* Many more error codes... */
+    default: return "Unknown";
+    }
+}
+```
+
+This makes the output much more useful for debugging. Instead of seeing "error 2", you'll see "OutOfMemory", which immediately tells you what went wrong.
+
+## Compilation and Execution
+
+Building the tracer is straightforward with the provided Makefile:
+
+```bash
+# Build both the tracer and the example
+make
+```
+
+This creates two binaries:
+- `cuda_events`: The eBPF-based CUDA tracing tool
+- `basic02`: A simple CUDA example application
+
+The build system is smart enough to detect your GPU architecture using `nvidia-smi` and compile the CUDA code with the appropriate flags.
+
+Running the tracer is just as easy:
+
+```bash
+# Start the tracing tool
+sudo ./cuda_events -p ./basic02
+
+# In another terminal, run the CUDA example
+./basic02
+```
+
+You can also trace a specific process by PID:
+
+```bash
+# Run the CUDA example
+./basic02 &
+PID=$!
+
+# Start the tracing tool with PID filtering
+sudo ./cuda_events -p ./basic02 -d $PID
+```
+
+The example output shows detailed information about each CUDA operation:
+
+```
+Using CUDA library: ./basic02
+TIME     PROCESS          PID     EVENT                 TYPE    DETAILS
+17:35:41 basic02          12345   cudaMalloc          [ENTER]  size=4000 bytes
+17:35:41 basic02          12345   cudaMalloc           [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaMalloc          [ENTER]  size=4000 bytes
+17:35:41 basic02          12345   cudaMalloc           [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaMemcpy          [ENTER]  size=4000 bytes, kind=1
+17:35:41 basic02          12345   cudaMemcpy           [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaLaunchKernel    [ENTER]  func=0x7f1234567890
+17:35:41 basic02          12345   cudaLaunchKernel     [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaMemcpy          [ENTER]  size=4000 bytes, kind=2
+17:35:41 basic02          12345   cudaMemcpy           [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaFree            [ENTER]  ptr=0x7f1234568000
+17:35:41 basic02          12345   cudaFree             [EXIT]  returned=Success
+17:35:41 basic02          12345   cudaFree            [ENTER]  ptr=0x7f1234569000
+17:35:41 basic02          12345   cudaFree             [EXIT]  returned=Success
+```
+
+This output shows the typical flow of a CUDA application:
+1. Allocate memory on the device
+2. Copy data from host to device (kind=1)
+3. Launch a kernel to process the data
+4. Copy results back from device to host (kind=2)
+5. Free device memory
+
+## Command Line Options
+
+The `cuda_events` tool supports these options:
+
+- `-v`: Enable verbose output for debugging
+- `-t`: Don't print timestamps
+- `-r`: Don't show function returns (only show function entries)
+- `-p PATH`: Specify the path to the CUDA runtime library or application
+- `-d PID`: Trace only the specified process ID
+
+## Learning Objectives
+
+Through this tutorial, you'll learn:
+
+1. How CUDA applications interact with GPUs through the CUDA runtime API
+2. How to use eBPF uprobes to trace user-space libraries
+3. How to design efficient data structures for kernel-to-user communication
+4. How to process and display traced events in a user-friendly format
+5. How to filter events by process ID for focused debugging
+
+## Next Steps
+
+Once you're comfortable with this basic CUDA tracing tool, you could extend it to:
+
+1. Add support for more CUDA API functions
+2. Add timing information to analyze performance bottlenecks
+3. Implement correlation between related operations (e.g., matching mallocs with frees)
+4. Create visualizations of CUDA operations for easier analysis
+5. Add support for other GPU frameworks like OpenCL or ROCm
+
+## References
+
+- CUDA Programming Guide: https://docs.nvidia.com/cuda/cuda-c-programming-guide/
+- NVIDIA CUDA Runtime API: https://docs.nvidia.com/cuda/cuda-runtime-api/
+- libbpf Documentation: https://libbpf.readthedocs.io/
+- Linux uprobes Documentation: https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
+
+If you'd like to dive deeper into eBPF, check out our tutorial repository at https://github.com/eunomia-bpf/bpf-developer-tutorial or visit our website at https://eunomia.dev/tutorials/. 
--- a/src/47-cuda-events/basic02.cu
+++ b/src/47-cuda-events/basic02.cu
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+// Define a simple PTX inline assembly function that multiplies a number by 2
+__device__ int multiplyByTwo(int x) {
+    int result;
+    asm("mul.lo.s32 %0, %1, 2;" : "=r"(result) : "r"(x));
+    return result;
+}
+
+// CUDA kernel using PTX inline assembly
+__global__ void vectorMultiplyByTwoPTX(int* input, int* output, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        output[idx] = multiplyByTwo(input[idx]);
+    }
+}
+
+// Host function to initialize data and launch kernel
+void vectorMultiplyByTwo(int* h_input, int* h_output, int n) {
+    int *d_input, *d_output;
+    
+    // Allocate device memory
+    cudaMalloc(&d_input, n * sizeof(int));
+    cudaMalloc(&d_output, n * sizeof(int));
+    
+    // Copy input data to device
+    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);
+    
+    // Launch kernel
+    int blockSize = 256;
+    int numBlocks = (n + blockSize - 1) / blockSize;
+    vectorMultiplyByTwoPTX<<<numBlocks, blockSize>>>(d_input, d_output, n);
+    
+    // Copy result back to host
+    cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
+    
+    // Free device memory
+    cudaFree(d_input);
+    cudaFree(d_output);
+}
+
+int main() {
+    const int n = 1000;
+    int h_input[n];
+    int h_output[n];
+    
+    // Initialize input data
+    for (int i = 0; i < n; i++) {
+        h_input[i] = i;
+    }
+    
+    // Perform vector multiplication
+    vectorMultiplyByTwo(h_input, h_output, n);
+    
+    // Verify results
+    printf("First 10 results:\n");
+    for (int i = 0; i < 10; i++) {
+        printf("%d * 2 = %d\n", h_input[i], h_output[i]);
+    }
+    
+    return 0;
+} 
--- a/src/47-cuda-events/cuda_events.c
+++ b/src/47-cuda-events/cuda_events.c
@@ -3,6 +3,7 @@
 #include <argp.h>
 #include <signal.h>
 #include <stdio.h>
+#include <stdlib.h>  /* For atoi() */
 #include <time.h>
 #include <sys/resource.h>
 #include <stdbool.h>
@@ -16,10 +17,12 @@ static struct env {
    bool print_timestamp;
    char *cuda_library_path;
    bool include_returns;
+    int target_pid;  /* New field for target PID */
 } env = {
    .print_timestamp = true,
    .include_returns = true,
    .cuda_library_path = NULL,
+    .target_pid = -1,  /* Default to -1 (all PIDs) */
 };

 const char *argp_program_version = "cuda_events 0.1";
@@ -30,13 +33,14 @@ const char argp_program_doc[] =
 "It traces CUDA API calls and shows associated information\n"
 "such as memory allocations, kernel launches, data transfers, etc.\n"
 "\n"
-"USAGE: ./cuda_events [-v] [--no-timestamp] [--cuda-path PATH]\n";
+"USAGE: ./cuda_events [-v] [--no-timestamp] [--cuda-path PATH] [--pid PID]\n";

 static const struct argp_option opts[] = {
    { "verbose", 'v', NULL, 0, "Verbose debug output" },
    { "no-timestamp", 't', NULL, 0, "Don't print timestamps" },
    { "no-returns", 'r', NULL, 0, "Don't show function returns" },
    { "cuda-path", 'p', "CUDA_PATH", 0, "Path to CUDA runtime library" },
+    { "pid", 'd', "PID", 0, "Trace only the specified PID" },
    {},
 };

@@ -55,6 +59,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
    case 'p':
        env.cuda_library_path = arg;
        break;
+    case 'd':
+        env.target_pid = atoi(arg);
+        break;
    case ARGP_KEY_ARG:
        argp_usage(state);
        break;
@@ -310,10 +317,10 @@ static int handle_event(void *ctx, void *data, size_t data_sz)
        printf("%-8s ", ts);
    }

-    printf("%-16s %-7d %-20s %s%s\n", 
+    printf("%-16s %-7d %-20s %8s %s\n", 
           e->comm, e->pid, 
           event_type_str(e->type),
-           e->is_return ? "ret: " : "",
+           e->is_return ? "[EXIT]" : "[ENTER]",
           details);

    return 0;
@@ -341,20 +348,21 @@ static int attach_cuda_func(struct cuda_events_bpf *skel, const char *lib_path,
    /* Attach entry uprobe */
    if (prog_entry) {
        uprobe_opts.func_name = func_name;
-        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_entry, -1, lib_path, 0, &uprobe_opts);
+        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_entry, env.target_pid  , lib_path, 0, &uprobe_opts);
        if (!link) {
-            fprintf(stderr, "Failed to attach entry uprobe for %s: %d\n", func_name, err);
-            return err;
+            fprintf(stderr, "Failed to attach entry uprobe for %s\n", func_name);
+            return -1;
        }
    }

    /* Attach exit uprobe */
    if (prog_exit) {
        uprobe_opts.func_name = func_name;
-        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_exit, -1, lib_path, 0, &uprobe_opts);
+        uprobe_opts.retprobe = true;  /* This is a return probe */
+        struct bpf_link *link = bpf_program__attach_uprobe_opts(prog_exit, env.target_pid, lib_path, 0, &uprobe_opts);
        if (!link) {
-            fprintf(stderr, "Failed to attach exit uprobe for %s: %d\n", func_name, err);
-            return err;
+            fprintf(stderr, "Failed to attach exit uprobe for %s\n", func_name);
+            return -1;
        }
    }

@@ -417,13 +425,8 @@ int main(int argc, char **argv)

    /* Print CUDA library path being used */
    printf("Using CUDA library: %s\n", cuda_lib_path);
-
-     /* Attach tracepoints */
-    err = cuda_events_bpf__attach(skel);
-    if (err) {
-        fprintf(stderr, "Failed to attach BPF skeleton\n");
-        goto cleanup;
-    }
+    if (env.target_pid)
+        printf("Filtering for PID: %d\n", env.target_pid);

    /* Attach to CUDA functions */
    for (size_t i = 0; i < sizeof(cuda_funcs) / sizeof(cuda_funcs[0]); i++) {
@@ -447,8 +450,8 @@ int main(int argc, char **argv)
    if (env.print_timestamp) {
        printf("%-8s ", "TIME");
    }
-    printf("%-16s %-7s %-20s %s\n",
-           "PROCESS", "PID", "EVENT", "DETAILS");
+    printf("%-16s %-7s %-20s %8s %s\n",
+           "PROCESS", "PID", "EVENT", "TYPE", "DETAILS");

    while (!exiting) {
        err = ring_buffer__poll(rb, 100 /* timeout, ms */);