Enhance Flamegraph Documentation and GPU Profiling Scripts

- Added an example flamegraph for Qwen3 LLM inference, highlighting key insights and performance bottlenecks. - Updated README.md to include detailed explanations of CPU and GPU profiling results, emphasizing the correlation between CPU stacks and GPU kernels. - Modified gpuperf.py to ensure absolute paths are used for output files, improving reliability across different working directories. - Enhanced merge_gpu_cpu_trace.py to strip ANSI escape sequences from CPU stack traces, ensuring cleaner output for analysis. - Introduced a new SVG file for the Qwen3 flamegraph, providing a visual representation of profiling data with interactive features.
2026-02-12 22:56:28 +08:00 · 2025-10-28 13:23:16 -07:00
parent ad583766a8
commit 5afd7fd348
4 changed files with 908 additions and 178 deletions
--- a/src/xpu/flamegraph/gpuperf.py
+++ b/src/xpu/flamegraph/gpuperf.py
@@ -59,7 +59,8 @@ class GPUPerf:
        if not cpu_output_file:
            cpu_output_file = f"cpu_profile_{pid if pid else 'cuda'}.txt"

-        self.profiler_output = cpu_output_file
+        # Convert to absolute path to handle working directory changes
+        self.profiler_output = str(Path(cpu_output_file).absolute())

        # Find CUDA runtime library if not specified
        if not cuda_lib_path:
@@ -131,14 +132,15 @@ class GPUPerf:
        trace_file = None
        if do_gpu_profiling:
            if output_trace:
-                trace_file = output_trace
+                # Convert to absolute path to handle target process changing directories
+                trace_file = str(Path(output_trace).absolute())
            else:
                # Create temporary file for trace output
                fd, trace_file = tempfile.mkstemp(suffix=".txt", prefix="gpuperf_trace_")
                os.close(fd)
                self.temp_trace_file = trace_file
                atexit.register(self.cleanup_temp_files)
-        
+
        # Set up environment variables
        env = os.environ.copy()
        env['CUDA_INJECTION64_PATH'] = str(self.injection_lib)