Enhance Flamegraph Documentation and GPU Profiling Scripts

- Added an example flamegraph for Qwen3 LLM inference, highlighting key insights and performance bottlenecks.
- Updated README.md to include detailed explanations of CPU and GPU profiling results, emphasizing the correlation between CPU stacks and GPU kernels.
- Modified gpuperf.py to ensure absolute paths are used for output files, improving reliability across different working directories.
- Enhanced merge_gpu_cpu_trace.py to strip ANSI escape sequences from CPU stack traces, ensuring cleaner output for analysis.
- Introduced a new SVG file for the Qwen3 flamegraph, providing a visual representation of profiling data with interactive features.
This commit is contained in:
Littlefisher
2025-10-28 13:23:16 -07:00
parent ad583766a8
commit 5afd7fd348
4 changed files with 908 additions and 178 deletions

View File

@@ -59,7 +59,8 @@ class GPUPerf:
if not cpu_output_file:
cpu_output_file = f"cpu_profile_{pid if pid else 'cuda'}.txt"
self.profiler_output = cpu_output_file
# Convert to absolute path to handle working directory changes
self.profiler_output = str(Path(cpu_output_file).absolute())
# Find CUDA runtime library if not specified
if not cuda_lib_path:
@@ -131,14 +132,15 @@ class GPUPerf:
trace_file = None
if do_gpu_profiling:
if output_trace:
trace_file = output_trace
# Convert to absolute path to handle target process changing directories
trace_file = str(Path(output_trace).absolute())
else:
# Create temporary file for trace output
fd, trace_file = tempfile.mkstemp(suffix=".txt", prefix="gpuperf_trace_")
os.close(fd)
self.temp_trace_file = trace_file
atexit.register(self.cleanup_temp_files)
# Set up environment variables
env = os.environ.copy()
env['CUDA_INJECTION64_PATH'] = str(self.injection_lib)