fix: Ensure CUPTI flushes events before stopping CPU profiler; update timestamp handling to use microseconds

2026-06-30 00:46:20 +08:00 · 2025-10-27 20:05:45 -07:00
parent 05ca05aa7c
commit f5ee20e333
2 changed files with 118 additions and 42 deletions
--- a/src/xpu/flamegraph/gpuperf.py
+++ b/src/xpu/flamegraph/gpuperf.py
@@ -184,6 +184,10 @@ class GPUPerf:
            print(f"Error running command: {e}", file=sys.stderr)
            return_code = 1
        finally:
            # Give CUPTI time to flush remaining buffered events
            # CUPTI may continue recording events after target exits
            time.sleep(0.5)
            # Stop CPU profiler if running
            self.stop_cpu_profiler()
--- a/src/xpu/flamegraph/merge_gpu_cpu_trace.py
+++ b/src/xpu/flamegraph/merge_gpu_cpu_trace.py
@@ -15,26 +15,26 @@ from collections import defaultdict
 class GPUKernelEvent:
-    """Represents a GPU kernel execution event"""
+    """Represents a GPU kernel execution event - timestamps kept in microseconds"""
-    def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int):
+    def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
        self.name = name
-        self.start_ns = start_ns
+        self.start_us = start_us  # Keep in microseconds (native GPU format)
-        self.end_ns = end_ns
+        self.end_us = end_us
        self.correlation_id = correlation_id
    def __repr__(self):
-        return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
+        return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
 class CudaLaunchEvent:
-    """Represents a cudaLaunchKernel runtime API call"""
+    """Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
-    def __init__(self, start_ns: int, end_ns: int, correlation_id: int):
+    def __init__(self, start_us: float, end_us: float, correlation_id: int):
-        self.start_ns = start_ns
+        self.start_us = start_us  # Keep in microseconds (native GPU format)
-        self.end_ns = end_ns
+        self.end_us = end_us
        self.correlation_id = correlation_id
    def __repr__(self):
-        return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
+        return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
 class CPUStack:
@@ -138,11 +138,11 @@ class TraceMerger:
                duration_us = event.get('dur', 0)
                if start_us > 0 and duration_us > 0 and correlation_id > 0:
-                    start_ns = int(start_us * 1000)
+                    # Keep timestamps in microseconds (native GPU format)
-                    end_ns = int((start_us + duration_us) * 1000)
+                    end_us = start_us + duration_us
                    self.cuda_launches[correlation_id] = CudaLaunchEvent(
-                        start_ns, end_ns, correlation_id
+                        start_us, end_us, correlation_id
                    )
                    launch_count += 1
@@ -153,13 +153,13 @@ class TraceMerger:
                duration_us = event.get('dur', 0)
                if start_us > 0 and duration_us > 0 and correlation_id > 0:
-                    start_ns = int(start_us * 1000)
+                    # Keep timestamps in microseconds (native GPU format)
-                    end_ns = int((start_us + duration_us) * 1000)
+                    end_us = start_us + duration_us
                    self.gpu_kernels.append(GPUKernelEvent(
                        kernel_name,
-                        start_ns,
+                        start_us,
-                        end_ns,
+                        end_us,
                        correlation_id
                    ))
                    kernel_count += 1
@@ -170,40 +170,112 @@ class TraceMerger:
        print(f"Parsed {kernel_count} GPU kernel events")
        print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
    def calculate_clock_offset(self):
        """
        Calculate the offset between CPU and GPU clocks.
        CPU and GPU use different time bases, so we need to align them.
        Strategy: Use the median offset from the first few events to be robust against outliers.
        Also report drift to help diagnose correlation issues.
        """
        if not self.cpu_stacks or not self.cuda_launches:
            return 0.0
        # Sample first 100 events from each to calculate offset
        sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
        sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
        sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
        offsets = []
        for cpu, gpu in zip(sorted_cpu, sorted_gpu):
            cpu_us = cpu.timestamp_ns / 1000.0
            offset = cpu_us - gpu.start_us
            offsets.append(offset)
        # Use median to be robust against outliers
        offsets.sort()
        median_offset = offsets[len(offsets) // 2]
        # Calculate drift across entire trace to warn about correlation issues
        if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
            # Sample at start and end
            cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
            cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
            gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
            gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
            offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
            offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
            drift = offset_end - offset_start
            cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000  # ms
            print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
            print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
            if abs(drift) > 1000:  # More than 1ms drift
                print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
                print(f"         This may cause timestamp correlation issues")
        else:
            print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
        return median_offset
    def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
        """
        Find GPU kernel that matches the CPU stack trace.
        Strategy:
-        1. Find cudaLaunchKernel runtime call within timestamp tolerance
+        1. Convert CPU nanosecond timestamp to microseconds
-        2. Use correlation ID to find actual GPU kernel execution
+        2. Apply clock offset to align CPU and GPU time bases
        3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
        4. Use correlation ID to find actual GPU kernel execution
        """
        import bisect
-        # Find cudaLaunchKernel runtime event that matches timestamp
+        # Convert CPU timestamp from nanoseconds to microseconds
-        best_launch = None
+        cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
        min_time_diff = self.timestamp_tolerance_ns
-        for launch in self.cuda_launches.values():
+        # Apply clock offset to align CPU and GPU timestamps
-            # Check if CPU stack timestamp is close to launch time
+        cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
            time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
-            if time_diff < min_time_diff:
+        tolerance_us = self.timestamp_tolerance_ns / 1000.0
                min_time_diff = time_diff
                best_launch = launch
-        if not best_launch:
+        # Binary search to find nearest GPU launch timestamp
        idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
        # Check surrounding launches (idx-1, idx, idx+1) for best match
        candidates = []
        for i in [idx - 1, idx, idx + 1]:
            if 0 <= i < len(self.sorted_launches):
                launch = self.sorted_launches[i]
                time_diff = abs(cpu_timestamp_aligned - launch.start_us)
                if time_diff < tolerance_us:
                    candidates.append((time_diff, launch))
        if not candidates:
            return None
-        # Find GPU kernel with matching correlation ID
+        # Get launch with smallest time difference
-        for kernel in self.gpu_kernels:
+        candidates.sort(key=lambda x: x[0])
-            if kernel.correlation_id == best_launch.correlation_id:
+        best_launch = candidates[0][1]
                return kernel
-        return None
+        # Find GPU kernel with matching correlation ID (using pre-built map)
        if not hasattr(self, 'corr_to_kernel'):
            self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
        return self.corr_to_kernel.get(best_launch.correlation_id)
    def merge_traces(self):
        """Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
        print("Correlating CPU stacks with GPU kernels...")
        # Calculate clock offset between CPU and GPU timestamps
        self.clock_offset_us = self.calculate_clock_offset()
        # Pre-sort GPU launches by timestamp for efficient binary search
        self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
        self.launch_timestamps = [l.start_us for l in self.sorted_launches]
        matched_count = 0
        unmatched_count = 0
@@ -218,18 +290,18 @@ class TraceMerger:
                # Add GPU kernel to the top of the stack
                merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
                matched_count += 1
            else:
                # Mark as unmatched launch (may happen if kernel hasn't executed yet)
                merged_stack.append("[GPU_Launch_Pending]")
                unmatched_count += 1
-            # Create folded stack string
+                # Create folded stack string - only add matched stacks
            if merged_stack:
                stack_str = ';'.join(merged_stack)
                self.merged_stacks[stack_str] += 1
            else:
                # Skip unmatched launches - don't add to merged output
                unmatched_count += 1
        print(f"Matched {matched_count} CPU stacks with GPU kernels")
-        print(f"Unmatched: {unmatched_count}")
+        if unmatched_count > 0:
            print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
            print(f"         This may indicate profiler timing mismatch or clock drift")
        print(f"Total unique stacks: {len(self.merged_stacks)}")
    def write_folded_output(self, output_file: str):
@@ -261,7 +333,7 @@ class TraceMerger:
            print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
            print(f"CUDA launch events: {len(self.cuda_launches)}")
-            total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000
+            total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
            print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
            # Show kernel breakdown