fix: Ensure CUPTI flushes events before stopping CPU profiler; update timestamp handling to use microseconds

This commit is contained in:
Littlefisher
2025-10-27 20:05:45 -07:00
parent 05ca05aa7c
commit f5ee20e333
2 changed files with 118 additions and 42 deletions

View File

@@ -184,6 +184,10 @@ class GPUPerf:
print(f"Error running command: {e}", file=sys.stderr)
return_code = 1
finally:
# Give CUPTI time to flush remaining buffered events
# CUPTI may continue recording events after target exits
time.sleep(0.5)
# Stop CPU profiler if running
self.stop_cpu_profiler()

View File

@@ -15,26 +15,26 @@ from collections import defaultdict
class GPUKernelEvent:
"""Represents a GPU kernel execution event"""
def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int):
"""Represents a GPU kernel execution event - timestamps kept in microseconds"""
def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
self.name = name
self.start_ns = start_ns
self.end_ns = end_ns
self.start_us = start_us # Keep in microseconds (native GPU format)
self.end_us = end_us
self.correlation_id = correlation_id
def __repr__(self):
return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
class CudaLaunchEvent:
"""Represents a cudaLaunchKernel runtime API call"""
def __init__(self, start_ns: int, end_ns: int, correlation_id: int):
self.start_ns = start_ns
self.end_ns = end_ns
"""Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
def __init__(self, start_us: float, end_us: float, correlation_id: int):
self.start_us = start_us # Keep in microseconds (native GPU format)
self.end_us = end_us
self.correlation_id = correlation_id
def __repr__(self):
return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
class CPUStack:
@@ -138,11 +138,11 @@ class TraceMerger:
duration_us = event.get('dur', 0)
if start_us > 0 and duration_us > 0 and correlation_id > 0:
start_ns = int(start_us * 1000)
end_ns = int((start_us + duration_us) * 1000)
# Keep timestamps in microseconds (native GPU format)
end_us = start_us + duration_us
self.cuda_launches[correlation_id] = CudaLaunchEvent(
start_ns, end_ns, correlation_id
start_us, end_us, correlation_id
)
launch_count += 1
@@ -153,13 +153,13 @@ class TraceMerger:
duration_us = event.get('dur', 0)
if start_us > 0 and duration_us > 0 and correlation_id > 0:
start_ns = int(start_us * 1000)
end_ns = int((start_us + duration_us) * 1000)
# Keep timestamps in microseconds (native GPU format)
end_us = start_us + duration_us
self.gpu_kernels.append(GPUKernelEvent(
kernel_name,
start_ns,
end_ns,
start_us,
end_us,
correlation_id
))
kernel_count += 1
@@ -170,40 +170,112 @@ class TraceMerger:
print(f"Parsed {kernel_count} GPU kernel events")
print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
def calculate_clock_offset(self):
"""
Calculate the offset between CPU and GPU clocks.
CPU and GPU use different time bases, so we need to align them.
Strategy: Use the median offset from the first few events to be robust against outliers.
Also report drift to help diagnose correlation issues.
"""
if not self.cpu_stacks or not self.cuda_launches:
return 0.0
# Sample first 100 events from each to calculate offset
sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
offsets = []
for cpu, gpu in zip(sorted_cpu, sorted_gpu):
cpu_us = cpu.timestamp_ns / 1000.0
offset = cpu_us - gpu.start_us
offsets.append(offset)
# Use median to be robust against outliers
offsets.sort()
median_offset = offsets[len(offsets) // 2]
# Calculate drift across entire trace to warn about correlation issues
if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
# Sample at start and end
cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
drift = offset_end - offset_start
cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000 # ms
print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
if abs(drift) > 1000: # More than 1ms drift
print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
print(f" This may cause timestamp correlation issues")
else:
print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
return median_offset
def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
"""
Find GPU kernel that matches the CPU stack trace.
Strategy:
1. Find cudaLaunchKernel runtime call within timestamp tolerance
2. Use correlation ID to find actual GPU kernel execution
1. Convert CPU nanosecond timestamp to microseconds
2. Apply clock offset to align CPU and GPU time bases
3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
4. Use correlation ID to find actual GPU kernel execution
"""
import bisect
# Find cudaLaunchKernel runtime event that matches timestamp
best_launch = None
min_time_diff = self.timestamp_tolerance_ns
# Convert CPU timestamp from nanoseconds to microseconds
cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
for launch in self.cuda_launches.values():
# Check if CPU stack timestamp is close to launch time
time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
# Apply clock offset to align CPU and GPU timestamps
cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
if time_diff < min_time_diff:
min_time_diff = time_diff
best_launch = launch
tolerance_us = self.timestamp_tolerance_ns / 1000.0
if not best_launch:
# Binary search to find nearest GPU launch timestamp
idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
# Check surrounding launches (idx-1, idx, idx+1) for best match
candidates = []
for i in [idx - 1, idx, idx + 1]:
if 0 <= i < len(self.sorted_launches):
launch = self.sorted_launches[i]
time_diff = abs(cpu_timestamp_aligned - launch.start_us)
if time_diff < tolerance_us:
candidates.append((time_diff, launch))
if not candidates:
return None
# Find GPU kernel with matching correlation ID
for kernel in self.gpu_kernels:
if kernel.correlation_id == best_launch.correlation_id:
return kernel
# Get launch with smallest time difference
candidates.sort(key=lambda x: x[0])
best_launch = candidates[0][1]
return None
# Find GPU kernel with matching correlation ID (using pre-built map)
if not hasattr(self, 'corr_to_kernel'):
self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
return self.corr_to_kernel.get(best_launch.correlation_id)
def merge_traces(self):
"""Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
print("Correlating CPU stacks with GPU kernels...")
# Calculate clock offset between CPU and GPU timestamps
self.clock_offset_us = self.calculate_clock_offset()
# Pre-sort GPU launches by timestamp for efficient binary search
self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
self.launch_timestamps = [l.start_us for l in self.sorted_launches]
matched_count = 0
unmatched_count = 0
@@ -218,18 +290,18 @@ class TraceMerger:
# Add GPU kernel to the top of the stack
merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
matched_count += 1
else:
# Mark as unmatched launch (may happen if kernel hasn't executed yet)
merged_stack.append("[GPU_Launch_Pending]")
unmatched_count += 1
# Create folded stack string
if merged_stack:
# Create folded stack string - only add matched stacks
stack_str = ';'.join(merged_stack)
self.merged_stacks[stack_str] += 1
else:
# Skip unmatched launches - don't add to merged output
unmatched_count += 1
print(f"Matched {matched_count} CPU stacks with GPU kernels")
print(f"Unmatched: {unmatched_count}")
if unmatched_count > 0:
print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
print(f" This may indicate profiler timing mismatch or clock drift")
print(f"Total unique stacks: {len(self.merged_stacks)}")
def write_folded_output(self, output_file: str):
@@ -261,7 +333,7 @@ class TraceMerger:
print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
print(f"CUDA launch events: {len(self.cuda_launches)}")
total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000
total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
# Show kernel breakdown