fix: Ensure CUPTI flushes events before stopping CPU profiler; update timestamp handling to use microseconds

This commit is contained in:
Littlefisher
2025-10-27 20:05:45 -07:00
parent 05ca05aa7c
commit f5ee20e333
2 changed files with 118 additions and 42 deletions

View File

@@ -184,6 +184,10 @@ class GPUPerf:
print(f"Error running command: {e}", file=sys.stderr) print(f"Error running command: {e}", file=sys.stderr)
return_code = 1 return_code = 1
finally: finally:
# Give CUPTI time to flush remaining buffered events
# CUPTI may continue recording events after target exits
time.sleep(0.5)
# Stop CPU profiler if running # Stop CPU profiler if running
self.stop_cpu_profiler() self.stop_cpu_profiler()

View File

@@ -15,26 +15,26 @@ from collections import defaultdict
class GPUKernelEvent: class GPUKernelEvent:
"""Represents a GPU kernel execution event""" """Represents a GPU kernel execution event - timestamps kept in microseconds"""
def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int): def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
self.name = name self.name = name
self.start_ns = start_ns self.start_us = start_us # Keep in microseconds (native GPU format)
self.end_ns = end_ns self.end_us = end_us
self.correlation_id = correlation_id self.correlation_id = correlation_id
def __repr__(self): def __repr__(self):
return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})" return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
class CudaLaunchEvent: class CudaLaunchEvent:
"""Represents a cudaLaunchKernel runtime API call""" """Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
def __init__(self, start_ns: int, end_ns: int, correlation_id: int): def __init__(self, start_us: float, end_us: float, correlation_id: int):
self.start_ns = start_ns self.start_us = start_us # Keep in microseconds (native GPU format)
self.end_ns = end_ns self.end_us = end_us
self.correlation_id = correlation_id self.correlation_id = correlation_id
def __repr__(self): def __repr__(self):
return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})" return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
class CPUStack: class CPUStack:
@@ -138,11 +138,11 @@ class TraceMerger:
duration_us = event.get('dur', 0) duration_us = event.get('dur', 0)
if start_us > 0 and duration_us > 0 and correlation_id > 0: if start_us > 0 and duration_us > 0 and correlation_id > 0:
start_ns = int(start_us * 1000) # Keep timestamps in microseconds (native GPU format)
end_ns = int((start_us + duration_us) * 1000) end_us = start_us + duration_us
self.cuda_launches[correlation_id] = CudaLaunchEvent( self.cuda_launches[correlation_id] = CudaLaunchEvent(
start_ns, end_ns, correlation_id start_us, end_us, correlation_id
) )
launch_count += 1 launch_count += 1
@@ -153,13 +153,13 @@ class TraceMerger:
duration_us = event.get('dur', 0) duration_us = event.get('dur', 0)
if start_us > 0 and duration_us > 0 and correlation_id > 0: if start_us > 0 and duration_us > 0 and correlation_id > 0:
start_ns = int(start_us * 1000) # Keep timestamps in microseconds (native GPU format)
end_ns = int((start_us + duration_us) * 1000) end_us = start_us + duration_us
self.gpu_kernels.append(GPUKernelEvent( self.gpu_kernels.append(GPUKernelEvent(
kernel_name, kernel_name,
start_ns, start_us,
end_ns, end_us,
correlation_id correlation_id
)) ))
kernel_count += 1 kernel_count += 1
@@ -170,40 +170,112 @@ class TraceMerger:
print(f"Parsed {kernel_count} GPU kernel events") print(f"Parsed {kernel_count} GPU kernel events")
print(f"Parsed {launch_count} cudaLaunchKernel runtime events") print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
def calculate_clock_offset(self):
"""
Calculate the offset between CPU and GPU clocks.
CPU and GPU use different time bases, so we need to align them.
Strategy: Use the median offset from the first few events to be robust against outliers.
Also report drift to help diagnose correlation issues.
"""
if not self.cpu_stacks or not self.cuda_launches:
return 0.0
# Sample first 100 events from each to calculate offset
sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
offsets = []
for cpu, gpu in zip(sorted_cpu, sorted_gpu):
cpu_us = cpu.timestamp_ns / 1000.0
offset = cpu_us - gpu.start_us
offsets.append(offset)
# Use median to be robust against outliers
offsets.sort()
median_offset = offsets[len(offsets) // 2]
# Calculate drift across entire trace to warn about correlation issues
if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
# Sample at start and end
cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
drift = offset_end - offset_start
cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000 # ms
print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
if abs(drift) > 1000: # More than 1ms drift
print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
print(f" This may cause timestamp correlation issues")
else:
print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
return median_offset
def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]: def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
""" """
Find GPU kernel that matches the CPU stack trace. Find GPU kernel that matches the CPU stack trace.
Strategy: Strategy:
1. Find cudaLaunchKernel runtime call within timestamp tolerance 1. Convert CPU nanosecond timestamp to microseconds
2. Use correlation ID to find actual GPU kernel execution 2. Apply clock offset to align CPU and GPU time bases
3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
4. Use correlation ID to find actual GPU kernel execution
""" """
import bisect
# Find cudaLaunchKernel runtime event that matches timestamp # Convert CPU timestamp from nanoseconds to microseconds
best_launch = None cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
min_time_diff = self.timestamp_tolerance_ns
for launch in self.cuda_launches.values(): # Apply clock offset to align CPU and GPU timestamps
# Check if CPU stack timestamp is close to launch time cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
if time_diff < min_time_diff: tolerance_us = self.timestamp_tolerance_ns / 1000.0
min_time_diff = time_diff
best_launch = launch
if not best_launch: # Binary search to find nearest GPU launch timestamp
idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
# Check surrounding launches (idx-1, idx, idx+1) for best match
candidates = []
for i in [idx - 1, idx, idx + 1]:
if 0 <= i < len(self.sorted_launches):
launch = self.sorted_launches[i]
time_diff = abs(cpu_timestamp_aligned - launch.start_us)
if time_diff < tolerance_us:
candidates.append((time_diff, launch))
if not candidates:
return None return None
# Find GPU kernel with matching correlation ID # Get launch with smallest time difference
for kernel in self.gpu_kernels: candidates.sort(key=lambda x: x[0])
if kernel.correlation_id == best_launch.correlation_id: best_launch = candidates[0][1]
return kernel
return None # Find GPU kernel with matching correlation ID (using pre-built map)
if not hasattr(self, 'corr_to_kernel'):
self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
return self.corr_to_kernel.get(best_launch.correlation_id)
def merge_traces(self): def merge_traces(self):
"""Correlate CPU stacks with GPU kernels using correlation IDs and timestamps""" """Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
print("Correlating CPU stacks with GPU kernels...") print("Correlating CPU stacks with GPU kernels...")
# Calculate clock offset between CPU and GPU timestamps
self.clock_offset_us = self.calculate_clock_offset()
# Pre-sort GPU launches by timestamp for efficient binary search
self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
self.launch_timestamps = [l.start_us for l in self.sorted_launches]
matched_count = 0 matched_count = 0
unmatched_count = 0 unmatched_count = 0
@@ -218,18 +290,18 @@ class TraceMerger:
# Add GPU kernel to the top of the stack # Add GPU kernel to the top of the stack
merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}") merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
matched_count += 1 matched_count += 1
else:
# Mark as unmatched launch (may happen if kernel hasn't executed yet)
merged_stack.append("[GPU_Launch_Pending]")
unmatched_count += 1
# Create folded stack string # Create folded stack string - only add matched stacks
if merged_stack:
stack_str = ';'.join(merged_stack) stack_str = ';'.join(merged_stack)
self.merged_stacks[stack_str] += 1 self.merged_stacks[stack_str] += 1
else:
# Skip unmatched launches - don't add to merged output
unmatched_count += 1
print(f"Matched {matched_count} CPU stacks with GPU kernels") print(f"Matched {matched_count} CPU stacks with GPU kernels")
print(f"Unmatched: {unmatched_count}") if unmatched_count > 0:
print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
print(f" This may indicate profiler timing mismatch or clock drift")
print(f"Total unique stacks: {len(self.merged_stacks)}") print(f"Total unique stacks: {len(self.merged_stacks)}")
def write_folded_output(self, output_file: str): def write_folded_output(self, output_file: str):
@@ -261,7 +333,7 @@ class TraceMerger:
print(f"\nGPU kernels executed: {len(self.gpu_kernels)}") print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
print(f"CUDA launch events: {len(self.cuda_launches)}") print(f"CUDA launch events: {len(self.cuda_launches)}")
total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000 total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
print(f"Total kernel execution time: {total_kernel_time:.2f} ms") print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
# Show kernel breakdown # Show kernel breakdown