mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-05-05 16:44:44 +08:00
fix: Ensure CUPTI flushes events before stopping CPU profiler; update timestamp handling to use microseconds
This commit is contained in:
@@ -184,6 +184,10 @@ class GPUPerf:
|
|||||||
print(f"Error running command: {e}", file=sys.stderr)
|
print(f"Error running command: {e}", file=sys.stderr)
|
||||||
return_code = 1
|
return_code = 1
|
||||||
finally:
|
finally:
|
||||||
|
# Give CUPTI time to flush remaining buffered events
|
||||||
|
# CUPTI may continue recording events after target exits
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
# Stop CPU profiler if running
|
# Stop CPU profiler if running
|
||||||
self.stop_cpu_profiler()
|
self.stop_cpu_profiler()
|
||||||
|
|
||||||
|
|||||||
@@ -15,26 +15,26 @@ from collections import defaultdict
|
|||||||
|
|
||||||
|
|
||||||
class GPUKernelEvent:
|
class GPUKernelEvent:
|
||||||
"""Represents a GPU kernel execution event"""
|
"""Represents a GPU kernel execution event - timestamps kept in microseconds"""
|
||||||
def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int):
|
def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.start_ns = start_ns
|
self.start_us = start_us # Keep in microseconds (native GPU format)
|
||||||
self.end_ns = end_ns
|
self.end_us = end_us
|
||||||
self.correlation_id = correlation_id
|
self.correlation_id = correlation_id
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
|
return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
|
||||||
|
|
||||||
|
|
||||||
class CudaLaunchEvent:
|
class CudaLaunchEvent:
|
||||||
"""Represents a cudaLaunchKernel runtime API call"""
|
"""Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
|
||||||
def __init__(self, start_ns: int, end_ns: int, correlation_id: int):
|
def __init__(self, start_us: float, end_us: float, correlation_id: int):
|
||||||
self.start_ns = start_ns
|
self.start_us = start_us # Keep in microseconds (native GPU format)
|
||||||
self.end_ns = end_ns
|
self.end_us = end_us
|
||||||
self.correlation_id = correlation_id
|
self.correlation_id = correlation_id
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
|
return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
|
||||||
|
|
||||||
|
|
||||||
class CPUStack:
|
class CPUStack:
|
||||||
@@ -138,11 +138,11 @@ class TraceMerger:
|
|||||||
duration_us = event.get('dur', 0)
|
duration_us = event.get('dur', 0)
|
||||||
|
|
||||||
if start_us > 0 and duration_us > 0 and correlation_id > 0:
|
if start_us > 0 and duration_us > 0 and correlation_id > 0:
|
||||||
start_ns = int(start_us * 1000)
|
# Keep timestamps in microseconds (native GPU format)
|
||||||
end_ns = int((start_us + duration_us) * 1000)
|
end_us = start_us + duration_us
|
||||||
|
|
||||||
self.cuda_launches[correlation_id] = CudaLaunchEvent(
|
self.cuda_launches[correlation_id] = CudaLaunchEvent(
|
||||||
start_ns, end_ns, correlation_id
|
start_us, end_us, correlation_id
|
||||||
)
|
)
|
||||||
launch_count += 1
|
launch_count += 1
|
||||||
|
|
||||||
@@ -153,13 +153,13 @@ class TraceMerger:
|
|||||||
duration_us = event.get('dur', 0)
|
duration_us = event.get('dur', 0)
|
||||||
|
|
||||||
if start_us > 0 and duration_us > 0 and correlation_id > 0:
|
if start_us > 0 and duration_us > 0 and correlation_id > 0:
|
||||||
start_ns = int(start_us * 1000)
|
# Keep timestamps in microseconds (native GPU format)
|
||||||
end_ns = int((start_us + duration_us) * 1000)
|
end_us = start_us + duration_us
|
||||||
|
|
||||||
self.gpu_kernels.append(GPUKernelEvent(
|
self.gpu_kernels.append(GPUKernelEvent(
|
||||||
kernel_name,
|
kernel_name,
|
||||||
start_ns,
|
start_us,
|
||||||
end_ns,
|
end_us,
|
||||||
correlation_id
|
correlation_id
|
||||||
))
|
))
|
||||||
kernel_count += 1
|
kernel_count += 1
|
||||||
@@ -170,40 +170,112 @@ class TraceMerger:
|
|||||||
print(f"Parsed {kernel_count} GPU kernel events")
|
print(f"Parsed {kernel_count} GPU kernel events")
|
||||||
print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
|
print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
|
||||||
|
|
||||||
|
def calculate_clock_offset(self):
|
||||||
|
"""
|
||||||
|
Calculate the offset between CPU and GPU clocks.
|
||||||
|
CPU and GPU use different time bases, so we need to align them.
|
||||||
|
|
||||||
|
Strategy: Use the median offset from the first few events to be robust against outliers.
|
||||||
|
Also report drift to help diagnose correlation issues.
|
||||||
|
"""
|
||||||
|
if not self.cpu_stacks or not self.cuda_launches:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Sample first 100 events from each to calculate offset
|
||||||
|
sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
|
||||||
|
|
||||||
|
sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
|
||||||
|
sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
|
||||||
|
|
||||||
|
offsets = []
|
||||||
|
for cpu, gpu in zip(sorted_cpu, sorted_gpu):
|
||||||
|
cpu_us = cpu.timestamp_ns / 1000.0
|
||||||
|
offset = cpu_us - gpu.start_us
|
||||||
|
offsets.append(offset)
|
||||||
|
|
||||||
|
# Use median to be robust against outliers
|
||||||
|
offsets.sort()
|
||||||
|
median_offset = offsets[len(offsets) // 2]
|
||||||
|
|
||||||
|
# Calculate drift across entire trace to warn about correlation issues
|
||||||
|
if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
|
||||||
|
# Sample at start and end
|
||||||
|
cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
|
||||||
|
cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
|
||||||
|
gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
|
||||||
|
gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
|
||||||
|
|
||||||
|
offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
|
||||||
|
offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
|
||||||
|
drift = offset_end - offset_start
|
||||||
|
|
||||||
|
cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000 # ms
|
||||||
|
|
||||||
|
print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
|
||||||
|
print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
|
||||||
|
if abs(drift) > 1000: # More than 1ms drift
|
||||||
|
print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
|
||||||
|
print(f" This may cause timestamp correlation issues")
|
||||||
|
else:
|
||||||
|
print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
|
||||||
|
|
||||||
|
return median_offset
|
||||||
|
|
||||||
def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
|
def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
|
||||||
"""
|
"""
|
||||||
Find GPU kernel that matches the CPU stack trace.
|
Find GPU kernel that matches the CPU stack trace.
|
||||||
Strategy:
|
Strategy:
|
||||||
1. Find cudaLaunchKernel runtime call within timestamp tolerance
|
1. Convert CPU nanosecond timestamp to microseconds
|
||||||
2. Use correlation ID to find actual GPU kernel execution
|
2. Apply clock offset to align CPU and GPU time bases
|
||||||
|
3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
|
||||||
|
4. Use correlation ID to find actual GPU kernel execution
|
||||||
"""
|
"""
|
||||||
|
import bisect
|
||||||
|
|
||||||
# Find cudaLaunchKernel runtime event that matches timestamp
|
# Convert CPU timestamp from nanoseconds to microseconds
|
||||||
best_launch = None
|
cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
|
||||||
min_time_diff = self.timestamp_tolerance_ns
|
|
||||||
|
|
||||||
for launch in self.cuda_launches.values():
|
# Apply clock offset to align CPU and GPU timestamps
|
||||||
# Check if CPU stack timestamp is close to launch time
|
cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
|
||||||
time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
|
|
||||||
|
|
||||||
if time_diff < min_time_diff:
|
tolerance_us = self.timestamp_tolerance_ns / 1000.0
|
||||||
min_time_diff = time_diff
|
|
||||||
best_launch = launch
|
|
||||||
|
|
||||||
if not best_launch:
|
# Binary search to find nearest GPU launch timestamp
|
||||||
|
idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
|
||||||
|
|
||||||
|
# Check surrounding launches (idx-1, idx, idx+1) for best match
|
||||||
|
candidates = []
|
||||||
|
for i in [idx - 1, idx, idx + 1]:
|
||||||
|
if 0 <= i < len(self.sorted_launches):
|
||||||
|
launch = self.sorted_launches[i]
|
||||||
|
time_diff = abs(cpu_timestamp_aligned - launch.start_us)
|
||||||
|
if time_diff < tolerance_us:
|
||||||
|
candidates.append((time_diff, launch))
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Find GPU kernel with matching correlation ID
|
# Get launch with smallest time difference
|
||||||
for kernel in self.gpu_kernels:
|
candidates.sort(key=lambda x: x[0])
|
||||||
if kernel.correlation_id == best_launch.correlation_id:
|
best_launch = candidates[0][1]
|
||||||
return kernel
|
|
||||||
|
|
||||||
return None
|
# Find GPU kernel with matching correlation ID (using pre-built map)
|
||||||
|
if not hasattr(self, 'corr_to_kernel'):
|
||||||
|
self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
|
||||||
|
|
||||||
|
return self.corr_to_kernel.get(best_launch.correlation_id)
|
||||||
|
|
||||||
def merge_traces(self):
|
def merge_traces(self):
|
||||||
"""Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
|
"""Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
|
||||||
print("Correlating CPU stacks with GPU kernels...")
|
print("Correlating CPU stacks with GPU kernels...")
|
||||||
|
|
||||||
|
# Calculate clock offset between CPU and GPU timestamps
|
||||||
|
self.clock_offset_us = self.calculate_clock_offset()
|
||||||
|
|
||||||
|
# Pre-sort GPU launches by timestamp for efficient binary search
|
||||||
|
self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
|
||||||
|
self.launch_timestamps = [l.start_us for l in self.sorted_launches]
|
||||||
|
|
||||||
matched_count = 0
|
matched_count = 0
|
||||||
unmatched_count = 0
|
unmatched_count = 0
|
||||||
|
|
||||||
@@ -218,18 +290,18 @@ class TraceMerger:
|
|||||||
# Add GPU kernel to the top of the stack
|
# Add GPU kernel to the top of the stack
|
||||||
merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
|
merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
|
||||||
matched_count += 1
|
matched_count += 1
|
||||||
else:
|
|
||||||
# Mark as unmatched launch (may happen if kernel hasn't executed yet)
|
|
||||||
merged_stack.append("[GPU_Launch_Pending]")
|
|
||||||
unmatched_count += 1
|
|
||||||
|
|
||||||
# Create folded stack string
|
# Create folded stack string - only add matched stacks
|
||||||
if merged_stack:
|
|
||||||
stack_str = ';'.join(merged_stack)
|
stack_str = ';'.join(merged_stack)
|
||||||
self.merged_stacks[stack_str] += 1
|
self.merged_stacks[stack_str] += 1
|
||||||
|
else:
|
||||||
|
# Skip unmatched launches - don't add to merged output
|
||||||
|
unmatched_count += 1
|
||||||
|
|
||||||
print(f"Matched {matched_count} CPU stacks with GPU kernels")
|
print(f"Matched {matched_count} CPU stacks with GPU kernels")
|
||||||
print(f"Unmatched: {unmatched_count}")
|
if unmatched_count > 0:
|
||||||
|
print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
|
||||||
|
print(f" This may indicate profiler timing mismatch or clock drift")
|
||||||
print(f"Total unique stacks: {len(self.merged_stacks)}")
|
print(f"Total unique stacks: {len(self.merged_stacks)}")
|
||||||
|
|
||||||
def write_folded_output(self, output_file: str):
|
def write_folded_output(self, output_file: str):
|
||||||
@@ -261,7 +333,7 @@ class TraceMerger:
|
|||||||
print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
|
print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
|
||||||
print(f"CUDA launch events: {len(self.cuda_launches)}")
|
print(f"CUDA launch events: {len(self.cuda_launches)}")
|
||||||
|
|
||||||
total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000
|
total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
|
||||||
print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
|
print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
|
||||||
|
|
||||||
# Show kernel breakdown
|
# Show kernel breakdown
|
||||||
|
|||||||
Reference in New Issue
Block a user