diff --git a/src/xpu/flamegraph/gpuperf.py b/src/xpu/flamegraph/gpuperf.py
index d848f7a..dede7d2 100755
--- a/src/xpu/flamegraph/gpuperf.py
+++ b/src/xpu/flamegraph/gpuperf.py
@@ -184,6 +184,10 @@ class GPUPerf:
             print(f"Error running command: {e}", file=sys.stderr)
             return_code = 1
         finally:
+            # Give CUPTI time to flush remaining buffered events
+            # CUPTI may continue recording events after target exits
+            time.sleep(0.5)
+
             # Stop CPU profiler if running
             self.stop_cpu_profiler()
         
diff --git a/src/xpu/flamegraph/merge_gpu_cpu_trace.py b/src/xpu/flamegraph/merge_gpu_cpu_trace.py
index 39557d2..ddcbeb4 100755
--- a/src/xpu/flamegraph/merge_gpu_cpu_trace.py
+++ b/src/xpu/flamegraph/merge_gpu_cpu_trace.py
@@ -15,26 +15,26 @@ from collections import defaultdict
 
 
 class GPUKernelEvent:
-    """Represents a GPU kernel execution event"""
-    def __init__(self, name: str, start_ns: int, end_ns: int, correlation_id: int):
+    """Represents a GPU kernel execution event - timestamps kept in microseconds"""
+    def __init__(self, name: str, start_us: float, end_us: float, correlation_id: int):
         self.name = name
-        self.start_ns = start_ns
-        self.end_ns = end_ns
+        self.start_us = start_us  # Keep in microseconds (native GPU format)
+        self.end_us = end_us
         self.correlation_id = correlation_id
 
     def __repr__(self):
-        return f"GPUKernel({self.name}, {self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
+        return f"GPUKernel({self.name}, {self.start_us}-{self.end_us} us, corr={self.correlation_id})"
 
 
 class CudaLaunchEvent:
-    """Represents a cudaLaunchKernel runtime API call"""
-    def __init__(self, start_ns: int, end_ns: int, correlation_id: int):
-        self.start_ns = start_ns
-        self.end_ns = end_ns
+    """Represents a cudaLaunchKernel runtime API call - timestamps kept in microseconds"""
+    def __init__(self, start_us: float, end_us: float, correlation_id: int):
+        self.start_us = start_us  # Keep in microseconds (native GPU format)
+        self.end_us = end_us
         self.correlation_id = correlation_id
 
     def __repr__(self):
-        return f"CudaLaunch({self.start_ns}-{self.end_ns}, corr={self.correlation_id})"
+        return f"CudaLaunch({self.start_us}-{self.end_us} us, corr={self.correlation_id})"
 
 
 class CPUStack:
@@ -138,11 +138,11 @@ class TraceMerger:
                 duration_us = event.get('dur', 0)
 
                 if start_us > 0 and duration_us > 0 and correlation_id > 0:
-                    start_ns = int(start_us * 1000)
-                    end_ns = int((start_us + duration_us) * 1000)
+                    # Keep timestamps in microseconds (native GPU format)
+                    end_us = start_us + duration_us
 
                     self.cuda_launches[correlation_id] = CudaLaunchEvent(
-                        start_ns, end_ns, correlation_id
+                        start_us, end_us, correlation_id
                     )
                     launch_count += 1
 
@@ -153,13 +153,13 @@ class TraceMerger:
                 duration_us = event.get('dur', 0)
 
                 if start_us > 0 and duration_us > 0 and correlation_id > 0:
-                    start_ns = int(start_us * 1000)
-                    end_ns = int((start_us + duration_us) * 1000)
+                    # Keep timestamps in microseconds (native GPU format)
+                    end_us = start_us + duration_us
 
                     self.gpu_kernels.append(GPUKernelEvent(
                         kernel_name,
-                        start_ns,
-                        end_ns,
+                        start_us,
+                        end_us,
                         correlation_id
                     ))
                     kernel_count += 1
@@ -170,40 +170,112 @@ class TraceMerger:
         print(f"Parsed {kernel_count} GPU kernel events")
         print(f"Parsed {launch_count} cudaLaunchKernel runtime events")
 
+    def calculate_clock_offset(self):
+        """
+        Calculate the offset between CPU and GPU clocks.
+        CPU and GPU use different time bases, so we need to align them.
+
+        Strategy: Use the median offset from the first few events to be robust against outliers.
+        Also report drift to help diagnose correlation issues.
+        """
+        if not self.cpu_stacks or not self.cuda_launches:
+            return 0.0
+
+        # Sample first 100 events from each to calculate offset
+        sample_size = min(100, len(self.cpu_stacks), len(self.cuda_launches))
+
+        sorted_cpu = sorted(self.cpu_stacks[:sample_size], key=lambda x: x.timestamp_ns)
+        sorted_gpu = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)[:sample_size]
+
+        offsets = []
+        for cpu, gpu in zip(sorted_cpu, sorted_gpu):
+            cpu_us = cpu.timestamp_ns / 1000.0
+            offset = cpu_us - gpu.start_us
+            offsets.append(offset)
+
+        # Use median to be robust against outliers
+        offsets.sort()
+        median_offset = offsets[len(offsets) // 2]
+
+        # Calculate drift across entire trace to warn about correlation issues
+        if len(self.cpu_stacks) > 100 and len(self.cuda_launches) > 100:
+            # Sample at start and end
+            cpu_first = min(self.cpu_stacks, key=lambda x: x.timestamp_ns)
+            cpu_last = max(self.cpu_stacks, key=lambda x: x.timestamp_ns)
+            gpu_first = min(self.cuda_launches.values(), key=lambda x: x.start_us)
+            gpu_last = max(self.cuda_launches.values(), key=lambda x: x.start_us)
+
+            offset_start = cpu_first.timestamp_ns / 1000.0 - gpu_first.start_us
+            offset_end = cpu_last.timestamp_ns / 1000.0 - gpu_last.start_us
+            drift = offset_end - offset_start
+
+            cpu_duration = (cpu_last.timestamp_ns - cpu_first.timestamp_ns) / 1_000_000  # ms
+
+            print(f"Clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
+            print(f"Clock drift: {drift / 1000:.3f} ms over {cpu_duration:.1f} ms trace duration")
+            if abs(drift) > 1000:  # More than 1ms drift
+                print(f"WARNING: Significant clock drift detected ({drift / cpu_duration:.3f} ms/ms)")
+                print(f"         This may cause timestamp correlation issues")
+        else:
+            print(f"Calculated clock offset: {median_offset / 1000:.3f} ms (CPU - GPU)")
+
+        return median_offset
+
     def find_matching_kernel(self, cpu_stack: CPUStack) -> Optional[GPUKernelEvent]:
         """
         Find GPU kernel that matches the CPU stack trace.
         Strategy:
-        1. Find cudaLaunchKernel runtime call within timestamp tolerance
-        2. Use correlation ID to find actual GPU kernel execution
+        1. Convert CPU nanosecond timestamp to microseconds
+        2. Apply clock offset to align CPU and GPU time bases
+        3. Use binary search to find cudaLaunchKernel runtime call within timestamp tolerance
+        4. Use correlation ID to find actual GPU kernel execution
         """
+        import bisect
 
-        # Find cudaLaunchKernel runtime event that matches timestamp
-        best_launch = None
-        min_time_diff = self.timestamp_tolerance_ns
+        # Convert CPU timestamp from nanoseconds to microseconds
+        cpu_timestamp_us = cpu_stack.timestamp_ns / 1000.0
 
-        for launch in self.cuda_launches.values():
-            # Check if CPU stack timestamp is close to launch time
-            time_diff = abs(cpu_stack.timestamp_ns - launch.start_ns)
+        # Apply clock offset to align CPU and GPU timestamps
+        cpu_timestamp_aligned = cpu_timestamp_us - self.clock_offset_us
 
-            if time_diff < min_time_diff:
-                min_time_diff = time_diff
-                best_launch = launch
+        tolerance_us = self.timestamp_tolerance_ns / 1000.0
 
-        if not best_launch:
+        # Binary search to find nearest GPU launch timestamp
+        idx = bisect.bisect_left(self.launch_timestamps, cpu_timestamp_aligned)
+
+        # Check surrounding launches (idx-1, idx, idx+1) for best match
+        candidates = []
+        for i in [idx - 1, idx, idx + 1]:
+            if 0 <= i < len(self.sorted_launches):
+                launch = self.sorted_launches[i]
+                time_diff = abs(cpu_timestamp_aligned - launch.start_us)
+                if time_diff < tolerance_us:
+                    candidates.append((time_diff, launch))
+
+        if not candidates:
             return None
 
-        # Find GPU kernel with matching correlation ID
-        for kernel in self.gpu_kernels:
-            if kernel.correlation_id == best_launch.correlation_id:
-                return kernel
+        # Get launch with smallest time difference
+        candidates.sort(key=lambda x: x[0])
+        best_launch = candidates[0][1]
 
-        return None
+        # Find GPU kernel with matching correlation ID (using pre-built map)
+        if not hasattr(self, 'corr_to_kernel'):
+            self.corr_to_kernel = {k.correlation_id: k for k in self.gpu_kernels}
+
+        return self.corr_to_kernel.get(best_launch.correlation_id)
 
     def merge_traces(self):
         """Correlate CPU stacks with GPU kernels using correlation IDs and timestamps"""
         print("Correlating CPU stacks with GPU kernels...")
 
+        # Calculate clock offset between CPU and GPU timestamps
+        self.clock_offset_us = self.calculate_clock_offset()
+
+        # Pre-sort GPU launches by timestamp for efficient binary search
+        self.sorted_launches = sorted(self.cuda_launches.values(), key=lambda x: x.start_us)
+        self.launch_timestamps = [l.start_us for l in self.sorted_launches]
+
         matched_count = 0
         unmatched_count = 0
 
@@ -218,18 +290,18 @@ class TraceMerger:
                 # Add GPU kernel to the top of the stack
                 merged_stack.append(f"[GPU_Kernel]{gpu_kernel.name}")
                 matched_count += 1
-            else:
-                # Mark as unmatched launch (may happen if kernel hasn't executed yet)
-                merged_stack.append("[GPU_Launch_Pending]")
-                unmatched_count += 1
 
-            # Create folded stack string
-            if merged_stack:
+                # Create folded stack string - only add matched stacks
                 stack_str = ';'.join(merged_stack)
                 self.merged_stacks[stack_str] += 1
+            else:
+                # Skip unmatched launches - don't add to merged output
+                unmatched_count += 1
 
         print(f"Matched {matched_count} CPU stacks with GPU kernels")
-        print(f"Unmatched: {unmatched_count}")
+        if unmatched_count > 0:
+            print(f"WARNING: {unmatched_count} CPU stacks could not be correlated with GPU kernels")
+            print(f"         This may indicate profiler timing mismatch or clock drift")
         print(f"Total unique stacks: {len(self.merged_stacks)}")
 
     def write_folded_output(self, output_file: str):
@@ -261,7 +333,7 @@ class TraceMerger:
             print(f"\nGPU kernels executed: {len(self.gpu_kernels)}")
             print(f"CUDA launch events: {len(self.cuda_launches)}")
 
-            total_kernel_time = sum(k.end_ns - k.start_ns for k in self.gpu_kernels) / 1_000_000
+            total_kernel_time = sum(k.end_us - k.start_us for k in self.gpu_kernels) / 1_000
             print(f"Total kernel execution time: {total_kernel_time:.2f} ms")
 
             # Show kernel breakdown