bpf-developer-tutorial/src/xpu/flamegraph/gpuperf.py

#!/usr/bin/env python3

import os
import sys
import argparse
import subprocess
import tempfile
import atexit
import time
import json
from pathlib import Path
from cupti_trace_parser import CuptiTraceParser
from merge_gpu_cpu_trace import TraceMerger

class GPUPerf:
    def __init__(self):
        self.script_dir = Path(__file__).parent.absolute()
        self.injection_lib = self.script_dir / "cupti_trace/libcupti_trace_injection.so"
        self.output_file = None
        self.temp_trace_file = None
        self.profiler_proc = None
        self.profiler_output = None
        self.parser = CuptiTraceParser()  # Initialize the parser

        # Path to CPU profiler
        script_dir = Path(__file__).parent.resolve()
        self.cpu_profiler = script_dir / "profiler/target/release/profile"
        if not self.cpu_profiler.exists():
            print(f"Warning: CPU profiler not found at {self.cpu_profiler}", file=sys.stderr)
            self.cpu_profiler = None

        # Find CUPTI library path
        cuda_paths = [
            "/usr/local/cuda-13.0/extras/CUPTI/lib64",
            "/usr/local/cuda/extras/CUPTI/lib64",
            "/usr/local/cuda-12.0/extras/CUPTI/lib64",
        ]

        self.cupti_lib = None
        for path in cuda_paths:
            cupti_path = Path(path) / "libcupti.so"
            if cupti_path.exists():
                self.cupti_lib = str(cupti_path)
                self.cupti_lib_dir = str(Path(path))
                break

        if not self.cupti_lib:
            print("Warning: Could not find CUPTI library. NVTX annotations may not work.", file=sys.stderr)

    def parse_cupti_trace(self, filename):
        """Parse CUPTI trace data using the parser module"""
        return self.parser.parse_file(filename)

    def start_cpu_profiler(self, pid=None, cpu_output_file=None, cuda_lib_path=None):
        """Start CPU profiler with cudaLaunchKernel uprobe"""
        if not self.cpu_profiler:
            return None

        if not cpu_output_file:
            cpu_output_file = f"cpu_profile_{pid if pid else 'cuda'}.txt"

        # Convert to absolute path to handle working directory changes
        self.profiler_output = str(Path(cpu_output_file).absolute())

        # Find CUDA runtime library if not specified
        if not cuda_lib_path:
            cuda_paths = [
                "/usr/local/cuda-12.9/lib64/libcudart.so.12",
                "/usr/local/cuda-13.0/lib64/libcudart.so.12",
                "/usr/local/cuda/lib64/libcudart.so.12",
                "/usr/local/cuda-12.8/lib64/libcudart.so.12",
            ]
            for path in cuda_paths:
                if Path(path).exists():
                    cuda_lib_path = path
                    break

        if not cuda_lib_path:
            print("Warning: Could not find CUDA runtime library for uprobe", file=sys.stderr)
            return None

        print(f"Starting CPU profiler with cudaLaunchKernel hook")
        print(f"  CUDA library: {cuda_lib_path}")
        print(f"  Output: {cpu_output_file}")

        try:
            # Run profiler with cudaLaunchKernel uprobe in extended folded format
            # Format: timestamp_ns comm pid tid cpu stack1;stack2;stack3
            cmd = ["sudo", str(self.cpu_profiler),
                   "--uprobe", f"{cuda_lib_path}:cudaLaunchKernel",
                   "-E"]  # -E for extended folded format with timestamps

            self.profiler_proc = subprocess.Popen(
                cmd,
                stdout=open(cpu_output_file, 'w'),
                stderr=subprocess.PIPE
            )
            # Give it a moment to attach
            time.sleep(1.0)
            return self.profiler_proc
        except Exception as e:
            print(f"Warning: Failed to start CPU profiler: {e}", file=sys.stderr)
            return None

    def stop_cpu_profiler(self):
        """Stop the CPU profiler gracefully"""
        if self.profiler_proc and self.profiler_proc.poll() is None:
            print("Stopping CPU profiler...")
            self.profiler_proc.terminate()
            try:
                self.profiler_proc.wait(timeout=5)
            except subprocess.TimeoutExpired:
                self.profiler_proc.kill()
                self.profiler_proc.wait()

            if self.profiler_output and os.path.exists(self.profiler_output):
                print(f"CPU profile saved to: {self.profiler_output}")

    def run_with_trace(self, command, output_trace=None, chrome_trace=None, cpu_profile=None, merged_trace=None, no_merge=False):
        """Run a command with CUPTI tracing and optional CPU profiling enabled"""

        # Determine if we're doing GPU profiling
        do_gpu_profiling = output_trace is not None or chrome_trace is not None

        # Check if injection library exists (only if we're doing GPU profiling)
        if do_gpu_profiling and not self.injection_lib.exists():
            print(f"Error: CUPTI injection library not found at {self.injection_lib}", file=sys.stderr)
            print("Please build it first using 'make' in the cupti_trace directory", file=sys.stderr)
            return 1

        # Set up trace output file for GPU profiling
        trace_file = None
        if do_gpu_profiling:
            if output_trace:
                # Convert to absolute path to handle target process changing directories
                trace_file = str(Path(output_trace).absolute())
            else:
                # Create temporary file for trace output
                fd, trace_file = tempfile.mkstemp(suffix=".txt", prefix="gpuperf_trace_")
                os.close(fd)
                self.temp_trace_file = trace_file
                atexit.register(self.cleanup_temp_files)

        # Set up environment variables
        env = os.environ.copy()
        env['CUDA_INJECTION64_PATH'] = str(self.injection_lib)
        env['CUPTI_TRACE_OUTPUT_FILE'] = trace_file

        if self.cupti_lib:
            env['NVTX_INJECTION64_PATH'] = self.cupti_lib
            if 'LD_LIBRARY_PATH' in env:
                env['LD_LIBRARY_PATH'] = f"{self.cupti_lib_dir}:{env['LD_LIBRARY_PATH']}"
            else:
                env['LD_LIBRARY_PATH'] = self.cupti_lib_dir

        print(f"Running command with GPU profiling: {' '.join(command)}")
        print(f"Trace output: {trace_file}")

        # Start the target process
        target_proc = None

        try:
            # Start CPU profiler FIRST if available and requested
            if cpu_profile and self.cpu_profiler:
                # Start profiler BEFORE target process to catch all kernel launches
                self.start_cpu_profiler(cpu_output_file=cpu_profile)

            # Then start the target process
            target_proc = subprocess.Popen(command, env=env)
            target_pid = target_proc.pid
            print(f"Started target process with PID: {target_pid}")

            # Wait for the target process to complete
            return_code = target_proc.wait()

        except KeyboardInterrupt:
            print("\nInterrupted by user")
            if target_proc:
                target_proc.terminate()
                try:
                    target_proc.wait(timeout=5)
                except subprocess.TimeoutExpired:
                    target_proc.kill()
            return_code = 130
        except Exception as e:
            print(f"Error running command: {e}", file=sys.stderr)
            return_code = 1
        finally:
            # Give CUPTI time to flush remaining buffered events
            # CUPTI may continue recording events after target exits
            time.sleep(0.5)

            # Stop CPU profiler if running
            self.stop_cpu_profiler()

        # Convert to Chrome trace if requested
        if chrome_trace and os.path.exists(trace_file):
            print(f"\nConverting trace to Chrome format: {chrome_trace}")
            try:
                events = self.parse_cupti_trace(trace_file)
                print(f"Parsed {len(events)} events")

                metadata = {
                    "tool": "gpuperf - GPU Performance Profiler",
                    "format": "Chrome Trace Format",
                    "command": ' '.join(command)
                }

                self.parser.save_chrome_trace(events, chrome_trace, metadata)

                print(f"\nChrome trace file written to: {chrome_trace}")
                print("\nTo visualize the trace:")
                print("1. Open Chrome or Edge browser")
                print("2. Navigate to chrome://tracing or edge://tracing")
                print("3. Click 'Load' and select the generated JSON file")
                print("\nAlternatively, visit https://ui.perfetto.dev/ and drag the JSON file there")
            except Exception as e:
                print(f"Error converting trace: {e}", file=sys.stderr)

        # Clean up temporary file if not keeping raw trace
        if not output_trace and self.temp_trace_file:
            try:
                os.unlink(self.temp_trace_file)
            except:
                pass

        # Generate merged folded trace if both CPU and GPU traces are available (and not disabled)
        if not no_merge and cpu_profile and (chrome_trace or output_trace):
            merged_output = merged_trace if merged_trace else "merged_trace.folded"
            self.generate_merged_trace(
                cpu_trace=cpu_profile,
                gpu_trace=chrome_trace if chrome_trace else None,
                gpu_raw_trace=trace_file if do_gpu_profiling else None,
                output_file=merged_output
            )

        return return_code

    def generate_merged_trace(self, cpu_trace=None, gpu_trace=None, gpu_raw_trace=None, output_file=None):
        """Generate merged CPU+GPU folded trace using TraceMerger"""
        if not cpu_trace or not (gpu_trace or gpu_raw_trace):
            return  # Need both CPU and GPU traces

        if not output_file:
            output_file = "merged_trace.folded"

        print(f"\nGenerating merged CPU+GPU trace: {output_file}")

        try:
            merger = TraceMerger()

            # Parse CPU trace
            if os.path.exists(cpu_trace):
                merger.parse_cpu_trace(cpu_trace)
            else:
                print(f"Warning: CPU trace not found: {cpu_trace}")
                return

            # Parse GPU trace (prefer JSON, fallback to raw)
            if gpu_trace and os.path.exists(gpu_trace):
                merger.parse_gpu_trace(gpu_trace)
            elif gpu_raw_trace and os.path.exists(gpu_raw_trace):
                # Convert raw trace to events first
                events = self.parse_cupti_trace(gpu_raw_trace)
                # Create temporary JSON for merger
                import json
                temp_json = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
                json.dump({"traceEvents": events}, temp_json)
                temp_json.close()
                merger.parse_gpu_trace(temp_json.name)
                os.unlink(temp_json.name)
            else:
                print(f"Warning: GPU trace not found")
                return

            # Merge traces
            merger.merge_traces()

            # Write folded output
            merger.write_folded_output(output_file)

            print(f"✓ Merged trace generated: {output_file}")
            print(f"\nTo generate flamegraph:")
            print(f"  /root/yunwei37/systemscope/cpu-tools/combined_flamegraph.pl {output_file} > merged_flamegraph.svg")

        except Exception as e:
            print(f"Error generating merged trace: {e}", file=sys.stderr)

    def cleanup_temp_files(self):
        """Clean up temporary files"""
        if self.temp_trace_file and os.path.exists(self.temp_trace_file):
            try:
                os.unlink(self.temp_trace_file)
            except:
                pass

    def convert_trace(self, input_file, output_file):
        """Convert existing CUPTI trace to Chrome format"""

        if not os.path.exists(input_file):
            print(f"Error: Input file '{input_file}' not found", file=sys.stderr)
            return 1

        print(f"Converting CUPTI trace to Chrome format...")
        print(f"Input: {input_file}")
        print(f"Output: {output_file}")

        try:
            events = self.parse_cupti_trace(input_file)
            print(f"Parsed {len(events)} events")

            metadata = {
                "tool": "gpuperf - GPU Performance Profiler",
                "format": "Chrome Trace Format"
            }

            self.parser.save_chrome_trace(events, output_file, metadata)

            print(f"\nChrome trace file written to: {output_file}")
            print("\nTo visualize the trace:")
            print("1. Open Chrome or Edge browser")
            print("2. Navigate to chrome://tracing or edge://tracing")
            print("3. Click 'Load' and select the generated JSON file")
            print("\nAlternatively, visit https://ui.perfetto.dev/ and drag the JSON file there")

            return 0
        except Exception as e:
            print(f"Error converting trace: {e}", file=sys.stderr)
            return 1

def main():
    # Check if first argument is 'convert' for conversion mode
    if len(sys.argv) > 1 and sys.argv[1] == 'convert':
        parser = argparse.ArgumentParser(
            prog='gpuperf convert',
            description='Convert existing CUPTI trace to Chrome format'
        )
        parser.add_argument('mode', help='Operation mode')  # This will be 'convert'
        parser.add_argument('-i', '--input', required=True, help='Input CUPTI trace file')
        parser.add_argument('-o', '--output', default='trace.json', help='Output Chrome trace JSON file')
        args = parser.parse_args()

        profiler = GPUPerf()
        return profiler.convert_trace(args.input, args.output)

    # Regular run mode
    parser = argparse.ArgumentParser(
        description='gpuperf - GPU and CPU Performance Profiler',
        usage='gpuperf [options] command [args...]\n       gpuperf convert -i input.txt -o output.json'
    )

    parser.add_argument('-o', '--output', help='Save raw CUPTI trace to file (default: gpu_results.txt)')
    parser.add_argument('-c', '--chrome', help='Convert trace to Chrome format and save to file (default: gpu_results.json)')
    parser.add_argument('-p', '--cpu-profile', help='Also capture CPU profile and save to file (default: cpu_results.txt)')
    parser.add_argument('-m', '--merged', help='Save merged CPU+GPU folded trace (default: merged_trace.folded)')
    parser.add_argument('--cpu-only', action='store_true', help='Only run CPU profiler without GPU tracing')
    parser.add_argument('--no-gpu', action='store_true', help='Disable GPU profiling')
    parser.add_argument('--no-cpu', action='store_true', help='Disable CPU profiling')
    parser.add_argument('--no-merge', action='store_true', help='Disable automatic merged trace generation')
    parser.add_argument('command', nargs=argparse.REMAINDER, help='Command to run with profiling')

    args = parser.parse_args()

    profiler = GPUPerf()

    # Handle run mode
    if not args.command:
        parser.print_help()
        return 1

    # Use the command directly from REMAINDER
    full_command = args.command

    # CPU-only mode
    if args.cpu_only:
        if not profiler.cpu_profiler:
            print("Error: CPU profiler not available", file=sys.stderr)
            return 1

        # Start the process and immediately profile it
        try:
            target_proc = subprocess.Popen(full_command)
            target_pid = target_proc.pid
            print(f"Started target process with PID: {target_pid}")

            cpu_output = args.cpu_profile or "cpu_results.txt"
            profiler.start_cpu_profiler(target_pid, cpu_output)

            return_code = target_proc.wait()
            profiler.stop_cpu_profiler()
            return return_code
        except Exception as e:
            print(f"Error: {e}", file=sys.stderr)
            return 1

    # Set up default values
    gpu_output = args.output if args.output else ("gpu_results.txt" if not args.no_gpu else None)
    chrome_output = args.chrome if args.chrome else ("gpu_results.json" if not args.no_gpu else None)
    cpu_output = args.cpu_profile if args.cpu_profile else ("cpu_results.txt" if not args.no_cpu else None)

    # If user explicitly disabled GPU, don't run GPU profiling
    if args.no_gpu:
        gpu_output = None
        chrome_output = None

    # If user explicitly disabled CPU, don't run CPU profiling
    if args.no_cpu:
        cpu_output = None

    # Combined GPU and CPU profiling (or just one based on flags)
    return profiler.run_with_trace(
        full_command,
        output_trace=gpu_output,
        chrome_trace=chrome_output,
        cpu_profile=cpu_output,
        merged_trace=args.merged,
        no_merge=args.no_merge
    )

if __name__ == '__main__':
    sys.exit(main())