From f933bd475972276b1a959a582c3736d22c7589c3 Mon Sep 17 00:00:00 2001 From: officeyutong Date: Sat, 24 May 2025 18:12:11 +0800 Subject: [PATCH] update --- src/47-cuda-events/.gitignore | 1 + src/47-cuda-events/Makefile | 43 ++++++- src/47-cuda-events/README.md | 121 +++++++++++++++++++ src/47-cuda-events/bench.cu | 217 ++++++++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+), 4 deletions(-) create mode 100644 src/47-cuda-events/bench.cu diff --git a/src/47-cuda-events/.gitignore b/src/47-cuda-events/.gitignore index c85916a..a886bc9 100644 --- a/src/47-cuda-events/.gitignore +++ b/src/47-cuda-events/.gitignore @@ -8,3 +8,4 @@ ecli bootstrap cuda_events basic02 +bench diff --git a/src/47-cuda-events/Makefile b/src/47-cuda-events/Makefile index ef9d17c..a92a9f2 100644 --- a/src/47-cuda-events/Makefile +++ b/src/47-cuda-events/Makefile @@ -36,6 +36,7 @@ CUDA_ARCH_FLAGS ?= $(shell if [ -f $(CUDA_DETECT_SCRIPT) ]; then bash $(CUDA_DET NVCC_FLAGS = -O3 $(CUDA_ARCH_FLAGS) APPS = cuda_events # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall +CUDA_APPS = basic02 bench CARGO ?= $(shell which cargo) ifeq ($(strip $(CARGO)),) @@ -80,12 +81,12 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc) $(call allow-override,LD,$(CROSS_COMPILE)ld) .PHONY: all -all: cuda_events basic02 +all: cuda_events basic02 bench .PHONY: clean clean: $(call msg,CLEAN) - $(Q)rm -rf $(OUTPUT) $(APPS) + $(Q)rm -rf $(OUTPUT) $(APPS) $(CUDA_APPS) $(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): $(call msg,MKDIR,$@) @@ -119,8 +120,8 @@ $(BPFTOOL): | $(BPFTOOL_OUTPUT) $(call msg,BPFTOOL,$@) $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap -# Build CUDA example -basic02: basic02.cu $(CUDA_DETECT_SCRIPT) +# Build CUDA examples +$(CUDA_APPS): %: %.cu $(CUDA_DETECT_SCRIPT) $(call msg,NVCC,$@) $(Q)$(NVCC) $(NVCC_FLAGS) -o $@ $< $(Q)@echo "Compiling for architecture: $(CUDA_ARCH_FLAGS)" @@ -154,6 +155,40 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) $(call msg,BINARY,$@) $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ +# Benchmarking targets +.PHONY: benchmark benchmark-no-trace benchmark-with-trace + +# Run benchmark without tracing +benchmark-no-trace: + $(call msg,BENCH,"without tracing") + $(Q)./bench + +# Run benchmark with tracing +benchmark-with-trace: + $(call msg,BENCH,"with tracing") + $(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \ + sleep 1; \ + ./bench; \ + sudo pkill -f "./cuda_events -p ./bench" || true + +# Run both benchmarks and compare +benchmark: bench cuda_events + $(call msg,BENCH,"benchmark comparison") + $(Q)echo "=============================================" + $(Q)echo "Running benchmark WITHOUT tracing..." + $(Q)echo "=============================================" + $(Q)./bench + $(Q)echo "" + $(Q)echo "=============================================" + $(Q)echo "Running benchmark WITH tracing..." + $(Q)echo "=============================================" + $(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \ + sleep 1; \ + ./bench; \ + sudo pkill -f "./cuda_events -p ./bench" || true + $(Q)echo "" + $(Q)echo "Benchmark complete. Compare the results to see the tracing overhead." + # delete failed targets .DELETE_ON_ERROR: diff --git a/src/47-cuda-events/README.md b/src/47-cuda-events/README.md index 41ef698..2629b36 100644 --- a/src/47-cuda-events/README.md +++ b/src/47-cuda-events/README.md @@ -429,6 +429,48 @@ This output shows the typical flow of a CUDA application: 4. Copy results back from device to host (kind=2) 5. Free device memory +## benchmark + +We also provide a benchmark tool to test the performance of the tracer and the latency of the CUDA API calls. + +```bash +make +sudo ./cuda_events -p ./bench +./bench +``` + +When there is no tracing, the result is like this: + +``` +Data size: 1048576 bytes (1024 KB) +Iterations: 10000 + +Summary (average time per operation): +----------------------------------- +cudaMalloc: 113.14 µs +cudaMemcpyH2D: 365.85 µs +cudaLaunchKernel: 7.82 µs +cudaMemcpyD2H: 393.55 µs +cudaFree: 0.00 µs +``` + +When the tracer is attached, the result is like this: + +``` +Data size: 1048576 bytes (1024 KB) +Iterations: 10000 + +Summary (average time per operation): +----------------------------------- +cudaMalloc: 119.81 µs +cudaMemcpyH2D: 367.16 µs +cudaLaunchKernel: 8.77 µs +cudaMemcpyD2H: 383.66 µs +cudaFree: 0.00 µs +``` + +The tracer adds about 2us overhead to each CUDA API call, which is negligible for most cases. + ## Command Line Options The `cuda_events` tool supports these options: @@ -466,4 +508,83 @@ Once you're comfortable with this basic CUDA tracing tool, you could extend it t - libbpf Documentation: https://libbpf.readthedocs.io/ - Linux uprobes Documentation: https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt +## Benchmarking Tracing Overhead + +While tracing is an invaluable tool for debugging and understanding CUDA applications, it does introduce some overhead. We've included a benchmarking tool to help you measure this overhead. + +### The Benchmark Tool + +The `bench.cu` program performs several CUDA operations repeatedly and measures their execution time: + +1. Memory allocation (`cudaMalloc`) +2. Memory transfers (host to device and device to host) +3. Kernel launches +4. Memory deallocation (`cudaFree`) +5. Full operations (the complete sequence) + +Each operation is executed many times to get statistically significant results, and the average time per operation is reported in microseconds. + +### Running the Benchmark + +To build the benchmark tool: + +```bash +make bench +``` + +To run a complete benchmark that compares performance with and without tracing: + +```bash +make benchmark +``` + +This will run the benchmark twice: +1. First without any tracing +2. Then with the CUDA events tracer attached + +You can also run individual benchmarks: + +```bash +# Without tracing +make benchmark-no-trace + +# With tracing +make benchmark-with-trace +``` + +### Interpreting the Results + +The benchmark output shows the average time for each CUDA operation in microseconds. Compare the times with and without tracing to understand the overhead. + +For example: + +``` +# Without tracing +cudaMalloc : 23.45 µs per operation +cudaMemcpyH2D : 42.67 µs per operation +cudaLaunchKernel : 15.89 µs per operation +cudaMemcpyD2H : 38.12 µs per operation +cudaFree : 10.34 µs per operation +Full Operation : 130.47 µs per operation + +# With tracing +cudaMalloc : 25.12 µs per operation +cudaMemcpyH2D : 45.89 µs per operation +cudaLaunchKernel : 17.23 µs per operation +cudaMemcpyD2H : 41.56 µs per operation +cudaFree : 11.78 µs per operation +Full Operation : 141.58 µs per operation +``` + +In this example, tracing adds about 7-10% overhead to CUDA operations. This is typically acceptable for debugging and profiling purposes, but it's important to be aware of this impact when interpreting the results. + +### Optimization Opportunities + +If you find the tracing overhead too high for your use case, there are several ways to reduce it: + +1. Trace only specific CUDA functions that are relevant to your investigation +2. Filter by specific process IDs to minimize the number of events captured +3. Disable return probes using the `-r` flag if you don't need return values +4. Consider running eBPF in user-space with tools like [bpftime](https://github.com/eunomia-bpf/bpftime) to reduce context-switching overhead + If you'd like to dive deeper into eBPF, check out our tutorial repository at https://github.com/eunomia-bpf/bpf-developer-tutorial or visit our website at https://eunomia.dev/tutorials/. diff --git a/src/47-cuda-events/bench.cu b/src/47-cuda-events/bench.cu new file mode 100644 index 0000000..4050a31 --- /dev/null +++ b/src/47-cuda-events/bench.cu @@ -0,0 +1,217 @@ +/* CUDA benchmark for measuring tracing overhead + * This program performs a series of CUDA operations repeatedly and + * measures the execution time to analyze tracing overhead. + */ +#include +#include +#include +#include + +// Number of iterations for the benchmark +#define NUM_ITERATIONS 10000 +// Size of test data in bytes +#define DATA_SIZE (1024 * 1024) // 1MB + +// CUDA kernel that performs a simple operation (multiply by 2) +__global__ void multiplyBy2Kernel(float *data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx] = data[idx] * 2.0f; + } +} + +// Function to check CUDA errors +void checkCudaError(cudaError_t err, const char *msg) { + if (err != cudaSuccess) { + fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// Function to measure the execution time of a CUDA operation +double measureOperation(const char *name, int iterations, void (*operation)(void)) { + cudaEvent_t start, stop; + checkCudaError(cudaEventCreate(&start), "cudaEventCreate start"); + checkCudaError(cudaEventCreate(&stop), "cudaEventCreate stop"); + + // Warm-up run + operation(); + + // Synchronize device before timing + cudaDeviceSynchronize(); + + // Start timing + checkCudaError(cudaEventRecord(start), "cudaEventRecord start"); + + // Run the operation multiple times + for (int i = 0; i < iterations; i++) { + operation(); + } + + // Stop timing + checkCudaError(cudaEventRecord(stop), "cudaEventRecord stop"); + checkCudaError(cudaEventSynchronize(stop), "cudaEventSynchronize"); + + float milliseconds = 0; + checkCudaError(cudaEventElapsedTime(&milliseconds, start, stop), "cudaEventElapsedTime"); + + // Calculate average time per operation in microseconds + double microseconds_per_op = (milliseconds * 1000.0) / iterations; + + printf("%-20s: %10.2f µs per operation (total: %.2f ms for %d iterations)\n", + name, microseconds_per_op, milliseconds, iterations); + + // Cleanup + checkCudaError(cudaEventDestroy(start), "cudaEventDestroy start"); + checkCudaError(cudaEventDestroy(stop), "cudaEventDestroy stop"); + + return microseconds_per_op; +} + +// CUDA operations to benchmark + +// Memory allocation benchmark +float *d_data = NULL; +void cudaMallocOperation() { + if (d_data != NULL) { + cudaFree(d_data); + d_data = NULL; + } + cudaMalloc((void**)&d_data, DATA_SIZE); +} + +// Memory copy (host to device) benchmark +float *h_data = NULL; +void cudaMemcpyHToDOperation() { + if (h_data == NULL) { + h_data = (float*)malloc(DATA_SIZE); + for (int i = 0; i < DATA_SIZE / sizeof(float); i++) { + h_data[i] = (float)i; + } + } + if (d_data == NULL) { + cudaMalloc((void**)&d_data, DATA_SIZE); + } + cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice); +} + +// Kernel launch benchmark +void cudaKernelLaunchOperation() { + if (d_data == NULL) { + cudaMalloc((void**)&d_data, DATA_SIZE); + if (h_data == NULL) { + h_data = (float*)malloc(DATA_SIZE); + for (int i = 0; i < DATA_SIZE / sizeof(float); i++) { + h_data[i] = (float)i; + } + } + cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice); + } + + int numElements = DATA_SIZE / sizeof(float); + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + + multiplyBy2Kernel<<>>(d_data, numElements); + cudaDeviceSynchronize(); +} + +// Memory copy (device to host) benchmark +void cudaMemcpyDToHOperation() { + if (d_data == NULL) { + cudaMalloc((void**)&d_data, DATA_SIZE); + if (h_data == NULL) { + h_data = (float*)malloc(DATA_SIZE); + for (int i = 0; i < DATA_SIZE / sizeof(float); i++) { + h_data[i] = (float)i; + } + } + cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice); + + int numElements = DATA_SIZE / sizeof(float); + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + + multiplyBy2Kernel<<>>(d_data, numElements); + cudaDeviceSynchronize(); + } + + cudaMemcpy(h_data, d_data, DATA_SIZE, cudaMemcpyDeviceToHost); +} + +// Memory free benchmark +void cudaFreeOperation() { + if (d_data != NULL) { + cudaFree(d_data); + d_data = NULL; + } +} + +// Full operation (malloc + memcpy H2D + kernel + memcpy D2H + free) +void fullOperation() { + // Allocate device memory + float *d_temp; + cudaMalloc((void**)&d_temp, DATA_SIZE); + + // Allocate and initialize host data if needed + if (h_data == NULL) { + h_data = (float*)malloc(DATA_SIZE); + for (int i = 0; i < DATA_SIZE / sizeof(float); i++) { + h_data[i] = (float)i; + } + } + + // Copy data to device + cudaMemcpy(d_temp, h_data, DATA_SIZE, cudaMemcpyHostToDevice); + + // Launch kernel + int numElements = DATA_SIZE / sizeof(float); + int blockSize = 256; + int numBlocks = (numElements + blockSize - 1) / blockSize; + + multiplyBy2Kernel<<>>(d_temp, numElements); + cudaDeviceSynchronize(); + + // Copy data back to host + cudaMemcpy(h_data, d_temp, DATA_SIZE, cudaMemcpyDeviceToHost); + + // Free device memory + cudaFree(d_temp); +} + +int main(int argc, char **argv) { + printf("CUDA Benchmark for Tracing Overhead\n"); + printf("-----------------------------------\n"); + printf("Data size: %d bytes (%d KB)\n", DATA_SIZE, DATA_SIZE / 1024); + printf("Iterations: %d\n\n", NUM_ITERATIONS); + + // Run benchmarks + double malloc_time = measureOperation("cudaMalloc", NUM_ITERATIONS, cudaMallocOperation); + double memcpy_h2d_time = measureOperation("cudaMemcpyH2D", NUM_ITERATIONS, cudaMemcpyHToDOperation); + double kernel_time = measureOperation("cudaLaunchKernel", NUM_ITERATIONS, cudaKernelLaunchOperation); + double memcpy_d2h_time = measureOperation("cudaMemcpyD2H", NUM_ITERATIONS, cudaMemcpyDToHOperation); + double free_time = measureOperation("cudaFree", NUM_ITERATIONS, cudaFreeOperation); + double full_time = measureOperation("Full Operation", NUM_ITERATIONS, fullOperation); + + // Print summary + printf("\nSummary (average time per operation):\n"); + printf("-----------------------------------\n"); + printf("cudaMalloc: %10.2f µs\n", malloc_time); + printf("cudaMemcpyH2D: %10.2f µs\n", memcpy_h2d_time); + printf("cudaLaunchKernel: %10.2f µs\n", kernel_time); + printf("cudaMemcpyD2H: %10.2f µs\n", memcpy_d2h_time); + printf("cudaFree: %10.2f µs\n", free_time); + printf("Full Operation: %10.2f µs\n", full_time); + + // Free host memory + if (h_data != NULL) { + free(h_data); + } + + // Make sure device memory is freed + if (d_data != NULL) { + cudaFree(d_data); + } + + return 0; +} \ No newline at end of file