This commit is contained in:
officeyutong
2025-05-24 18:12:11 +08:00
parent 0a7d2a40c7
commit f933bd4759
4 changed files with 378 additions and 4 deletions

View File

@@ -8,3 +8,4 @@ ecli
bootstrap
cuda_events
basic02
bench

View File

@@ -36,6 +36,7 @@ CUDA_ARCH_FLAGS ?= $(shell if [ -f $(CUDA_DETECT_SCRIPT) ]; then bash $(CUDA_DET
NVCC_FLAGS = -O3 $(CUDA_ARCH_FLAGS)
APPS = cuda_events # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
CUDA_APPS = basic02 bench
CARGO ?= $(shell which cargo)
ifeq ($(strip $(CARGO)),)
@@ -80,12 +81,12 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc)
$(call allow-override,LD,$(CROSS_COMPILE)ld)
.PHONY: all
all: cuda_events basic02
all: cuda_events basic02 bench
.PHONY: clean
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) $(APPS)
$(Q)rm -rf $(OUTPUT) $(APPS) $(CUDA_APPS)
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
@@ -119,8 +120,8 @@ $(BPFTOOL): | $(BPFTOOL_OUTPUT)
$(call msg,BPFTOOL,$@)
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
# Build CUDA example
basic02: basic02.cu $(CUDA_DETECT_SCRIPT)
# Build CUDA examples
$(CUDA_APPS): %: %.cu $(CUDA_DETECT_SCRIPT)
$(call msg,NVCC,$@)
$(Q)$(NVCC) $(NVCC_FLAGS) -o $@ $<
$(Q)@echo "Compiling for architecture: $(CUDA_ARCH_FLAGS)"
@@ -154,6 +155,40 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
$(call msg,BINARY,$@)
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
# Benchmarking targets
.PHONY: benchmark benchmark-no-trace benchmark-with-trace
# Run benchmark without tracing
benchmark-no-trace:
$(call msg,BENCH,"without tracing")
$(Q)./bench
# Run benchmark with tracing
benchmark-with-trace:
$(call msg,BENCH,"with tracing")
$(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \
sleep 1; \
./bench; \
sudo pkill -f "./cuda_events -p ./bench" || true
# Run both benchmarks and compare
benchmark: bench cuda_events
$(call msg,BENCH,"benchmark comparison")
$(Q)echo "============================================="
$(Q)echo "Running benchmark WITHOUT tracing..."
$(Q)echo "============================================="
$(Q)./bench
$(Q)echo ""
$(Q)echo "============================================="
$(Q)echo "Running benchmark WITH tracing..."
$(Q)echo "============================================="
$(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \
sleep 1; \
./bench; \
sudo pkill -f "./cuda_events -p ./bench" || true
$(Q)echo ""
$(Q)echo "Benchmark complete. Compare the results to see the tracing overhead."
# delete failed targets
.DELETE_ON_ERROR:

View File

@@ -429,6 +429,48 @@ This output shows the typical flow of a CUDA application:
4. Copy results back from device to host (kind=2)
5. Free device memory
## benchmark
We also provide a benchmark tool to test the performance of the tracer and the latency of the CUDA API calls.
```bash
make
sudo ./cuda_events -p ./bench
./bench
```
When there is no tracing, the result is like this:
```
Data size: 1048576 bytes (1024 KB)
Iterations: 10000
Summary (average time per operation):
-----------------------------------
cudaMalloc: 113.14 µs
cudaMemcpyH2D: 365.85 µs
cudaLaunchKernel: 7.82 µs
cudaMemcpyD2H: 393.55 µs
cudaFree: 0.00 µs
```
When the tracer is attached, the result is like this:
```
Data size: 1048576 bytes (1024 KB)
Iterations: 10000
Summary (average time per operation):
-----------------------------------
cudaMalloc: 119.81 µs
cudaMemcpyH2D: 367.16 µs
cudaLaunchKernel: 8.77 µs
cudaMemcpyD2H: 383.66 µs
cudaFree: 0.00 µs
```
The tracer adds about 2us overhead to each CUDA API call, which is negligible for most cases.
## Command Line Options
The `cuda_events` tool supports these options:
@@ -466,4 +508,83 @@ Once you're comfortable with this basic CUDA tracing tool, you could extend it t
- libbpf Documentation: https://libbpf.readthedocs.io/
- Linux uprobes Documentation: https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
## Benchmarking Tracing Overhead
While tracing is an invaluable tool for debugging and understanding CUDA applications, it does introduce some overhead. We've included a benchmarking tool to help you measure this overhead.
### The Benchmark Tool
The `bench.cu` program performs several CUDA operations repeatedly and measures their execution time:
1. Memory allocation (`cudaMalloc`)
2. Memory transfers (host to device and device to host)
3. Kernel launches
4. Memory deallocation (`cudaFree`)
5. Full operations (the complete sequence)
Each operation is executed many times to get statistically significant results, and the average time per operation is reported in microseconds.
### Running the Benchmark
To build the benchmark tool:
```bash
make bench
```
To run a complete benchmark that compares performance with and without tracing:
```bash
make benchmark
```
This will run the benchmark twice:
1. First without any tracing
2. Then with the CUDA events tracer attached
You can also run individual benchmarks:
```bash
# Without tracing
make benchmark-no-trace
# With tracing
make benchmark-with-trace
```
### Interpreting the Results
The benchmark output shows the average time for each CUDA operation in microseconds. Compare the times with and without tracing to understand the overhead.
For example:
```
# Without tracing
cudaMalloc : 23.45 µs per operation
cudaMemcpyH2D : 42.67 µs per operation
cudaLaunchKernel : 15.89 µs per operation
cudaMemcpyD2H : 38.12 µs per operation
cudaFree : 10.34 µs per operation
Full Operation : 130.47 µs per operation
# With tracing
cudaMalloc : 25.12 µs per operation
cudaMemcpyH2D : 45.89 µs per operation
cudaLaunchKernel : 17.23 µs per operation
cudaMemcpyD2H : 41.56 µs per operation
cudaFree : 11.78 µs per operation
Full Operation : 141.58 µs per operation
```
In this example, tracing adds about 7-10% overhead to CUDA operations. This is typically acceptable for debugging and profiling purposes, but it's important to be aware of this impact when interpreting the results.
### Optimization Opportunities
If you find the tracing overhead too high for your use case, there are several ways to reduce it:
1. Trace only specific CUDA functions that are relevant to your investigation
2. Filter by specific process IDs to minimize the number of events captured
3. Disable return probes using the `-r` flag if you don't need return values
4. Consider running eBPF in user-space with tools like [bpftime](https://github.com/eunomia-bpf/bpftime) to reduce context-switching overhead
If you'd like to dive deeper into eBPF, check out our tutorial repository at https://github.com/eunomia-bpf/bpf-developer-tutorial or visit our website at https://eunomia.dev/tutorials/.

217
src/47-cuda-events/bench.cu Normal file
View File

@@ -0,0 +1,217 @@
/* CUDA benchmark for measuring tracing overhead
* This program performs a series of CUDA operations repeatedly and
* measures the execution time to analyze tracing overhead.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
// Number of iterations for the benchmark
#define NUM_ITERATIONS 10000
// Size of test data in bytes
#define DATA_SIZE (1024 * 1024) // 1MB
// CUDA kernel that performs a simple operation (multiply by 2)
__global__ void multiplyBy2Kernel(float *data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] = data[idx] * 2.0f;
}
}
// Function to check CUDA errors
void checkCudaError(cudaError_t err, const char *msg) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
// Function to measure the execution time of a CUDA operation
double measureOperation(const char *name, int iterations, void (*operation)(void)) {
cudaEvent_t start, stop;
checkCudaError(cudaEventCreate(&start), "cudaEventCreate start");
checkCudaError(cudaEventCreate(&stop), "cudaEventCreate stop");
// Warm-up run
operation();
// Synchronize device before timing
cudaDeviceSynchronize();
// Start timing
checkCudaError(cudaEventRecord(start), "cudaEventRecord start");
// Run the operation multiple times
for (int i = 0; i < iterations; i++) {
operation();
}
// Stop timing
checkCudaError(cudaEventRecord(stop), "cudaEventRecord stop");
checkCudaError(cudaEventSynchronize(stop), "cudaEventSynchronize");
float milliseconds = 0;
checkCudaError(cudaEventElapsedTime(&milliseconds, start, stop), "cudaEventElapsedTime");
// Calculate average time per operation in microseconds
double microseconds_per_op = (milliseconds * 1000.0) / iterations;
printf("%-20s: %10.2f µs per operation (total: %.2f ms for %d iterations)\n",
name, microseconds_per_op, milliseconds, iterations);
// Cleanup
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy start");
checkCudaError(cudaEventDestroy(stop), "cudaEventDestroy stop");
return microseconds_per_op;
}
// CUDA operations to benchmark
// Memory allocation benchmark
float *d_data = NULL;
void cudaMallocOperation() {
if (d_data != NULL) {
cudaFree(d_data);
d_data = NULL;
}
cudaMalloc((void**)&d_data, DATA_SIZE);
}
// Memory copy (host to device) benchmark
float *h_data = NULL;
void cudaMemcpyHToDOperation() {
if (h_data == NULL) {
h_data = (float*)malloc(DATA_SIZE);
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
h_data[i] = (float)i;
}
}
if (d_data == NULL) {
cudaMalloc((void**)&d_data, DATA_SIZE);
}
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
}
// Kernel launch benchmark
void cudaKernelLaunchOperation() {
if (d_data == NULL) {
cudaMalloc((void**)&d_data, DATA_SIZE);
if (h_data == NULL) {
h_data = (float*)malloc(DATA_SIZE);
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
h_data[i] = (float)i;
}
}
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
}
int numElements = DATA_SIZE / sizeof(float);
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_data, numElements);
cudaDeviceSynchronize();
}
// Memory copy (device to host) benchmark
void cudaMemcpyDToHOperation() {
if (d_data == NULL) {
cudaMalloc((void**)&d_data, DATA_SIZE);
if (h_data == NULL) {
h_data = (float*)malloc(DATA_SIZE);
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
h_data[i] = (float)i;
}
}
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
int numElements = DATA_SIZE / sizeof(float);
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_data, numElements);
cudaDeviceSynchronize();
}
cudaMemcpy(h_data, d_data, DATA_SIZE, cudaMemcpyDeviceToHost);
}
// Memory free benchmark
void cudaFreeOperation() {
if (d_data != NULL) {
cudaFree(d_data);
d_data = NULL;
}
}
// Full operation (malloc + memcpy H2D + kernel + memcpy D2H + free)
void fullOperation() {
// Allocate device memory
float *d_temp;
cudaMalloc((void**)&d_temp, DATA_SIZE);
// Allocate and initialize host data if needed
if (h_data == NULL) {
h_data = (float*)malloc(DATA_SIZE);
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
h_data[i] = (float)i;
}
}
// Copy data to device
cudaMemcpy(d_temp, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
// Launch kernel
int numElements = DATA_SIZE / sizeof(float);
int blockSize = 256;
int numBlocks = (numElements + blockSize - 1) / blockSize;
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_temp, numElements);
cudaDeviceSynchronize();
// Copy data back to host
cudaMemcpy(h_data, d_temp, DATA_SIZE, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_temp);
}
int main(int argc, char **argv) {
printf("CUDA Benchmark for Tracing Overhead\n");
printf("-----------------------------------\n");
printf("Data size: %d bytes (%d KB)\n", DATA_SIZE, DATA_SIZE / 1024);
printf("Iterations: %d\n\n", NUM_ITERATIONS);
// Run benchmarks
double malloc_time = measureOperation("cudaMalloc", NUM_ITERATIONS, cudaMallocOperation);
double memcpy_h2d_time = measureOperation("cudaMemcpyH2D", NUM_ITERATIONS, cudaMemcpyHToDOperation);
double kernel_time = measureOperation("cudaLaunchKernel", NUM_ITERATIONS, cudaKernelLaunchOperation);
double memcpy_d2h_time = measureOperation("cudaMemcpyD2H", NUM_ITERATIONS, cudaMemcpyDToHOperation);
double free_time = measureOperation("cudaFree", NUM_ITERATIONS, cudaFreeOperation);
double full_time = measureOperation("Full Operation", NUM_ITERATIONS, fullOperation);
// Print summary
printf("\nSummary (average time per operation):\n");
printf("-----------------------------------\n");
printf("cudaMalloc: %10.2f µs\n", malloc_time);
printf("cudaMemcpyH2D: %10.2f µs\n", memcpy_h2d_time);
printf("cudaLaunchKernel: %10.2f µs\n", kernel_time);
printf("cudaMemcpyD2H: %10.2f µs\n", memcpy_d2h_time);
printf("cudaFree: %10.2f µs\n", free_time);
printf("Full Operation: %10.2f µs\n", full_time);
// Free host memory
if (h_data != NULL) {
free(h_data);
}
// Make sure device memory is freed
if (d_data != NULL) {
cudaFree(d_data);
}
return 0;
}