mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-02-03 02:04:30 +08:00
update
This commit is contained in:
1
src/47-cuda-events/.gitignore
vendored
1
src/47-cuda-events/.gitignore
vendored
@@ -8,3 +8,4 @@ ecli
|
||||
bootstrap
|
||||
cuda_events
|
||||
basic02
|
||||
bench
|
||||
|
||||
@@ -36,6 +36,7 @@ CUDA_ARCH_FLAGS ?= $(shell if [ -f $(CUDA_DETECT_SCRIPT) ]; then bash $(CUDA_DET
|
||||
NVCC_FLAGS = -O3 $(CUDA_ARCH_FLAGS)
|
||||
|
||||
APPS = cuda_events # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
|
||||
CUDA_APPS = basic02 bench
|
||||
|
||||
CARGO ?= $(shell which cargo)
|
||||
ifeq ($(strip $(CARGO)),)
|
||||
@@ -80,12 +81,12 @@ $(call allow-override,CC,$(CROSS_COMPILE)cc)
|
||||
$(call allow-override,LD,$(CROSS_COMPILE)ld)
|
||||
|
||||
.PHONY: all
|
||||
all: cuda_events basic02
|
||||
all: cuda_events basic02 bench
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
$(call msg,CLEAN)
|
||||
$(Q)rm -rf $(OUTPUT) $(APPS)
|
||||
$(Q)rm -rf $(OUTPUT) $(APPS) $(CUDA_APPS)
|
||||
|
||||
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
|
||||
$(call msg,MKDIR,$@)
|
||||
@@ -119,8 +120,8 @@ $(BPFTOOL): | $(BPFTOOL_OUTPUT)
|
||||
$(call msg,BPFTOOL,$@)
|
||||
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
|
||||
|
||||
# Build CUDA example
|
||||
basic02: basic02.cu $(CUDA_DETECT_SCRIPT)
|
||||
# Build CUDA examples
|
||||
$(CUDA_APPS): %: %.cu $(CUDA_DETECT_SCRIPT)
|
||||
$(call msg,NVCC,$@)
|
||||
$(Q)$(NVCC) $(NVCC_FLAGS) -o $@ $<
|
||||
$(Q)@echo "Compiling for architecture: $(CUDA_ARCH_FLAGS)"
|
||||
@@ -154,6 +155,40 @@ $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
|
||||
$(call msg,BINARY,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
|
||||
|
||||
# Benchmarking targets
|
||||
.PHONY: benchmark benchmark-no-trace benchmark-with-trace
|
||||
|
||||
# Run benchmark without tracing
|
||||
benchmark-no-trace:
|
||||
$(call msg,BENCH,"without tracing")
|
||||
$(Q)./bench
|
||||
|
||||
# Run benchmark with tracing
|
||||
benchmark-with-trace:
|
||||
$(call msg,BENCH,"with tracing")
|
||||
$(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \
|
||||
sleep 1; \
|
||||
./bench; \
|
||||
sudo pkill -f "./cuda_events -p ./bench" || true
|
||||
|
||||
# Run both benchmarks and compare
|
||||
benchmark: bench cuda_events
|
||||
$(call msg,BENCH,"benchmark comparison")
|
||||
$(Q)echo "============================================="
|
||||
$(Q)echo "Running benchmark WITHOUT tracing..."
|
||||
$(Q)echo "============================================="
|
||||
$(Q)./bench
|
||||
$(Q)echo ""
|
||||
$(Q)echo "============================================="
|
||||
$(Q)echo "Running benchmark WITH tracing..."
|
||||
$(Q)echo "============================================="
|
||||
$(Q)(sudo ./cuda_events -p ./bench > /dev/null 2>&1 &); \
|
||||
sleep 1; \
|
||||
./bench; \
|
||||
sudo pkill -f "./cuda_events -p ./bench" || true
|
||||
$(Q)echo ""
|
||||
$(Q)echo "Benchmark complete. Compare the results to see the tracing overhead."
|
||||
|
||||
# delete failed targets
|
||||
.DELETE_ON_ERROR:
|
||||
|
||||
|
||||
@@ -429,6 +429,48 @@ This output shows the typical flow of a CUDA application:
|
||||
4. Copy results back from device to host (kind=2)
|
||||
5. Free device memory
|
||||
|
||||
## benchmark
|
||||
|
||||
We also provide a benchmark tool to test the performance of the tracer and the latency of the CUDA API calls.
|
||||
|
||||
```bash
|
||||
make
|
||||
sudo ./cuda_events -p ./bench
|
||||
./bench
|
||||
```
|
||||
|
||||
When there is no tracing, the result is like this:
|
||||
|
||||
```
|
||||
Data size: 1048576 bytes (1024 KB)
|
||||
Iterations: 10000
|
||||
|
||||
Summary (average time per operation):
|
||||
-----------------------------------
|
||||
cudaMalloc: 113.14 µs
|
||||
cudaMemcpyH2D: 365.85 µs
|
||||
cudaLaunchKernel: 7.82 µs
|
||||
cudaMemcpyD2H: 393.55 µs
|
||||
cudaFree: 0.00 µs
|
||||
```
|
||||
|
||||
When the tracer is attached, the result is like this:
|
||||
|
||||
```
|
||||
Data size: 1048576 bytes (1024 KB)
|
||||
Iterations: 10000
|
||||
|
||||
Summary (average time per operation):
|
||||
-----------------------------------
|
||||
cudaMalloc: 119.81 µs
|
||||
cudaMemcpyH2D: 367.16 µs
|
||||
cudaLaunchKernel: 8.77 µs
|
||||
cudaMemcpyD2H: 383.66 µs
|
||||
cudaFree: 0.00 µs
|
||||
```
|
||||
|
||||
The tracer adds about 2us overhead to each CUDA API call, which is negligible for most cases.
|
||||
|
||||
## Command Line Options
|
||||
|
||||
The `cuda_events` tool supports these options:
|
||||
@@ -466,4 +508,83 @@ Once you're comfortable with this basic CUDA tracing tool, you could extend it t
|
||||
- libbpf Documentation: https://libbpf.readthedocs.io/
|
||||
- Linux uprobes Documentation: https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
|
||||
|
||||
## Benchmarking Tracing Overhead
|
||||
|
||||
While tracing is an invaluable tool for debugging and understanding CUDA applications, it does introduce some overhead. We've included a benchmarking tool to help you measure this overhead.
|
||||
|
||||
### The Benchmark Tool
|
||||
|
||||
The `bench.cu` program performs several CUDA operations repeatedly and measures their execution time:
|
||||
|
||||
1. Memory allocation (`cudaMalloc`)
|
||||
2. Memory transfers (host to device and device to host)
|
||||
3. Kernel launches
|
||||
4. Memory deallocation (`cudaFree`)
|
||||
5. Full operations (the complete sequence)
|
||||
|
||||
Each operation is executed many times to get statistically significant results, and the average time per operation is reported in microseconds.
|
||||
|
||||
### Running the Benchmark
|
||||
|
||||
To build the benchmark tool:
|
||||
|
||||
```bash
|
||||
make bench
|
||||
```
|
||||
|
||||
To run a complete benchmark that compares performance with and without tracing:
|
||||
|
||||
```bash
|
||||
make benchmark
|
||||
```
|
||||
|
||||
This will run the benchmark twice:
|
||||
1. First without any tracing
|
||||
2. Then with the CUDA events tracer attached
|
||||
|
||||
You can also run individual benchmarks:
|
||||
|
||||
```bash
|
||||
# Without tracing
|
||||
make benchmark-no-trace
|
||||
|
||||
# With tracing
|
||||
make benchmark-with-trace
|
||||
```
|
||||
|
||||
### Interpreting the Results
|
||||
|
||||
The benchmark output shows the average time for each CUDA operation in microseconds. Compare the times with and without tracing to understand the overhead.
|
||||
|
||||
For example:
|
||||
|
||||
```
|
||||
# Without tracing
|
||||
cudaMalloc : 23.45 µs per operation
|
||||
cudaMemcpyH2D : 42.67 µs per operation
|
||||
cudaLaunchKernel : 15.89 µs per operation
|
||||
cudaMemcpyD2H : 38.12 µs per operation
|
||||
cudaFree : 10.34 µs per operation
|
||||
Full Operation : 130.47 µs per operation
|
||||
|
||||
# With tracing
|
||||
cudaMalloc : 25.12 µs per operation
|
||||
cudaMemcpyH2D : 45.89 µs per operation
|
||||
cudaLaunchKernel : 17.23 µs per operation
|
||||
cudaMemcpyD2H : 41.56 µs per operation
|
||||
cudaFree : 11.78 µs per operation
|
||||
Full Operation : 141.58 µs per operation
|
||||
```
|
||||
|
||||
In this example, tracing adds about 7-10% overhead to CUDA operations. This is typically acceptable for debugging and profiling purposes, but it's important to be aware of this impact when interpreting the results.
|
||||
|
||||
### Optimization Opportunities
|
||||
|
||||
If you find the tracing overhead too high for your use case, there are several ways to reduce it:
|
||||
|
||||
1. Trace only specific CUDA functions that are relevant to your investigation
|
||||
2. Filter by specific process IDs to minimize the number of events captured
|
||||
3. Disable return probes using the `-r` flag if you don't need return values
|
||||
4. Consider running eBPF in user-space with tools like [bpftime](https://github.com/eunomia-bpf/bpftime) to reduce context-switching overhead
|
||||
|
||||
If you'd like to dive deeper into eBPF, check out our tutorial repository at https://github.com/eunomia-bpf/bpf-developer-tutorial or visit our website at https://eunomia.dev/tutorials/.
|
||||
|
||||
217
src/47-cuda-events/bench.cu
Normal file
217
src/47-cuda-events/bench.cu
Normal file
@@ -0,0 +1,217 @@
|
||||
/* CUDA benchmark for measuring tracing overhead
|
||||
* This program performs a series of CUDA operations repeatedly and
|
||||
* measures the execution time to analyze tracing overhead.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Number of iterations for the benchmark
|
||||
#define NUM_ITERATIONS 10000
|
||||
// Size of test data in bytes
|
||||
#define DATA_SIZE (1024 * 1024) // 1MB
|
||||
|
||||
// CUDA kernel that performs a simple operation (multiply by 2)
|
||||
__global__ void multiplyBy2Kernel(float *data, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < n) {
|
||||
data[idx] = data[idx] * 2.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Function to check CUDA errors
|
||||
void checkCudaError(cudaError_t err, const char *msg) {
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "CUDA Error: %s: %s\n", msg, cudaGetErrorString(err));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
// Function to measure the execution time of a CUDA operation
|
||||
double measureOperation(const char *name, int iterations, void (*operation)(void)) {
|
||||
cudaEvent_t start, stop;
|
||||
checkCudaError(cudaEventCreate(&start), "cudaEventCreate start");
|
||||
checkCudaError(cudaEventCreate(&stop), "cudaEventCreate stop");
|
||||
|
||||
// Warm-up run
|
||||
operation();
|
||||
|
||||
// Synchronize device before timing
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Start timing
|
||||
checkCudaError(cudaEventRecord(start), "cudaEventRecord start");
|
||||
|
||||
// Run the operation multiple times
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
operation();
|
||||
}
|
||||
|
||||
// Stop timing
|
||||
checkCudaError(cudaEventRecord(stop), "cudaEventRecord stop");
|
||||
checkCudaError(cudaEventSynchronize(stop), "cudaEventSynchronize");
|
||||
|
||||
float milliseconds = 0;
|
||||
checkCudaError(cudaEventElapsedTime(&milliseconds, start, stop), "cudaEventElapsedTime");
|
||||
|
||||
// Calculate average time per operation in microseconds
|
||||
double microseconds_per_op = (milliseconds * 1000.0) / iterations;
|
||||
|
||||
printf("%-20s: %10.2f µs per operation (total: %.2f ms for %d iterations)\n",
|
||||
name, microseconds_per_op, milliseconds, iterations);
|
||||
|
||||
// Cleanup
|
||||
checkCudaError(cudaEventDestroy(start), "cudaEventDestroy start");
|
||||
checkCudaError(cudaEventDestroy(stop), "cudaEventDestroy stop");
|
||||
|
||||
return microseconds_per_op;
|
||||
}
|
||||
|
||||
// CUDA operations to benchmark
|
||||
|
||||
// Memory allocation benchmark
|
||||
float *d_data = NULL;
|
||||
void cudaMallocOperation() {
|
||||
if (d_data != NULL) {
|
||||
cudaFree(d_data);
|
||||
d_data = NULL;
|
||||
}
|
||||
cudaMalloc((void**)&d_data, DATA_SIZE);
|
||||
}
|
||||
|
||||
// Memory copy (host to device) benchmark
|
||||
float *h_data = NULL;
|
||||
void cudaMemcpyHToDOperation() {
|
||||
if (h_data == NULL) {
|
||||
h_data = (float*)malloc(DATA_SIZE);
|
||||
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
|
||||
h_data[i] = (float)i;
|
||||
}
|
||||
}
|
||||
if (d_data == NULL) {
|
||||
cudaMalloc((void**)&d_data, DATA_SIZE);
|
||||
}
|
||||
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
// Kernel launch benchmark
|
||||
void cudaKernelLaunchOperation() {
|
||||
if (d_data == NULL) {
|
||||
cudaMalloc((void**)&d_data, DATA_SIZE);
|
||||
if (h_data == NULL) {
|
||||
h_data = (float*)malloc(DATA_SIZE);
|
||||
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
|
||||
h_data[i] = (float)i;
|
||||
}
|
||||
}
|
||||
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
int numElements = DATA_SIZE / sizeof(float);
|
||||
int blockSize = 256;
|
||||
int numBlocks = (numElements + blockSize - 1) / blockSize;
|
||||
|
||||
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_data, numElements);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
// Memory copy (device to host) benchmark
|
||||
void cudaMemcpyDToHOperation() {
|
||||
if (d_data == NULL) {
|
||||
cudaMalloc((void**)&d_data, DATA_SIZE);
|
||||
if (h_data == NULL) {
|
||||
h_data = (float*)malloc(DATA_SIZE);
|
||||
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
|
||||
h_data[i] = (float)i;
|
||||
}
|
||||
}
|
||||
cudaMemcpy(d_data, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
|
||||
|
||||
int numElements = DATA_SIZE / sizeof(float);
|
||||
int blockSize = 256;
|
||||
int numBlocks = (numElements + blockSize - 1) / blockSize;
|
||||
|
||||
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_data, numElements);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
cudaMemcpy(h_data, d_data, DATA_SIZE, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
// Memory free benchmark
|
||||
void cudaFreeOperation() {
|
||||
if (d_data != NULL) {
|
||||
cudaFree(d_data);
|
||||
d_data = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Full operation (malloc + memcpy H2D + kernel + memcpy D2H + free)
|
||||
void fullOperation() {
|
||||
// Allocate device memory
|
||||
float *d_temp;
|
||||
cudaMalloc((void**)&d_temp, DATA_SIZE);
|
||||
|
||||
// Allocate and initialize host data if needed
|
||||
if (h_data == NULL) {
|
||||
h_data = (float*)malloc(DATA_SIZE);
|
||||
for (int i = 0; i < DATA_SIZE / sizeof(float); i++) {
|
||||
h_data[i] = (float)i;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy data to device
|
||||
cudaMemcpy(d_temp, h_data, DATA_SIZE, cudaMemcpyHostToDevice);
|
||||
|
||||
// Launch kernel
|
||||
int numElements = DATA_SIZE / sizeof(float);
|
||||
int blockSize = 256;
|
||||
int numBlocks = (numElements + blockSize - 1) / blockSize;
|
||||
|
||||
multiplyBy2Kernel<<<numBlocks, blockSize>>>(d_temp, numElements);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Copy data back to host
|
||||
cudaMemcpy(h_data, d_temp, DATA_SIZE, cudaMemcpyDeviceToHost);
|
||||
|
||||
// Free device memory
|
||||
cudaFree(d_temp);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("CUDA Benchmark for Tracing Overhead\n");
|
||||
printf("-----------------------------------\n");
|
||||
printf("Data size: %d bytes (%d KB)\n", DATA_SIZE, DATA_SIZE / 1024);
|
||||
printf("Iterations: %d\n\n", NUM_ITERATIONS);
|
||||
|
||||
// Run benchmarks
|
||||
double malloc_time = measureOperation("cudaMalloc", NUM_ITERATIONS, cudaMallocOperation);
|
||||
double memcpy_h2d_time = measureOperation("cudaMemcpyH2D", NUM_ITERATIONS, cudaMemcpyHToDOperation);
|
||||
double kernel_time = measureOperation("cudaLaunchKernel", NUM_ITERATIONS, cudaKernelLaunchOperation);
|
||||
double memcpy_d2h_time = measureOperation("cudaMemcpyD2H", NUM_ITERATIONS, cudaMemcpyDToHOperation);
|
||||
double free_time = measureOperation("cudaFree", NUM_ITERATIONS, cudaFreeOperation);
|
||||
double full_time = measureOperation("Full Operation", NUM_ITERATIONS, fullOperation);
|
||||
|
||||
// Print summary
|
||||
printf("\nSummary (average time per operation):\n");
|
||||
printf("-----------------------------------\n");
|
||||
printf("cudaMalloc: %10.2f µs\n", malloc_time);
|
||||
printf("cudaMemcpyH2D: %10.2f µs\n", memcpy_h2d_time);
|
||||
printf("cudaLaunchKernel: %10.2f µs\n", kernel_time);
|
||||
printf("cudaMemcpyD2H: %10.2f µs\n", memcpy_d2h_time);
|
||||
printf("cudaFree: %10.2f µs\n", free_time);
|
||||
printf("Full Operation: %10.2f µs\n", full_time);
|
||||
|
||||
// Free host memory
|
||||
if (h_data != NULL) {
|
||||
free(h_data);
|
||||
}
|
||||
|
||||
// Make sure device memory is freed
|
||||
if (d_data != NULL) {
|
||||
cudaFree(d_data);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user