Files
bpf-developer-tutorial/src/47-cuda-events/basic02.cu
2025-05-24 18:02:46 +08:00

63 lines
1.7 KiB
Plaintext

#include <stdio.h>
#include <cuda_runtime.h>
// Define a simple PTX inline assembly function that multiplies a number by 2
__device__ int multiplyByTwo(int x) {
int result;
asm("mul.lo.s32 %0, %1, 2;" : "=r"(result) : "r"(x));
return result;
}
// CUDA kernel using PTX inline assembly
__global__ void vectorMultiplyByTwoPTX(int* input, int* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
output[idx] = multiplyByTwo(input[idx]);
}
}
// Host function to initialize data and launch kernel
void vectorMultiplyByTwo(int* h_input, int* h_output, int n) {
int *d_input, *d_output;
// Allocate device memory
cudaMalloc(&d_input, n * sizeof(int));
cudaMalloc(&d_output, n * sizeof(int));
// Copy input data to device
cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);
// Launch kernel
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
vectorMultiplyByTwoPTX<<<numBlocks, blockSize>>>(d_input, d_output, n);
// Copy result back to host
cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_input);
cudaFree(d_output);
}
int main() {
const int n = 1000;
int h_input[n];
int h_output[n];
// Initialize input data
for (int i = 0; i < n; i++) {
h_input[i] = i;
}
// Perform vector multiplication
vectorMultiplyByTwo(h_input, h_output, n);
// Verify results
printf("First 10 results:\n");
for (int i = 0; i < 10; i++) {
printf("%d * 2 = %d\n", h_input[i], h_output[i]);
}
return 0;
}