mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-02-13 23:26:09 +08:00
63 lines
1.7 KiB
Plaintext
63 lines
1.7 KiB
Plaintext
#include <stdio.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
// Define a simple PTX inline assembly function that multiplies a number by 2
|
|
__device__ int multiplyByTwo(int x) {
|
|
int result;
|
|
asm("mul.lo.s32 %0, %1, 2;" : "=r"(result) : "r"(x));
|
|
return result;
|
|
}
|
|
|
|
// CUDA kernel using PTX inline assembly
|
|
__global__ void vectorMultiplyByTwoPTX(int* input, int* output, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < n) {
|
|
output[idx] = multiplyByTwo(input[idx]);
|
|
}
|
|
}
|
|
|
|
// Host function to initialize data and launch kernel
|
|
void vectorMultiplyByTwo(int* h_input, int* h_output, int n) {
|
|
int *d_input, *d_output;
|
|
|
|
// Allocate device memory
|
|
cudaMalloc(&d_input, n * sizeof(int));
|
|
cudaMalloc(&d_output, n * sizeof(int));
|
|
|
|
// Copy input data to device
|
|
cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);
|
|
|
|
// Launch kernel
|
|
int blockSize = 256;
|
|
int numBlocks = (n + blockSize - 1) / blockSize;
|
|
vectorMultiplyByTwoPTX<<<numBlocks, blockSize>>>(d_input, d_output, n);
|
|
|
|
// Copy result back to host
|
|
cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
// Free device memory
|
|
cudaFree(d_input);
|
|
cudaFree(d_output);
|
|
}
|
|
|
|
int main() {
|
|
const int n = 1000;
|
|
int h_input[n];
|
|
int h_output[n];
|
|
|
|
// Initialize input data
|
|
for (int i = 0; i < n; i++) {
|
|
h_input[i] = i;
|
|
}
|
|
|
|
// Perform vector multiplication
|
|
vectorMultiplyByTwo(h_input, h_output, n);
|
|
|
|
// Verify results
|
|
printf("First 10 results:\n");
|
|
for (int i = 0; i < 10; i++) {
|
|
printf("%d * 2 = %d\n", h_input[i], h_output[i]);
|
|
}
|
|
|
|
return 0;
|
|
} |