#include #include // Define a simple PTX inline assembly function that multiplies a number by 2 __device__ int multiplyByTwo(int x) { int result; asm("mul.lo.s32 %0, %1, 2;" : "=r"(result) : "r"(x)); return result; } // CUDA kernel using PTX inline assembly __global__ void vectorMultiplyByTwoPTX(int* input, int* output, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { output[idx] = multiplyByTwo(input[idx]); } } // Host function to initialize data and launch kernel void vectorMultiplyByTwo(int* h_input, int* h_output, int n) { int *d_input, *d_output; // Allocate device memory cudaMalloc(&d_input, n * sizeof(int)); cudaMalloc(&d_output, n * sizeof(int)); // Copy input data to device cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice); // Launch kernel int blockSize = 256; int numBlocks = (n + blockSize - 1) / blockSize; vectorMultiplyByTwoPTX<<>>(d_input, d_output, n); // Copy result back to host cudaMemcpy(h_output, d_output, n * sizeof(int), cudaMemcpyDeviceToHost); // Free device memory cudaFree(d_input); cudaFree(d_output); } int main() { const int n = 1000; int h_input[n]; int h_output[n]; // Initialize input data for (int i = 0; i < n; i++) { h_input[i] = i; } // Perform vector multiplication vectorMultiplyByTwo(h_input, h_output, n); // Verify results printf("First 10 results:\n"); for (int i = 0; i < 10; i++) { printf("%d * 2 = %d\n", h_input[i], h_output[i]); } return 0; }