added cuda lexer and removed example from c++ samples

2025-12-08 20:38:47 +00:00 · 2013-11-05 10:57:12 +01:00
parent 6d7eae5011
commit 94b3ea3df5
4 changed files with 104 additions and 39 deletions
--- a/samples/Cuda/scalarProd_kernel.cuh
+++ b/samples/Cuda/scalarProd_kernel.cuh
@@ -0,0 +1,52 @@
+__global__ void scalarProdGPU(
+    float *d_C,
+    float *d_A,
+    float *d_B,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ float accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        int vectorBase = IMUL(elementN, vec);
+        int vectorEnd  = vectorBase + elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+            float sum = 0;
+
+            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+                sum += d_A[pos] * d_B[pos];
+
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+                accumResult[iAccum] += accumResult[stride + iAccum];
+        }
+
+        if (threadIdx.x == 0) d_C[vec] = accumResult[0];
+    }
+}
--- a/samples/Cuda/vectorAdd.cu
+++ b/samples/Cuda/vectorAdd.cu
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        C[i] = A[i] + B[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int
+main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Reset the device and exit
+    err = cudaDeviceReset();
+
+    return 0;
+}