From 94b3ea3df51edf6a7cfbd26016642e01844eb20e Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 5 Nov 2013 10:57:12 +0100
Subject: [PATCH] added cuda lexer and removed example from c++ samples

---
 lib/linguist/languages.yml         |  6 ++++
 samples/C++/cuda.cu                | 39 ----------------------
 samples/Cuda/scalarProd_kernel.cuh | 52 ++++++++++++++++++++++++++++++
 samples/Cuda/vectorAdd.cu          | 46 ++++++++++++++++++++++++++
 4 files changed, 104 insertions(+), 39 deletions(-)
 delete mode 100644 samples/C++/cuda.cu
 create mode 100644 samples/Cuda/scalarProd_kernel.cuh
 create mode 100644 samples/Cuda/vectorAdd.cu
diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml
index 33de0c01..dfa1fe03 100644
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -327,6 +327,12 @@ Cucumber:
   lexer: Gherkin
   primary_extension: .feature
 
+Cuda:
+  lexer: CUDA
+  primary_extension: .cu
+  extensions:
+  - .cuh
+
 Cython:
   type: programming
   group: Python
diff --git a/samples/C++/cuda.cu b/samples/C++/cuda.cu
deleted file mode 100644
index ddef40cd..00000000
--- a/samples/C++/cuda.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-void foo()
-{
-  cudaArray* cu_array;
-  texture<float, 2, cudaReadModeElementType> tex;
-
-  // Allocate array
-  cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
-  cudaMallocArray(&cu_array, &description, width, height);
-
-  // Copy image data to array
-  cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
-
-  // Set texture parameters (default)
-  tex.addressMode[0] = cudaAddressModeClamp;
-  tex.addressMode[1] = cudaAddressModeClamp;
-  tex.filterMode = cudaFilterModePoint;
-  tex.normalized = false; // do not normalize coordinates
-
-  // Bind the array to the texture
-  cudaBindTextureToArray(tex, cu_array);
-
-  // Run kernel
-  dim3 blockDim(16, 16, 1);
-  dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
-  kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
-
-  // Unbind the array from the texture
-  cudaUnbindTexture(tex);
-} //end foo()
-
-__global__ void kernel(float* odata, int height, int width)
-{
-   unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-   unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-   if (x < width && y < height) {
-      float c = tex2D(tex, x, y);
-      odata[y*width+x] = c;
-   }
-}
diff --git a/samples/Cuda/scalarProd_kernel.cuh b/samples/Cuda/scalarProd_kernel.cuh
new file mode 100644
index 00000000..7622c597
--- /dev/null
+++ b/samples/Cuda/scalarProd_kernel.cuh
@@ -0,0 +1,52 @@
+__global__ void scalarProdGPU(
+    float *d_C,
+    float *d_A,
+    float *d_B,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ float accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        int vectorBase = IMUL(elementN, vec);
+        int vectorEnd  = vectorBase + elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+            float sum = 0;
+
+            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+                sum += d_A[pos] * d_B[pos];
+
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+                accumResult[iAccum] += accumResult[stride + iAccum];
+        }
+
+        if (threadIdx.x == 0) d_C[vec] = accumResult[0];
+    }
+}
\ No newline at end of file
diff --git a/samples/Cuda/vectorAdd.cu b/samples/Cuda/vectorAdd.cu
new file mode 100644
index 00000000..cdc21dff
--- /dev/null
+++ b/samples/Cuda/vectorAdd.cu
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        C[i] = A[i] + B[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int
+main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Reset the device and exit
+    err = cudaDeviceReset();
+
+    return 0;
+}
\ No newline at end of file