Merge pull request #758 from kashif/nvidia-cuda

added cuda lexer and removed example from c++ samples
2026-02-15 12:49:30 +00:00 · 2013-11-05 13:57:49 -08:00
parent 051bedefab d86f8ba12f
commit 7b443fcdde
5 changed files with 220 additions and 116 deletions
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -332,6 +332,12 @@ Cucumber:
  lexer: Gherkin
  primary_extension: .feature

+Cuda:
+  lexer: CUDA
+  primary_extension: .cu
+  extensions:
+  - .cuh
+
 Cython:
  type: programming
  group: Python
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
@@ -31,7 +31,6 @@
    "C++": [
      ".cc",
      ".cpp",
-      ".cu",
      ".h",
      ".hpp"
    ],
@@ -65,6 +64,10 @@
    "CSS": [
      ".css"
    ],
+    "Cuda": [
+      ".cu",
+      ".cuh"
+    ],
    "Dart": [
      ".dart"
    ],
@@ -460,8 +463,8 @@
      ".gemrc"
    ]
  },
-  "tokens_total": 417572,
-  "languages_total": 476,
+  "tokens_total": 417690,
+  "languages_total": 477,
  "tokens": {
    "ABAP": {
      "*/**": 1,
@@ -7829,82 +7832,17 @@
    "C++": {
      "class": 34,
      "Bar": 2,
-      "{": 553,
+      "{": 550,
      "protected": 4,
      "char": 122,
      "*name": 6,
-      ";": 2308,
+      ";": 2290,
      "public": 27,
-      "void": 152,
+      "void": 150,
      "hello": 2,
-      "(": 2438,
-      ")": 2440,
-      "}": 552,
-      "foo": 2,
-      "cudaArray*": 1,
-      "cu_array": 4,
-      "texture": 1,
-      "<float,>": 1,
-      "2": 1,
-      "cudaReadModeElementType": 1,
-      "tex": 4,
-      "cudaChannelFormatDesc": 1,
-      "description": 5,
-      "cudaCreateChannelDesc": 1,
-      "<float>": 1,
-      "cudaMallocArray": 1,
-      "&": 148,
-      "width": 5,
-      "height": 5,
-      "cudaMemcpyToArray": 1,
-      "image": 1,
-      "width*height*sizeof": 1,
-      "float": 9,
-      "cudaMemcpyHostToDevice": 1,
-      "tex.addressMode": 2,
-      "[": 204,
-      "]": 204,
-      "cudaAddressModeClamp": 2,
-      "tex.filterMode": 1,
-      "cudaFilterModePoint": 1,
-      "tex.normalized": 1,
-      "false": 43,
-      "//": 239,
-      "do": 5,
-      "not": 2,
-      "normalize": 1,
-      "coordinates": 1,
-      "cudaBindTextureToArray": 1,
-      "dim3": 2,
-      "blockDim": 2,
-      "gridDim": 2,
-      "+": 55,
-      "blockDim.x": 2,
-      "-": 227,
-      "/": 15,
-      "blockDim.y": 2,
-      "kernel": 2,
-      "<<": 19,
-      "<": 56,
-      "d_data": 1,
-      "cudaUnbindTexture": 1,
-      "//end": 1,
-      "__global__": 1,
-      "float*": 1,
-      "odata": 2,
-      "int": 148,
-      "unsigned": 22,
-      "x": 48,
-      "blockIdx.x*blockDim.x": 1,
-      "threadIdx.x": 1,
-      "y": 16,
-      "blockIdx.y*blockDim.y": 1,
-      "threadIdx.y": 1,
-      "if": 296,
-      "&&": 24,
-      "c": 52,
-      "tex2D": 1,
-      "y*width": 1,
+      "(": 2422,
+      ")": 2424,
+      "}": 549,
      "#include": 106,
      "<QCoreApplication>": 1,
      "<QString>": 1,
@@ -7916,6 +7854,7 @@
      "NULL": 108,
      "*Env": 1,
      "instance": 4,
+      "if": 295,
      "env_instance": 3,
      "new": 9,
      "return": 147,
@@ -7930,11 +7869,13 @@
      "envvar": 2,
      "name": 21,
      "value": 18,
+      "int": 144,
      "indexOfEquals": 5,
      "for": 18,
      "env": 3,
      "envp": 4,
      "*env": 1,
+      "+": 50,
      "envvar.indexOf": 1,
      "continue": 2,
      "envvar.left": 1,
@@ -7951,6 +7892,7 @@
      "*instance": 1,
      "private": 12,
      "#endif": 82,
+      "//": 238,
      "GDSDBREADER_H": 3,
      "<QDir>": 1,
      "GDS_DIR": 1,
@@ -8007,6 +7949,7 @@
      "A": 1,
      "friend": 10,
      "stream": 5,
+      "<<": 18,
      "myclass.label": 2,
      "myclass.depth": 2,
      "myclass.userIndex": 2,
@@ -8020,6 +7963,7 @@
      "myclass.firstLineData": 4,
      "myclass.linesNumbers": 2,
      "QDataStream": 2,
+      "&": 146,
      "myclass": 1,
      "//Don": 1,
      "read": 1,
@@ -8066,6 +8010,7 @@
      "ECDSA_SIG_recover_key_GFp": 3,
      "ECDSA_SIG": 3,
      "*ecsig": 1,
+      "unsigned": 20,
      "*msg": 2,
      "msglen": 2,
      "recid": 3,
@@ -8084,10 +8029,13 @@
      "*zero": 1,
      "n": 28,
      "i": 47,
+      "/": 13,
+      "-": 225,
      "BN_CTX_start": 1,
      "order": 8,
      "BN_CTX_get": 8,
      "EC_GROUP_get_order": 1,
+      "x": 44,
      "BN_copy": 1,
      "BN_mul_word": 1,
      "BN_add": 1,
@@ -8126,6 +8074,7 @@
      "fCompressedPubKey": 5,
      "true": 39,
      "Reset": 5,
+      "false": 42,
      "EC_KEY_new_by_curve_name": 2,
      "NID_secp256k1": 2,
      "throw": 4,
@@ -8139,6 +8088,8 @@
      "hash": 20,
      "sizeof": 14,
      "vchSig": 18,
+      "[": 201,
+      "]": 201,
      "nSize": 2,
      "vchSig.clear": 2,
      "vchSig.resize": 2,
@@ -8158,6 +8109,8 @@
      "nBitsR": 3,
      "BN_num_bits": 2,
      "nBitsS": 3,
+      "<": 53,
+      "&&": 23,
      "nRecId": 4,
      "<4;>": 1,
      "keyRec": 5,
@@ -8498,6 +8451,7 @@
      "has": 2,
      "user": 2,
      "friendly": 2,
+      "description": 3,
      "use": 1,
      "mapping": 1,
      "dialogs.": 1,
@@ -8680,6 +8634,7 @@
      "SCI_CLEAR": 1,
      "DeleteBack": 1,
      "SCI_DELETEBACK": 1,
+      "not": 1,
      "at": 4,
      "DeleteBackNotLine": 1,
      "SCI_DELETEBACKNOTLINE": 1,
@@ -8793,6 +8748,7 @@
      "unchanged.": 1,
      "Valid": 1,
      "control": 1,
+      "c": 50,
      "Key_Down": 1,
      "Key_Up": 1,
      "Key_Left": 1,
@@ -9016,6 +8972,7 @@
      "ScanHtmlComment": 3,
      "LT": 2,
      "next_.literal_chars": 13,
+      "do": 4,
      "ScanString": 3,
      "LTE": 1,
      "ASSIGN_SHL": 1,
@@ -9083,6 +9040,7 @@
      "l": 1,
      "p": 5,
      "w": 1,
+      "y": 13,
      "keyword": 1,
      "Unescaped": 1,
      "in_character_class": 2,
@@ -9724,6 +9682,7 @@
      "npy_longdouble": 1,
      "__pyx_t_5numpy_longdouble_t": 1,
      "complex": 2,
+      "float": 7,
      "__pyx_t_float_complex": 27,
      "_Complex": 2,
      "real": 2,
@@ -13186,6 +13145,84 @@
      "backdrop.fade": 1,
      "backdrop.fade.in": 1
    },
+    "Cuda": {
+      "__global__": 2,
+      "void": 3,
+      "scalarProdGPU": 1,
+      "(": 20,
+      "float": 8,
+      "*d_C": 1,
+      "*d_A": 1,
+      "*d_B": 1,
+      "int": 14,
+      "vectorN": 2,
+      "elementN": 3,
+      ")": 20,
+      "{": 8,
+      "//Accumulators": 1,
+      "cache": 1,
+      "__shared__": 1,
+      "accumResult": 5,
+      "[": 11,
+      "ACCUM_N": 4,
+      "]": 11,
+      ";": 30,
+      "////////////////////////////////////////////////////////////////////////////": 2,
+      "for": 5,
+      "vec": 5,
+      "blockIdx.x": 2,
+      "<": 5,
+      "+": 12,
+      "gridDim.x": 1,
+      "vectorBase": 3,
+      "IMUL": 1,
+      "vectorEnd": 2,
+      "////////////////////////////////////////////////////////////////////////": 4,
+      "iAccum": 10,
+      "threadIdx.x": 4,
+      "blockDim.x": 3,
+      "sum": 3,
+      "pos": 5,
+      "d_A": 2,
+      "*": 2,
+      "d_B": 2,
+      "}": 8,
+      "stride": 5,
+      "/": 2,
+      "__syncthreads": 1,
+      "if": 3,
+      "d_C": 2,
+      "#include": 2,
+      "<stdio.h>": 1,
+      "<cuda_runtime.h>": 1,
+      "vectorAdd": 2,
+      "const": 2,
+      "*A": 1,
+      "*B": 1,
+      "*C": 1,
+      "numElements": 4,
+      "i": 5,
+      "C": 1,
+      "A": 1,
+      "B": 1,
+      "main": 1,
+      "cudaError_t": 1,
+      "err": 5,
+      "cudaSuccess": 2,
+      "threadsPerBlock": 4,
+      "blocksPerGrid": 1,
+      "-": 1,
+      "<<": 1,
+      "<blocksPerGrid,>": 1,
+      "cudaGetLastError": 1,
+      "fprintf": 1,
+      "stderr": 1,
+      "cudaGetErrorString": 1,
+      "exit": 1,
+      "EXIT_FAILURE": 1,
+      "cudaDeviceReset": 1,
+      "return": 1
+    },
    "Dart": {
      "class": 1,
      "Point": 7,
@@ -42233,7 +42270,7 @@
    "Awk": 544,
    "Bluespec": 1298,
    "C": 58858,
-    "C++": 21480,
+    "C++": 21308,
    "Ceylon": 50,
    "Clojure": 510,
    "COBOL": 90,
@@ -42241,6 +42278,7 @@
    "Common Lisp": 103,
    "Coq": 18259,
    "CSS": 43867,
+    "Cuda": 290,
    "Dart": 68,
    "Diff": 16,
    "DM": 169,
@@ -42354,7 +42392,7 @@
    "Awk": 1,
    "Bluespec": 2,
    "C": 26,
-    "C++": 20,
+    "C++": 19,
    "Ceylon": 1,
    "Clojure": 7,
    "COBOL": 4,
@@ -42362,6 +42400,7 @@
    "Common Lisp": 1,
    "Coq": 12,
    "CSS": 2,
+    "Cuda": 2,
    "Dart": 1,
    "Diff": 1,
    "DM": 1,
@@ -42464,5 +42503,5 @@
    "Xtend": 2,
    "YAML": 1
  },
-  "md5": "6a064f3fdb191614ff6f065a365cb3d7"
+  "md5": "9fdad6d44dffa2bd9b71e18d9082cb2e"
 }
--- a/samples/C++/cuda.cu
+++ b/samples/C++/cuda.cu
@@ -1,39 +0,0 @@
-void foo()
-{
-  cudaArray* cu_array;
-  texture<float, 2, cudaReadModeElementType> tex;
-
-  // Allocate array
-  cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
-  cudaMallocArray(&cu_array, &description, width, height);
-
-  // Copy image data to array
-  cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
-
-  // Set texture parameters (default)
-  tex.addressMode[0] = cudaAddressModeClamp;
-  tex.addressMode[1] = cudaAddressModeClamp;
-  tex.filterMode = cudaFilterModePoint;
-  tex.normalized = false; // do not normalize coordinates
-
-  // Bind the array to the texture
-  cudaBindTextureToArray(tex, cu_array);
-
-  // Run kernel
-  dim3 blockDim(16, 16, 1);
-  dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
-  kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
-
-  // Unbind the array from the texture
-  cudaUnbindTexture(tex);
-} //end foo()
-
-__global__ void kernel(float* odata, int height, int width)
-{
-   unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
-   unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
-   if (x < width && y < height) {
-      float c = tex2D(tex, x, y);
-      odata[y*width+x] = c;
-   }
-}
--- a/samples/Cuda/scalarProd_kernel.cuh
+++ b/samples/Cuda/scalarProd_kernel.cuh
@@ -0,0 +1,52 @@
+__global__ void scalarProdGPU(
+    float *d_C,
+    float *d_A,
+    float *d_B,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ float accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        int vectorBase = IMUL(elementN, vec);
+        int vectorEnd  = vectorBase + elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+            float sum = 0;
+
+            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+                sum += d_A[pos] * d_B[pos];
+
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+                accumResult[iAccum] += accumResult[stride + iAccum];
+        }
+
+        if (threadIdx.x == 0) d_C[vec] = accumResult[0];
+    }
+}
--- a/samples/Cuda/vectorAdd.cu
+++ b/samples/Cuda/vectorAdd.cu
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void
+vectorAdd(const float *A, const float *B, float *C, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        C[i] = A[i] + B[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int
+main(void)
+{
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+
+    // Reset the device and exit
+    err = cudaDeviceReset();
+
+    return 0;
+}