added cuda lexer and removed example from c++ samples

2025-10-29 17:50:22 +00:00 · 2013-11-05 10:57:12 +01:00
parent 6d7eae5011
commit 94b3ea3df5
4 changed files with 104 additions and 39 deletions
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -327,6 +327,12 @@ Cucumber:
  lexer: Gherkin
  primary_extension: .feature
 Cuda:
  lexer: CUDA
  primary_extension: .cu
  extensions:
  - .cuh
 Cython:
  type: programming
  group: Python
--- a/samples/C++/cuda.cu
+++ b/samples/C++/cuda.cu
@@ -1,39 +0,0 @@
 void foo()
 {
  cudaArray* cu_array;
  texture<float, 2, cudaReadModeElementType> tex;
  // Allocate array
  cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
  cudaMallocArray(&cu_array, &description, width, height);
  // Copy image data to array
  cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
  // Set texture parameters (default)
  tex.addressMode[0] = cudaAddressModeClamp;
  tex.addressMode[1] = cudaAddressModeClamp;
  tex.filterMode = cudaFilterModePoint;
  tex.normalized = false; // do not normalize coordinates
  // Bind the array to the texture
  cudaBindTextureToArray(tex, cu_array);
  // Run kernel
  dim3 blockDim(16, 16, 1);
  dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
  kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
  // Unbind the array from the texture
  cudaUnbindTexture(tex);
 } //end foo()
 __global__ void kernel(float* odata, int height, int width)
 {
   unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
   unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
   if (x < width && y < height) {
      float c = tex2D(tex, x, y);
      odata[y*width+x] = c;
   }
 }
--- a/samples/Cuda/scalarProd_kernel.cuh
+++ b/samples/Cuda/scalarProd_kernel.cuh
@@ -0,0 +1,52 @@
 __global__ void scalarProdGPU(
    float *d_C,
    float *d_A,
    float *d_B,
    int vectorN,
    int elementN
 )
 {
    //Accumulators cache
    __shared__ float accumResult[ACCUM_N];
    ////////////////////////////////////////////////////////////////////////////
    // Cycle through every pair of vectors,
    // taking into account that vector counts can be different
    // from total number of thread blocks
    ////////////////////////////////////////////////////////////////////////////
    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
    {
        int vectorBase = IMUL(elementN, vec);
        int vectorEnd  = vectorBase + elementN;
        ////////////////////////////////////////////////////////////////////////
        // Each accumulator cycles through vectors with
        // stride equal to number of total number of accumulators ACCUM_N
        // At this stage ACCUM_N is only preferred be a multiple of warp size
        // to meet memory coalescing alignment constraints.
        ////////////////////////////////////////////////////////////////////////
        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
        {
            float sum = 0;
            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
                sum += d_A[pos] * d_B[pos];
            accumResult[iAccum] = sum;
        }
        ////////////////////////////////////////////////////////////////////////
        // Perform tree-like reduction of accumulators' results.
        // ACCUM_N has to be power of two at this stage
        ////////////////////////////////////////////////////////////////////////
        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
        {
            __syncthreads();
            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
                accumResult[iAccum] += accumResult[stride + iAccum];
        }
        if (threadIdx.x == 0) d_C[vec] = accumResult[0];
    }
 }
--- a/samples/Cuda/vectorAdd.cu
+++ b/samples/Cuda/vectorAdd.cu
@@ -0,0 +1,46 @@
 #include <stdio.h>
 #include <cuda_runtime.h>
 /**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
 __global__ void
 vectorAdd(const float *A, const float *B, float *C, int numElements)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
 }
 /**
 * Host main routine
 */
 int
 main(void)
 {
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Reset the device and exit
    err = cudaDeviceReset();
    return 0;
 }