From 94b3ea3df51edf6a7cfbd26016642e01844eb20e Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Tue, 5 Nov 2013 10:57:12 +0100 Subject: [PATCH] added cuda lexer and removed example from c++ samples --- lib/linguist/languages.yml | 6 ++++ samples/C++/cuda.cu | 39 ---------------------- samples/Cuda/scalarProd_kernel.cuh | 52 ++++++++++++++++++++++++++++++ samples/Cuda/vectorAdd.cu | 46 ++++++++++++++++++++++++++ 4 files changed, 104 insertions(+), 39 deletions(-) delete mode 100644 samples/C++/cuda.cu create mode 100644 samples/Cuda/scalarProd_kernel.cuh create mode 100644 samples/Cuda/vectorAdd.cu diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 33de0c01..dfa1fe03 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -327,6 +327,12 @@ Cucumber: lexer: Gherkin primary_extension: .feature +Cuda: + lexer: CUDA + primary_extension: .cu + extensions: + - .cuh + Cython: type: programming group: Python diff --git a/samples/C++/cuda.cu b/samples/C++/cuda.cu deleted file mode 100644 index ddef40cd..00000000 --- a/samples/C++/cuda.cu +++ /dev/null @@ -1,39 +0,0 @@ -void foo() -{ - cudaArray* cu_array; - texture tex; - - // Allocate array - cudaChannelFormatDesc description = cudaCreateChannelDesc(); - cudaMallocArray(&cu_array, &description, width, height); - - // Copy image data to array - cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice); - - // Set texture parameters (default) - tex.addressMode[0] = cudaAddressModeClamp; - tex.addressMode[1] = cudaAddressModeClamp; - tex.filterMode = cudaFilterModePoint; - tex.normalized = false; // do not normalize coordinates - - // Bind the array to the texture - cudaBindTextureToArray(tex, cu_array); - - // Run kernel - dim3 blockDim(16, 16, 1); - dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1); - kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width); - - // Unbind the array from the texture - cudaUnbindTexture(tex); -} //end foo() - -__global__ void kernel(float* odata, int height, int width) -{ - unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; - if (x < width && y < height) { - float c = tex2D(tex, x, y); - odata[y*width+x] = c; - } -} diff --git a/samples/Cuda/scalarProd_kernel.cuh b/samples/Cuda/scalarProd_kernel.cuh new file mode 100644 index 00000000..7622c597 --- /dev/null +++ b/samples/Cuda/scalarProd_kernel.cuh @@ -0,0 +1,52 @@ +__global__ void scalarProdGPU( + float *d_C, + float *d_A, + float *d_B, + int vectorN, + int elementN +) +{ + //Accumulators cache + __shared__ float accumResult[ACCUM_N]; + + //////////////////////////////////////////////////////////////////////////// + // Cycle through every pair of vectors, + // taking into account that vector counts can be different + // from total number of thread blocks + //////////////////////////////////////////////////////////////////////////// + for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) + { + int vectorBase = IMUL(elementN, vec); + int vectorEnd = vectorBase + elementN; + + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) + { + float sum = 0; + + for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + sum += d_A[pos] * d_B[pos]; + + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + accumResult[iAccum] += accumResult[stride + iAccum]; + } + + if (threadIdx.x == 0) d_C[vec] = accumResult[0]; + } +} \ No newline at end of file diff --git a/samples/Cuda/vectorAdd.cu b/samples/Cuda/vectorAdd.cu new file mode 100644 index 00000000..cdc21dff --- /dev/null +++ b/samples/Cuda/vectorAdd.cu @@ -0,0 +1,46 @@ +#include +#include + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void +vectorAdd(const float *A, const float *B, float *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] + B[i]; + } +} + +/** + * Host main routine + */ +int +main(void) +{ + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Launch the Vector Add CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; + vectorAdd<<>>(d_A, d_B, d_C, numElements); + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } + + // Reset the device and exit + err = cudaDeviceReset(); + + return 0; +} \ No newline at end of file