mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	added cuda lexer and removed example from c++ samples
This commit is contained in:
		| @@ -327,6 +327,12 @@ Cucumber: | ||||
|   lexer: Gherkin | ||||
|   primary_extension: .feature | ||||
|  | ||||
| Cuda: | ||||
|   lexer: CUDA | ||||
|   primary_extension: .cu | ||||
|   extensions: | ||||
|   - .cuh | ||||
|  | ||||
| Cython: | ||||
|   type: programming | ||||
|   group: Python | ||||
|   | ||||
| @@ -1,39 +0,0 @@ | ||||
| void foo() | ||||
| { | ||||
|   cudaArray* cu_array; | ||||
|   texture<float, 2, cudaReadModeElementType> tex; | ||||
|  | ||||
|   // Allocate array | ||||
|   cudaChannelFormatDesc description = cudaCreateChannelDesc<float>(); | ||||
|   cudaMallocArray(&cu_array, &description, width, height); | ||||
|  | ||||
|   // Copy image data to array | ||||
|   cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice); | ||||
|  | ||||
|   // Set texture parameters (default) | ||||
|   tex.addressMode[0] = cudaAddressModeClamp; | ||||
|   tex.addressMode[1] = cudaAddressModeClamp; | ||||
|   tex.filterMode = cudaFilterModePoint; | ||||
|   tex.normalized = false; // do not normalize coordinates | ||||
|  | ||||
|   // Bind the array to the texture | ||||
|   cudaBindTextureToArray(tex, cu_array); | ||||
|  | ||||
|   // Run kernel | ||||
|   dim3 blockDim(16, 16, 1); | ||||
|   dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1); | ||||
|   kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width); | ||||
|  | ||||
|   // Unbind the array from the texture | ||||
|   cudaUnbindTexture(tex); | ||||
| } //end foo() | ||||
|  | ||||
| __global__ void kernel(float* odata, int height, int width) | ||||
| { | ||||
|    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; | ||||
|    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; | ||||
|    if (x < width && y < height) { | ||||
|       float c = tex2D(tex, x, y); | ||||
|       odata[y*width+x] = c; | ||||
|    } | ||||
| } | ||||
							
								
								
									
										52
									
								
								samples/Cuda/scalarProd_kernel.cuh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								samples/Cuda/scalarProd_kernel.cuh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| __global__ void scalarProdGPU( | ||||
|     float *d_C, | ||||
|     float *d_A, | ||||
|     float *d_B, | ||||
|     int vectorN, | ||||
|     int elementN | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ float accumResult[ACCUM_N]; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         int vectorBase = IMUL(elementN, vec); | ||||
|         int vectorEnd  = vectorBase + elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|             float sum = 0; | ||||
|  | ||||
|             for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||
|                 sum += d_A[pos] * d_B[pos]; | ||||
|  | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) d_C[vec] = accumResult[0]; | ||||
|     } | ||||
| } | ||||
							
								
								
									
										46
									
								
								samples/Cuda/vectorAdd.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								samples/Cuda/vectorAdd.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | ||||
| #include <stdio.h> | ||||
| #include <cuda_runtime.h> | ||||
|  | ||||
| /** | ||||
|  * CUDA Kernel Device code | ||||
|  * | ||||
|  * Computes the vector addition of A and B into C. The 3 vectors have the same | ||||
|  * number of elements numElements. | ||||
|  */ | ||||
| __global__ void | ||||
| vectorAdd(const float *A, const float *B, float *C, int numElements) | ||||
| { | ||||
|     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|  | ||||
|     if (i < numElements) | ||||
|     { | ||||
|         C[i] = A[i] + B[i]; | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * Host main routine | ||||
|  */ | ||||
| int | ||||
| main(void) | ||||
| { | ||||
|     // Error code to check return values for CUDA calls | ||||
|     cudaError_t err = cudaSuccess; | ||||
|  | ||||
|     // Launch the Vector Add CUDA Kernel | ||||
|     int threadsPerBlock = 256; | ||||
|     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; | ||||
|     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); | ||||
|     err = cudaGetLastError(); | ||||
|  | ||||
|     if (err != cudaSuccess) | ||||
|     { | ||||
|         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); | ||||
|         exit(EXIT_FAILURE); | ||||
|     } | ||||
|  | ||||
|     // Reset the device and exit | ||||
|     err = cudaDeviceReset(); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user