mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	added cuda lexer and removed example from c++ samples
This commit is contained in:
		| @@ -327,6 +327,12 @@ Cucumber: | |||||||
|   lexer: Gherkin |   lexer: Gherkin | ||||||
|   primary_extension: .feature |   primary_extension: .feature | ||||||
|  |  | ||||||
|  | Cuda: | ||||||
|  |   lexer: CUDA | ||||||
|  |   primary_extension: .cu | ||||||
|  |   extensions: | ||||||
|  |   - .cuh | ||||||
|  |  | ||||||
| Cython: | Cython: | ||||||
|   type: programming |   type: programming | ||||||
|   group: Python |   group: Python | ||||||
|   | |||||||
| @@ -1,39 +0,0 @@ | |||||||
| void foo() |  | ||||||
| { |  | ||||||
|   cudaArray* cu_array; |  | ||||||
|   texture<float, 2, cudaReadModeElementType> tex; |  | ||||||
|  |  | ||||||
|   // Allocate array |  | ||||||
|   cudaChannelFormatDesc description = cudaCreateChannelDesc<float>(); |  | ||||||
|   cudaMallocArray(&cu_array, &description, width, height); |  | ||||||
|  |  | ||||||
|   // Copy image data to array |  | ||||||
|   cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice); |  | ||||||
|  |  | ||||||
|   // Set texture parameters (default) |  | ||||||
|   tex.addressMode[0] = cudaAddressModeClamp; |  | ||||||
|   tex.addressMode[1] = cudaAddressModeClamp; |  | ||||||
|   tex.filterMode = cudaFilterModePoint; |  | ||||||
|   tex.normalized = false; // do not normalize coordinates |  | ||||||
|  |  | ||||||
|   // Bind the array to the texture |  | ||||||
|   cudaBindTextureToArray(tex, cu_array); |  | ||||||
|  |  | ||||||
|   // Run kernel |  | ||||||
|   dim3 blockDim(16, 16, 1); |  | ||||||
|   dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1); |  | ||||||
|   kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width); |  | ||||||
|  |  | ||||||
|   // Unbind the array from the texture |  | ||||||
|   cudaUnbindTexture(tex); |  | ||||||
| } //end foo() |  | ||||||
|  |  | ||||||
| __global__ void kernel(float* odata, int height, int width) |  | ||||||
| { |  | ||||||
|    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; |  | ||||||
|    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; |  | ||||||
|    if (x < width && y < height) { |  | ||||||
|       float c = tex2D(tex, x, y); |  | ||||||
|       odata[y*width+x] = c; |  | ||||||
|    } |  | ||||||
| } |  | ||||||
							
								
								
									
										52
									
								
								samples/Cuda/scalarProd_kernel.cuh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								samples/Cuda/scalarProd_kernel.cuh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | |||||||
|  | __global__ void scalarProdGPU( | ||||||
|  |     float *d_C, | ||||||
|  |     float *d_A, | ||||||
|  |     float *d_B, | ||||||
|  |     int vectorN, | ||||||
|  |     int elementN | ||||||
|  | ) | ||||||
|  | { | ||||||
|  |     //Accumulators cache | ||||||
|  |     __shared__ float accumResult[ACCUM_N]; | ||||||
|  |  | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // Cycle through every pair of vectors, | ||||||
|  |     // taking into account that vector counts can be different | ||||||
|  |     // from total number of thread blocks | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||||
|  |     { | ||||||
|  |         int vectorBase = IMUL(elementN, vec); | ||||||
|  |         int vectorEnd  = vectorBase + elementN; | ||||||
|  |  | ||||||
|  |         //////////////////////////////////////////////////////////////////////// | ||||||
|  |         // Each accumulator cycles through vectors with | ||||||
|  |         // stride equal to number of total number of accumulators ACCUM_N | ||||||
|  |         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||||
|  |         // to meet memory coalescing alignment constraints. | ||||||
|  |         //////////////////////////////////////////////////////////////////////// | ||||||
|  |         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||||
|  |         { | ||||||
|  |             float sum = 0; | ||||||
|  |  | ||||||
|  |             for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||||
|  |                 sum += d_A[pos] * d_B[pos]; | ||||||
|  |  | ||||||
|  |             accumResult[iAccum] = sum; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         //////////////////////////////////////////////////////////////////////// | ||||||
|  |         // Perform tree-like reduction of accumulators' results. | ||||||
|  |         // ACCUM_N has to be power of two at this stage | ||||||
|  |         //////////////////////////////////////////////////////////////////////// | ||||||
|  |         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||||
|  |         { | ||||||
|  |             __syncthreads(); | ||||||
|  |  | ||||||
|  |             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||||
|  |                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if (threadIdx.x == 0) d_C[vec] = accumResult[0]; | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										46
									
								
								samples/Cuda/vectorAdd.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								samples/Cuda/vectorAdd.cu
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | |||||||
|  | #include <stdio.h> | ||||||
|  | #include <cuda_runtime.h> | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * CUDA Kernel Device code | ||||||
|  |  * | ||||||
|  |  * Computes the vector addition of A and B into C. The 3 vectors have the same | ||||||
|  |  * number of elements numElements. | ||||||
|  |  */ | ||||||
|  | __global__ void | ||||||
|  | vectorAdd(const float *A, const float *B, float *C, int numElements) | ||||||
|  | { | ||||||
|  |     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||||
|  |  | ||||||
|  |     if (i < numElements) | ||||||
|  |     { | ||||||
|  |         C[i] = A[i] + B[i]; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Host main routine | ||||||
|  |  */ | ||||||
|  | int | ||||||
|  | main(void) | ||||||
|  | { | ||||||
|  |     // Error code to check return values for CUDA calls | ||||||
|  |     cudaError_t err = cudaSuccess; | ||||||
|  |  | ||||||
|  |     // Launch the Vector Add CUDA Kernel | ||||||
|  |     int threadsPerBlock = 256; | ||||||
|  |     int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; | ||||||
|  |     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements); | ||||||
|  |     err = cudaGetLastError(); | ||||||
|  |  | ||||||
|  |     if (err != cudaSuccess) | ||||||
|  |     { | ||||||
|  |         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); | ||||||
|  |         exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Reset the device and exit | ||||||
|  |     err = cudaDeviceReset(); | ||||||
|  |  | ||||||
|  |     return 0; | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user