mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-28 17:20:22 +00:00
added cuda lexer and removed example from c++ samples
This commit is contained in:
@@ -327,6 +327,12 @@ Cucumber:
|
||||
lexer: Gherkin
|
||||
primary_extension: .feature
|
||||
|
||||
Cuda:
|
||||
lexer: CUDA
|
||||
primary_extension: .cu
|
||||
extensions:
|
||||
- .cuh
|
||||
|
||||
Cython:
|
||||
type: programming
|
||||
group: Python
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
void foo()
|
||||
{
|
||||
cudaArray* cu_array;
|
||||
texture<float, 2, cudaReadModeElementType> tex;
|
||||
|
||||
// Allocate array
|
||||
cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
|
||||
cudaMallocArray(&cu_array, &description, width, height);
|
||||
|
||||
// Copy image data to array
|
||||
cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
// Set texture parameters (default)
|
||||
tex.addressMode[0] = cudaAddressModeClamp;
|
||||
tex.addressMode[1] = cudaAddressModeClamp;
|
||||
tex.filterMode = cudaFilterModePoint;
|
||||
tex.normalized = false; // do not normalize coordinates
|
||||
|
||||
// Bind the array to the texture
|
||||
cudaBindTextureToArray(tex, cu_array);
|
||||
|
||||
// Run kernel
|
||||
dim3 blockDim(16, 16, 1);
|
||||
dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
|
||||
kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
|
||||
|
||||
// Unbind the array from the texture
|
||||
cudaUnbindTexture(tex);
|
||||
} //end foo()
|
||||
|
||||
__global__ void kernel(float* odata, int height, int width)
|
||||
{
|
||||
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
|
||||
if (x < width && y < height) {
|
||||
float c = tex2D(tex, x, y);
|
||||
odata[y*width+x] = c;
|
||||
}
|
||||
}
|
||||
52
samples/Cuda/scalarProd_kernel.cuh
Normal file
52
samples/Cuda/scalarProd_kernel.cuh
Normal file
@@ -0,0 +1,52 @@
|
||||
__global__ void scalarProdGPU(
|
||||
float *d_C,
|
||||
float *d_A,
|
||||
float *d_B,
|
||||
int vectorN,
|
||||
int elementN
|
||||
)
|
||||
{
|
||||
//Accumulators cache
|
||||
__shared__ float accumResult[ACCUM_N];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Cycle through every pair of vectors,
|
||||
// taking into account that vector counts can be different
|
||||
// from total number of thread blocks
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
||||
{
|
||||
int vectorBase = IMUL(elementN, vec);
|
||||
int vectorEnd = vectorBase + elementN;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Each accumulator cycles through vectors with
|
||||
// stride equal to number of total number of accumulators ACCUM_N
|
||||
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
||||
// to meet memory coalescing alignment constraints.
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
||||
{
|
||||
float sum = 0;
|
||||
|
||||
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
||||
sum += d_A[pos] * d_B[pos];
|
||||
|
||||
accumResult[iAccum] = sum;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Perform tree-like reduction of accumulators' results.
|
||||
// ACCUM_N has to be power of two at this stage
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
||||
accumResult[iAccum] += accumResult[stride + iAccum];
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) d_C[vec] = accumResult[0];
|
||||
}
|
||||
}
|
||||
46
samples/Cuda/vectorAdd.cu
Normal file
46
samples/Cuda/vectorAdd.cu
Normal file
@@ -0,0 +1,46 @@
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/**
|
||||
* CUDA Kernel Device code
|
||||
*
|
||||
* Computes the vector addition of A and B into C. The 3 vectors have the same
|
||||
* number of elements numElements.
|
||||
*/
|
||||
__global__ void
|
||||
vectorAdd(const float *A, const float *B, float *C, int numElements)
|
||||
{
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i < numElements)
|
||||
{
|
||||
C[i] = A[i] + B[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Host main routine
|
||||
*/
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
// Error code to check return values for CUDA calls
|
||||
cudaError_t err = cudaSuccess;
|
||||
|
||||
// Launch the Vector Add CUDA Kernel
|
||||
int threadsPerBlock = 256;
|
||||
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
|
||||
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
|
||||
err = cudaGetLastError();
|
||||
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Reset the device and exit
|
||||
err = cudaDeviceReset();
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user