mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 09:40:21 +00:00
added cuda lexer and removed example from c++ samples
This commit is contained in:
52
samples/Cuda/scalarProd_kernel.cuh
Normal file
52
samples/Cuda/scalarProd_kernel.cuh
Normal file
@@ -0,0 +1,52 @@
|
||||
__global__ void scalarProdGPU(
|
||||
float *d_C,
|
||||
float *d_A,
|
||||
float *d_B,
|
||||
int vectorN,
|
||||
int elementN
|
||||
)
|
||||
{
|
||||
//Accumulators cache
|
||||
__shared__ float accumResult[ACCUM_N];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Cycle through every pair of vectors,
|
||||
// taking into account that vector counts can be different
|
||||
// from total number of thread blocks
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
||||
{
|
||||
int vectorBase = IMUL(elementN, vec);
|
||||
int vectorEnd = vectorBase + elementN;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Each accumulator cycles through vectors with
|
||||
// stride equal to number of total number of accumulators ACCUM_N
|
||||
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
||||
// to meet memory coalescing alignment constraints.
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
||||
{
|
||||
float sum = 0;
|
||||
|
||||
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
||||
sum += d_A[pos] * d_B[pos];
|
||||
|
||||
accumResult[iAccum] = sum;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Perform tree-like reduction of accumulators' results.
|
||||
// ACCUM_N has to be power of two at this stage
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
|
||||
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
||||
accumResult[iAccum] += accumResult[stride + iAccum];
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) d_C[vec] = accumResult[0];
|
||||
}
|
||||
}
|
||||
46
samples/Cuda/vectorAdd.cu
Normal file
46
samples/Cuda/vectorAdd.cu
Normal file
@@ -0,0 +1,46 @@
|
||||
#include <stdio.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/**
|
||||
* CUDA Kernel Device code
|
||||
*
|
||||
* Computes the vector addition of A and B into C. The 3 vectors have the same
|
||||
* number of elements numElements.
|
||||
*/
|
||||
__global__ void
|
||||
vectorAdd(const float *A, const float *B, float *C, int numElements)
|
||||
{
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i < numElements)
|
||||
{
|
||||
C[i] = A[i] + B[i];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Host main routine
|
||||
*/
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
// Error code to check return values for CUDA calls
|
||||
cudaError_t err = cudaSuccess;
|
||||
|
||||
// Launch the Vector Add CUDA Kernel
|
||||
int threadsPerBlock = 256;
|
||||
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
|
||||
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
|
||||
err = cudaGetLastError();
|
||||
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Reset the device and exit
|
||||
err = cudaDeviceReset();
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user