added cuda lexer and removed example from c++ samples

This commit is contained in:
Kashif Rasul
2013-11-05 10:57:12 +01:00
parent 6d7eae5011
commit 94b3ea3df5
4 changed files with 104 additions and 39 deletions

View File

@@ -0,0 +1,52 @@
__global__ void scalarProdGPU(
float *d_C,
float *d_A,
float *d_B,
int vectorN,
int elementN
)
{
//Accumulators cache
__shared__ float accumResult[ACCUM_N];
////////////////////////////////////////////////////////////////////////////
// Cycle through every pair of vectors,
// taking into account that vector counts can be different
// from total number of thread blocks
////////////////////////////////////////////////////////////////////////////
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
{
int vectorBase = IMUL(elementN, vec);
int vectorEnd = vectorBase + elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
float sum = 0;
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
sum += d_A[pos] * d_B[pos];
accumResult[iAccum] = sum;
}
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
{
__syncthreads();
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
accumResult[iAccum] += accumResult[stride + iAccum];
}
if (threadIdx.x == 0) d_C[vec] = accumResult[0];
}
}

46
samples/Cuda/vectorAdd.cu Normal file
View File

@@ -0,0 +1,46 @@
#include <stdio.h>
#include <cuda_runtime.h>
/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
C[i] = A[i] + B[i];
}
}
/**
* Host main routine
*/
int
main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
return 0;
}