added cuda lexer and removed example from c++ samples

This commit is contained in:
Kashif Rasul
2013-11-05 10:57:12 +01:00
parent 6d7eae5011
commit 94b3ea3df5
4 changed files with 104 additions and 39 deletions

View File

@@ -327,6 +327,12 @@ Cucumber:
lexer: Gherkin
primary_extension: .feature
Cuda:
lexer: CUDA
primary_extension: .cu
extensions:
- .cuh
Cython:
type: programming
group: Python

View File

@@ -1,39 +0,0 @@
void foo()
{
cudaArray* cu_array;
texture<float, 2, cudaReadModeElementType> tex;
// Allocate array
cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
cudaMallocArray(&cu_array, &description, width, height);
// Copy image data to array
cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
// Set texture parameters (default)
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false; // do not normalize coordinates
// Bind the array to the texture
cudaBindTextureToArray(tex, cu_array);
// Run kernel
dim3 blockDim(16, 16, 1);
dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
// Unbind the array from the texture
cudaUnbindTexture(tex);
} //end foo()
__global__ void kernel(float* odata, int height, int width)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < width && y < height) {
float c = tex2D(tex, x, y);
odata[y*width+x] = c;
}
}

View File

@@ -0,0 +1,52 @@
__global__ void scalarProdGPU(
float *d_C,
float *d_A,
float *d_B,
int vectorN,
int elementN
)
{
//Accumulators cache
__shared__ float accumResult[ACCUM_N];
////////////////////////////////////////////////////////////////////////////
// Cycle through every pair of vectors,
// taking into account that vector counts can be different
// from total number of thread blocks
////////////////////////////////////////////////////////////////////////////
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
{
int vectorBase = IMUL(elementN, vec);
int vectorEnd = vectorBase + elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
float sum = 0;
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
sum += d_A[pos] * d_B[pos];
accumResult[iAccum] = sum;
}
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
{
__syncthreads();
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
accumResult[iAccum] += accumResult[stride + iAccum];
}
if (threadIdx.x == 0) d_C[vec] = accumResult[0];
}
}

46
samples/Cuda/vectorAdd.cu Normal file
View File

@@ -0,0 +1,46 @@
#include <stdio.h>
#include <cuda_runtime.h>
/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
C[i] = A[i] + B[i];
}
}
/**
* Host main routine
*/
int
main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
return 0;
}