Merge pull request #758 from kashif/nvidia-cuda

added cuda lexer and removed example from c++ samples
This commit is contained in:
Ted Nyman
2013-11-05 13:57:49 -08:00
5 changed files with 220 additions and 116 deletions

View File

@@ -332,6 +332,12 @@ Cucumber:
lexer: Gherkin
primary_extension: .feature
Cuda:
lexer: CUDA
primary_extension: .cu
extensions:
- .cuh
Cython:
type: programming
group: Python

View File

@@ -31,7 +31,6 @@
"C++": [
".cc",
".cpp",
".cu",
".h",
".hpp"
],
@@ -65,6 +64,10 @@
"CSS": [
".css"
],
"Cuda": [
".cu",
".cuh"
],
"Dart": [
".dart"
],
@@ -460,8 +463,8 @@
".gemrc"
]
},
"tokens_total": 417572,
"languages_total": 476,
"tokens_total": 417690,
"languages_total": 477,
"tokens": {
"ABAP": {
"*/**": 1,
@@ -7829,82 +7832,17 @@
"C++": {
"class": 34,
"Bar": 2,
"{": 553,
"{": 550,
"protected": 4,
"char": 122,
"*name": 6,
";": 2308,
";": 2290,
"public": 27,
"void": 152,
"void": 150,
"hello": 2,
"(": 2438,
")": 2440,
"}": 552,
"foo": 2,
"cudaArray*": 1,
"cu_array": 4,
"texture": 1,
"<float,>": 1,
"2": 1,
"cudaReadModeElementType": 1,
"tex": 4,
"cudaChannelFormatDesc": 1,
"description": 5,
"cudaCreateChannelDesc": 1,
"<float>": 1,
"cudaMallocArray": 1,
"&": 148,
"width": 5,
"height": 5,
"cudaMemcpyToArray": 1,
"image": 1,
"width*height*sizeof": 1,
"float": 9,
"cudaMemcpyHostToDevice": 1,
"tex.addressMode": 2,
"[": 204,
"]": 204,
"cudaAddressModeClamp": 2,
"tex.filterMode": 1,
"cudaFilterModePoint": 1,
"tex.normalized": 1,
"false": 43,
"//": 239,
"do": 5,
"not": 2,
"normalize": 1,
"coordinates": 1,
"cudaBindTextureToArray": 1,
"dim3": 2,
"blockDim": 2,
"gridDim": 2,
"+": 55,
"blockDim.x": 2,
"-": 227,
"/": 15,
"blockDim.y": 2,
"kernel": 2,
"<<": 19,
"<": 56,
"d_data": 1,
"cudaUnbindTexture": 1,
"//end": 1,
"__global__": 1,
"float*": 1,
"odata": 2,
"int": 148,
"unsigned": 22,
"x": 48,
"blockIdx.x*blockDim.x": 1,
"threadIdx.x": 1,
"y": 16,
"blockIdx.y*blockDim.y": 1,
"threadIdx.y": 1,
"if": 296,
"&&": 24,
"c": 52,
"tex2D": 1,
"y*width": 1,
"(": 2422,
")": 2424,
"}": 549,
"#include": 106,
"<QCoreApplication>": 1,
"<QString>": 1,
@@ -7916,6 +7854,7 @@
"NULL": 108,
"*Env": 1,
"instance": 4,
"if": 295,
"env_instance": 3,
"new": 9,
"return": 147,
@@ -7930,11 +7869,13 @@
"envvar": 2,
"name": 21,
"value": 18,
"int": 144,
"indexOfEquals": 5,
"for": 18,
"env": 3,
"envp": 4,
"*env": 1,
"+": 50,
"envvar.indexOf": 1,
"continue": 2,
"envvar.left": 1,
@@ -7951,6 +7892,7 @@
"*instance": 1,
"private": 12,
"#endif": 82,
"//": 238,
"GDSDBREADER_H": 3,
"<QDir>": 1,
"GDS_DIR": 1,
@@ -8007,6 +7949,7 @@
"A": 1,
"friend": 10,
"stream": 5,
"<<": 18,
"myclass.label": 2,
"myclass.depth": 2,
"myclass.userIndex": 2,
@@ -8020,6 +7963,7 @@
"myclass.firstLineData": 4,
"myclass.linesNumbers": 2,
"QDataStream": 2,
"&": 146,
"myclass": 1,
"//Don": 1,
"read": 1,
@@ -8066,6 +8010,7 @@
"ECDSA_SIG_recover_key_GFp": 3,
"ECDSA_SIG": 3,
"*ecsig": 1,
"unsigned": 20,
"*msg": 2,
"msglen": 2,
"recid": 3,
@@ -8084,10 +8029,13 @@
"*zero": 1,
"n": 28,
"i": 47,
"/": 13,
"-": 225,
"BN_CTX_start": 1,
"order": 8,
"BN_CTX_get": 8,
"EC_GROUP_get_order": 1,
"x": 44,
"BN_copy": 1,
"BN_mul_word": 1,
"BN_add": 1,
@@ -8126,6 +8074,7 @@
"fCompressedPubKey": 5,
"true": 39,
"Reset": 5,
"false": 42,
"EC_KEY_new_by_curve_name": 2,
"NID_secp256k1": 2,
"throw": 4,
@@ -8139,6 +8088,8 @@
"hash": 20,
"sizeof": 14,
"vchSig": 18,
"[": 201,
"]": 201,
"nSize": 2,
"vchSig.clear": 2,
"vchSig.resize": 2,
@@ -8158,6 +8109,8 @@
"nBitsR": 3,
"BN_num_bits": 2,
"nBitsS": 3,
"<": 53,
"&&": 23,
"nRecId": 4,
"<4;>": 1,
"keyRec": 5,
@@ -8498,6 +8451,7 @@
"has": 2,
"user": 2,
"friendly": 2,
"description": 3,
"use": 1,
"mapping": 1,
"dialogs.": 1,
@@ -8680,6 +8634,7 @@
"SCI_CLEAR": 1,
"DeleteBack": 1,
"SCI_DELETEBACK": 1,
"not": 1,
"at": 4,
"DeleteBackNotLine": 1,
"SCI_DELETEBACKNOTLINE": 1,
@@ -8793,6 +8748,7 @@
"unchanged.": 1,
"Valid": 1,
"control": 1,
"c": 50,
"Key_Down": 1,
"Key_Up": 1,
"Key_Left": 1,
@@ -9016,6 +8972,7 @@
"ScanHtmlComment": 3,
"LT": 2,
"next_.literal_chars": 13,
"do": 4,
"ScanString": 3,
"LTE": 1,
"ASSIGN_SHL": 1,
@@ -9083,6 +9040,7 @@
"l": 1,
"p": 5,
"w": 1,
"y": 13,
"keyword": 1,
"Unescaped": 1,
"in_character_class": 2,
@@ -9724,6 +9682,7 @@
"npy_longdouble": 1,
"__pyx_t_5numpy_longdouble_t": 1,
"complex": 2,
"float": 7,
"__pyx_t_float_complex": 27,
"_Complex": 2,
"real": 2,
@@ -13186,6 +13145,84 @@
"backdrop.fade": 1,
"backdrop.fade.in": 1
},
"Cuda": {
"__global__": 2,
"void": 3,
"scalarProdGPU": 1,
"(": 20,
"float": 8,
"*d_C": 1,
"*d_A": 1,
"*d_B": 1,
"int": 14,
"vectorN": 2,
"elementN": 3,
")": 20,
"{": 8,
"//Accumulators": 1,
"cache": 1,
"__shared__": 1,
"accumResult": 5,
"[": 11,
"ACCUM_N": 4,
"]": 11,
";": 30,
"////////////////////////////////////////////////////////////////////////////": 2,
"for": 5,
"vec": 5,
"blockIdx.x": 2,
"<": 5,
"+": 12,
"gridDim.x": 1,
"vectorBase": 3,
"IMUL": 1,
"vectorEnd": 2,
"////////////////////////////////////////////////////////////////////////": 4,
"iAccum": 10,
"threadIdx.x": 4,
"blockDim.x": 3,
"sum": 3,
"pos": 5,
"d_A": 2,
"*": 2,
"d_B": 2,
"}": 8,
"stride": 5,
"/": 2,
"__syncthreads": 1,
"if": 3,
"d_C": 2,
"#include": 2,
"<stdio.h>": 1,
"<cuda_runtime.h>": 1,
"vectorAdd": 2,
"const": 2,
"*A": 1,
"*B": 1,
"*C": 1,
"numElements": 4,
"i": 5,
"C": 1,
"A": 1,
"B": 1,
"main": 1,
"cudaError_t": 1,
"err": 5,
"cudaSuccess": 2,
"threadsPerBlock": 4,
"blocksPerGrid": 1,
"-": 1,
"<<": 1,
"<blocksPerGrid,>": 1,
"cudaGetLastError": 1,
"fprintf": 1,
"stderr": 1,
"cudaGetErrorString": 1,
"exit": 1,
"EXIT_FAILURE": 1,
"cudaDeviceReset": 1,
"return": 1
},
"Dart": {
"class": 1,
"Point": 7,
@@ -42233,7 +42270,7 @@
"Awk": 544,
"Bluespec": 1298,
"C": 58858,
"C++": 21480,
"C++": 21308,
"Ceylon": 50,
"Clojure": 510,
"COBOL": 90,
@@ -42241,6 +42278,7 @@
"Common Lisp": 103,
"Coq": 18259,
"CSS": 43867,
"Cuda": 290,
"Dart": 68,
"Diff": 16,
"DM": 169,
@@ -42354,7 +42392,7 @@
"Awk": 1,
"Bluespec": 2,
"C": 26,
"C++": 20,
"C++": 19,
"Ceylon": 1,
"Clojure": 7,
"COBOL": 4,
@@ -42362,6 +42400,7 @@
"Common Lisp": 1,
"Coq": 12,
"CSS": 2,
"Cuda": 2,
"Dart": 1,
"Diff": 1,
"DM": 1,
@@ -42464,5 +42503,5 @@
"Xtend": 2,
"YAML": 1
},
"md5": "6a064f3fdb191614ff6f065a365cb3d7"
"md5": "9fdad6d44dffa2bd9b71e18d9082cb2e"
}

View File

@@ -1,39 +0,0 @@
void foo()
{
cudaArray* cu_array;
texture<float, 2, cudaReadModeElementType> tex;
// Allocate array
cudaChannelFormatDesc description = cudaCreateChannelDesc<float>();
cudaMallocArray(&cu_array, &description, width, height);
// Copy image data to array
cudaMemcpyToArray(cu_array, image, width*height*sizeof(float), cudaMemcpyHostToDevice);
// Set texture parameters (default)
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.filterMode = cudaFilterModePoint;
tex.normalized = false; // do not normalize coordinates
// Bind the array to the texture
cudaBindTextureToArray(tex, cu_array);
// Run kernel
dim3 blockDim(16, 16, 1);
dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1) / blockDim.y, 1);
kernel<<< gridDim, blockDim, 0 >>>(d_data, height, width);
// Unbind the array from the texture
cudaUnbindTexture(tex);
} //end foo()
__global__ void kernel(float* odata, int height, int width)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
if (x < width && y < height) {
float c = tex2D(tex, x, y);
odata[y*width+x] = c;
}
}

View File

@@ -0,0 +1,52 @@
__global__ void scalarProdGPU(
float *d_C,
float *d_A,
float *d_B,
int vectorN,
int elementN
)
{
//Accumulators cache
__shared__ float accumResult[ACCUM_N];
////////////////////////////////////////////////////////////////////////////
// Cycle through every pair of vectors,
// taking into account that vector counts can be different
// from total number of thread blocks
////////////////////////////////////////////////////////////////////////////
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
{
int vectorBase = IMUL(elementN, vec);
int vectorEnd = vectorBase + elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
float sum = 0;
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
sum += d_A[pos] * d_B[pos];
accumResult[iAccum] = sum;
}
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
{
__syncthreads();
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
accumResult[iAccum] += accumResult[stride + iAccum];
}
if (threadIdx.x == 0) d_C[vec] = accumResult[0];
}
}

46
samples/Cuda/vectorAdd.cu Normal file
View File

@@ -0,0 +1,46 @@
#include <stdio.h>
#include <cuda_runtime.h>
/**
* CUDA Kernel Device code
*
* Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements.
*/
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
C[i] = A[i] + B[i];
}
}
/**
* Host main routine
*/
int
main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
return 0;
}