mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			52 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			52 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| __global__ void scalarProdGPU(
 | |
|     float *d_C,
 | |
|     float *d_A,
 | |
|     float *d_B,
 | |
|     int vectorN,
 | |
|     int elementN
 | |
| )
 | |
| {
 | |
|     //Accumulators cache
 | |
|     __shared__ float accumResult[ACCUM_N];
 | |
| 
 | |
|     ////////////////////////////////////////////////////////////////////////////
 | |
|     // Cycle through every pair of vectors,
 | |
|     // taking into account that vector counts can be different
 | |
|     // from total number of thread blocks
 | |
|     ////////////////////////////////////////////////////////////////////////////
 | |
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
 | |
|     {
 | |
|         int vectorBase = IMUL(elementN, vec);
 | |
|         int vectorEnd  = vectorBase + elementN;
 | |
| 
 | |
|         ////////////////////////////////////////////////////////////////////////
 | |
|         // Each accumulator cycles through vectors with
 | |
|         // stride equal to number of total number of accumulators ACCUM_N
 | |
|         // At this stage ACCUM_N is only preferred be a multiple of warp size
 | |
|         // to meet memory coalescing alignment constraints.
 | |
|         ////////////////////////////////////////////////////////////////////////
 | |
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
 | |
|         {
 | |
|             float sum = 0;
 | |
| 
 | |
|             for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
 | |
|                 sum += d_A[pos] * d_B[pos];
 | |
| 
 | |
|             accumResult[iAccum] = sum;
 | |
|         }
 | |
| 
 | |
|         ////////////////////////////////////////////////////////////////////////
 | |
|         // Perform tree-like reduction of accumulators' results.
 | |
|         // ACCUM_N has to be power of two at this stage
 | |
|         ////////////////////////////////////////////////////////////////////////
 | |
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
 | |
|         {
 | |
|             __syncthreads();
 | |
| 
 | |
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
 | |
|                 accumResult[iAccum] += accumResult[stride + iAccum];
 | |
|         }
 | |
| 
 | |
|         if (threadIdx.x == 0) d_C[vec] = accumResult[0];
 | |
|     }
 | |
| } |