Tuesday, December 8, 2009

OpenCL Kernel for Scalar Product with Atomic Operations

The final sum of the dotproduct example is implemented on CPU. This is a solution of Scalar Product (DotProduct) without final reduction on the host side. This example uses atomic operations.


/*
* sDOT OpenCL Kernel Function for Level 1 BLAS Dot Product dot<-xy * Author; Wendell Rodrigues
* INRIA-Lille :: DaRT Team
*/
__kernel void sDOT(
__global const unsigned int N,
__global const float* X,
__global const float* Y,
__global float* DOT,
__global int* FLAG,
__local float* sdata
)
{
// get index into global data array
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);

sdata[tid] = (i<N) ? X[i]*Y[i] : 0;

if (i==0) {
DOT[0]=0;
*FLAG=0;
}

barrier(CLK_LOCAL_MEM_FENCE);


// do reduction in shared mem
for(unsigned int s=1; s < get_local_size(0); s *= 2)
{
int index = 2 * s * tid;

if (index < get_local_size(0))
{
sdata[index] += sdata[index + s];
}

barrier(CLK_LOCAL_MEM_FENCE);
}

// write result for this block to global mem
if (tid == 0) {
while (atom_cmpxchg(FLAG,0,1)==1);
DOT[0] += sdata[0];
atom_cmpxchg(FLAG,1,0);
}

}