In the code block there is my kernel function. It essentialy calculates which point is the farthest from all clusters and results are saved in lengths[3] (id of the point) and output[0] the distance from the belonging cluster. The while piece does a simple sum reduction. I know it is not the best method to do but I need to understand why having one cluster the code works properly insteas with two or more clusters return wrong values.
__kernel void computeDistances(__global t_cluster *points,__global t_cluster *clusters, __global float *output,__global t_cluster *support,__global short *lengths)
{
int threadId = get_global_id(0);
float bestVal = 0;
int counter, offset;
short idPoint, idCluster;
for(idPoint = 0; idPoint < lengths[0]; idPoint++)
{
for(idCluster = 0; idCluster < lengths[2]; idCluster++)
{
support[0].attributes[threadId] = pow( (points[idPoint].attributes[threadId] - clusters[idCluster].attributes[threadId]) , 2 );
counter = SIZE;
offset = 1;
while(counter != 1)
{
counter = counter / 2 + (counter % 2);
barrier(CLK_GLOBAL_MEM_FENCE);
if(threadId % (2*offset) == 0)
if(threadId + offset < lengths[1])
support[0].attributes[threadId] = support[0].attributes[threadId] + support[0].attributes[threadId+offset];
offset = offset * 2 ;
}
barrier(CLK_GLOBAL_MEM_FENCE);
if(support[0].attributes[threadId] > bestVal)
bestVal = support[0].attributes[threadId];
}
barrier(CLK_GLOBAL_MEM_FENCE);
if(threadId == 0 && bestVal > output[threadId])
{
output[0] = bestVal;
lengths[3] = idPoint;
}
}
}