From a first test, it seems to work, but presumably that does not mean so much: It also worked without any barrier… As usual, any concurrency bugs are hard or impossible to reproduce, but in any case, I think with the barrier it should be formally correct (don’t pin me down to this, it’s only due to my understanding until now).
However, here’s the test case that I used, on a good old GeForce 8800. Maybe you can test it, spot potential differences or modify it so that the error occurs again
import static org.jocl.CL.*;
import java.util.Arrays;
import org.jocl.*;
public class LocalMemoryTest
{
private static final String programSource =
"__kernel void test(" +"
"+
" __global int *globalArray," +"
"+
" __local int *localArray)"+"
"+
"{"+"
"+
" localArray[get_local_id(0)] = get_local_id(0);"+"
"+
""+"
"+
" barrier(CLK_LOCAL_MEM_FENCE);"+"
"+
""+"
"+
" int offset = 17; //16 OK 17 KO"+"
"+
" if (get_local_id(0) < offset)"+"
"+
" localArray[get_local_id(0)] = localArray[get_local_id(0) + offset];"+"
"+
""+"
"+
" barrier(CLK_LOCAL_MEM_FENCE);"+"
"+
""+"
"+
" int aReg = localArray[get_local_id(0)];"+"
"+
" globalArray[get_local_id(0)] = aReg;"+"
"+
"}";
private static final String kernelName = "test";
private static cl_context context;
private static cl_command_queue commandQueue;
private static cl_program program;
private static cl_kernel kernel;
public static void main(String args[])
{
defaultInitialization();
int n = 64;
int array[] = new int[n];
cl_mem mem = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
n * Sizeof.cl_int, Pointer.to(array), null);
int a = 0;
clSetKernelArg(kernel, a++, Sizeof.cl_mem, Pointer.to(mem));
clSetKernelArg(kernel, a++, n * Sizeof.cl_int, null);
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
new long[]{n}, new long[]{n}, 0, null, null);
clEnqueueReadBuffer(commandQueue, mem, CL_TRUE, 0,
n * Sizeof.cl_int, Pointer.to(array), 0, null, null);
System.out.println("Result "+Arrays.toString(array));
}
private static void defaultInitialization()
{
// The platform, device type and device number
// that will be used
final int platformIndex = 0;
final long deviceType = CL_DEVICE_TYPE_ALL;
final int deviceIndex = 0;
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Obtain the number of platforms
int numPlatformsArray[] = new int[1];
clGetPlatformIDs(0, null, numPlatformsArray);
int numPlatforms = numPlatformsArray[0];
// Obtain a platform ID
cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
clGetPlatformIDs(platforms.length, platforms, null);
cl_platform_id platform = platforms[platformIndex];
// Initialize the context properties
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);
// Obtain the number of devices for the platform
int numDevicesArray[] = new int[1];
clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
int numDevices = numDevicesArray[0];
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
cl_device_id device = devices[deviceIndex];
// Create a context for the selected device
context = clCreateContext(
contextProperties, 1, new cl_device_id[]{device},
null, null, null);
// Create a command-queue for the selected device
commandQueue =
clCreateCommandQueue(context, device, 0, null);
// Create the program
cl_program program = clCreateProgramWithSource(context,
1, new String[]{ programSource }, null, null);
// Build the program
clBuildProgram(program, 0, null, null, null, null);
// Create the kernel
kernel = clCreateKernel(program, kernelName, null);
}
}