Passing 2-D instance to the kernel

Soyeed · 6. Juli 2010 um 08:17

I have a 2-d array like
float srcArrayA[][] = {{5,6,7},{4,8,9}};

which i need to pass to the kernel .

The pointer class supports 1-d objects. Is there any way to do this directly(or a workaround without having to break the data structure).
or do i have to break my data into 3 instances of 1-d array

Marco13 · 6. Juli 2010 um 08:58

Hello

I wrote a little bit about this topic in this thread. Although this was about JCuda, the situation is similar here: A 2D-Java array is not stored as a continguous block of memory. In fact, it is just an array containing object references (namely, references to array objects). For the OpenCL memory operations, the data is required to be stored as a continguous block. So there is no way to conveniently and efficiently handle a 2D array with OpenCL. Note that this does not only apply to Java: When you are creating a 2D-array in plain C like this

float **array = (float**)malloc(2*sizeof(float*));
array[0] = (float*)malloc(3*sizeof(float));
...
array[0][0] = 5;
...

this can also not be copied to OpenCL using the memory functions, since this also is not a continguous block of memory.

As you might have seen in the NVIDIA- and AMD OpenCL samples (for example, those involving a matrix multiplication) 2D arrays are generally handled by creating 1D-Arrays and accessing them in the form
array[column+row*numColumns] = 123;

I assume that you are already using this scheme inside your kernel (I’m not sure of Kernels can treat 2D arrays at all - I think the can not…). In any case, the most efficient solution would be to to also use 1D arrays on Java side for this. But if you already have 2D arrays, and do not want to change this, you could create some small helper methods which can copy the “rows” of the 2D Java arrays into a single cl_mem object, like illustrated in this small example:

package test;

import org.jocl.*;

import static org.jocl.CL.*;

public class JOCLSample2DInput
{
    private static String programSource =
        "__kernel void "+
        "sampleKernel(__global float *a)"+
        "{"+
        "    int gidX = get_global_id(0);"+
        "    int gidY = get_global_id(1);"+
        "    a[gidX+3*gidY] *= 2;"+
        "}";
    
    private static cl_context context;
    private static cl_command_queue commandQueue;
    private static cl_kernel kernel;
    private static cl_program program;
    
    public static void main(String args[])
    {
        defaultInitialization();

        // Create input array
        float array[][] = {{1,2,3},{4,5,6},{7,8,9}};
        
        // Allocate the memory object
        cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, 
            Sizeof.cl_float * 3 * 3, null, null);
        
        // Write the source array into the buffer
        writeBuffer2D(commandQueue, mem, array);
        
        // Execute the kernel
        clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(mem));
        clEnqueueNDRangeKernel(commandQueue, kernel, 2, null,
            new long[]{3,3}, null, 0, null, null);

        // Read the buffer back to the array
        readBuffer2D(commandQueue, mem, array);
        
        // Release kernel, program, and memory objects
        clReleaseMemObject(mem);
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseCommandQueue(commandQueue);
        clReleaseContext(context);
        
        System.out.println("Result: ");
        for (int r=0; r<array.length; r++)
        {
            System.out.println(java.util.Arrays.toString(array[r]));
        }
    }
    
    private static void writeBuffer2D(cl_command_queue commandQueue, cl_mem buffer, float array[][])
    {
        long byteOffset = 0;
        for (int r=0; r<array.length; r++)
        {
            int bytes = array[r].length * Sizeof.cl_float;
            clEnqueueWriteBuffer(
                commandQueue, buffer, CL_TRUE, byteOffset, bytes,
                Pointer.to(array[r]), 0, null, null);
            byteOffset += bytes; 
        }
    }

    private static void readBuffer2D(cl_command_queue commandQueue, cl_mem buffer, float array[][])
    {
        long byteOffset = 0;
        for (int r=0; r<array.length; r++)
        {
            int bytes = array[r].length * Sizeof.cl_float;
            clEnqueueReadBuffer(
                commandQueue, buffer, CL_TRUE, byteOffset, bytes,
                Pointer.to(array[r]), 0, null, null);
            byteOffset += bytes; 
        }
    }
    

    private static void defaultInitialization()
    {
        // Obtain the platform IDs and initialize the context properties
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
        
        // Create an OpenCL context on a GPU device
        context = clCreateContextFromType(
            contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
        if (context == null)
        {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
            
            if (context == null)
            {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);
        
        // Get the list of GPU devices associated with the context
        long numBytes[] = new long[1];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes); 
        
        // Obtain the cl_device_id for the first device
        int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],  
            Pointer.to(devices), null);

        // Create a command-queue
        commandQueue = 
            clCreateCommandQueue(context, devices[0], 0, null);

        // Create the program from the source code
        program = clCreateProgramWithSource(context,
            1, new String[]{ programSource }, null, null);
        
        // Build the program
        clBuildProgram(program, 0, null, null, null, null);
        
        // Create the kernel
        kernel = clCreateKernel(program, "sampleKernel", null);
    }
}

Soyeed · 7. Juli 2010 um 04:22

Thanks Marco
Though i had figured it out that i would have to use a singe-D array but ur reasoning explains it all.
Thanks a lot. The help is really appreciated.