JOCL Prefix scan / Reduction

system · 3. Februar 2010 um 04:12

Hello,

Has anyone already ported either the prefix scan or the reduction algorithms to jocl? I need to use them, but I’m not sure if I should port all the host code for jocl, or just use something like JNI or Swig to call directly the host programs.

What are your suggestions?

shiftreduce · 3. Februar 2010 um 04:13

Sorry that I posted as unregistered, didn’t realize I wasn’t logged in

Marco13 · 3. Februar 2010 um 12:37

Hello

Hopefully, one day I’ll find more time for converting the samples. For now, I have done a quick (and slightly dirty :o ) conversion of the “Scan” sample. When I find the time, I’ll clean it up a little bit and add it to the samples on the website.

[code in next post]
Note that this sample uses the JCLU library fromhttp://www.ithilian.com/jclu/downloads/downloads.html but in a newer verision which may temprorarily obtained from http://www.jocl.org/JCLU-0.2-beta.jar !

Actually, I think for using a scan or reduction there is not so much code to port from C to Java at all: In general, you may use the “boilerplate” code to load a source file, compile it, and then launch the contained kernel as appropriate for your application case… However, the NVIDIA sample launches the kernel with different sizes, verifies the results and does some benchmarking, so it may be interesting for others as well…

Marco13 · 3. Februar 2010 um 12:38

/*
 * A quick port of the NVIDIA OpenCL Scan example to JOCL
 */

/*
 * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */

package test;

import static com.ithilian.jclu.CLUtils.*;
import static org.jocl.CL.*;

import java.util.Random;

import org.jocl.*;

public class JOCLScanNVIDIA
{
    public static final boolean GPU_PROFILING = true;
    
    ////////////////////////////////////////////////////////////////////////////////
    // Main program
    ////////////////////////////////////////////////////////////////////////////////
    public static void main(String argv[])
    {
        // Start logs
        shrLog("Starting...

"); 

        cl_platform_id cpPlatform;       //OpenCL platform
        cl_device_id cdDevice;           //OpenCL device
        cl_device_id cdDeviceA[] = new cl_device_id[1];
        cl_context      cxGPUContext;    //OpenCL context
        cl_command_queue cqCommandQueue; //OpenCL command que
        cl_mem d_Input, d_Output;        //OpenCL memory buffer objects

        int h_Input[], h_OutputCPU[], h_OutputGPU[];
        final int N = 13 * 1048576 / 2;

        shrLog("Allocating and initializing host arrays...
");
            h_Input     = new int[N];
            h_OutputCPU = new int[N];
            h_OutputGPU = new int[N];
            Random random = new Random(2009);
            for(int i = 0; i < N; i++)
                h_Input** = random.nextInt();

        shrLog("Initializing OpenCL...
");
            //Get the NVIDIA platform
            cpPlatform = oclGetPlatform(0);

            //Get a GPU device
            clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, cdDeviceA, null);
            cdDevice = cdDeviceA[0];
            
            //Create the context
            cxGPUContext = clCreateContext(null, 1, cdDeviceA, null, null, null);

            //Create a command-queue
            cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, null);

        shrLog("Initializing OpenCL scan...
");
            initScan(cxGPUContext, cqCommandQueue, argv);

        shrLog("Creating OpenCL memory objects...

");
            d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N * Sizeof.cl_uint, Pointer.to(h_Input), null);
            d_Output = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * Sizeof.cl_uint, null, null);
            
        boolean globalFlag = true; // init pass/fail flag to pass
        long szWorkgroup[] = new long[1];
        final int iCycles = 100;
        shrLog("*** Running GPU scan for short arrays (%d identical iterations)...

", iCycles);
        for(int arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength *= 2)
        {
            shrLog("Running scan for %d elements (%d arrays)...
", arrayLength, N / arrayLength);
                clFinish(cqCommandQueue);
                shrDeltaT(0);
                for (int i = 0; i<iCycles; i++)
                {
                    szWorkgroup[0] = scanExclusiveShort(
                        cqCommandQueue,
                        d_Output,
                        d_Input,
                        N / arrayLength,
                        arrayLength
                    );
                }
                clFinish(cqCommandQueue);
                double timerValue = shrDeltaT(0)/(double)iCycles;

            shrLog("Validating the results...
"); 
                shrLog(" ...reading back OpenCL memory
");
                    clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * Sizeof.cl_uint, Pointer.to(h_OutputGPU), 0, null, null);

                shrLog(" ...scanExclusiveHost()
");
                    scanExclusiveHost(
                        h_OutputCPU,
                        h_Input,
                        N / arrayLength,
                        arrayLength
                    );

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results
");
                    boolean localFlag = true;
                    for(int i = 0; i < N; i++)
                    {
                        if(h_OutputCPU** != h_OutputGPU**)
                        {
                            localFlag = false;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s

", (localFlag == true) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                if (GPU_PROFILING)
                {
                    if (arrayLength == MAX_SHORT_ARRAY_SIZE)
                    {
                        shrLog("
");
                        shrLog("oclScan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %d Elements, NumDevsUsed = %d, Workgroup = %d
", 
                               (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup[0]);
                        shrLog("
");
                    }
                }
        }

        shrLog("*** Running GPU scan for large arrays (%d identical iterations)...

", iCycles);
        for(int arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength *= 2)
        {
            shrLog("Running scan for %d elements (%d arrays)...
", arrayLength, N / arrayLength);
                clFinish(cqCommandQueue);
                shrDeltaT(0);
                for (int i = 0; i<iCycles; i++)
                {
                    szWorkgroup[0] = scanExclusiveLarge(
                        cqCommandQueue,
                        d_Output,
                        d_Input,
                        N / arrayLength,
                        arrayLength
                    );
                }
                clFinish(cqCommandQueue);
                double timerValue = shrDeltaT(0)/(double)iCycles;

            shrLog("Validating the results...
"); 
                shrLog(" ...reading back OpenCL memory
");
                    clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * Sizeof.cl_uint, Pointer.to(h_OutputGPU), 0, null, null);

                shrLog(" ...scanExclusiveHost()
");
                    scanExclusiveHost(
                        h_OutputCPU,
                        h_Input,
                        N / arrayLength,
                        arrayLength
                    );

                // Compare GPU results with CPU results and accumulate error for this test
                shrLog(" ...comparing the results
");
                    boolean localFlag = true;
                    for(int i = 0; i < N; i++)
                    {
                        if(h_OutputCPU** != h_OutputGPU**)
                        {
                            localFlag = false;
                            break;
                        }
                    }

                // Log message on individual test result, then accumulate to global flag
                shrLog(" ...Results %s

", (localFlag == true) ? "Match" : "DON'T Match !!!");
                globalFlag = globalFlag && localFlag;

                if (GPU_PROFILING)
                {
                    if (arrayLength == MAX_LARGE_ARRAY_SIZE)
                    {
                        shrLog("
");
                        shrLog("oclScan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %d Elements, NumDevsUsed = %d, Workgroup = %d
", 
                               (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup[0]);
                        shrLog("
");
                    }
                }
        }

        // pass or fail (cumulative... all tests in the loop)
        shrLog("TEST %s

", globalFlag ? "PASSED" : "FAILED !!!");

        shrLog("Shutting down...
");
            //Release kernels and program
            closeScan();

            //Release other OpenCL Objects
            clReleaseMemObject(d_Output);
            clReleaseMemObject(d_Input);
            clReleaseCommandQueue(cqCommandQueue);
            clReleaseContext(cxGPUContext);

            //
    }

    
    
    
    ////////////////////////////////////////////////////////////////////////////////
     // OpenCL scan kernel launchers
     ////////////////////////////////////////////////////////////////////////////////
     //OpenCL scan program handle
     static cl_program
         cpProgram;
    
     //OpenCL scan kernel handles
     static cl_kernel
         ckScanExclusiveLocal1, ckScanExclusiveLocal2, ckUniformUpdate;
    
     static cl_mem
         d_Buffer;
    
     //All three kernels run 512 threads per workgroup
     //Must be a power of two
     static final int  WORKGROUP_SIZE = 512;
     static final String compileOptions = "-D WORKGROUP_SIZE=512";
    
     static void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, String argv[]){
    
         shrLog(" ...loading Scan.cl
");
             String cScan = oclLoadProgramSource("kernels/Scan.cl");
    
         shrLog(" ...creating scan program
");
             cpProgram = clCreateProgramWithSource(cxGPUContext, 1, new String[]{cScan}, new long[]{cScan.length()}, null);
    
         shrLog(" ...building scan program
");
             clBuildProgram(cpProgram, 0, null, compileOptions, null, null);
    
         shrLog(" ...creating scan kernels
");
             ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", null);
             ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", null);
             ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", null);
    
         shrLog( " ...checking minimum supported workgroup size
");
             //Check for work group size
             cl_device_id device = new cl_device_id();
             long szScanExclusiveLocal1[] = new long[1];
             long szScanExclusiveLocal2[] = new long[1];
             long szUniformUpdate[] = new long[1];
    
             clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, Sizeof.cl_device_id, Pointer.to(device), null);
             clGetKernelWorkGroupInfo(ckScanExclusiveLocal1,  device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szScanExclusiveLocal1), null);
             clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szScanExclusiveLocal2), null);
             clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szUniformUpdate), null);
    
             if( (szScanExclusiveLocal1[0] < WORKGROUP_SIZE) || (szScanExclusiveLocal2[0] < WORKGROUP_SIZE) || (szUniformUpdate[0] < WORKGROUP_SIZE) ){
                 shrLog("ERROR: Minimum work-group size %d required by this application is not supported on this device.
", WORKGROUP_SIZE);
                 System.exit(0);
             }
    
         shrLog(" ...allocating internal buffers
");
             d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * Sizeof.cl_uint, null, null);
     }
    
     static void closeScan(){
         clReleaseMemObject(d_Buffer);
         clReleaseKernel(ckUniformUpdate);
         clReleaseKernel(ckScanExclusiveLocal2);
         clReleaseKernel(ckScanExclusiveLocal1);
         clReleaseProgram(cpProgram);
     }
    
     ////////////////////////////////////////////////////////////////////////////////
     // Common definitions
     ////////////////////////////////////////////////////////////////////////////////
     static final int MAX_BATCH_ELEMENTS = 64 * 1048576;
     static final int MIN_SHORT_ARRAY_SIZE = 4;
     static final int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
     static final int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
     static final int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
    
     static int iSnapUp(int dividend, int divisor){
         return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
     }
    
     static int factorRadix2(int log2L, int L){
         if(L==0){
             log2L = 0;
             return 0;
         }else{
             for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
             return L;
         }
     }
    
     ////////////////////////////////////////////////////////////////////////////////
     // Short scan launcher
     ////////////////////////////////////////////////////////////////////////////////
     static long scanExclusiveLocal1(
         cl_command_queue cqCommandQueue,
         cl_mem d_Dst,
         cl_mem d_Src,
         int n,
         int size
     ){
         long localWorkSize, globalWorkSize;
    
         clSetKernelArg(ckScanExclusiveLocal1, 0, Sizeof.cl_mem, Pointer.to(d_Dst));
         clSetKernelArg(ckScanExclusiveLocal1, 1, Sizeof.cl_mem, Pointer.to(d_Src));
         clSetKernelArg(ckScanExclusiveLocal1, 2, 2 * WORKGROUP_SIZE * Sizeof.cl_uint, null);
         clSetKernelArg(ckScanExclusiveLocal1, 3, Sizeof.cl_uint, Pointer.to(new int[]{size}));
    
         localWorkSize = WORKGROUP_SIZE;
         globalWorkSize = (n * size) / 4;
    
         clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal1, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
    
         return localWorkSize;
     }
    
     static long scanExclusiveShort(
         cl_command_queue cqCommandQueue,
         cl_mem d_Dst,
         cl_mem d_Src,
         int batchSize,
         int arrayLength
     ){
         //Check power-of-two factorization
         int log2L = 0;
         int factorizationRemainder = factorRadix2(log2L, arrayLength);
         oclCheckError( factorizationRemainder == 1, shrTRUE);
    
         //Check supported size range
         oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE );
    
         //Check total batch size limit
         oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );
    
         //Check all work-groups to be fully packed with data
         oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE);
    
         return scanExclusiveLocal1(
             cqCommandQueue,
             d_Dst,
             d_Src,
             batchSize,
             arrayLength
         );
     }
    
     ////////////////////////////////////////////////////////////////////////////////
     // Large scan launcher
     ////////////////////////////////////////////////////////////////////////////////
     static void scanExclusiveLocal2(
         cl_command_queue cqCommandQueue,
         cl_mem d_Buffer,
         cl_mem d_Dst,
         cl_mem d_Src,
         int n,
         int size
     ){
         long localWorkSize, globalWorkSize;
    
         int elements = n * size;
         clSetKernelArg(ckScanExclusiveLocal2, 0, Sizeof.cl_mem, Pointer.to(d_Buffer));
         clSetKernelArg(ckScanExclusiveLocal2, 1, Sizeof.cl_mem, Pointer.to(d_Dst));
         clSetKernelArg(ckScanExclusiveLocal2, 2, Sizeof.cl_mem, Pointer.to(d_Src));
         clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * Sizeof.cl_uint, null);
         clSetKernelArg(ckScanExclusiveLocal2, 4, Sizeof.cl_uint, Pointer.to(new int[]{elements}));
         clSetKernelArg(ckScanExclusiveLocal2, 5, Sizeof.cl_uint, Pointer.to(new int[]{size}));
    
         localWorkSize = WORKGROUP_SIZE;
         globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
    
         clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
     }
    
     static long uniformUpdate(
         cl_command_queue cqCommandQueue,
         cl_mem d_Dst,
         cl_mem d_Buffer,
         int n
     ){
         long localWorkSize, globalWorkSize;
    
         clSetKernelArg(ckUniformUpdate, 0, Sizeof.cl_mem, Pointer.to(d_Dst));
         clSetKernelArg(ckUniformUpdate, 1, Sizeof.cl_mem, Pointer.to(d_Buffer));
    
         localWorkSize = WORKGROUP_SIZE;
         globalWorkSize = n * WORKGROUP_SIZE;
    
         clEnqueueNDRangeKernel(cqCommandQueue, ckUniformUpdate, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
    
         return localWorkSize;
     }
    
     static long scanExclusiveLarge(
         cl_command_queue cqCommandQueue,
         cl_mem d_Dst,
         cl_mem d_Src,
         int batchSize,
         int arrayLength
     ){
         //Check power-of-two factorization
         int log2L = 0;
         int factorizationRemainder = factorRadix2(log2L, arrayLength);
         oclCheckError( factorizationRemainder == 1, shrTRUE);
    
         //Check supported size range
         oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE );
    
         //Check total batch size limit
         oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );
    
         scanExclusiveLocal1(
             cqCommandQueue,
             d_Dst,
             d_Src,
             (batchSize * arrayLength) / (4 * WORKGROUP_SIZE),
             4 * WORKGROUP_SIZE
         );
    
         scanExclusiveLocal2(
             cqCommandQueue,
             d_Buffer,
             d_Dst,
             d_Src,
             batchSize,
             arrayLength / (4 * WORKGROUP_SIZE)
         );
    
         return uniformUpdate(
             cqCommandQueue,
             d_Dst,
             d_Buffer,
             (batchSize * arrayLength) / (4 * WORKGROUP_SIZE)
         );
     }

     
     
     
     static void scanExclusiveHost(
         int dst[],
         int src[],
         int batchSize,
         int arrayLength
     ){
         int offset = 0;
         for(int i = 0; i < batchSize; i++, offset += arrayLength){
             dst[offset+0] = 0;
             for(int j = 1; j < arrayLength; j++)
                 dst[offset+j] = src[offset+j - 1] + dst[offset+j - 1];
         }
     }
     
    static final boolean shrTRUE = true;
    private static long nanoTimes[] = new long[]{-1,-1,-1};
     
    private static void oclCheckError(boolean b0, boolean b1)
    {
        if (b0 != b1)
        {
            throw new RuntimeException();
        }
    }
    
    private static void shrLog(String format, Object ... args)
    {
        System.out.printf(format, args);
    }
    
    
    private static double shrDeltaT(int n)
    {
        if (nanoTimes[n] == -1)
        {
            nanoTimes[n] = System.nanoTime();
            return -9999.0;
        }
        long old = nanoTimes[n];
        nanoTimes[n] = System.nanoTime();
        return (nanoTimes[n]-old) / 1e9;
    }
    
}

shiftreduce · 8. Februar 2010 um 05:28

Thanks a lot!

Really liking the whole JOCL development. Hope it only gets better!