/*
* A quick port of the NVIDIA OpenCL Scan example to JOCL
*/
/*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
*
* NVIDIA Corporation and its licensors retain all intellectual property and
* proprietary rights in and to this software and related documentation.
* Any use, reproduction, disclosure, or distribution of this software
* and related documentation without an express license agreement from
* NVIDIA Corporation is strictly prohibited.
*
* Please refer to the applicable NVIDIA end user license agreement (EULA)
* associated with this source code for terms and conditions that govern
* your use of this NVIDIA software.
*
*/
package test;
import static com.ithilian.jclu.CLUtils.*;
import static org.jocl.CL.*;
import java.util.Random;
import org.jocl.*;
public class JOCLScanNVIDIA
{
public static final boolean GPU_PROFILING = true;
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
public static void main(String argv[])
{
// Start logs
shrLog("Starting...
");
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id cdDevice; //OpenCL device
cl_device_id cdDeviceA[] = new cl_device_id[1];
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem d_Input, d_Output; //OpenCL memory buffer objects
int h_Input[], h_OutputCPU[], h_OutputGPU[];
final int N = 13 * 1048576 / 2;
shrLog("Allocating and initializing host arrays...
");
h_Input = new int[N];
h_OutputCPU = new int[N];
h_OutputGPU = new int[N];
Random random = new Random(2009);
for(int i = 0; i < N; i++)
h_Input** = random.nextInt();
shrLog("Initializing OpenCL...
");
//Get the NVIDIA platform
cpPlatform = oclGetPlatform(0);
//Get a GPU device
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, cdDeviceA, null);
cdDevice = cdDeviceA[0];
//Create the context
cxGPUContext = clCreateContext(null, 1, cdDeviceA, null, null, null);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, null);
shrLog("Initializing OpenCL scan...
");
initScan(cxGPUContext, cqCommandQueue, argv);
shrLog("Creating OpenCL memory objects...
");
d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N * Sizeof.cl_uint, Pointer.to(h_Input), null);
d_Output = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * Sizeof.cl_uint, null, null);
boolean globalFlag = true; // init pass/fail flag to pass
long szWorkgroup[] = new long[1];
final int iCycles = 100;
shrLog("*** Running GPU scan for short arrays (%d identical iterations)...
", iCycles);
for(int arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength *= 2)
{
shrLog("Running scan for %d elements (%d arrays)...
", arrayLength, N / arrayLength);
clFinish(cqCommandQueue);
shrDeltaT(0);
for (int i = 0; i<iCycles; i++)
{
szWorkgroup[0] = scanExclusiveShort(
cqCommandQueue,
d_Output,
d_Input,
N / arrayLength,
arrayLength
);
}
clFinish(cqCommandQueue);
double timerValue = shrDeltaT(0)/(double)iCycles;
shrLog("Validating the results...
");
shrLog(" ...reading back OpenCL memory
");
clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * Sizeof.cl_uint, Pointer.to(h_OutputGPU), 0, null, null);
shrLog(" ...scanExclusiveHost()
");
scanExclusiveHost(
h_OutputCPU,
h_Input,
N / arrayLength,
arrayLength
);
// Compare GPU results with CPU results and accumulate error for this test
shrLog(" ...comparing the results
");
boolean localFlag = true;
for(int i = 0; i < N; i++)
{
if(h_OutputCPU** != h_OutputGPU**)
{
localFlag = false;
break;
}
}
// Log message on individual test result, then accumulate to global flag
shrLog(" ...Results %s
", (localFlag == true) ? "Match" : "DON'T Match !!!");
globalFlag = globalFlag && localFlag;
if (GPU_PROFILING)
{
if (arrayLength == MAX_SHORT_ARRAY_SIZE)
{
shrLog("
");
shrLog("oclScan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %d Elements, NumDevsUsed = %d, Workgroup = %d
",
(1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup[0]);
shrLog("
");
}
}
}
shrLog("*** Running GPU scan for large arrays (%d identical iterations)...
", iCycles);
for(int arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength *= 2)
{
shrLog("Running scan for %d elements (%d arrays)...
", arrayLength, N / arrayLength);
clFinish(cqCommandQueue);
shrDeltaT(0);
for (int i = 0; i<iCycles; i++)
{
szWorkgroup[0] = scanExclusiveLarge(
cqCommandQueue,
d_Output,
d_Input,
N / arrayLength,
arrayLength
);
}
clFinish(cqCommandQueue);
double timerValue = shrDeltaT(0)/(double)iCycles;
shrLog("Validating the results...
");
shrLog(" ...reading back OpenCL memory
");
clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * Sizeof.cl_uint, Pointer.to(h_OutputGPU), 0, null, null);
shrLog(" ...scanExclusiveHost()
");
scanExclusiveHost(
h_OutputCPU,
h_Input,
N / arrayLength,
arrayLength
);
// Compare GPU results with CPU results and accumulate error for this test
shrLog(" ...comparing the results
");
boolean localFlag = true;
for(int i = 0; i < N; i++)
{
if(h_OutputCPU** != h_OutputGPU**)
{
localFlag = false;
break;
}
}
// Log message on individual test result, then accumulate to global flag
shrLog(" ...Results %s
", (localFlag == true) ? "Match" : "DON'T Match !!!");
globalFlag = globalFlag && localFlag;
if (GPU_PROFILING)
{
if (arrayLength == MAX_LARGE_ARRAY_SIZE)
{
shrLog("
");
shrLog("oclScan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %d Elements, NumDevsUsed = %d, Workgroup = %d
",
(1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup[0]);
shrLog("
");
}
}
}
// pass or fail (cumulative... all tests in the loop)
shrLog("TEST %s
", globalFlag ? "PASSED" : "FAILED !!!");
shrLog("Shutting down...
");
//Release kernels and program
closeScan();
//Release other OpenCL Objects
clReleaseMemObject(d_Output);
clReleaseMemObject(d_Input);
clReleaseCommandQueue(cqCommandQueue);
clReleaseContext(cxGPUContext);
//
}
////////////////////////////////////////////////////////////////////////////////
// OpenCL scan kernel launchers
////////////////////////////////////////////////////////////////////////////////
//OpenCL scan program handle
static cl_program
cpProgram;
//OpenCL scan kernel handles
static cl_kernel
ckScanExclusiveLocal1, ckScanExclusiveLocal2, ckUniformUpdate;
static cl_mem
d_Buffer;
//All three kernels run 512 threads per workgroup
//Must be a power of two
static final int WORKGROUP_SIZE = 512;
static final String compileOptions = "-D WORKGROUP_SIZE=512";
static void initScan(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, String argv[]){
shrLog(" ...loading Scan.cl
");
String cScan = oclLoadProgramSource("kernels/Scan.cl");
shrLog(" ...creating scan program
");
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, new String[]{cScan}, new long[]{cScan.length()}, null);
shrLog(" ...building scan program
");
clBuildProgram(cpProgram, 0, null, compileOptions, null, null);
shrLog(" ...creating scan kernels
");
ckScanExclusiveLocal1 = clCreateKernel(cpProgram, "scanExclusiveLocal1", null);
ckScanExclusiveLocal2 = clCreateKernel(cpProgram, "scanExclusiveLocal2", null);
ckUniformUpdate = clCreateKernel(cpProgram, "uniformUpdate", null);
shrLog( " ...checking minimum supported workgroup size
");
//Check for work group size
cl_device_id device = new cl_device_id();
long szScanExclusiveLocal1[] = new long[1];
long szScanExclusiveLocal2[] = new long[1];
long szUniformUpdate[] = new long[1];
clGetCommandQueueInfo(cqParamCommandQue, CL_QUEUE_DEVICE, Sizeof.cl_device_id, Pointer.to(device), null);
clGetKernelWorkGroupInfo(ckScanExclusiveLocal1, device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szScanExclusiveLocal1), null);
clGetKernelWorkGroupInfo(ckScanExclusiveLocal2, device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szScanExclusiveLocal2), null);
clGetKernelWorkGroupInfo(ckUniformUpdate, device, CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(szUniformUpdate), null);
if( (szScanExclusiveLocal1[0] < WORKGROUP_SIZE) || (szScanExclusiveLocal2[0] < WORKGROUP_SIZE) || (szUniformUpdate[0] < WORKGROUP_SIZE) ){
shrLog("ERROR: Minimum work-group size %d required by this application is not supported on this device.
", WORKGROUP_SIZE);
System.exit(0);
}
shrLog(" ...allocating internal buffers
");
d_Buffer = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, (MAX_BATCH_ELEMENTS / (4 * WORKGROUP_SIZE)) * Sizeof.cl_uint, null, null);
}
static void closeScan(){
clReleaseMemObject(d_Buffer);
clReleaseKernel(ckUniformUpdate);
clReleaseKernel(ckScanExclusiveLocal2);
clReleaseKernel(ckScanExclusiveLocal1);
clReleaseProgram(cpProgram);
}
////////////////////////////////////////////////////////////////////////////////
// Common definitions
////////////////////////////////////////////////////////////////////////////////
static final int MAX_BATCH_ELEMENTS = 64 * 1048576;
static final int MIN_SHORT_ARRAY_SIZE = 4;
static final int MAX_SHORT_ARRAY_SIZE = 4 * WORKGROUP_SIZE;
static final int MIN_LARGE_ARRAY_SIZE = 8 * WORKGROUP_SIZE;
static final int MAX_LARGE_ARRAY_SIZE = 4 * WORKGROUP_SIZE * WORKGROUP_SIZE;
static int iSnapUp(int dividend, int divisor){
return ((dividend % divisor) == 0) ? dividend : (dividend - dividend % divisor + divisor);
}
static int factorRadix2(int log2L, int L){
if(L==0){
log2L = 0;
return 0;
}else{
for(log2L = 0; (L & 1) == 0; L >>= 1, log2L++);
return L;
}
}
////////////////////////////////////////////////////////////////////////////////
// Short scan launcher
////////////////////////////////////////////////////////////////////////////////
static long scanExclusiveLocal1(
cl_command_queue cqCommandQueue,
cl_mem d_Dst,
cl_mem d_Src,
int n,
int size
){
long localWorkSize, globalWorkSize;
clSetKernelArg(ckScanExclusiveLocal1, 0, Sizeof.cl_mem, Pointer.to(d_Dst));
clSetKernelArg(ckScanExclusiveLocal1, 1, Sizeof.cl_mem, Pointer.to(d_Src));
clSetKernelArg(ckScanExclusiveLocal1, 2, 2 * WORKGROUP_SIZE * Sizeof.cl_uint, null);
clSetKernelArg(ckScanExclusiveLocal1, 3, Sizeof.cl_uint, Pointer.to(new int[]{size}));
localWorkSize = WORKGROUP_SIZE;
globalWorkSize = (n * size) / 4;
clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal1, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
return localWorkSize;
}
static long scanExclusiveShort(
cl_command_queue cqCommandQueue,
cl_mem d_Dst,
cl_mem d_Src,
int batchSize,
int arrayLength
){
//Check power-of-two factorization
int log2L = 0;
int factorizationRemainder = factorRadix2(log2L, arrayLength);
oclCheckError( factorizationRemainder == 1, shrTRUE);
//Check supported size range
oclCheckError( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE), shrTRUE );
//Check total batch size limit
oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );
//Check all work-groups to be fully packed with data
oclCheckError( (batchSize * arrayLength) % (4 * WORKGROUP_SIZE) == 0, shrTRUE);
return scanExclusiveLocal1(
cqCommandQueue,
d_Dst,
d_Src,
batchSize,
arrayLength
);
}
////////////////////////////////////////////////////////////////////////////////
// Large scan launcher
////////////////////////////////////////////////////////////////////////////////
static void scanExclusiveLocal2(
cl_command_queue cqCommandQueue,
cl_mem d_Buffer,
cl_mem d_Dst,
cl_mem d_Src,
int n,
int size
){
long localWorkSize, globalWorkSize;
int elements = n * size;
clSetKernelArg(ckScanExclusiveLocal2, 0, Sizeof.cl_mem, Pointer.to(d_Buffer));
clSetKernelArg(ckScanExclusiveLocal2, 1, Sizeof.cl_mem, Pointer.to(d_Dst));
clSetKernelArg(ckScanExclusiveLocal2, 2, Sizeof.cl_mem, Pointer.to(d_Src));
clSetKernelArg(ckScanExclusiveLocal2, 3, 2 * WORKGROUP_SIZE * Sizeof.cl_uint, null);
clSetKernelArg(ckScanExclusiveLocal2, 4, Sizeof.cl_uint, Pointer.to(new int[]{elements}));
clSetKernelArg(ckScanExclusiveLocal2, 5, Sizeof.cl_uint, Pointer.to(new int[]{size}));
localWorkSize = WORKGROUP_SIZE;
globalWorkSize = iSnapUp(elements, WORKGROUP_SIZE);
clEnqueueNDRangeKernel(cqCommandQueue, ckScanExclusiveLocal2, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
}
static long uniformUpdate(
cl_command_queue cqCommandQueue,
cl_mem d_Dst,
cl_mem d_Buffer,
int n
){
long localWorkSize, globalWorkSize;
clSetKernelArg(ckUniformUpdate, 0, Sizeof.cl_mem, Pointer.to(d_Dst));
clSetKernelArg(ckUniformUpdate, 1, Sizeof.cl_mem, Pointer.to(d_Buffer));
localWorkSize = WORKGROUP_SIZE;
globalWorkSize = n * WORKGROUP_SIZE;
clEnqueueNDRangeKernel(cqCommandQueue, ckUniformUpdate, 1, null, new long[]{globalWorkSize}, new long[]{localWorkSize}, 0, null, null);
return localWorkSize;
}
static long scanExclusiveLarge(
cl_command_queue cqCommandQueue,
cl_mem d_Dst,
cl_mem d_Src,
int batchSize,
int arrayLength
){
//Check power-of-two factorization
int log2L = 0;
int factorizationRemainder = factorRadix2(log2L, arrayLength);
oclCheckError( factorizationRemainder == 1, shrTRUE);
//Check supported size range
oclCheckError( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE), shrTRUE );
//Check total batch size limit
oclCheckError( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS, shrTRUE );
scanExclusiveLocal1(
cqCommandQueue,
d_Dst,
d_Src,
(batchSize * arrayLength) / (4 * WORKGROUP_SIZE),
4 * WORKGROUP_SIZE
);
scanExclusiveLocal2(
cqCommandQueue,
d_Buffer,
d_Dst,
d_Src,
batchSize,
arrayLength / (4 * WORKGROUP_SIZE)
);
return uniformUpdate(
cqCommandQueue,
d_Dst,
d_Buffer,
(batchSize * arrayLength) / (4 * WORKGROUP_SIZE)
);
}
static void scanExclusiveHost(
int dst[],
int src[],
int batchSize,
int arrayLength
){
int offset = 0;
for(int i = 0; i < batchSize; i++, offset += arrayLength){
dst[offset+0] = 0;
for(int j = 1; j < arrayLength; j++)
dst[offset+j] = src[offset+j - 1] + dst[offset+j - 1];
}
}
static final boolean shrTRUE = true;
private static long nanoTimes[] = new long[]{-1,-1,-1};
private static void oclCheckError(boolean b0, boolean b1)
{
if (b0 != b1)
{
throw new RuntimeException();
}
}
private static void shrLog(String format, Object ... args)
{
System.out.printf(format, args);
}
private static double shrDeltaT(int n)
{
if (nanoTimes[n] == -1)
{
nanoTimes[n] = System.nanoTime();
return -9999.0;
}
long old = nanoTimes[n];
nanoTimes[n] = System.nanoTime();
return (nanoTimes[n]-old) / 1e9;
}
}