Hi
I adjusted the program accordingly, although I’m not 100% sure if this is all right.
BTW: You’ve been passing single integer values to the kernel using cl_mem-objects. For example, you declared your kernel as
__kernel void pi(__global const int * iterNumb ...)
{
int iter = iterNumb[0];
...
And I assume that you have done something like this on Java side:
int iterNumb[] = new int[1];
iterNumb[0] = 12345;
cl_mem iterNumbMem = clCreateBuffer(context, CL_MEM_READ_WRITE, 1 * Sizeof.cl_int, null, null);
clEnqueueWriteBuffer(commandQueue, iterNumbMem, true, 0, Sizeof.cl_int, Pointer.to(iterNumb), 0, null, null);
clSetKernelArg(kernel, 0, 1 * Sizeof.cl_mem, Pointer.to(iterNumbMem));
But single primtive values may be passed directly, which is much easier. Just declare your kernel as
__kernel void pi(int iter, ...)
Then you can do the following on Java side:
int iter = 12345;
clSetKernelArg(kernel, 0, Sizeof.cl_int, Pointer.to(new int[]{iter}));
Here’s the adjusted program and the respective kernel:
package test;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import static org.jocl.CL.*;
import org.jocl.*;
public class PITest2
{
private static String programSource0 = readFile("kernels/QuadFloat.cl");
private static String programSource1 = readFile("kernels/Picl2.cl");
public static void main(String args[])
{
// Create input- and output data
int n = 512;
int intervals = 10000000;
// Obtain the platform IDs and initialize the context properties
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
// Create an OpenCL context on a GPU device
cl_context context = clCreateContextFromType(contextProperties,
CL_DEVICE_TYPE_GPU, null, null, null);
if (context == null)
{
// If no context for a GPU device could be created,
// try to create one for a CPU device.
context = clCreateContextFromType(contextProperties,
CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null)
{
System.out.println("Unable to create a context");
return;
}
}
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Get the list of GPU devices associated with the context
// and obtain the cl_device_id for the first device
long numBytes[] = new long[1];
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES,
numBytes[0], Pointer.to(devices), null);
// Create a command-queue, program and kernel
cl_command_queue commandQueue = clCreateCommandQueue(context,
devices[0], 0, null);
cl_program program = clCreateProgramWithSource(context, 2,
new String[] { programSource0, programSource1 }, null, null);
clBuildProgram(program, 0, null, null, null, null);
cl_kernel kernel = clCreateKernel(program, "pi", null);
// Allocate the memory objects for the input- and output data
cl_mem piMem = clCreateBuffer(context, CL_MEM_READ_WRITE,
Sizeof.cl_float, null, null);
long global_work_size[] = new long[] { n };
// Set the arguments for the kernel
clSetKernelArg(kernel, 0, Sizeof.cl_int,
Pointer.to(new int[] { intervals }));
clSetKernelArg(kernel, 1, Sizeof.cl_mem,
Pointer.to(piMem));
clSetKernelArg(kernel, 2, n * Sizeof.cl_float, null);
long before = System.nanoTime();
cl_event kernelEvent1 = new cl_event();
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, global_work_size,
null, 0, null, kernelEvent1);
// Wait for the the events, i.e. until the kernels have completed
System.out.println("Waiting for events...");
clWaitForEvents(1, new cl_event[] { kernelEvent1 });
long after = System.nanoTime();
double kernelTime = (double) (after - before) / 1e9;
System.out.println("kernel time(secs): " + kernelTime);
// Read the output data
float result[] = new float[1];
clEnqueueReadBuffer(commandQueue, piMem, CL_TRUE, 0,
Sizeof.cl_float, Pointer.to(result), 0, null, null);
System.out.printf("OpenCL: %.20f
", result[0]);
// Compute the result using Java (only float)
float pi = 0;
float dstArray[] = new float[n];
before = System.nanoTime();
test(intervals, dstArray, n);
after = System.nanoTime();
double javaTime = (double) (after - before) / 1e9;
System.out.println("java time(secs): " + javaTime);
for (int i = 0; i < dstArray.length; i++)
{
pi += dstArray**;
}
System.out.printf("Java : %.20f
", pi);
// Release kernel, program, and memory objects
clReleaseMemObject(piMem);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
}
private static String readFile(String fileName)
{
try
{
BufferedReader br = new BufferedReader(new FileReader(fileName));
// usamos el string builder ya que los strings son inmutables en
// java
StringBuilder sb = new StringBuilder();
String line = null;
while (true)
{
line = br.readLine();
if (line == null)
{
break;
}
sb.append(line + "
");
}
return sb.toString();
}
catch (IOException e)
{
e.printStackTrace();
return "";
}
}
private static void test(int intervals, float solution[], int get_global_size)
{
for (int pid = 0; pid < get_global_size; pid++)
{
// uint pid = get_global_id(0);
// uint globalSize = get_global_size(0);
int globalSize = get_global_size;
float intervalWidth = 0;
float x = 0;
float sum = 0.0f;
int elements = intervals / globalSize;
int rest = intervals % globalSize;
int length = elements;
if (pid == globalSize - 1)
{
length += rest;
}
intervalWidth = 1.0f / (float) intervals;
for (int i = pid * elements; i < pid * elements + length; i++)
{
x = (i + 0.5f) * intervalWidth;
sum = sum + 4.0f / (1.0f + x * x);
}
solution[pid] = sum * intervalWidth;
}
}
}
Kernel “Picl2.cl”
//openCL extensions for double
//#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void pi(unsigned int iter, __global float * pi, __local float * sdata)
{
unsigned pid = get_global_id(0);
unsigned nth = get_global_size(0);
float x = 0.0f, sum = 0.0f;
sdata[pid] = 0.0f;
int step = iter / nth;
int rest = iter % nth;
float paso = 1.0f / iter;
//each procesor do his step
//#pragma unroll
for (int i = pid*step; i < (pid+1)*step; i++) {
x = (i + 0.5f) * paso;
sum = sum + 4.0f / (1.0f + x * x);
}
//if we are the last procesor we do the step + rest
if(pid == nth-1){
for (int i = pid*step; i < pid*step+rest; i++) {
x = (i + 0.5f) * paso;
sum = sum + 4.0f / (1.0f + x * x);
}
}
//we write our results in shared memory to improve parallel reduction
sdata[pid]=sum * paso;
//we wait to all processors to finish writing in local memory
barrier(CLK_LOCAL_MEM_FENCE);
//we make parallel reduction on GPU (we use secuential addressing)
//no bank conflicts
for(unsigned int s=nth/2;s>0;s>>=1) //bit displacement is cheaper than div
{
if(pid<s) {
sdata[pid]+=sdata[pid+s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//we write back the results to global memory
if(pid==0) pi[0]=sdata[0];
}