Hi all
For the ones who like to compare
My PC: AMD 8350(CPU), 2X R9 380 4GB (GPU) and 16GB ram.
First version is DP code version and the other is SP version.
For me result for DP is:
**GPU init + buffer create was 245 ms
Time to execute kernel with 27000000 calculations; 244 ms
Control data [0,0,0]: 2.381773290676036
Control data [9,0,0]: 1.7791850202025574
Control data [0,9,0]: 51.99337081388339
Control data [0,0,1]: 301.40362615629846
Control data [9,1,9]: 899.8805224704302
Time to execute on single thread CPU 27000000 calculations; 24620 ms**
and for the SP version:
**GPU init + buffer create was **192 **ms
Time to execute kernel with 27000000 calculations; 120 ms
Control data [0,0,0]: 2.3817732
Control data [9,0,0]: 1.779185
Control data [0,9,0]: 51.99337
Control data [0,0,1]: 301.4036
Control data [9,1,9]: 899.8805**
DP code
import org.jocl.*;
import static org.jocl.CL.*;
public class Driver
{
private static int arrayXLength = 300;
private static int arrayYLength = 300;
private static int arrayZLength = 300;
private static boolean COMPARE_WITH_CPU = true;
private static String programSource =
"__kernel void sampleKernel(__global const int *in, __global double *out)"+
"{"+
" int gid = get_global_id(0);"+
" out[gid] = sin((double)in[gid]) + cos((double)in[gid]) + sqrt((double)in[gid]);"+
"}";
private static cl_context context;
private static cl_command_queue commandQueue;
private static cl_kernel kernel;
private static cl_program program;
public static void main(String args[])
{
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
CL.setExceptionsEnabled(true);
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
long numBytes[] = new long[1];
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0], Pointer.to(devices), null);
commandQueue = clCreateCommandQueue(context, devices[0], 0, null);
program = clCreateProgramWithSource(context, 1, new String[]{ programSource }, null, null);
clBuildProgram(program, 0, null, null, null, null);
kernel = clCreateKernel(program, "sampleKernel", null);
int n = arrayXLength * arrayYLength * arrayZLength;
int array[] = new int[n];
initArray(array);
double dstArray[] = new double[n];
Pointer dst = Pointer.to(dstArray);
Pointer ptArray = Pointer.to(array);
int[] pos = new int[]{n};
Pointer posPointer = Pointer.to(pos);
long startInit = System.currentTimeMillis();
//Create memory buffers on the device
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_int * n, ptArray, null);
cl_mem memResult = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_double * n , posPointer, null);
clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, Sizeof.cl_int * n, ptArray,0,null,null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(mem));
clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(memResult));
long endInit = System.currentTimeMillis();
System.out.println("
GPU init + buffer create was " + (endInit-startInit) + " ms");
long global_work_size[] = new long[]{n};
long local_work_size[] = new long[]{256};
long startTime = System.currentTimeMillis();
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, global_work_size, local_work_size, 0, null, null);
// Read the memory on device and store it into host mem dst variable
clEnqueueReadBuffer(commandQueue, memResult, CL_TRUE, 0, n * Sizeof.cl_double, dst, 0, null, null);
long endTime = System.currentTimeMillis();
System.out.println("Time to execute kernel with " + n + " calculations; " + (endTime-startTime) + " ms");
System.out.print("
Control data [0,0,0]: "+ getData(dstArray, 0, 0, 0));
System.out.print("
Control data [9,0,0]: "+ getData(dstArray, 9, 0, 0));
System.out.print("
Control data [0,9,0]: "+ getData(dstArray, 0, 9, 0));
System.out.print("
Control data [0,0,1]: "+ getData(dstArray, 0, 0, 1));
System.out.print("
Control data [9,1,9]: "+ getData(dstArray, 9, 1, 9));
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(mem);
clReleaseMemObject(memResult);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
if(COMPARE_WITH_CPU)
{
startTime = System.currentTimeMillis();
for(int i = 0 ; i < n ; i++)
{
float tmp = (float) (Math.sin(i) + Math.cos(i) + Math.sqrt(i));
}
endTime = System.currentTimeMillis();
System.out.println("
Time to execute on single thread CPU " + n + " calculations; " + (endTime-startTime) + " ms");
}
}
private static double getData(double[] array, int x, int y, int z)
{
return array[x + (y * arrayXLength) + (z * arrayYLength * arrayZLength)];
}
private static void initArray(int[] array)
{
for(int x = 0 ; x < arrayXLength*arrayYLength*arrayZLength ; x++)
{
array[x] = x+1;
}
}
}
and the SP version
import org.jocl.*;
import static org.jocl.CL.*;
public class Driver
{
private static int arrayXLength = 300;
private static int arrayYLength = 300;
private static int arrayZLength = 300;
private static boolean COMPARE_WITH_CPU = true;
private static String programSource =
"__kernel void sampleKernel(__global const int *in, __global float *out)"+
"{"+
" int gid = get_global_id(0);"+
" out[gid] = sin((float)in[gid]) + cos((float)in[gid]) + sqrt((float)in[gid]);"+
"}";
private static cl_context context;
private static cl_command_queue commandQueue;
private static cl_kernel kernel;
private static cl_program program;
public static void main(String args[])
{
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
CL.setExceptionsEnabled(true);
context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
long numBytes[] = new long[1];
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
int numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0], Pointer.to(devices), null);
commandQueue = clCreateCommandQueue(context, devices[0], 0, null);
program = clCreateProgramWithSource(context, 1, new String[]{ programSource }, null, null);
clBuildProgram(program, 0, null, null, null, null);
kernel = clCreateKernel(program, "sampleKernel", null);
int n = arrayXLength * arrayYLength * arrayZLength;
int array[] = new int[n];
initArray(array);
float dstArray[] = new float[n];
Pointer dst = Pointer.to(dstArray);
Pointer ptArray = Pointer.to(array);
int[] pos = new int[]{n};
Pointer posPointer = Pointer.to(pos);
long startInit = System.currentTimeMillis();
//Create memory buffers on the device
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_int * n, ptArray, null);
cl_mem memResult = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * n , posPointer, null);
clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, Sizeof.cl_int * n, ptArray,0,null,null);
// Set the arguments for the kernel
clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(mem));
clSetKernelArg(kernel, 1, Sizeof.cl_mem, Pointer.to(memResult));
long endInit = System.currentTimeMillis();
System.out.println("
GPU init + buffer create was " + (endInit-startInit) + " ms");
long global_work_size[] = new long[]{n};
long local_work_size[] = new long[]{256};
long startTime = System.currentTimeMillis();
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel, 1, null, global_work_size, local_work_size, 0, null, null);
// Read the memory on device and store it into host mem dst variable
clEnqueueReadBuffer(commandQueue, memResult, CL_TRUE, 0, n * Sizeof.cl_float, dst, 0, null, null);
long endTime = System.currentTimeMillis();
System.out.println("Time to execute kernel with " + n + " calculations; " + (endTime-startTime) + " ms");
System.out.print("
Control data [0,0,0]: "+ getData(dstArray, 0, 0, 0));
System.out.print("
Control data [9,0,0]: "+ getData(dstArray, 9, 0, 0));
System.out.print("
Control data [0,9,0]: "+ getData(dstArray, 0, 9, 0));
System.out.print("
Control data [0,0,1]: "+ getData(dstArray, 0, 0, 1));
System.out.print("
Control data [9,1,9]: "+ getData(dstArray, 9, 1, 9));
// Release kernel, program, and memory objects
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(mem);
clReleaseMemObject(memResult);
clReleaseCommandQueue(commandQueue);
clReleaseContext(context);
if(COMPARE_WITH_CPU)
{
startTime = System.currentTimeMillis();
for(int i = 0 ; i < n ; i++)
{
float tmp = (float) (Math.sin(i) + Math.cos(i) + Math.sqrt(i));
}
endTime = System.currentTimeMillis();
System.out.println("
Time to execute on single thread CPU " + n + " calculations; " + (endTime-startTime) + " ms");
}
}
private static float getData(float[] array, int x, int y, int z)
{
return array[x + (y * arrayXLength) + (z * arrayYLength * arrayZLength)];
}
private static void initArray(int[] array)
{
for(int x = 0 ; x < arrayXLength*arrayYLength*arrayZLength ; x++)
{
array[x] = x+1;
}
}
}