I have created some “SortOfABenchmark” - it’s not really a reliable, expressive Benchmark, just a very biased and artificial example to point this out. It performs two sorts of operations on all elements of two vectors:
- A simple vector addition: result = a+b
- Some useless, complex computation: result = sin(a)*sin(a)+cos(a)*cos(a)+sin(b)*sin(b)+cos(b)*cos(b)
Both operations are applied to vectors of different sizes, 260000 to 8.3 Million
The result for the largest vector is the following on my machine:
Running with 8388608 elements...
[ms] Duration: Average:
java-simple: 29 29
jcuda-simple: 2 2
jcuda-simple-withMem: 52 52
java-complex: 42317 42317
jcuda-complex: 111 111
jcuda-complex-withMem: 163 163
This roughly means that the simple vector addition with plain Java is twice as fast as when it is done with JCuda (including the memory transfer time). However, the complex operation takes 42 Seconds with Java, whereas with JCuda it takes 163 milliseconds.
Again: This is far from an objective or realistic result, and should be taken with the appropriate grain of salt, but shows that in general the main advantages of JCuda show up when there’s a lot of computing work to be done.
import static jcuda.driver.JCudaDriver.*;
import jcuda.*;
import jcuda.driver.*;
import jcuda.utils.*;
public class SortOfABenchmark
{
// A kernel performing a simple addition
private static final String sourceCodeSimple =
"extern \"C\"" + "
" +
"__global__ void compute(float *result, float *a, float *b)" + "
" +
"{" + "
" +
" int i = blockIdx.x * blockDim.x + threadIdx.x;" + "
" +
" result** = a** + b**;" + "
" +
"}";
// A kernel performing a fairly useless but complex computation
private static final String sourceCodeComplex =
"extern \"C\"" + "
" +
"__global__ void compute(float *result, float *a, float *b)" + "
" +
"{" + "
" +
" int i = blockIdx.x * blockDim.x + threadIdx.x;" + "
" +
" result** = " + "
" +
" sin(a**)*sin(a**)+cos(a**)*cos(a**) + " + "
" +
" sin(b**)*sin(b**)+cos(b**)*cos(b**);" + "
" +
"}";
private static KernelLauncher kernelSimple;
private static KernelLauncher kernelComplex;
public static void main(String args[])
{
JCudaDriver.setExceptionsEnabled(true);
// Prepare the KernelLaunchers for the simple and the complex kernel
System.out.println("Preparing the KernelLaunchers...");
kernelSimple = KernelLauncher.compile(sourceCodeSimple, "compute");
kernelComplex = KernelLauncher.compile(sourceCodeComplex, "compute");
System.out.println("Preparing the KernelLaunchers... DONE");
// Run the test with different input sizes
for (int blocks = 1024; blocks <= 32768; blocks*=2)
{
int size = 256 * blocks;
System.out.println("Running with "+size+" elements...");
Timer.createTimer("java-simple");
Timer.createTimer("jcuda-simple");
Timer.createTimer("jcuda-simple-withMem");
Timer.createTimer("java-complex");
Timer.createTimer("jcuda-complex");
Timer.createTimer("jcuda-complex-withMem");
float result[] = new float[size];
float a[] = new float[size];
float b[] = new float[size];
for (int i=0; i<size; i++)
{
a** = i;
b** = i;
}
// Run the simple computation with Java
Timer.startTimer("java-simple");
javaSimple(a,b,result);
Timer.stopTimer("java-simple");
// Run the simple computation with JCuda
jcuda("jcuda-simple", kernelSimple, a, b, result);
// Run the complex computation with Java
Timer.startTimer("java-complex");
javaComplex(a,b,result);
Timer.stopTimer("java-complex");
// Run the complex computation with JCuda
jcuda("jcuda-complex", kernelComplex, a, b, result);
Timer.prettyPrint();
}
}
private static void javaSimple(float a[], float b[], float result[])
{
for (int i=0; i<a.length; i++)
{
result** = a**+b**;
}
}
private static void javaComplex(float a[], float b[], float result[])
{
for (int i=0; i<a.length; i++)
{
result** = (float)(Math.sin(a**)*Math.sin(a**)+Math.cos(a**)*Math.cos(a**) +
Math.sin(b**)*Math.sin(b**)+Math.cos(b**)*Math.cos(b**));
}
}
private static void jcuda(String name, KernelLauncher kernelLauncher, float a[], float b[], float result[])
{
Timer.startTimer(name+"-withMem");
// Allocate the device memory and copy the input
// data to the device
int size = a.length;
CUdeviceptr dResult = new CUdeviceptr();
cuMemAlloc(dResult, size * Sizeof.FLOAT);
CUdeviceptr dA = new CUdeviceptr();
cuMemAlloc(dA, size * Sizeof.FLOAT);
cuMemcpyHtoD(dA, Pointer.to(a), size * Sizeof.FLOAT);
CUdeviceptr dB = new CUdeviceptr();
cuMemAlloc(dB, size * Sizeof.FLOAT);
cuMemcpyHtoD(dB, Pointer.to(b), size * Sizeof.FLOAT);
// Call the kernel
int gridSize = size / 256;
kernelLauncher.setGridSize(gridSize, 1);
kernelLauncher.setBlockSize(256, 1, 1);
Timer.startTimer(name);
kernelLauncher.call(dResult, dA, dB);
Timer.stopTimer(name);
JCudaDriver.cuCtxSynchronize();
// Copy the result from the device to the host
cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);
// Clean up
cuMemFree(dA);
cuMemFree(dB);
cuMemFree(dResult);
Timer.stopTimer(name+"-withMem");
}
}