OK then, the handling of the CPU part is of course not straightforward in Java, since it involves some 'struct’s. But at least accessing the GPU part via JCuda is not too complicated. It also involves a struct, but only a very simple one that is filled on the host side and read in the kernel.
Here’s a small example, mainly based on the GPU-related part of the original MersenneTwister example.
/*
* JCuda - Java bindings for NVIDIA CUDA driver and runtime API
* http://www.jcuda.org
*
* Copyright 2010 Marco Hutter - http://www.jcuda.org
*/
import java.io.*;
import java.nio.ByteBuffer;
import jcuda.*;
import jcuda.driver.*;
import static jcuda.driver.JCudaDriver.*;
/**
* This is a port of the GPU part of the NVIDIA CUDA
* Mersenne Twister Random Number Generator example.<br />
* <br />
* Required files:
* <ul>
* <li>
* <b>MersenneTwister.compute_10.sm_10.cubin</b> - The
* SM 10 CUBIN file that is created from the original
* example when keeping the preprocessed files by adding
* the <code>--keep</code> parameter to the NVCC call.
* </li>
* <li>
* <b>MersenneTwister.dat</b> - The data file that is
* contained in the original example.
* </li>
* </ul>
*/
public class JCudaDriverMersenneTwister
{
/**
* The name of the CUBIN file
*/
private static final String cubinFileName =
"MersenneTwister.compute_10.sm_10.cubin";
/**
* The name of the data file that is loaded
*/
private static final String dataFileName =
"MersenneTwister.dat";
// Variable declarations as in the original example
private static final int PATH_N = 24000;
private static final int MT_RNG_COUNT = 4096;
private static final int N_PER_RNG =
alignUp(divUp(PATH_N, MT_RNG_COUNT), 2);
private static final int RAND_N = MT_RNG_COUNT * N_PER_RNG;
private static final int SEED = 777;
/**
* This is the size of the stripped Mersenne Twister structure
* that is defined in the original sample as follows:
* <pre>
* typedef struct
* {
* unsigned int matrix_a;
* unsigned int mask_b;
* unsigned int mask_c;
* unsigned int seed;
* } mt_struct_stripped;
* </pre>
*/
private static int sizeof_mt_struct_stripped = 4 * Sizeof.INT;
/**
* This variable is originally declared as
* <code>static mt_struct_stripped h_MT[MT_RNG_COUNT];</code>
*
* Since this data is filled by reading the data from a
* file, and it has to be copied to the device, this is
* not stored as an array of Objects of a class that
* resembles the original structure, but simply as a
* direct byte buffer.
*/
private static ByteBuffer h_MT =
ByteBuffer.allocateDirect(MT_RNG_COUNT * sizeof_mt_struct_stripped);
/**
* The CUDA module that is created from the CUBIN file
*/
private static CUmodule module;
/**
* The entry point of this sample.
*
* @param args Not used
*/
public static void main(String args[])
{
// Initialize the driver and create a context for the first device.
System.out.println("Initializing CUDA driver...");
JCudaDriver.setExceptionsEnabled(true);
cuInit(0);
CUcontext pctx = new CUcontext();
CUdevice dev = new CUdevice();
cuDeviceGet(dev, 0);
cuCtxCreate(pctx, 0, dev);
// Load the module from the CUBIN file
System.out.println("Loading module from "+cubinFileName+"...");
module = new CUmodule();
cuModuleLoad(module, cubinFileName);
// Obtain the function pointers to the "RandomGPU"
// and the "BoxMuller" functions
CUfunction randomGPU = new CUfunction();
cuModuleGetFunction(randomGPU, module, "_Z9RandomGPUPfi");
CUfunction boxMullerGPU = new CUfunction();
cuModuleGetFunction(boxMullerGPU, module, "_Z12BoxMullerGPUPfi");
// Initialize the data for the samples
System.out.println("Initializing data for "+PATH_N+" samples...");
float h_RandGPU[] = new float[RAND_N];
CUdeviceptr d_Rand = new CUdeviceptr();
cuMemAlloc(d_Rand, RAND_N * Sizeof.FLOAT);
// Load the twister configuration from the input data file
System.out.println("Loading GPU twister configuration...");
loadMTGPU(dataFileName);
seedMTGPU(SEED);
System.out.println("Generating random numbers on GPU...");
int numIterations = 20;
for (int i = -1; i < numIterations; i++)
{
if (i == 0)
{
cuCtxSynchronize();
}
cuFuncSetBlockShape(randomGPU, 128, 1, 1);
cuLaunchGrid(randomGPU, 32, 1);
cuFuncSetBlockShape(boxMullerGPU, 128, 1, 1);
cuLaunchGrid(boxMullerGPU, 32, 1);
}
cuCtxSynchronize();
System.out.println("Reading back the results...");
cuMemcpyDtoH(Pointer.to(h_RandGPU), d_Rand, RAND_N * Sizeof.FLOAT);
System.out.println("Results: "+stringFor(h_RandGPU, 6));
System.out.println("Shutting down...");
cuMemFree(d_Rand);
}
/**
* Returns a String containing up to 'max' elements of the
* given array.
*
* @param array The array
* @param max The maximum number of elements
* @return The String for the array
*/
private static String stringFor(float array[], int max)
{
int n = Math.min(max, array.length);
StringBuilder sb = new StringBuilder("[");
for (int i=0; i<n; i++)
{
sb.append(String.valueOf(array**));
if (i<n-1)
{
sb.append(", ");
}
}
if (max < array.length)
{
sb.append(", ...");
}
sb.append("]");
return sb.toString();
}
/**
* Align a to nearest higher multiple of b
*
* @param a The value to align
* @param b The alignment
* @return The aligned value
*/
private static int alignUp(int a, int b)
{
return ((a % b) != 0) ? (a - a % b + b) : a;
}
/**
* Computes the quotient a/b, rounded up to the next highest
* integral value
*
* @param a Dividend
* @param b Divisor
* @return The rounded quotient
*/
private static int divUp(int a, int b)
{
return ((a % b) != 0) ? (a / b + 1) : (a / b);
}
/**
* Load the twister configuration from the file with the given name,
* and store its contents in the "h_MT" ByteBuffer
*
* @param fname The file name
*/
private static void loadMTGPU(String fname)
{
FileInputStream fis = null;
try
{
fis = new FileInputStream(fname);
byte buffer[] = new byte[sizeof_mt_struct_stripped];
for (int i = 0; i < MT_RNG_COUNT; i++)
{
fis.read(buffer);
h_MT.put(buffer);
}
h_MT.position(0);
}
catch (IOException e)
{
e.printStackTrace();
try
{
fis.close();
}
catch (IOException ex)
{}
}
}
/**
* Initialize the twister with the given seed.
*
* @param seed The seed
*/
private static void seedMTGPU(int seed)
{
int i;
ByteBuffer MT = ByteBuffer.allocateDirect(
MT_RNG_COUNT * sizeof_mt_struct_stripped);
byte buffer[] = new byte[sizeof_mt_struct_stripped];
for (i = 0; i < MT_RNG_COUNT; i++)
{
// In the original example, this is simply an assignment:
// MT** = h_MT**;
// Since the data here is not stored in arrays of
// structures, but in direct buffers, the data
// is copied from the first buffer and stored in
// the second
h_MT.get(buffer);
MT.put(buffer);
// The last field of the structure is the int
// that stores the seed. In the original
// example, this is just an assignment
// MT**.seed = seed;
// Since the structures are stored in byte buffers
// this value is set manually here:
MT.position(MT.position() - Sizeof.INT);
MT.putInt(seed);
}
// Copy the current data to the global 'ds_MT' variable
// of the module
CUdeviceptr ds_MT = new CUdeviceptr();
cuModuleGetGlobal(ds_MT, new int[1], module, "ds_MT");
cuMemcpyHtoD(ds_MT, Pointer.to(MT),
MT_RNG_COUNT * sizeof_mt_struct_stripped);
}
}
If it turns out to be helpful, I may also upload it on the website.
By the way: I already addressed the issue of handling structs in the context of JOCL, but I think that it is very similar for JOCL and for JCuda, so I’ll possibly upload some utility classes for struct handling soon. Another item in the queue