Ok,
So what I am finding is that even though my simulation works with JCUDA, the standard version is very slightly faster without it.
I have spent some time looking it over, but I can’t figure out how to make it quicker. It may simply be that I’m not making an efficient use of CUDA in this case. I’ll post the code I’m using for only a CPU simulation, and then I’ll post what I did using CUDA. If anyone can help in identifying what might be particularly slowing down the simulation, then I would be very grateful.
CPU only simulation:
public void Event()
{
Random selectelement = new Random();
theselection = (selectelement.nextInt(concern.size()));
Element e = concern.get(theselection);
ArrayList<Element> local = e.getLocal();
elements.remove(e);
local.clear();
for(Element a : elements) {
double x1;
double y1;
double x2;
double y2;
double r;
r = e.getRange();
x1 = e.getXposition();
y1 = e.getYposition();
x2 = a.getXposition();
y2 = a.getYposition();
dist = Math.sqrt(Math.pow(x1 - x2, 2) + Math.pow(y1 - y2, 2));
if(dist <= r){
local.add(a);
}
}
elements.add(e);
... the rest of the code after is the same in both simulations ...
With CUDA:
private CUfunction function;
private CUcontext context;
...
public void Event(int numsteps) throws IOException
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create the PTX file by calling the NVCC
String ptxFileName = preparePtxFile("CalculateNeighbourhood.cu");
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "distance");
// Allocate and fill the host input data
float hostInputX[] = new float[numElements];
float hostInputY[] = new float[numElements];
int numberofcycles = 0;
while(numberofcycles < numsteps) {
Random selectelement = new Random();
theselection = (selectelement.nextInt(concern.size()));
Element e = concern.get(theselection);
ArrayList<Element> local = e.getLocal();
elements.remove(e);
local.clear();
double eXX = e.getXposition();
double eYY = e.getYposition();
eX = (float) eXX;
eY = (float) eYY;
// Fill the float arrays with the correct positional data
while(elementcount <= (numberofelements - 2)) {
Element a = elements.get(elementcount);
float x1;
float y1;
double x2;
double y2;
x2 = a.getXposition();
y2 = a.getYposition();
x1 = (float) x2;
y1 = (float) y2;
hostInputX[elementcount] = x1;
hostInputY[elementcount] = y1;
elementcount++;
}
elementcount = 0;
// Allocate the device input data, and copy the
// host input data to the device
CUdeviceptr deviceInputX = new CUdeviceptr();
cuMemAlloc(deviceInputX, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputX, Pointer.to(hostInputX), numElements * Sizeof.FLOAT);
CUdeviceptr deviceInputY = new CUdeviceptr();
cuMemAlloc(deviceInputY, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputY, Pointer.to(hostInputY), numElements * Sizeof.FLOAT);
// Allocate device output memory
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, numElements * Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{numElements}),
Pointer.to(deviceInputX),
Pointer.to(deviceInputY),
Pointer.to(new float[]{eX}),
Pointer.to(new float[]{eY}),
Pointer.to(deviceOutput)
);
// Call the kernel function.
int blockSizeX = 512;
int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Allocate host output memory and copy the device output
// to the host.
float hostOutput[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, numElements * Sizeof.FLOAT);
for(int i = 0; i < numElements; i++) {
if(Math.abs(hostOutput**) > 0) {
Element b = elements.get(i);
local.add(b);
}
}
// Clean up.
cuMemFree(deviceInputX);
cuMemFree(deviceInputY);
cuMemFree(deviceOutput);
... More code the same in both...
... Then at the end of the Method:
cuCtxDestroy(context);
The Kernel:
extern "C"
__global__ void distance(int n, float *a, float *b, float c, float d, float *local)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n)
{
float e = a** - c;
float f = b** - d;
float dist = e*e + f*f;
float range = 28;
if (dist <= range*range) local** = 1;
else local** = 0;
}
}
Two sets of code that basically take the element in question and then calculate which elements in the simulation are in range of that element.
I know it’s almost rude to throw out a pile of code and ask if anyone can clean it up, but I am really stuck with this. It may simply be that I’m not doing enough with the kernel in order for this simulation to be improved with CUDA, but I don’t know.
Thanks
Sam