For those who are very good at this, just to let you know.
My kernel is not well designed just because I am trying to figure out and understand why the value I set the sumSetOutput variable at index [0] inside the kernel is not returning back from the kernel to the host.
Please bare in mind that the full functionality of this topic is not fully express…because of my basic understanding of JCuda, I am still trying to get past the little details like why the device pointers are not returning values to the host?
If someone who can understand what I am trying to do here can run the code and tell me why, as I think It should be happening, why I am not getting the value 1000 back from the output variable sumSetOutput from the kernel.
running JCuda version: 0.6.5 for 32 bit machine
public static void main(String[] args)
{
try
{
// //this works fine, this produces the *.ptx file from the *.cu
// found at .\JCudaKernelLibrary\ folder
PTXFileMaker.makePtx(".\\JCudaKernelLibrary", "HFoldSumSetKernel");
// basic device initialization
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// dependent variables
// [dim] stands for dimension, this the number of vertices a polygon
// has, in this case 3 vertices, I am computing a sumset for a
// triangle
int dim = 3;
// the actual [polygon], or the vertices that it will be made off,
// as you can see, this is an actually very, very small triangle.
float[] polygon = { -1, -1, 1, 1, -1, 1, -1, 1, 1 };
// the [sumSet], this variable holds the first set of points that
// make up the triangle
float[] sumSet = { -1, -1, 1, 1, -1, 1, -1, 1, 1 };
// the [hFoldSumSet] this is the output variable, so if you take a
// nest for loop, and you run the addition of the vertices from the
// polygon and
// the vertices from the sumSet, you get the hFoldSumSet, and the
// [h] in this case, means the power at which you raise outcome of
// points found
// inside the forgoing polygon.
float[] hFoldSumSet = new float[9];
// initializing the Cuda Module for invoking the kernel
CUmodule module = new CUmodule();
cuModuleLoad(module, ".\\JCudaKernelLibrary\\HFoldSumSetKernel.ptx");
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "HFoldSumSet");
// I only take this loop to HFold 3, and you will see the output
// outcomes out as 9, 27, 81, that's how rapping the polygon grows.
for (int i = 0; i < 3; i++)
{
// /passing parameters to the function that invokes the kernel
// /supposedly the hFoldSumSet variable should hold the result
// from the sumSet and Polygon addition,
// / but for simplicity, Inside the kernel I only have
// sumSetOutput[0] = 1000 just to see what I am doing;
// /but the code is returning 0.0s inside the output variable;
hFoldSumSet = hFoldSumSet(function, hFoldSumSet, sumSet,
polygon, dim);
System.out.println("output size=" + hFoldSumSet.length);
// /here I set the [sumSet] variable equal to the output just
// because in the next sequence of the loop, the next SumSet set
// has to have
// /the previous set of vertices that make up the last
// computation between the sumset and polygon nested loop.
sumSet = hFoldSumSet;
hFoldSumSet = new float[sumSet.length * dim];
// /because I am setting [sumSet] to the output variable, I
// believe I should see that value 1000 inside the first index
// from the output variable
// / but that's not so.
for (int j = 0; j < sumSet.length; j += 3)
{
System.out.println("x=" + sumSet[j + 0] + " y=" +
sumSet[j + 1] + " z=" + sumSet[j + 2]);
}
}
}
catch (IOException e)
{
e.printStackTrace();
}
}
////this method is where I invoke the Cuda kernel
private float[] hFoldSumSet(CUfunction function, float[] hFoldSumSet, float[] sumSet, float[] polygon, int dim) {
//device pointers definitions
CUdeviceptr devHFoldSumSetOutput = new CUdeviceptr();
CUdeviceptr devSumSetInput = new CUdeviceptr();
CUdeviceptr devPolygonInput = new CUdeviceptr();
cuMemAlloc(devHFoldSumSetOutput, hFoldSumSet.length * Sizeof.FLOAT);
cuMemAlloc(devSumSetInput, sumSet.length * Sizeof.FLOAT);
cuMemcpyHtoD(devSumSetInput, Pointer.to(sumSet), sumSet.length * Sizeof.FLOAT);
cuMemAlloc(devPolygonInput, polygon.length * Sizeof.FLOAT);
cuMemcpyHtoD(devPolygonInput, Pointer.to(polygon), polygon.length * Sizeof.FLOAT);
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{dim}),
Pointer.to(devSumSetInput),
Pointer.to(devPolygonInput),
Pointer.to(devHFoldSumSetOutput)
);
cuLaunchKernel(function,
1, 1, 1,
1, 1, 1,
0, null,
kernelParameters, null
);
cuMemcpyDtoH(Pointer.to(hFoldSumSet), devHFoldSumSetOutput, hFoldSumSet.length * Sizeof.FLOAT);
cuCtxSynchronize();
cuMemFree(devHFoldSumSetOutput);
cuMemFree(devSumSetInput);
cuMemFree(devPolygonInput);
return hFoldSumSet;
}
This is my HFoldSumSetKernel.cu file content:
extern "C"
__global__ void hFoldSumSet(int dimension, float*sumSet, float*polygon, float*sumSetOutput){
//this is only for simplicity, but I am trying to figure out why it is not working.
sumSetOutput[0] = 1000;
//help!
}