Hi,
I have just began working on CUDA and recently am exploring JCuda. I have written a small testing code to see how JCuda works. However, i have encountered an error exception which puzzles me. Attached is my java codes for the kernel calls. Apologise for the messy codes ><
int numOfBlocks = 1536, offset = 0, numThreadperBlock = 512;
String cubinFilename = "/home/alan/DNSKernel.cubin";
JCudaDriver.cuInit(0);
CUcontext context = new CUcontext();
CUdevice dev = new CUdevice();
JCudaDriver.cuDeviceGet(dev,0);
JCudaDriver.cuCtxCreate(context, 0, dev);
//Load the cubin file
CUmodule module = new CUmodule();
JCudaDriver.cuModuleLoad(module, cubinFilename);
//Create function pointer to cuda function
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module, "gpuAvgRTTcompute");
//Allocate memory for input data on GPU device
CUdeviceptr dev07 = new CUdeviceptr();
CUdeviceptr dev08 = new CUdeviceptr();
CUdeviceptr dev09 = new CUdeviceptr();
CUdeviceptr tempResult = new CUdeviceptr();
CUdeviceptr devfinalResult = new CUdeviceptr();
JCudaDriver.cuMemAlloc(dev07, yr07Qrt.length*Sizeof.FLOAT);
JCudaDriver.cuMemAlloc(dev08, yr08Qrt.length*Sizeof.FLOAT);
JCudaDriver.cuMemAlloc(dev09, yr09Qrt.length*Sizeof.FLOAT);
JCudaDriver.cuMemAlloc(tempResult, numOfBlocks*Sizeof.FLOAT);
JCudaDriver.cuMemAlloc(devfinalResult, 3*Sizeof.FLOAT);
//Copy input data to GPU device array
JCudaDriver.cuMemcpyHtoD(dev07, Pointer.to(yr07Qrt), yr07Qrt.length*Sizeof.FLOAT);
JCudaDriver.cuMemcpyHtoD(dev08, Pointer.to(yr08Qrt), yr08Qrt.length*Sizeof.FLOAT);
JCudaDriver.cuMemcpyHtoD(dev09, Pointer.to(yr09Qrt), yr09Qrt.length*Sizeof.FLOAT);
//Pointer declarations
Pointer d07 = Pointer.to(dev07);
Pointer d08 = Pointer.to(dev08);
Pointer d09 = Pointer.to(dev09);
Pointer size07 = Pointer.to(new int[]{yr07Qrt.length});
Pointer size08 = Pointer.to(new int[]{yr08Qrt.length});
Pointer size09 = Pointer.to(new int[]{yr09Qrt.length});
Pointer tmpResult = Pointer.to(tempResult);
Pointer fResult = Pointer.to(devfinalResult);
JCudaDriver.cuFuncSetBlockShape(function, numThreadperBlock, 1, 1);
//Parameter setup for 1st kernel call
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, d07, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, d08, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, d09, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = JCudaDriver.align(offset, Sizeof.INT);
JCudaDriver.cuParamSetv(function, offset, size07, Sizeof.INT);
offset += Sizeof.INT;
offset = JCudaDriver.align(offset, Sizeof.INT);
JCudaDriver.cuParamSetv(function, offset, size08, Sizeof.INT);
offset += Sizeof.INT;
offset = JCudaDriver.align(offset, Sizeof.INT);
JCudaDriver.cuParamSetv(function, offset, size09, Sizeof.INT);
offset += Sizeof.INT;
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, tmpResult, Sizeof.POINTER);
offset += Sizeof.POINTER;
//Launch 1st kernel
JCudaDriver.cuParamSetSize(function, offset);
JCudaDriver.cuLaunchGrid(function, numOfBlocks, 1);
JCudaDriver.cuCtxSynchronize();
//Parameter setup for 2nd kernel call
JCudaDriver.cuModuleGetFunction(function, module, "gpuAvgRTTcompute2");
offset = 0;
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, fResult, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = JCudaDriver.align(offset, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, tmpResult, Sizeof.POINTER);
offset += Sizeof.POINTER;
JCudaDriver.cuParamSetSize(function, offset);
JCudaDriver.cuFuncSetBlockShape(function, numThreadperBlock, 1, 1);
JCudaDriver.cuLaunchGrid(function, 3, 1);
JCudaDriver.cuCtxSynchronize();
//Copy out the final results
JCudaDriver.cuMemcpyDtoH(Pointer.to(finalResults), devfinalResult,3*Sizeof.FLOAT);
However i get the error message:
Exception in thread “main” jcuda.CudaException: CUDA_ERROR_LAUNCH_FAILED
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:153)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:723)
at dnstest.main(dnstest.java:185)
when running the program. I’m pretty sure the codes in the kernel is working as i’ve tested the codes in CUDA. Do appreciate any help here. Thanks alot.