Hi,
I am working on a larger project for a plugin development in ImageJ and I’m in the process of implementing GPU bound processing of some of the more time consuming algorithms. The CUDA code has been developed in .net with CUDAFY.net and I generate a .ptx file from there. The code runs fine in that enviroment and now I am trying to set up calls to the .ptx file but am running into problems at the initial stages. Currently my call function is modeled more or less directly from the examples on jcuda.org, currently looking as below. The width and depth variables were in C# sent directly to the gpu, but I’ve not been able to find how to send shared variables in the kernel call using jcuda yet, so the .ptx code has been modified to handle this.
short[] width = {50}; // filter window width.
int[] depth = {1000}; // z.
short framewidth = 64;
short frameheight = 64;
int N = depth[0] * framewidth * frameheight; // size of entry.
float[] meanVector = generateTest(depth[0]);
int[] test_Data = generateTest(N,depth[0]);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the PTX that contains the kernel.
CUmodule module = new CUmodule();
cuModuleLoad(module, "medianFiltering.ptx");
// Obtain a handle to the kernel function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "medianKernel");
// Allocate the device input data, and copy the
// host input data to the device
int numElements = ((2 * width[0] + 1) * framewidth * frameheight);
CUdeviceptr device_window = new CUdeviceptr();
cuMemAlloc(device_window, numElements * Sizeof.FLOAT);
numElements = 1;
CUdeviceptr device_width = new CUdeviceptr();
cuMemAlloc(device_width, numElements * Sizeof.SHORT);
cuMemcpyHtoD(device_width, Pointer.to(width),
numElements * Sizeof.SHORT);
numElements = test_Data.length;
CUdeviceptr device_test_Data = new CUdeviceptr();
cuMemAlloc(device_test_Data, numElements * Sizeof.INT);
cuMemcpyHtoD(device_test_Data, Pointer.to(test_Data),
numElements * Sizeof.INT);
numElements = 1;
CUdeviceptr device_depth = new CUdeviceptr();
cuMemAlloc(device_depth, numElements * Sizeof.INT);
cuMemcpyHtoD(device_depth, Pointer.to(depth),
numElements * Sizeof.INT);
numElements = meanVector.length;
CUdeviceptr deviceMeanVector = new CUdeviceptr();
cuMemAlloc(deviceMeanVector, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceMeanVector, Pointer.to(meanVector),
numElements * Sizeof.FLOAT);
numElements = test_Data.length;
// Allocate device output memory
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, numElements * Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(device_width),
Pointer.to(device_window),
Pointer.to(device_depth),
Pointer.to(device_test_Data),
Pointer.to(deviceOutput)
);
int blockSizeX = 1;
int gridSizeY = 64;
int gridSizeX = 64;//(int)Math.ceil((double)numElements / blockSizeX);
cuLaunchKernel(function,
gridSizeX, gridSizeY, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
numElements = test_Data.length;
float hostOutput[] = new float[numElements];
cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput,
numElements * Sizeof.FLOAT);
cuMemFree(device_width);
cuMemFree(device_window);
cuMemFree(device_depth);
cuMemFree(device_test_Data);
cuMemFree(deviceOutput);
I’m running this using the new 8.0.0 build of jcuda on a gtx1080 card on a windows 7 machine. The error message I’m getting is:
A fatal error has been detected by the Java Runtime Environment:
EXCEPTION_ACCESS_VIOLATION (0xc0000005) at pc=0x000007fee50b7044, pid=7848, tid=8980
JRE version: Java™ SE Runtime Environment (8.0_91-b15) (build 1.8.0_91-b15)
Java VM: Java HotSpot™ 64-Bit Server VM (25.91-b15 mixed mode windows-amd64 compressed oops)
Problematic frame:
C [nvcuda.dll+0x327044]
Failed to write core dump. Minidumps are not enabled by default on client versions of Windows
It looks to me that java can’t find nvcuda.dll, a .dll file not included in the CUDA 8.0 toolkit bin folder. Any suggestions on how to proceed would be greatly appriciated.