Hello everyone.
I heard that We can use Driver and Runtime APIs together on CUDA 3.0.
I want to use minimum Driver API calls(only for prepare .cubin files and create kernel functions).
So, I want to use: cudaConfigureCall() instead cuParamSetv() and etc.
I’m trying to program the sample. My kernel function “MatrixMul” works with Driver API realization of calling kernel function. But mix realization does not work. Kernel function returns zeroes in result array. cudaConfigureCall() and cudaSetupArgument() return “cudaSuccess”, but cudaLaunch() returns “cudaErrorInvalidDeviceFunction”.
Code:
public static void mulMatrixCUDrvCURun(float[] aCU,float[] bCU,float[] cCU,int n) throws IOException{
int sizeMatrix=aCU.length;
// Initialize the driver and create a context for the first device.
JCudaDriver.cuInit(0);
CUcontext context = new CUcontext();
CUdevice device = new CUdevice();
JCudaDriver.cuDeviceGet(device, 0);
JCudaDriver.cuCtxCreate(context, 0, device);
String cubinFileName = prepareCubinFile("myMatrMul.cu");
// Load the CUBIN file.
CUmodule module = new CUmodule();
JCudaDriver.cuModuleLoad(module, cubinFileName);
// Obtain a function pointer to the "MatrixMul" function.
CUfunction function = new CUfunction();
JCudaDriver.cuModuleGetFunction(function, module, "MatrixMul");
// Allocate memory on the device using JCuda
Pointer deviceA = new Pointer();
Pointer deviceB = new Pointer();
Pointer deviceC = new Pointer();
JCuda.cudaMalloc(deviceA, sizeMatrix * Sizeof.FLOAT);
JCuda.cudaMalloc(deviceB, sizeMatrix * Sizeof.FLOAT);
JCuda.cudaMalloc(deviceC, sizeMatrix * Sizeof.FLOAT);
// Copy memory from host to device using JCuda
JCuda.cudaMemcpy(deviceA, Pointer.to(aCU), sizeMatrix * Sizeof.FLOAT,cudaMemcpyKind.cudaMemcpyHostToDevice);
JCuda.cudaMemcpy(deviceB, Pointer.to(bCU), sizeMatrix * Sizeof.FLOAT,cudaMemcpyKind.cudaMemcpyHostToDevice);
JCuda.cudaMemcpy(deviceC, Pointer.to(cCU), sizeMatrix * Sizeof.FLOAT,cudaMemcpyKind.cudaMemcpyHostToDevice);
int offset = 0;
dim3 blocks=new dim3(n/BLOCK_SIZE, n/BLOCK_SIZE, 1);
dim3 threads=new dim3(BLOCK_SIZE, BLOCK_SIZE, 1);
int err=JCuda.cudaConfigureCall(blocks,threads,0,null);
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
err=JCuda.cudaSetupArgument(Pointer.to(deviceA), Sizeof.POINTER, offset);
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
offset+=Sizeof.POINTER;
err=JCuda.cudaSetupArgument(Pointer.to(deviceB), Sizeof.POINTER, offset);
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
offset+=Sizeof.POINTER;
err=JCuda.cudaSetupArgument(Pointer.to(deviceC), Sizeof.POINTER, offset);
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
offset+=Sizeof.POINTER;
JCuda.cudaSetupArgument(Pointer.to(new int[]{n}), Sizeof.INT, offset);
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
offset+=Sizeof.INT;
// Call the function.
err=JCuda.cudaLaunch("MatrixMul");
System.out.print(jcuda.runtime.cudaError.stringFor(err)+" ");
JCuda.cudaThreadSynchronize();
JCuda.cudaMemcpy(Pointer.to(cCU), deviceC, sizeMatrix * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToHost);
// Clean up
JCuda.cudaFree(deviceA);
JCuda.cudaFree(deviceB);
JCuda.cudaFree(deviceC);
}```
Where is my problem? Thanks for the help.