Hi marco,I am trying to run a program for matrix multiplications can you please help me to solve this CUDA_ERROR_UNKNOWN exception?
console:
Exception in thread "main" jcuda.CudaException: CUDA_ERROR_UNKNOWN
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:288)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1852)
at Matrix_mul_2.main(Matrix_mul_2.java:99)
import static jcuda.driver.JCudaDriver.cuCtxCreate;
import static jcuda.driver.JCudaDriver.cuCtxDestroy;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoad;
import java.util.Locale;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;
/**
* An example that computes the sums of the rows of a matrix,
* and writes the results into an array
*/
public class Matrix_mul_2
{
public static void main(String[] args)
{
// Enable exceptions and omit subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create a context for the first device
cuInit(0);
CUcontext context = new CUcontext();
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
cuCtxCreate(context, 0, device);
// Load the module and obtain the pointer to the kernel function
CUmodule module = new CUmodule();
cuModuleLoad(module, "C://Users//workspace//Jcuda_Matrix_multiplication//JCudaMatmul.ptx");
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "matrixMulKernel");
// Create the input matrix in host memory
int width = 5;
// Allocate and fill the host input data
float hostMatrixA[] = new float[width*width];
float hostMatrixB[] = new float[width*width];
float result_matrix[] = new float[width*width];
for(int i = 0; i < width*width; i++) //initializing arrays
{
hostMatrixA** = 5;
hostMatrixB** = i+2;
result_matrix** = 0;
}
// float hostMatrix[] = createExampleMatrix(rows, cols);
// Copy the host data to the device
CUdeviceptr deviceMatrixA = new CUdeviceptr();
cuMemAlloc(deviceMatrixA, width * width * Sizeof.FLOAT);
cuMemcpyHtoD(deviceMatrixA, Pointer.to(hostMatrixA),
width * width * Sizeof.FLOAT);
CUdeviceptr deviceMatrixB = new CUdeviceptr();
cuMemAlloc(deviceMatrixB, width * width * Sizeof.FLOAT);
cuMemcpyHtoD(deviceMatrixB, Pointer.to(hostMatrixB),
width * width * Sizeof.FLOAT);
// Allocate memory for the result on the device
CUdeviceptr deviceSums = new CUdeviceptr();
cuMemAlloc(deviceSums, width * width * Sizeof.FLOAT);
// Set up and launch the kernel
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{ width }),
Pointer.to(deviceMatrixA),
Pointer.to(deviceMatrixB),
Pointer.to(deviceSums)
);
int blockSizeX = 256;
int gridSizeX = (int)(Math.ceil((width * width) + blockSizeX - 1) / blockSizeX);
System.out.println(" grid size : "+gridSizeX);
cuLaunchKernel(function,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, null, kernelParameters, null);
cuCtxSynchronize();
// Copy the result from the device to the host
float hostSums[] = new float[width];
cuMemcpyDtoH(Pointer.to(hostSums), deviceSums, width *width * Sizeof.FLOAT);
System.out.println("Row sums:
");
// System.out.println(createString2D(hostSums, width, 1));
// Clean up
cuMemFree(deviceSums);
cuMemFree(deviceMatrixB);
cuMemFree(deviceMatrixA);
cuCtxDestroy(context);
}
private static float[] createExampleMatrix(int rows, int cols)
{
float matrix[] = new float[rows*cols];
int counter = 0;
for (int r = 0; r < rows; r++)
{
for (int c = 0; c < cols; c++)
{
matrix[r * cols + c] = counter;
counter++;
}
}
return matrix;
}
}
I have check the .cu file successfully running on CUDA (C-CUDA).