I am trying to do some Matrix multiplication with JCuda but I am stuck at this error:
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:359)
at jcuda.driver.JCudaDriver.cuLaunchKernel(JCudaDriver.java:17119)
at oroarmor.jcuda.JCudaMatrixMultiply.main(JCudaMatrixMultiply.java:77)
My main code:
public class JCudaMatrixMultiply {
public static void main(String[] args) {
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Create the PTX file by calling the NVCC
String ptxFileName = JCudaSamplesUtils.preparePtxFile("src/main/resources/kernels/matrixMultiply.cu");
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "multiply");
int n = 1 << 10;
int mSize = n * n;
System.out.println(mSize);
int[] hostA = new int[mSize];
int[] hostB = new int[mSize];
for (int i = 0; i < mSize; i++) {
hostA[i] = 1;
hostB[i] = 1;
}
CUdeviceptr deviceAPtr = new CUdeviceptr();
cuMemAlloc(deviceAPtr, mSize * Sizeof.INT);
cuMemcpyHtoD(deviceAPtr, Pointer.to(hostA), mSize * Sizeof.INT);
CUdeviceptr deviceBPtr = new CUdeviceptr();
cuMemAlloc(deviceBPtr, mSize * Sizeof.INT);
cuMemcpyHtoD(deviceBPtr, Pointer.to(hostB), mSize * Sizeof.INT);
CUdeviceptr deviceO = new CUdeviceptr();
cuMemAlloc(deviceO, mSize * Sizeof.INT);
Pointer kernelParameters = Pointer.to(Pointer.to(new int[] { n }), Pointer.to(deviceAPtr),
Pointer.to(deviceBPtr), Pointer.to(deviceO));
// int blockSizeX = 512;
// int gridSizeX = (int) Math.ceil((double) n / blockSizeX);
cuLaunchKernel(//
function, 2, 2, 1, // Grid dimension
512, 512, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
cuCtxSynchronize();
int hostO[] = new int[mSize];
for (int i = 0; i < mSize; i++) {
hostO[i] = -1;
}
cuMemcpyDtoH(Pointer.to(hostO), deviceO, mSize * Sizeof.INT);
for (int i = 0; i < n / (1 << 7); i++) {
System.out.print("[ ");
for (int j = 0; j < n / (1 << 7); j++) {
System.out.print(hostO[i * n + j] + ", ");
}
System.out.print(" ] [ ");
for (int j = 0; j < n / (1 << 7); j++) {
int sum = 0;
for (int k = 0; k < n; k++) {
sum += hostA[i * n + k] * hostB[j + k * n];
}
System.out.print(sum + ", ");
}
System.out.println(" ]");
}
System.out.println("finished");
}
}
My kernel:
extern "C"
__global__ void multiply(int n, int *a, int *b, int *sum)
{
int i = blockIdx.x * blockDim.x + threadIdx.x; //output x
int j = blockIdx.y * blockDim.y + threadIdx.y; //output y
int product = 0;
if (i < n && j < n)
{
for(int k = 0; k < n; k++){
product += a[i + k * n] * b[j * n + k];
}
}
sum[i+j*n] = 3;
}
JCudaSamplesUtils.preparePtxFile(fileName) is from the sample code. What am I doing wrong so that I can fix this? Thank you!