Hi,
I’ve written a simple Matrix Multiplikation Kernel
#include "stdio.h"
__global__ void KernelMul(float *A, float *B, float *C,int ar,int ac, int br, int bc)
{
float Cvalue = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if(row > ar || col > ac) return;
for (int e = 0; e < ac; ++e) {
Cvalue += A[row * ac + e] * B[e * bc + col];
printf("%f
", Cvalue);
}
C[row * bc + col] = Cvalue;
}```
and the Java Code using this is
```public static float[] mul(float[] hostInputA,float[] hostInputB,int ar,int ac,int br,int bc) {
JCudaDriver.setExceptionsEnabled(true);
String ptxFileName = "src\\Matrix\\MatrixMul.ptx";
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "KernelMul");
CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, hostInputA.length * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA),
hostInputA.length * Sizeof.FLOAT);
CUdeviceptr deviceInputB = new CUdeviceptr();
cuMemAlloc(deviceInputB, hostInputB.length * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB),
hostInputB.length * Sizeof.FLOAT);
/*CUdeviceptr deviceInputar = new CUdeviceptr();
cuMemAlloc(deviceInputB, Sizeof.INT);
cuMemcpyHtoD(deviceInputB, Pointer.to(ar),Sizeof.INT);
CUdeviceptr deviceInputac = new CUdeviceptr();
cuMemAlloc(deviceInputB, Sizeof.INT);
cuMemcpyHtoD(deviceInputB, Pointer.to(ac),Sizeof.INT);
CUdeviceptr deviceInputbr = new CUdeviceptr();
cuMemAlloc(deviceInputB, Sizeof.INT);
cuMemcpyHtoD(deviceInputB, Pointer.to(br),Sizeof.INT);
CUdeviceptr deviceInputbc = new CUdeviceptr();
cuMemAlloc(deviceInputB, Sizeof.INT);
cuMemcpyHtoD(deviceInputB, Pointer.to(bc),Sizeof.INT);*/
// Allocate device output memory
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, hostInputA.length * Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(deviceInputA),
Pointer.to(deviceInputB),
Pointer.to(deviceOutput),
Pointer.to(new int[]{ar}),
Pointer.to(new int[]{ac}),
Pointer.to(new int[]{br}),
Pointer.to(new int[]{bc})
);
// Call the kernel function.
int blockSizeX = 16;
int blockSizeY = 16;
int gridSizeX = (int)Math.ceil(((bc+blockSizeX-1)/blockSizeX));
int gridSizeY = (int)Math.ceil(((ar+blockSizeY-1)/blockSizeY));
//System.out.println(gridSizeX);
//System.out.println(gridSizeY);
JCudaDriver.cuCtxSetLimit(CUlimit.CU_LIMIT_PRINTF_FIFO_SIZE, 4096);
cuLaunchKernel(function,
gridSizeX, gridSizeY, 1, // Grid dimension
blockSizeX, blockSizeY, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
int k = cuCtxSynchronize();
System.out.println(JCuda.cudaGetErrorString(k));
/*cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != CUDA_SUCCESS)
printf("kernel launch failed with error \"%s\".
",
cudaGetErrorString(cudaerr));*/
// Allocate host output memory and copy the device output
// to the host.
float hostOutput[] = new float[hostInputA.length];
cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput,
hostInputA.length * Sizeof.FLOAT);
cuMemFree(deviceInputA);
cuMemFree(deviceInputB);
cuMemFree(deviceOutput);
return hostOutput;
}```
When Multiplying the 4 x 4 Matrix with all 1 the Result is either correct so ((4,4,4,4),(4,4,4,4),(4,4,4,4),(4,4,4,4)) or a wrong one ((4,4,4,4),(4,4,4,4),(3,4,4,4),(4,4,4,4)).
It seems to be Random if it misscalculates or not, so how could I debug a Problem ?
Also I have tested a few of the samples and they seem to have worked.
I hope someone can help.