Hi Marco,
I’ve been trying to port a basic example of a matrix inverse based on Robert Crovella’s answer on SO.
Below is a simplified working example of the original CUDA code:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
int main(int argc, char const *argv[])
{
int n = 3;
float a[n*n] = {0,3,4,
1,3,10,
4,9,16};
float result[n*n];
cublasHandle_t handle;
cublasCreate_v2(&handle);
int batchSize = 1;
int *P, *INFO;
cudaMalloc<int>(&P, n * batchSize * sizeof(int));
cudaMalloc<int>(&INFO, batchSize * sizeof(int));
int lda = n;
float* src_d, *dst_d;
cudaMalloc<float>(&src_d, n * n * sizeof(float));
cudaMemcpy(src_d, a, n * n * sizeof(float),cudaMemcpyHostToDevice);
cudaMalloc<float>(&dst_d, n * n * sizeof(float));
float *A[] = { src_d };
float** A_d;
cudaMalloc<float*>(&A_d,sizeof(A));
cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);
cublasSgetrfBatched(handle, n, A_d, lda, P, INFO, batchSize);
int INFOh = 0;
cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost);
fprintf(stderr, "INFOh %d\n", INFOh);
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudaMalloc<float*>(&C_d,sizeof(C));
cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice);
cublasSgetriBatched(handle, n, A_d, lda, P, C_d, n, INFO, batchSize);
cudaMemcpy(&INFOh, INFO, sizeof(int), cudaMemcpyDeviceToHost);
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaMemcpy(result, dst_d, n * n * sizeof(float), cudaMemcpyDeviceToHost);
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
cudaFree(P), cudaFree(INFO), cudaFree(src_d), cudaFree(dst_d), cublasDestroy_v2(handle);
return 0;
}
Compiling and running the above:
INFOh 0
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
Here is my attempt to port this code to jcublas:
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasStatus;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import static jcuda.jcublas.JCublas2.*;
import static jcuda.runtime.JCuda.*;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
public class JCublasInvTest {
@Test
public void testInv() {
cublasHandle handle = new cublasHandle();
cublasCreate(handle);
float[] a = {0,3,4,1,3,10,4,9,16};
int batchSize = 1;
int n = 3;
int lda = 3;
long nEls = 9;
Pointer hA = Pointer.to(a);
Pointer dA = new Pointer();
cudaMalloc(dA, nEls * Sizeof.FLOAT);
cudaMemcpy(dA, hA, nEls * Sizeof.FLOAT, cudaMemcpyHostToDevice);
/* Not sure about this */
Pointer phA = Pointer.to(dA);
Pointer pdA = new Pointer();
cudaMalloc(pdA, Sizeof.FLOAT);
cudaMemcpy(pdA, phA, Sizeof.FLOAT, cudaMemcpyHostToDevice);
Pointer dP = new Pointer();
cudaMalloc(dP, (long) n * batchSize * Sizeof.INT);
Pointer dInfo = new Pointer();
cudaMalloc(dInfo, batchSize * Sizeof.INT);
/* Previously tried dA directly, but still not working */
int status = cublasSgetrfBatched(handle, n, pdA, lda, dP, dInfo, batchSize);
System.out.println("Status of cublasSgetrfBatched: " + status);
int[] info = new int[] { 0 };
Pointer hInfo = Pointer.to(info);
cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
System.out.println("Info after cublasSgetrfBatched: " + info[0]);
float[] cData = new float[(int) nEls];
Pointer dC = new Pointer();
cudaMalloc(dC, nEls * Sizeof.FLOAT);
/* Not sure about this */
Pointer phC = Pointer.to(dC);
Pointer pdC = new Pointer();
cudaMalloc(pdC, Sizeof.FLOAT);
cudaMemcpy(pdC, phC, Sizeof.FLOAT, cudaMemcpyHostToDevice);
/* Previously tried dA and dC directly, but still not working */
status = cublasSgetriBatched(handle, n, pdA, lda, dP, pdC, lda, dInfo, batchSize);
System.out.println("status cublasSgetriBatched = " + status + " error string: " + cublasStatus.stringFor(status));
cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
System.out.println(info[0]);
cudaMemcpy(Pointer.to(cData), dC, nEls * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
System.out.println(Arrays.toString(cData));
cudaFree(dA);
cudaFree(dC);
cudaFree(pdA);
cudaFree(pdC);
cudaFree(dP);
cudaFree(dInfo);
cublasDestroy(handle);
}
}
Output:
Status of cublasSgetrfBatched: 0
Info after cublasSgetrfBatched: 0
status cublasSgetriBatched = 13 error string: CUBLAS_STATUS_EXECUTION_FAILED
0
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
I wasn’t sure how to correctly port the c/cpp:
float *A[] = { src_d };
float** A_d;
I tried creating a Pointer to the Pointer, but I’m still not getting it work.
Hopefully it’s not a silly error from me.
Many thanks again for all your help.