Correct usage of cublasSgetriBatched for matrix inverse

Hi Marco,

I’ve been trying to port a basic example of a matrix inverse based on Robert Crovella’s answer on SO.

Below is a simplified working example of the original CUDA code:

#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

int main(int argc, char const *argv[])
{
	int n = 3;

    float a[n*n] = {0,3,4,
                    1,3,10,
                    4,9,16};

    float result[n*n];

	cublasHandle_t handle;
    cublasCreate_v2(&handle);

    int batchSize = 1;

    int *P, *INFO;

    cudaMalloc<int>(&P, n * batchSize * sizeof(int));
    cudaMalloc<int>(&INFO, batchSize * sizeof(int));

    int lda = n;

	float* src_d, *dst_d;

    cudaMalloc<float>(&src_d, n * n * sizeof(float));
    cudaMemcpy(src_d, a, n * n * sizeof(float),cudaMemcpyHostToDevice);
    cudaMalloc<float>(&dst_d, n * n * sizeof(float));

	float *A[] = { src_d };
    float** A_d;

    cudaMalloc<float*>(&A_d,sizeof(A));
    cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);

    cublasSgetrfBatched(handle, n, A_d, lda, P, INFO, batchSize);

    int INFOh = 0;
    cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost);

    
    fprintf(stderr, "INFOh %d\n", INFOh);

    if(INFOh == 17)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

	float* C[] = { dst_d };
    float** C_d;
    cudaMalloc<float*>(&C_d,sizeof(C));
    cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice);

    cublasSgetriBatched(handle, n, A_d, lda, P, C_d, n, INFO, batchSize);

    cudaMemcpy(&INFOh, INFO, sizeof(int), cudaMemcpyDeviceToHost);

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    cudaMemcpy(result, dst_d, n * n * sizeof(float), cudaMemcpyDeviceToHost);

    fprintf(stdout, "Input:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",a[i*n+j]);
        fprintf(stdout,"\n");
    }

    fprintf(stdout,"\n\n");


    fprintf(stdout, "Inverse:\n\n");
    for(int i=0; i<n; i++)
    {
        for(int j=0; j<n; j++)
            fprintf(stdout,"%f\t",result[i*n+j]);
        fprintf(stdout,"\n");
    }

    cudaFree(P), cudaFree(INFO), cudaFree(src_d), cudaFree(dst_d), cublasDestroy_v2(handle);
	return 0;
}

Compiling and running the above:

INFOh 0
Input:

0.000000	3.000000	4.000000	
1.000000	3.000000	10.000000	
4.000000	9.000000	16.000000	


Inverse:

-0.700000	-0.200000	0.300000	
0.400000	-0.266667	0.066667	
-0.050000	0.200000	-0.050000

Here is my attempt to port this code to jcublas:

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasStatus;
import org.junit.jupiter.api.Test;

import java.util.Arrays;

import static jcuda.jcublas.JCublas2.*;
import static jcuda.runtime.JCuda.*;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

public class JCublasInvTest {
  @Test
  public void testInv() {
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);

    float[] a = {0,3,4,1,3,10,4,9,16};

    int batchSize = 1;
    int n = 3;
    int lda = 3;
    long nEls = 9;

    Pointer hA = Pointer.to(a);
    Pointer dA = new Pointer();
    cudaMalloc(dA, nEls * Sizeof.FLOAT);
    cudaMemcpy(dA, hA, nEls * Sizeof.FLOAT, cudaMemcpyHostToDevice);

    /* Not sure about this */
    Pointer phA = Pointer.to(dA);
    Pointer pdA = new Pointer();
    cudaMalloc(pdA, Sizeof.FLOAT);
    cudaMemcpy(pdA, phA, Sizeof.FLOAT, cudaMemcpyHostToDevice);

    Pointer dP = new Pointer();
    cudaMalloc(dP, (long) n * batchSize * Sizeof.INT);
    Pointer dInfo = new Pointer();
    cudaMalloc(dInfo, batchSize * Sizeof.INT);

    /* Previously tried dA directly, but still not working */
    int status = cublasSgetrfBatched(handle, n, pdA, lda, dP, dInfo, batchSize);
    System.out.println("Status of cublasSgetrfBatched: " + status);

    int[] info = new int[] { 0 };
    Pointer hInfo = Pointer.to(info);
    cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
    System.out.println("Info after cublasSgetrfBatched: " + info[0]);

    float[] cData = new float[(int) nEls];
    Pointer dC = new Pointer();
    cudaMalloc(dC, nEls * Sizeof.FLOAT);

    /* Not sure about this */
    Pointer phC = Pointer.to(dC);
    Pointer pdC = new Pointer();
    cudaMalloc(pdC, Sizeof.FLOAT);
    cudaMemcpy(pdC, phC, Sizeof.FLOAT, cudaMemcpyHostToDevice);

    /* Previously tried dA and dC directly, but still not working */
    status = cublasSgetriBatched(handle, n, pdA, lda, dP, pdC, lda, dInfo, batchSize);
    System.out.println("status cublasSgetriBatched = " + status + " error string: " + cublasStatus.stringFor(status));

    cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
    System.out.println(info[0]);

    cudaMemcpy(Pointer.to(cData), dC, nEls * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
    System.out.println(Arrays.toString(cData));

    cudaFree(dA);
    cudaFree(dC);
    cudaFree(pdA);
    cudaFree(pdC);
    cudaFree(dP);
    cudaFree(dInfo);
    cublasDestroy(handle);
  }
}

Output:

Status of cublasSgetrfBatched: 0
Info after cublasSgetrfBatched: 0
status cublasSgetriBatched = 13 error string: CUBLAS_STATUS_EXECUTION_FAILED
0
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

I wasn’t sure how to correctly port the c/cpp:

    float *A[] = { src_d };
    float** A_d;

I tried creating a Pointer to the Pointer, but I’m still not getting it work.
Hopefully it’s not a silly error from me.

Many thanks again for all your help.

I started trying to „map“ between the stack overflow code that you linked to, and the code that you posted here, but might need a bit more time to look into the details.


The main reason for the error seems to be that you used a wrong size for some of the memory copies. Specifcally, at both places where you commented with /* Not sure about this */. The C version of these parts is this:

float *A[] = { src_d };
float** A_d;
cudaMalloc<float*>(&A_d,sizeof(A));
cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);

The sizeof is applied to an array of pointers (and the cudaMalloc call is parameterized with float*, and not with float, FWIW).

Comparing that to the Java version

    Pointer phA = Pointer.to(dA);
    Pointer pdA = new Pointer();
    cudaMalloc(pdA, Sizeof.FLOAT);
    cudaMemcpy(pdA, phA, Sizeof.FLOAT, cudaMemcpyHostToDevice);

You’ll notice that the Sizeof.FLOAT may not be the right thing: pdA is supposed to be a float** variable. And in this case, this is a pointer to device memory that contains further pointers.

The „minimal modification“ that should make it work would be to change this to

    Pointer phA = Pointer.to(dA);
    Pointer pdA = new Pointer();
    cudaMalloc(pdA, Sizeof.POINTER);
    cudaMemcpy(pdA, phA, Sizeof.POINTER, cudaMemcpyHostToDevice);

(changing Sizeof.FLOAT to Sizeof.POINTER). This works in this case, because there is only a single pointer. More generically, you would use Sizeof.POINTER * numberOfPointersToCopy, of course.


An aside: This can be a bit tricky even in C: When you don’t have an array (like float *A[] = { src_d };), but instead, are allocating your pointers dynamically, it can be confusing. I wrote a few words about this in CUDA Double pointer memory copy - Stack Overflow , and the „overly elaborate variable names“ might help here.


Further asides:

  • I usually call JCuda.setExceptionsEnabled(true) and JCublas2.setExceptionsEnabled(true) at the beginning of the main function, just to simplify the error checks for quick tests.
  • There is an example for non-batched matrix inversion at JCublas2MatrixInvert.java
  • There is an example for a batched SGEMM at JCublas2SgemmBatched.java

(These examples combined may contain some useful snippets - e.g. for getting that „pointers to pointers“-handling right…)


Here is the „quick-fixed“ version of the code.

(Again: Note that you might have to use Sizeof.POINTER * numberOfPointers if you intend to generalize this!)

package jcuda.jcublas.test;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasStatus;

import java.util.Arrays;

import static jcuda.jcublas.JCublas2.*;
import static jcuda.runtime.JCuda.*;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;

public class JCublasInvTest {

    public static void main(String[] args)
    {
        JCublas2.setExceptionsEnabled(true);
        
        cublasHandle handle = new cublasHandle();
        cublasCreate(handle);

        float[] a = {0,3,4,1,3,10,4,9,16};

        int batchSize = 1;
        int n = 3;
        int lda = 3;
        long nEls = 9;

        Pointer hA = Pointer.to(a);
        Pointer dA = new Pointer();
        cudaMalloc(dA, nEls * Sizeof.FLOAT);
        cudaMemcpy(dA, hA, nEls * Sizeof.FLOAT, cudaMemcpyHostToDevice);

        /* Not sure about this */
        Pointer phA = Pointer.to(dA);
        Pointer pdA = new Pointer();
        cudaMalloc(pdA, Sizeof.POINTER);
        cudaMemcpy(pdA, phA, Sizeof.POINTER, cudaMemcpyHostToDevice);

        Pointer dP = new Pointer();
        cudaMalloc(dP, (long) n * batchSize * Sizeof.INT);
        Pointer dInfo = new Pointer();
        cudaMalloc(dInfo, batchSize * Sizeof.INT);

        /* Previously tried dA directly, but still not working */
        int status = cublasSgetrfBatched(handle, n, pdA, lda, dP, dInfo, batchSize);
        System.out.println("Status of cublasSgetrfBatched: " + status);

        int[] info = new int[] { 0 };
        Pointer hInfo = Pointer.to(info);
        cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
        System.out.println("Info after cublasSgetrfBatched: " + info[0]);

        float[] cData = new float[(int) nEls];
        Pointer dC = new Pointer();
        cudaMalloc(dC, nEls * Sizeof.FLOAT);

        /* Not sure about this */
        Pointer phC = Pointer.to(dC);
        Pointer pdC = new Pointer();
        cudaMalloc(pdC, Sizeof.POINTER);
        cudaMemcpy(pdC, phC, Sizeof.POINTER, cudaMemcpyHostToDevice);

        /* Previously tried dA and dC directly, but still not working */
        status = cublasSgetriBatched(handle, n, pdA, lda, dP, pdC, lda, dInfo, batchSize);
        System.out.println("status cublasSgetriBatched = " + status + " error string: " + cublasStatus.stringFor(status));

        cudaMemcpy(hInfo, dInfo, Sizeof.INT, cudaMemcpyDeviceToHost);
        System.out.println(info[0]);

        cudaMemcpy(Pointer.to(cData), dC, nEls * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
        System.out.println(Arrays.toString(cData));

        cudaFree(dA);
        cudaFree(dC);
        cudaFree(pdA);
        cudaFree(pdC);
        cudaFree(dP);
        cudaFree(dInfo);
        cublasDestroy(handle);
    }
}

The output is

Status of cublasSgetrfBatched: 0
Info after cublasSgetrfBatched: 0
status cublasSgetriBatched = 0 error string: CUBLAS_STATUS_SUCCESS
0
[-0.70000005, -0.2, 0.3, 0.40000004, -0.26666668, 0.06666667, -0.04999999, 0.2, -0.050000004]

which appears to be the right answer.


EDIT, a final aside: On a 32bit system, your code would have worked: There, Sizeof.FLOAT and Sizeof.POINTER would both be 4. But on 64bit systems, Sizeof.POINTER is 8.

1 „Gefällt mir“

Awesome! Thank you so much, Marco!

Your answer is full of beneficial points - I learned a lot just from this.
Thank you also for the SO and GitHub links - super-useful.

I tried to distill the SO code into one sequence of actions just for the purposes of porting and comparing side-to-side Java/CPP. I guess the major change I made was removing the identity-extension workaround (that Crovella used to fix the previous buggy versions of cublas).

Thanks again!