Matrix Multiplication in jcuda using two cuda device

hi,I working with jcuda in the example of matrix multiplication but i
have two cuda device in a single PC and i don’t now how to use the
second device in this example.
this is the code.

package org.jppf.example.prueba;
import java.util.Random;

import jcuda.*;
import jcuda.jcublas.JCublas;

public class TestMatrixMultiplication {
        //final int size;

        public TestMatrixMultiplication(final int newSize)
        //public static void main(String args[])
        {
                //this.size = newSize;
                testSgemm(newSize);
        }



        public static void testSgemm(int n)
        {
                float alpha = 1.0f;
                float beta = 0.0f;
                int nn = n * n;

                System.out.println("Creando datos de entrada...");
                float h_A[] = createRandomFloatData(nn);
                float h_B[] = createRandomFloatData(nn);
                float h_C[] = createRandomFloatData(nn);
                float h_C_ref[] = h_C.clone();

                System.out.println("Ejecutando la mutiplicacion de Matrices sequencial...");
                long A=System.currentTimeMillis();
                sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref);
                long B=System.currentTimeMillis()-A;
                System.out.println("Tiempo de ejecución igual a: "+B+" milisegundos");
                System.out.println("Ejecutando la multiplicacion de matrices con JCublas...");
                A=System.currentTimeMillis();
                sgemmJCublas(n, alpha, h_A, h_B, beta, h_C);
                B=System.currentTimeMillis()-A;
                System.out.println("Tiempo de ejecución igual a: "+B+" milisegundos");
                boolean passed = isCorrectResult(h_C, h_C_ref);
                System.out.println("testMatrixMultiplication: "+(passed?"PASSED":"FAILED"));
        }

        private static void sgemmJCublas(int n, float alpha, float A[], float B[],
                        float beta, float C[])
        {
                int nn = n * n;

                // Initialize JCublas
                JCublas.cublasInit();

                // Allocate memory on the device
                Pointer d_A = new Pointer();
                Pointer d_B = new Pointer();
                Pointer d_C = new Pointer();
                JCublas.cublasAlloc(nn, Sizeof.FLOAT, d_A);
                JCublas.cublasAlloc(nn, Sizeof.FLOAT, d_B);
                JCublas.cublasAlloc(nn, Sizeof.FLOAT, d_C);

                // Copy the memory from the host to the device
                JCublas.cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1);
                JCublas.cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1);
                JCublas.cublasSetVector(nn, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1);

                // Execute sgemm
                JCublas.cublasSgemm(
                                'n', 'n', n, n, n, alpha, d_A, n, d_B, n, beta, d_C, n);

                // Copy the result from the device to the host
                JCublas.cublasGetVector(nn, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1);

                // Clean up
                JCublas.cublasFree(d_A);
                JCublas.cublasFree(d_B);
                JCublas.cublasFree(d_C);

                JCublas.cublasShutdown();
        }

        private static void sgemmJava(int n, float alpha, float A[], float B[],
                        float beta, float C[])
        {
                for (int i = 0; i < n; ++i)
                {
                        for (int j = 0; j < n; ++j)
                        {
                                float prod = 0;
                                for (int k = 0; k < n; ++k)
                                {
                                        prod += A[k * n + i] * B[j * n + k];
                                }
                                C[j * n + i] = alpha * prod + beta * C[j * n + i];
                        }
                }
        }


        private static float[] createRandomFloatData(int n)
        {
                Random random = new Random();
                float x[] = new float[n];
                for (int i = 0; i < n; i++)
                {
                        x** = random.nextFloat();
                }
                return x;
        }


        private static boolean isCorrectResult(float result[], float reference[])
        {
                float errorNorm = 0;
                float refNorm = 0;
                for (int i = 0; i < result.length; ++i)
                {
                        float diff = reference** - result**;
                        errorNorm += diff * diff;
                        refNorm += reference** * result**;
                }
                errorNorm = (float) Math.sqrt(errorNorm);
                refNorm = (float) Math.sqrt(refNorm);
                if (Math.abs(refNorm) < 1e-6)
                {
                        return false;
                }
                return (errorNorm / refNorm < 1e-6f);
        }

}

Hello

Since I’m still stuck with a single device, I can not post a sample that is guaranteed to work right now. But according to the CUBLAS documentation at http://docs.nvidia.com/cuda/cublas/index.html , it should be possible to associate different devices with a CUBLAS context:

…the application can use cudaSetDevice() to associate different devices with different host threads and in each of those host threads it can initialize a unique handle to the CUBLAS library context, which will use the particular device associated with that host thread. Then, the CUBLAS library function calls made with different handle will automatically dispatch the computation to different devices.

Of course, the best usage pattern then depends highly on the purpose of the program. If you intend to let your matrix mulitplications be performed by two devices simultaneously, you’ll have to think about the best threading- and synchronization strategies, probably really using multiple Java Threads and CUDA streams.

But if your intention is only to run a basic test, the following program might be sufficient: It is similar to the original JCublas2 example, but initializes the SGEMM input memory for all available devices, then runs the sgemm on all devices, and finally copies the results from all devices back and verifies them (That means that each of these steps is done sequentially for all devices, and the CUBLAS context is ONLY created for the SGEMM call. For „real world“ applications, you would not do this, of course).

Note that I could only test this with a single device until now

But I hope it work for multiple devices as well:

import static jcuda.jcublas.JCublas2.*;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.runtime.JCuda.*;
import static jcuda.runtime.cudaMemcpyKind.*;

import java.util.Random;

import jcuda.*;
import jcuda.jcublas.cublasHandle;

public class JCublasMultiDevice
{
    public static void main(String args[])
    {
        testSgemm(100);
    }

    public static void testSgemm(int n)
    {
        float alpha = 0.3f;
        float beta = 0.7f;
        int nn = n * n;

        int deviceCountArray[] = { 0 };
        cudaGetDeviceCount(deviceCountArray);
        int deviceCount = deviceCountArray[0];
        System.out.println("Runing test on "+deviceCount+" devices");
        
        System.out.println("Creating input data...");
        float h_A[] = createRandomFloatData(nn);
        float h_B[] = createRandomFloatData(nn);
        float h_C0[] = createRandomFloatData(nn);
        float h_C_ref[] = h_C0.clone();
        
        System.out.println("Performing Sgemm with Java...");
        sgemmJava(n, alpha, h_A, h_B, beta, h_C_ref);

        
        System.out.println("Copying data to devices...");
        float h_C[][] = new float[deviceCount][];
        for (int i=0; i<deviceCount; i++)
        {
            h_C** = h_C0.clone();
        }
        
        Pointer d_A[] = new Pointer[deviceCount];
        Pointer d_B[] = new Pointer[deviceCount];
        Pointer d_C[] = new Pointer[deviceCount];
        for (int i=0; i<deviceCount; i++)
        {
            System.out.println("Copying data to device "+i+" of "+deviceCount);
            d_A** = createDevicePointer(i, h_A);
            d_B** = createDevicePointer(i, h_B);
            d_C** = createDevicePointer(i, h_C**);
        }

        System.out.println("Performing Sgemm with JCublas...");
        for (int i=0; i<deviceCount; i++)
        {
            System.out.println("Performing Sgemm with device "+i+" of "+deviceCount);
            sgemmJCublas(i, n, alpha, d_A**, d_B**, beta, d_C**);
        }
        
        System.out.println("Obtaining results...");
        for (int i=0; i<deviceCount; i++)
        {
            System.out.println("Obtaining results from device "+i+" of "+deviceCount);
            releaseDevicePointer(i, d_A**, null);
            releaseDevicePointer(i, d_B**, null);
            releaseDevicePointer(i, d_C**, h_C**);
        }
        
        
        boolean passed = true;
        System.out.println("Verifying results...");
        for (int i=0; i<deviceCount; i++)
        {
            System.out.println("Verifying results from device "+i+" of "+deviceCount);
            boolean p = isCorrectResult(h_C**, h_C_ref);
            if (!p)
            {
                System.out.println("Error in result of device "+i);
            }
            passed &= p;
            
        }
        
        System.out.println("testSgemm "+(passed?"PASSED":"FAILED"));
        
    }
    
    
    /**
     * Allocate a device pointer on the specified device that has the
     * same contents as the given array
     *  
     * @param device The device index
     * @param array The array
     * @return The new device pointer
     */
    private static Pointer createDevicePointer(int device, float array[])
    {
        cudaSetDevice(device);
        Pointer devicePointer = new Pointer();
        cudaMalloc(devicePointer, array.length * Sizeof.FLOAT);
        cudaMemcpy(devicePointer, Pointer.to(array), 
            array.length*Sizeof.FLOAT, cudaMemcpyHostToDevice);
        return devicePointer;
    }
    
    /**
     * Copy the contents of the given device pointer into the given 
     * array if it is not <code>null</code>, and release the pointer
     * 
     * @param device The device index
     * @param devicePointer The device pointer
     * @param array The optional target array
     */
    private static void releaseDevicePointer(int device, Pointer devicePointer, float array[])
    {
        cudaSetDevice(device);
        if (array != null)
        {
            cudaMemcpy(Pointer.to(array), devicePointer, 
                array.length*Sizeof.FLOAT, cudaMemcpyDeviceToHost);
        }
        cudaFree(devicePointer);
    }

    /**
     * Implementation of sgemm using JCublas
     */
    private static void sgemmJCublas(
        int device, int n, float alpha, Pointer d_A, Pointer d_B,
        float beta, Pointer d_C)
    {
        // Select the specified device
        cudaSetDevice(device);

        // Create a CUBLAS handle
        cublasHandle handle = new cublasHandle();
        cublasCreate(handle);

        // Execute sgemm
        Pointer pAlpha = Pointer.to(new float[]{alpha});
        Pointer pBeta = Pointer.to(new float[]{beta});
        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, 
            pAlpha, d_A, n, d_B, n, pBeta, d_C, n);

        cublasDestroy(handle);
    }

    /**
     * Simple implementation of sgemm, using plain Java
     */
    private static void sgemmJava(int n, float alpha, float A[], float B[],
                    float beta, float C[])
    {
        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                float prod = 0;
                for (int k = 0; k < n; ++k)
                {
                    prod += A[k * n + i] * B[j * n + k];
                }
                C[j * n + i] = alpha * prod + beta * C[j * n + i];
            }
        }
    }


    /**
     * Creates an array of the specified size, containing some random data
     */
    private static float[] createRandomFloatData(int n)
    {
        Random random = new Random();
        float x[] = new float[n];
        for (int i = 0; i < n; i++)
        {
            x** = random.nextFloat();
        }
        return x;
    }

    /**
     * Compares the given result against a reference, and returns whether the
     * error norm is below a small epsilon threshold
     */
    private static boolean isCorrectResult(float result[], float reference[])
    {
        float errorNorm = 0;
        float refNorm = 0;
        for (int i = 0; i < result.length; ++i)
        {
            float diff = reference** - result**;
            errorNorm += diff * diff;
            refNorm += reference** * result**;
        }
        errorNorm = (float) Math.sqrt(errorNorm);
        refNorm = (float) Math.sqrt(refNorm);
        if (Math.abs(refNorm) < 1e-6)
        {
            return false;
        }
        return (errorNorm / refNorm < 1e-6f);
    }
}