JCuda - convert row-major to column-major

Hello everybody…Please a little help…

This is my kernel…


/* ----------------- invert_mapping() --------------------- */
/* inverts data array from row-major to column-major.

   [p0,dim0][p0,dim1][p0,dim2] ... 
   [p1,dim0][p1,dim1][p1,dim2] ... 
   [p2,dim0][p2,dim1][p2,dim2] ... 
										to
   [dim0,p0][dim0,p1][dim0,p2] ...
   [dim1,p0][dim1,p1][dim1,p2] ...
   [dim2,p0][dim2,p1][dim2,p2] ...
*/
__global__ void invert_mapping(float *input,			/* original */
							   float *output,			/* inverted */
							   int npoints,				/* npoints */
							   int nfeatures)			/* nfeatures */
{
	int point_id = threadIdx.x + blockDim.x*blockIdx.x;	/* id of thread */
	int i;

	if(point_id < npoints){
		for(i=0;i<nfeatures;i++)
			output[point_id + npoints*i] = input[point_id*nfeatures + i];
	}
	return;
}
/* ----------------- invert_mapping() end --------------------- */

I think its easy to understand what this code is doing.

I am trying to pass theese arguments on the kernel using JCuda but I don’t really know what is going wrong…And I have the same wrong results all the time.

I think all the parameters are passed correctly but I have a question about the block and thread size…

TO set this parameters I wrote this:

JCudaDriver.cuFuncSetBlockShape(function, Global.num_threads, Global.num_threads, 1);

JCudaDriver.cuLaunchGrid(function, Global.num_blocks, Global.num_blocks);

Because I want 2-d blocks and 2-d grids. Is this right?? I really don’t know what is so wrong :((

    public void allocateMemory(int npoints, int nfeatures, int nclusters, float[][] features, int start, Pointer qw) throws IOException
    {
        String cubinFileName = prepareCubinFile("kmeans_cuda_kernel.cu");

        // Initialize the driver and create a context for the first device.
        JCudaDriver.cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        JCudaDriver.cuDeviceGet(dev, 0);
        JCudaDriver.cuCtxCreate(pctx, 0, dev);

        // Load the CUBIN file.
        CUmodule module = new CUmodule();
        JCudaDriver.cuModuleLoad(module, cubinFileName);

        // Obtain a function pointer to the "sampleKernel" function.
        CUfunction function = new CUfunction();
        JCudaDriver.cuModuleGetFunction(function, module, "invert_mapping");

        Global.num_blocks = npoints / Global.num_threads;
        if (npoints % Global.num_threads > 0) // defeat truncation
            Global.num_blocks++;

        Global.num_blocks_perdim = (int)Math.sqrt((double)Global.num_blocks);
        while (Global.num_blocks_perdim * Global.num_blocks_perdim < Global.num_blocks)
            // defeat truncation (should run once)
            Global.num_blocks_perdim++;

        Global.num_blocks = Global.num_blocks_perdim * Global.num_blocks_perdim;

        membership_new = new int[npoints];
        for (int i = 0; i < npoints; i++)
        {
            membership_new** = -1;
        }

        feature_d = new float[npoints * nfeatures];
        feature_flipped_d = new float[npoints * nfeatures];
        // JCuda.cudaMalloc(Pointer.to(feature_flipped_d),
        // npoints*nfeatures*Sizeof.FLOAT);

        float bla[] = new float[npoints * nfeatures];
        int s = 0;
        int h = 0;
        for (int i = 0; i < npoints * nfeatures; i++)
        {
            if (s == 10)
            {
                s = 0;
                h++;
            }
            bla** = features[h][s];
            s++;
        }

        // for (int i=0;i<npoints;i++){
        // for (int j=0;j<nfeatures+1;j++){
        // System.out.println("features[" + start+ "]" + "[" + j + "] " +
        // bla[j]);
        // }
        // }

        CUdeviceptr dfeature_d = new CUdeviceptr();
        cuMemAlloc(dfeature_d, npoints * nfeatures * Sizeof.FLOAT);
        // JCuda.cudaMalloc(Pointer.to(feature_d),
        // npoints*nfeatures*Sizeof.FLOAT);

        CUdeviceptr dfeature_flipped_d = new CUdeviceptr();
        cuMemAlloc(dfeature_flipped_d, npoints * nfeatures * Sizeof.FLOAT);
        JCuda.cudaMemcpy(dfeature_flipped_d, qw, npoints * nfeatures * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyHostToDevice);

        /*// Allocate arrays on the device, one for each row. The pointers
        // to these array are stored in host memory.
        for (int i=start; i < npoints; i++){
        Global.feature_flipped_d** = new Pointer();
        JCuda.cudaMalloc(Global.feature_flipped_d**, nfeatures*Sizeof.FLOAT);
        }
    
        // Copy the contents of the rows from the host input data to
        // the device arrays that have just been allocated.
        for (int i=start; i < npoints; i++){
        JCuda.cudaMemcpy(Global.feature_flipped_d**, Pointer.to(hostfeature_flipped_d**), nfeatures*Sizeof.FLOAT,
        cudaMemcpyKind.cudaMemcpyHostToDevice);
        }
    
        // Allocate device memory for the array pointers, and copy
        // the array pointers from the host to the device.
        JCuda.cudaMalloc(deviceInput, npoints*Sizeof.POINTER);
        JCuda.cudaMemcpy(deviceInput, Pointer.to(Global.feature_flipped_d), npoints*Sizeof.POINTER,
        cudaMemcpyKind.cudaMemcpyHostToDevice);
    
    
        JCuda.cudaMalloc(feature_d, npoints*nfeatures*Sizeof.FLOAT);*/

        System.out.println("Calling the kernel...");

        // Set up the execution parameters.
        JCudaDriver.cuFuncSetBlockShape(function, Global.num_threads, Global.num_threads, 1);

        Pointer pSize1 = Pointer.to(new int[]
        { npoints });
        Pointer pSize2 = Pointer.to(new int[]
        { nfeatures });
        Pointer pfeature_d = Pointer.to(dfeature_d);
        Pointer pfeature_flipped_d = Pointer.to(dfeature_flipped_d);

        int offset = 0;
        offset = JCudaDriver.align(offset, npoints * Sizeof.POINTER);
        JCudaDriver.cuParamSetv(function, offset, pfeature_flipped_d, Sizeof.POINTER);
        offset += Sizeof.POINTER;

        offset = JCudaDriver.align(offset, npoints * nfeatures * Sizeof.FLOAT);
        JCudaDriver.cuParamSetv(function, offset, pfeature_d, Sizeof.POINTER);
        offset += Sizeof.POINTER;

        offset = JCudaDriver.align(offset, Sizeof.INT);
        JCudaDriver.cuParamSetv(function, offset, pSize1, Sizeof.INT);
        offset += Sizeof.INT;

        offset = JCudaDriver.align(offset, Sizeof.INT);
        JCudaDriver.cuParamSetv(function, offset, pSize2, Sizeof.INT);
        offset += Sizeof.INT;

        JCudaDriver.cuParamSetSize(function, offset);

        // Call the function.
        JCudaDriver.cuLaunchGrid(function, Global.num_blocks, Global.num_blocks);
        JCudaDriver.cuCtxSynchronize();

        /*float hOutput[] = new float[npoints*nfeatures];
        cuMemcpyDtoH(Pointer.to(hOutput), dfeature_d, Sizeof.FLOAT * npoints*nfeatures);
    
        float hhoutput[] = new float[npoints*nfeatures];
        cuMemcpyDtoH(Pointer.to(hhoutput), dfeature_flipped_d, Sizeof.FLOAT * npoints*nfeatures);
    
        System.out.println("mplaaaaaa " + Arrays.toString(hOutput) );*/

        float feature_dd[] = new float[npoints * nfeatures];
        cuMemcpyDtoH(Pointer.to(feature_dd), dfeature_d, Sizeof.FLOAT * npoints * nfeatures);

        for (int i = 0; i < nfeatures; i++)
        {
            // for (int j=0;j<nfeatures;j++){
            System.out.println("feature_dddd[" + i + "]" + feature_dd[200]);
            // }
        }

    }

And this is the hole code on this function…If anyone can help I could really appreciate because I am really stucked!!

Thank you guys!!

Hello

If I understood this right, you just want to transpose a matrix (i.e. the 2D input array)? Note that the NVIDIA SDK already contains a sample called ‚transpose‘, which is an optimized kernel for such matrix transposes.

But of course, it may also make sense to try this without looking at the sample, to see how stuff works :slight_smile:

From the code that you posted it is not clear what is the error is exactly. A small, self-contained example containing all relevant information, namely the parameters of the function call and the expected results would be helpful. But I’ll try to have a closer look at this tomorrow, maybe I can figure out what the error is.

bye

Hello,

OK, I had a look at the source code and there have been some… errors. For example, the alignment that is computed with
offset = JCudaDriver.align(offset, npoints * nfeatures * Sizeof.FLOAT);
is horribly wrong, since it should be aligned to the size of the type, and this is just a POINTER in this case.

Additionally, you have set up
JCudaDriver.cuParamSetv(function, offset, pfeature_flipped_d, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, pfeature_d, Sizeof.POINTER);
(in this order) although the kernel expected the input as the first argument and the output as the second.

One important aspect is also that you did not declare your kernel as extern “C”, so the name got mangled and it probably did not even find the kernel function. You could have detected this very early by just calling
JCudaDriver.setExceptionsEnabled(true);
at the beginning of your program.

For short: There have been some caveats. I have created a very simple example of how your task might be achieved. But note that this is a naive implementation, which does NOT use shared memory, and does NOT obey the rules for coalesced memory access, so it will probably be slow. For an optimized version of a matrix transpose, you may want to have a look at the NVIDIA sample.

The source code

import java.io.*;

import jcuda.*;
import jcuda.driver.*;
import static jcuda.driver.JCudaDriver.*;


public class RowColumSwap
{
    public static void main(String args[]) throws IOException
    {
        String cubinFileName = prepareCubinFile("simpleTranspose.cu");

        JCudaDriver.setExceptionsEnabled(true);
        
        // Initialize the driver and create a context for the first device.
        cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        cuDeviceGet(dev, 0);
        cuCtxCreate(pctx, 0, dev);
 
        // Load the CUBIN file.
        CUmodule module = new CUmodule();
        cuModuleLoad(module, cubinFileName);
 
        // Obtain a function pointer to the function.
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "simpleTranspose");

        // Set up the number of threads and blocks
        int numThreadsX = 2;
        int numBlocksX = 3;
        int numThreadsY = 3;
        int numBlocksY = 4;
 
        // Compute the size of the matrix
        int inputColumns = numThreadsX * numBlocksX;
        int inputRows = numThreadsY * numBlocksY;
        int outputColumns = inputRows;
        int size = inputRows * inputColumns;
        
        // Create the input and output arrays
        float input[] = new float[size];
        float output[] = new float[size];
        for (int i = 0; i < size; i++)
        {
            input** = i;
        }

        System.out.println("Input");
        printMatrix(input, inputColumns);

        // Create the input array on the device
        CUdeviceptr dInput = new CUdeviceptr();
        cuMemAlloc(dInput, size * Sizeof.FLOAT);
        cuMemcpyHtoD(dInput, Pointer.to(input), size*Sizeof.FLOAT);
        
        // Create the output array on the device
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, size * Sizeof.FLOAT);
        cuMemsetD32(dOutput, 0, size);

        // Set up the arguments
        Pointer pInput = Pointer.to(dInput);
        Pointer pOutput = Pointer.to(dOutput);
        Pointer pSize1 = Pointer.to(new int[]{ inputRows });
        Pointer pSize2 = Pointer.to(new int[]{ inputColumns });
 
        int offset = 0;
        offset = align(offset, Sizeof.POINTER);
        cuParamSetv(function, offset, pInput, Sizeof.POINTER);
        offset += Sizeof.POINTER;
 
        offset = align(offset, Sizeof.POINTER);
        cuParamSetv(function, offset, pOutput, Sizeof.POINTER);
        offset += Sizeof.POINTER;
 
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, pSize1, Sizeof.INT);
        offset += Sizeof.INT;
 
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, pSize2, Sizeof.INT);
        offset += Sizeof.INT;
 
        // Set up the execution parameters and call the function
        cuFuncSetBlockShape(function, numThreadsX, numThreadsY, 1);
        cuParamSetSize(function, offset);
        cuLaunchGrid(function, numBlocksX, numBlocksY);
        cuCtxSynchronize();
 
        // Obtain the result
        cuMemcpyDtoH(Pointer.to(output), dOutput, size * Sizeof.FLOAT);

        System.out.println("Output");
        printMatrix(output, outputColumns);
    }

    
    //-- Only helper functions below...
    
    private static void printMatrix(float matrix[], int columns)
    {
        for (int i=0; i<matrix.length; i++)
        {
            System.out.printf("%4.1f ", matrix**);
            if ((i+1) % columns == 0)
            {
                System.out.println();
            }
        }
    }
    
    
    private static String prepareCubinFile(String cuFileName) throws IOException
    {
        int endIndex = cuFileName.lastIndexOf('.');
        if (endIndex == -1)
        {
            endIndex = cuFileName.length()-1;
        }
        String cubinFileName = cuFileName.substring(0, endIndex+1)+"cubin";
        File cuFile = new File(cuFileName);
        if (!cuFile.exists())
        {
            throw new IOException("Input file not found: "+cuFileName);
        }
        String modelString = "-m"+System.getProperty("sun.arch.data.model");        
        String command = 
            "nvcc " + modelString + " -arch sm_11 -cubin "+
            cuFile.getPath()+" -o "+cubinFileName;
        
        System.out.println("Executing
"+command);
        Process process = Runtime.getRuntime().exec(command);

        String errorMessage = new String(toByteArray(process.getErrorStream()));
        String outputMessage = new String(toByteArray(process.getInputStream()));
        int exitValue = 0;
        try
        {
            exitValue = process.waitFor();
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            throw new IOException("Interrupted while waiting for nvcc output", e);
        }

        System.out.println("nvcc process exitValue "+exitValue);
        if (exitValue != 0)
        {
            System.out.println("errorMessage:
"+errorMessage);
            System.out.println("outputMessage:
"+outputMessage);
            throw new IOException("Could not create .cubin file: "+errorMessage);
        }
        return cubinFileName;
    }
    private static byte[] toByteArray(InputStream inputStream) throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte buffer[] = new byte[8192];
        while (true)
        {
            int read = inputStream.read(buffer);
            if (read == -1)
            {
                break;
            }
            baos.write(buffer, 0, read);
        }
        return baos.toByteArray();
    }
    
}

The kernel code, stored in ‘simpleTranspose.cu’:


extern "C"
__global__ void simpleTranspose(float *input,
                                float *output,
                                int inputRows,
                                int inputColumns)
{
    int inputColumn = threadIdx.x + blockDim.x*blockIdx.x;
    int inputRow = threadIdx.y + blockDim.y*blockIdx.y;
    int outputRow = inputColumn;
    int outputColumn = inputRow;
    int outputRows = inputColumns;
    int outputColumns = inputRows;

    if(inputColumn < inputColumns && inputRow < inputRows)
    {
        int inputIndex = inputColumn+inputRow*inputColumns;
        int outputIndex = outputColumn+outputRow*outputColumns;
        output[outputIndex] = input[inputIndex];
    }
    return;
}

Oh, concerning the intention to use 2D blocks and grids: If you set up 2D blocks and grids, you also have to consider this in your kernel, that is, you also have to use threadIdx**.y** etc. The example posted above shows how this may be done.

By the way: The KernelLauncher class is a small example that shows how the invocation of kernels may be simplified, to avoid errors like the ones that you had in your program concerning the alignment and the order of the kernel arguments.

An updated version of the KernelLauncher will be uploaded soon, but here’s how your task could be solved using the current version:

(Expecting the “simpleTranspose.cu” file to be present, as posted above)


import java.io.*;

import jcuda.*;
import jcuda.driver.*;
import static jcuda.driver.JCudaDriver.*;

public class RowColumnSwapSimple
{
    public static void main(String args[]) throws IOException
    {
        // Set up the number of threads and blocks
        int numThreadsX = 2;
        int numBlocksX = 3;
        int numThreadsY = 3;
        int numBlocksY = 4;

        // Compute the size of the matrix
        int inputColumns = numThreadsX * numBlocksX;
        int inputRows = numThreadsY * numBlocksY;
        int outputColumns = inputRows;
        int size = inputRows * inputColumns;
        
        // Create the input and output arrays
        float input[] = new float[size];
        float output[] = new float[size];
        for (int i = 0; i < size; i++)
        {
            input** = i;
        }

        System.out.println("Input");
        printMatrix(input, inputColumns);

        // Create the input array on the device
        CUdeviceptr dInput = new CUdeviceptr();
        cuMemAlloc(dInput, size * Sizeof.FLOAT);
        cuMemcpyHtoD(dInput, Pointer.to(input), size*Sizeof.FLOAT);
        
        // Create the output array on the device
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, size * Sizeof.FLOAT);
        cuMemsetD32(dOutput, 0, size);

        KernelLauncher kernelLauncher = 
            KernelLauncher.create("simpleTranspose.cu", "simpleTranspose", true);
        kernelLauncher.setBlockSize(numThreadsX, numThreadsY, 1);
        kernelLauncher.setGridSize(numBlocksX, numBlocksY);
        kernelLauncher.call(dInput, dOutput, inputRows, inputColumns);
 
        // Obtain the result
        cuMemcpyDtoH(Pointer.to(output), dOutput, size * Sizeof.FLOAT);

        System.out.println("Output");
        printMatrix(output, outputColumns);
    }

    private static void printMatrix(float matrix[], int columns)
    {
        for (int i=0; i<matrix.length; i++)
        {
            System.out.printf("%4.1f ", matrix**);
            if ((i+1) % columns == 0)
            {
                System.out.println();
            }
        }
    }
}

A little bit shorter and simpler, I think.