Char ** response and JCUDA.Pointer

I’ve recently starting learning CUDA/jCUDA and I’m using the JDriverCubinSample.java as a template to call the following function in my .cubin file:

void tokenizeTest(char inputString, char* returnval)

The java code executes without failure however I’m having difficulty extracting the response from returnval, does anyone know where i could find a sample where the response from a call to the device is char** ?

I’ve been hacking away at something along the lines of the following but can’t quite get it … I do realize that the bellow may need to be changed to retrieve a Pointer to Pointers which would then need to be iterated ?

char hostOutput[] = new char[x];
JCudaDriver.cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput,
x * Sizeof.CHAR);

Any help/guidance would be much appreciated …

Nick.

Hello

First of all, you should be aware that ja Java ‘char’ has 16 bits (2 bytes), and a C ‘char’ has only 8 bits (1 byte). So when you are using ‘char’ in a kernel, you always have to use ‘byte’ on Java side.

Concerning the actual problem: It’s basically just the opposite direction of what is shown in the original sample. The critical part (which should also be pointed out by the sample) is that the float** (or char**) is a pointer to a pointer, which may be considered as an “array of pointers”. Each of these pointers points to device memory. But the array of pointers also has to be stored in device memory (although it can only be allocated on the host).

I have modified the example a little bit. It now calls a kernel that receives a String (as a byte[] array), and each thread just copies the contents of this input array into one row of a 2D output array.

import static jcuda.driver.JCudaDriver.*;

import java.io.*;

import jcuda.*;
import jcuda.driver.*;

public class JCudaCharPointersTest
{
    public static void main(String args[]) throws IOException
    {
        CUfunction function = defaultInit();
        
        // Create the host input memory: The bytes of a String
        String inputString = "Hello pointers";
        byte hostInput[] = inputString.getBytes();
        int size = hostInput.length;

        // Allocate and fill the device input memory
        CUdeviceptr deviceInput = new CUdeviceptr();
        cuMemAlloc(deviceInput, size * Sizeof.BYTE);
        cuMemcpyHtoD(deviceInput, Pointer.to(hostInput),
            size * Sizeof.BYTE);

        // Allocate the host output memory: 'numThreads' arrays
        // of bytes, each receiving a copy of the input
        int numThreads = 8;
        byte hostOutput[][] = new byte[numThreads][size];

        // Allocate arrays on the device, one for each row. The pointers
        // to these array are stored in host memory.
        CUdeviceptr hostDevicePointers[] = new CUdeviceptr[numThreads];
        for(int i = 0; i < numThreads; i++)
        {
            hostDevicePointers** = new CUdeviceptr();
            cuMemAlloc(hostDevicePointers**, size * Sizeof.BYTE);
        }

        // Allocate device memory for the array pointers, and copy
        // the array pointers from the host to the device.
        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, numThreads * Sizeof.POINTER);
        cuMemcpyHtoD(deviceOutput, Pointer.to(hostDevicePointers),
            numThreads * Sizeof.POINTER);

        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.
        Pointer kernelParams = Pointer.to(
            Pointer.to(new int[]{size}), 
            Pointer.to(deviceInput), 
            Pointer.to(deviceOutput)
        );
        
        // Call the kernel function.
        cuLaunchKernel(function, 
            1, 1, 1,           // Grid dimension 
            numThreads, 1, 1,  // Block dimension
            0, null,           // Shared memory size and stream 
            kernelParams, null // Kernel- and extra parameters
        ); 
        cuCtxSynchronize();

        // Copy each row back from the device to the host
        for(int i = 0; i < numThreads; i++)
        {
            cuMemcpyDtoH(Pointer.to(hostOutput**), hostDevicePointers**,
                size * Sizeof.BYTE);
        }
        
        // Print the results
        boolean passed = true;
        for(int i = 0; i < numThreads; i++)
        {
            String s = new String(hostOutput**);
            if (!s.equals(inputString))
            {
                passed = false;
            }
            System.out.println(s);
        }
        System.out.println("Test "+(passed?"PASSED":"FAILED"));

        // Clean up.
        for(int i = 0; i < numThreads; i++)
        {
            cuMemFree(hostDevicePointers**);
        }
        cuMemFree(deviceInput);
        cuMemFree(deviceOutput);
    }
    
    private static CUfunction defaultInit() throws IOException
    {
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);
        
        // Create the PTX file by calling the NVCC
        String ptxFileName = preparePtxFile("JCudaCharPointersTestKernel.cu");
        
        // Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        // Load the ptx file.
        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFileName);

        // Obtain a function pointer to the "sampleKernel" function.
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "sampleKernel");

        return function;
    }
    
    private static String preparePtxFile(String cuFileName) throws IOException
    {
        int endIndex = cuFileName.lastIndexOf('.');
        if (endIndex == -1)
        {
            endIndex = cuFileName.length()-1;
        }
        String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
        File ptxFile = new File(ptxFileName);
        if (ptxFile.exists())
        {
            return ptxFileName;
        }
        
        File cuFile = new File(cuFileName);
        if (!cuFile.exists())
        {
            throw new IOException("Input file not found: "+cuFileName);
        }
        String modelString = "-m"+System.getProperty("sun.arch.data.model");        
        String command = 
            "nvcc " + modelString + " -ptx "+
            cuFile.getPath()+" -o "+ptxFileName;
        
        System.out.println("Executing
"+command);
        Process process = Runtime.getRuntime().exec(command);

        String errorMessage = 
            new String(toByteArray(process.getErrorStream()));
        String outputMessage = 
            new String(toByteArray(process.getInputStream()));
        int exitValue = 0;
        try
        {
            exitValue = process.waitFor();
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            throw new IOException(
                "Interrupted while waiting for nvcc output", e);
        }

        if (exitValue != 0)
        {
            System.out.println("nvcc process exitValue "+exitValue);
            System.out.println("errorMessage:
"+errorMessage);
            System.out.println("outputMessage:
"+outputMessage);
            throw new IOException(
                "Could not create .ptx file: "+errorMessage);
        }
        
        System.out.println("Finished creating PTX file");
        return ptxFileName;
    }

    private static byte[] toByteArray(InputStream inputStream) 
        throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte buffer[] = new byte[8192];
        while (true)
        {
            int read = inputStream.read(buffer);
            if (read == -1)
            {
                break;
            }
            baos.write(buffer, 0, read);
        }
        return baos.toByteArray();
    }
    
    
}

Kernel:

extern "C"
__global__ void sampleKernel(int size, char* globalInputData, char** globalOutputData)
{
  const unsigned int tidX = threadIdx.x;
  for (int i=0; i<size; i++)
  {
    globalOutputData[tidX]** = globalInputData**;
  }
}

Thanks Marco ! I’ll let you know once i have it working.

is cuLaunchKernel specific to 4.0 ? I’m getting method not found and unable to find it, I’m running 3.2

The sample code I’ve been using as template is from JCudaDriverCubinSample which has no reference to cuLaunchKernel.

Nick.

Oh, yes, sorry, this was the updated version for CUDA 4.0.

You may use the launch methods from original JCudaDriverCubinSample - it should mainly be a copy&paste from the sample, but note that the order of the arguments was changed (to size, input, output). If you encounter problems, I’ll post a modified version of the test example that only uses CUDA 3.2 functions.

Almost got it …

Have it executing, however in my .cu file I’m taking a string and then tokenizing it and returning each token (keyword) as an element of the array in the char** response. I’m running the example with a single thread (numThreads=1) so the intent is for the response to be an array of words, however it seems as if the response gets interpreted as a single string, instead of an array of elements where each word in the string is an element of the array.

So in the output, instead of

    for(int i = 0; i < numThreads; i++)
    {
        String s = new String(hostOutput**);
        System.out.println("s:" + s);  //this prints out the entire sentence passed in, but i want each keyword as an array element of char** output
    }

    I'm looking for something along the lines of the following:

    for(int i = 0; i < numThreads; i++)
    {
        String[] s = new String(hostOutput**);
         for (String token : s)
             System.out.println("token:" + token);
    }

Hopefully this makes sense to you, let me know what you think, and thanks for all your help thus far !

Nick.

Hello

At the moment, not really… What do you intend to achieve when using a single thread?

Admittedly, I did not completely understand the example:


        for(int i = 0; i < numThreads; i++)
        {
            String[] s = new String(hostOutput**);
             for (String token : s)
                 System.out.println("token:" + token);
        }

Each element of the hostOuptut array is a byte array, containing the bytes of a single String. If you want it to be an array of Strings, it will either have to be a char*** (three indirections) or each String will have to be separated, for example, with ‘\0’ (“null”) characters. Both would not be so desirable: Passing a 3D array to a kernel is a hassle, really (as you’ve seen, 2D is already rather complicated). And marking the token boundaries with ‘\0’ characters may also be complicated - not to mention the difficulties of chosing appropriate sizes for the arrays…

So could you explain what exactly you want to achieve, maybe by giving an example of the input and output of the program, and point out where exactly you want to exploit the data-parallel capabilites of CUDA there?

Please do NOT assume that arbitrary tasks automa’g’ically become faster when they are done with CUDA - CUDA is only efficient for really data-parallel problems, and there’s a lot of expertise and experience required to achieve a good speedup.

bye
Marco

[QUOTE=Marco13]Oh, yes, sorry, this was the updated version for CUDA 4.0.

I’ll post a modified version of the test example that only uses CUDA 3.2 functions.[/QUOTE]

Hi, I’m still wait for sample for CUDA 3.2!

Or fast answer: With method can I use instead of cuLaunchKernel() ?

It said: If you encounter any problems. You could have asked specifially for that, or reminded me earlier, but however, here you go:

import static jcuda.driver.JCudaDriver.*;

import java.io.*;

import jcuda.*;
import jcuda.driver.*;

public class JCudaCharPointersTest
{
    public static void main(String args[]) throws IOException
    {
        CUfunction function = defaultInit();
       
        // Create the host input memory: The bytes of a String
        String inputString = "Hello pointers";
        byte hostInput[] = inputString.getBytes();
        int size = hostInput.length;

        // Allocate and fill the device input memory
        CUdeviceptr deviceInput = new CUdeviceptr();
        cuMemAlloc(deviceInput, size * Sizeof.BYTE);
        cuMemcpyHtoD(deviceInput, Pointer.to(hostInput),
            size * Sizeof.BYTE);

        // Allocate the host output memory: 'numThreads' arrays
        // of bytes, each receiving a copy of the input
        int numThreads = 8;
        byte hostOutput[][] = new byte[numThreads][size];

        // Allocate arrays on the device, one for each row. The pointers
        // to these array are stored in host memory.
        CUdeviceptr hostDevicePointers[] = new CUdeviceptr[numThreads];
        for(int i = 0; i < numThreads; i++)
        {
            hostDevicePointers** = new CUdeviceptr();
            cuMemAlloc(hostDevicePointers**, size * Sizeof.BYTE);
        }

        // Allocate device memory for the array pointers, and copy
        // the array pointers from the host to the device.
        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, numThreads * Sizeof.POINTER);
        cuMemcpyHtoD(deviceOutput, Pointer.to(hostDevicePointers),
            numThreads * Sizeof.POINTER);

        // Set up the execution parameters for the kernel
        cuFuncSetBlockShape(function, numThreads, 1, 1);
        Pointer sizePointer = Pointer.to(new int[]{size});
        Pointer deviceInputPointer = Pointer.to(deviceInput);
        Pointer deviceOutputPointer = Pointer.to(deviceOutput);
        int offset = 0;

        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, sizePointer, Sizeof.INT);
        offset += Sizeof.INT;

        offset = align(offset, Sizeof.POINTER);
        cuParamSetv(function, offset, deviceInputPointer, Sizeof.POINTER);
        offset += Sizeof.POINTER;

        offset = align(offset, Sizeof.POINTER);
        cuParamSetv(function, offset, deviceOutputPointer, Sizeof.POINTER);
        offset += Sizeof.POINTER;
        
        cuParamSetSize(function, offset);
        
        // Call the kernel function.
        cuLaunchGrid(function, 1, 1);
        cuCtxSynchronize();

        // Copy each row back from the device to the host
        for(int i = 0; i < numThreads; i++)
        {
            cuMemcpyDtoH(Pointer.to(hostOutput**), hostDevicePointers**,
                size * Sizeof.BYTE);
        }
       
        // Print the results
        boolean passed = true;
        for(int i = 0; i < numThreads; i++)
        {
            String s = new String(hostOutput**);
            if (!s.equals(inputString))
            {
                passed = false;
            }
            System.out.println(s);
        }
        System.out.println("Test "+(passed?"PASSED":"FAILED"));

        // Clean up.
        for(int i = 0; i < numThreads; i++)
        {
            cuMemFree(hostDevicePointers**);
        }
        cuMemFree(deviceInput);
        cuMemFree(deviceOutput);
    }

    private static CUfunction defaultInit() throws IOException
    {
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);
       
        // Create the PTX file by calling the NVCC
        String ptxFileName = preparePtxFile("JCudaCharPointersTestKernel.cu");
       
        // Initialize the driver and create a context for the first device.
        cuInit(0);
        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        CUcontext context = new CUcontext();
        cuCtxCreate(context, 0, device);

        // Load the ptx file.
        CUmodule module = new CUmodule();
        cuModuleLoad(module, ptxFileName);

        // Obtain a function pointer to the "sampleKernel" function.
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "sampleKernel");

        return function;
    }

    private static String preparePtxFile(String cuFileName) throws IOException
    {
        int endIndex = cuFileName.lastIndexOf('.');
        if (endIndex == -1)
        {
            endIndex = cuFileName.length()-1;
        }
        String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
        File ptxFile = new File(ptxFileName);
        if (ptxFile.exists())
        {
            return ptxFileName;
        }
       
        File cuFile = new File(cuFileName);
        if (!cuFile.exists())
        {
            throw new IOException("Input file not found: "+cuFileName);
        }
        String modelString = "-m"+System.getProperty("sun.arch.data.model");        
        String command =
            "nvcc " + modelString + " -ptx "+
            cuFile.getPath()+" -o "+ptxFileName;
       
        System.out.println("Executing
"+command);
        Process process = Runtime.getRuntime().exec(command);

        String errorMessage =
            new String(toByteArray(process.getErrorStream()));
        String outputMessage =
            new String(toByteArray(process.getInputStream()));
        int exitValue = 0;
        try
        {
            exitValue = process.waitFor();
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            throw new IOException(
                "Interrupted while waiting for nvcc output", e);
        }

        if (exitValue != 0)
        {
            System.out.println("nvcc process exitValue "+exitValue);
            System.out.println("errorMessage:
"+errorMessage);
            System.out.println("outputMessage:
"+outputMessage);
            throw new IOException(
                "Could not create .ptx file: "+errorMessage);
        }
       
        System.out.println("Finished creating PTX file");
        return ptxFileName;
    }

    private static byte[] toByteArray(InputStream inputStream)
        throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte buffer[] = new byte[8192];
        while (true)
        {
            int read = inputStream.read(buffer);
            if (read == -1)
            {
                break;
            }
            baos.write(buffer, 0, read);
        }
        return baos.toByteArray();
    }


}