Hello
First of all, you should be aware that ja Java ‘char’ has 16 bits (2 bytes), and a C ‘char’ has only 8 bits (1 byte). So when you are using ‘char’ in a kernel, you always have to use ‘byte’ on Java side.
Concerning the actual problem: It’s basically just the opposite direction of what is shown in the original sample. The critical part (which should also be pointed out by the sample) is that the float** (or char**) is a pointer to a pointer, which may be considered as an “array of pointers”. Each of these pointers points to device memory. But the array of pointers also has to be stored in device memory (although it can only be allocated on the host).
I have modified the example a little bit. It now calls a kernel that receives a String (as a byte[] array), and each thread just copies the contents of this input array into one row of a 2D output array.
import static jcuda.driver.JCudaDriver.*;
import java.io.*;
import jcuda.*;
import jcuda.driver.*;
public class JCudaCharPointersTest
{
public static void main(String args[]) throws IOException
{
CUfunction function = defaultInit();
// Create the host input memory: The bytes of a String
String inputString = "Hello pointers";
byte hostInput[] = inputString.getBytes();
int size = hostInput.length;
// Allocate and fill the device input memory
CUdeviceptr deviceInput = new CUdeviceptr();
cuMemAlloc(deviceInput, size * Sizeof.BYTE);
cuMemcpyHtoD(deviceInput, Pointer.to(hostInput),
size * Sizeof.BYTE);
// Allocate the host output memory: 'numThreads' arrays
// of bytes, each receiving a copy of the input
int numThreads = 8;
byte hostOutput[][] = new byte[numThreads][size];
// Allocate arrays on the device, one for each row. The pointers
// to these array are stored in host memory.
CUdeviceptr hostDevicePointers[] = new CUdeviceptr[numThreads];
for(int i = 0; i < numThreads; i++)
{
hostDevicePointers** = new CUdeviceptr();
cuMemAlloc(hostDevicePointers**, size * Sizeof.BYTE);
}
// Allocate device memory for the array pointers, and copy
// the array pointers from the host to the device.
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, numThreads * Sizeof.POINTER);
cuMemcpyHtoD(deviceOutput, Pointer.to(hostDevicePointers),
numThreads * Sizeof.POINTER);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParams = Pointer.to(
Pointer.to(new int[]{size}),
Pointer.to(deviceInput),
Pointer.to(deviceOutput)
);
// Call the kernel function.
cuLaunchKernel(function,
1, 1, 1, // Grid dimension
numThreads, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParams, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Copy each row back from the device to the host
for(int i = 0; i < numThreads; i++)
{
cuMemcpyDtoH(Pointer.to(hostOutput**), hostDevicePointers**,
size * Sizeof.BYTE);
}
// Print the results
boolean passed = true;
for(int i = 0; i < numThreads; i++)
{
String s = new String(hostOutput**);
if (!s.equals(inputString))
{
passed = false;
}
System.out.println(s);
}
System.out.println("Test "+(passed?"PASSED":"FAILED"));
// Clean up.
for(int i = 0; i < numThreads; i++)
{
cuMemFree(hostDevicePointers**);
}
cuMemFree(deviceInput);
cuMemFree(deviceOutput);
}
private static CUfunction defaultInit() throws IOException
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create the PTX file by calling the NVCC
String ptxFileName = preparePtxFile("JCudaCharPointersTestKernel.cu");
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "sampleKernel" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "sampleKernel");
return function;
}
private static String preparePtxFile(String cuFileName) throws IOException
{
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
{
endIndex = cuFileName.length()-1;
}
String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
File ptxFile = new File(ptxFileName);
if (ptxFile.exists())
{
return ptxFileName;
}
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command =
"nvcc " + modelString + " -ptx "+
cuFile.getPath()+" -o "+ptxFileName;
System.out.println("Executing
"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage =
new String(toByteArray(process.getErrorStream()));
String outputMessage =
new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted while waiting for nvcc output", e);
}
if (exitValue != 0)
{
System.out.println("nvcc process exitValue "+exitValue);
System.out.println("errorMessage:
"+errorMessage);
System.out.println("outputMessage:
"+outputMessage);
throw new IOException(
"Could not create .ptx file: "+errorMessage);
}
System.out.println("Finished creating PTX file");
return ptxFileName;
}
private static byte[] toByteArray(InputStream inputStream)
throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
Kernel:
extern "C"
__global__ void sampleKernel(int size, char* globalInputData, char** globalOutputData)
{
const unsigned int tidX = threadIdx.x;
for (int i=0; i<size; i++)
{
globalOutputData[tidX]** = globalInputData**;
}
}