OK, it’s just a quick example, but maybe it helps. It only performs some “dummy” operation on a list of words (adding 1 to each character), but shows the two aforementioned ways of passing a list of words to a kernel.
Note that this is really only a quick example. If you intend to write a more sophisticated application, you should carefully think about the best structures for this task (including the question about the appropriate location for the conversion from Java Strings into CUDA pointers and vice versa).
Depending on what you intend to do, there might be further options. For example, there might be cases where it is advantageous to pretend that all words have the same length, and thus have the possibility to use a 1D array (as in the “processSinglePointer” function), but only interpret it as a 2D array (with the respective width), and may easily access each word as one “row” of the implicit 2D array.
All this does also not yet take into account aspects like memory coalescing or shared memory, but this heavily depends on the actual algorithm that is to be applied.
package tests;
import static jcuda.driver.JCudaDriver.*;
import java.io.*;
import java.util.*;
import jcuda.*;
import jcuda.driver.*;
/**
* A sample demonstrating two different ways of passing strings
* to a kernel:
* - As an array of pointers, where each pointer points to
* a single word
* - As a single data block, where all words are concatenated
* and the word boundaries are given explicitly
*/
public class JCudaStringsSample
{
/**
* The kernel function using multiple pointers
* (i.e. an array of pointers)
*/
private static CUfunction multiplePointersKernel = null;
/**
* The kernel function using a single pointer to
* a memory block containing the concatenated words
*/
private static CUfunction singlePointerKernel = null;
/**
* Entry point of this sample
*
* @param args Not used
* @throws IOException If an IO error occurs
*/
public static void main(String args[]) throws IOException
{
// Perform the CUDA initialization
init();
// Fill a list with dummy words
List<String> wordList = new ArrayList<String>();
int numWords = 100;
for (int i=0; i<numWords; i++)
{
wordList.add("string"+i);
}
// Process the word list in several ways
List<String> result0 = processMultiplePointers(wordList);
List<String> result1 = processSinglePointer(wordList);
List<String> resultRef = processHost(wordList);
// Verify the result
boolean passed = true;
for (int i=0; i<wordList.size(); i++)
{
String word = wordList.get(i);
String word0 = result0.get(i);
String word1 = result1.get(i);
String wordRef = resultRef.get(i);
passed &= word0.equals(wordRef);
passed &= word1.equals(wordRef);
if (i < 10)
{
System.out.printf(
"Input: %-10s Reference: %-10s Multi: %-10s Single: %-10s
",
word, wordRef, word0, word1);
}
}
System.out.println("Test "+(passed ? "PASSED" : "FAILED"));
}
/**
* Host implementation of what is done in the kernel: It will
* only add 1 to the ASCII code of each character, and return
* the resulting strings
*
* @param wordList The word list
* @return The new word list
*/
private static List<String> processHost(List<String> wordList)
{
List<String> result = new ArrayList<String>();
for (String word : wordList)
{
byte hostWordData[] = word.getBytes();
for (int i=0; i<hostWordData.length; i++)
{
hostWordData** += 1;
}
String resultWord = new String(hostWordData);
result.add(resultWord);
}
return result;
}
/**
* Process the word list by creating one pointer for each word,
* and passing these to the kernel as an array of pointers.
*
* @param wordList The word list
* @return The new word list
*/
private static List<String> processMultiplePointers(List<String> wordList)
{
int numWords = wordList.size();
// Allocate and fill arrays on the device:
// - One one for each input word, which is filled
// with the byte data for the respective word
// - One for each output word
CUdeviceptr dWordInputPointers[] = new CUdeviceptr[numWords];
CUdeviceptr dWordOutputPointers[] = new CUdeviceptr[numWords];
int wordLengths[] = new int[numWords];
for(int i = 0; i < numWords; i++)
{
String word = wordList.get(i);
byte hostWordData[] = word.getBytes();
wordLengths** = hostWordData.length;
dWordInputPointers** = new CUdeviceptr();
cuMemAlloc(dWordInputPointers**, wordLengths** * Sizeof.BYTE);
cuMemcpyHtoD(dWordInputPointers**,
Pointer.to(hostWordData), wordLengths** * Sizeof.BYTE);
dWordOutputPointers** = new CUdeviceptr();
cuMemAlloc(dWordOutputPointers**, wordLengths** * Sizeof.BYTE);
}
// Allocate device memory for the array of pointers
// that point to the individual input words, and copy
// the input word pointers from the host to the device.
CUdeviceptr dWordInputPointersArray = new CUdeviceptr();
cuMemAlloc(dWordInputPointersArray, numWords * Sizeof.POINTER);
cuMemcpyHtoD(dWordInputPointersArray,
Pointer.to(dWordInputPointers),
numWords * Sizeof.POINTER);
// Allocate device memory for the array of pointers
// that point to the individual output words, and copy
// the output word pointers from the host to the device.
CUdeviceptr dWordOutputPointersArray = new CUdeviceptr();
cuMemAlloc(dWordOutputPointersArray, numWords * Sizeof.POINTER);
cuMemcpyHtoD(dWordOutputPointersArray,
Pointer.to(dWordOutputPointers),
numWords * Sizeof.POINTER);
// Allocate and fill the device array for the word lengths
CUdeviceptr dWordLengths = new CUdeviceptr();
cuMemAlloc(dWordLengths, numWords * Sizeof.INT);
cuMemcpyHtoD(dWordLengths, Pointer.to(wordLengths),
numWords * Sizeof.INT);
// Set up the kernel parameters
Pointer kernelParams = Pointer.to(
Pointer.to(new int[]{numWords}),
Pointer.to(dWordInputPointersArray),
Pointer.to(dWordLengths),
Pointer.to(dWordOutputPointersArray)
);
// Call the kernel function.
int blockDimX = 256;
int gridDimX = (int)Math.ceil((double)numWords/blockDimX);
cuLaunchKernel(multiplePointersKernel,
gridDimX, 1, 1, // Grid dimension
blockDimX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParams, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Copy the contents of each output pointer of the
// device back into a host array, create a string
// from each array and store it in the result list
List<String> result = new ArrayList<String>();
for(int i = 0; i < numWords; i++)
{
byte hostWordData[] = new byte[wordLengths**];
cuMemcpyDtoH(Pointer.to(hostWordData), dWordOutputPointers**,
wordLengths** * Sizeof.BYTE);
String word = new String(hostWordData);
result.add(word);
}
// Clean up.
for(int i = 0; i < numWords; i++)
{
cuMemFree(dWordInputPointers**);
cuMemFree(dWordOutputPointers**);
}
cuMemFree(dWordInputPointersArray);
cuMemFree(dWordOutputPointersArray);
cuMemFree(dWordLengths);
return result;
}
/**
* Process the word list by creating one large memory block
* that contains all words, and pass this to the kernel
* together with additional information about the word
* boundaries
*
* @param wordList The word list
* @return The new word list
*/
private static List<String> processSinglePointer(List<String> wordList)
{
int numWords = wordList.size();
// Compute the word lengths and the index
// that the end of each word will have
// in a large, combined array
int wordLengths[] = new int[numWords];
int wordEndIndices[] = new int[numWords];
int offset = 0;
for(int i = 0; i < numWords; i++)
{
String word = wordList.get(i);
wordLengths** = word.length();
offset += word.length();
wordEndIndices** = offset;
}
int totalLength = offset;
// Allocate and fill the device array for the word lengths
CUdeviceptr dWordLengths = new CUdeviceptr();
cuMemAlloc(dWordLengths, numWords * Sizeof.INT);
cuMemcpyHtoD(dWordLengths, Pointer.to(wordLengths),
numWords * Sizeof.INT);
// Allocate and fill the device array for the word end indices
CUdeviceptr dWordEndIndices = new CUdeviceptr();
cuMemAlloc(dWordEndIndices, numWords * Sizeof.INT);
cuMemcpyHtoD(dWordEndIndices, Pointer.to(wordEndIndices),
numWords * Sizeof.INT);
// Allocate and fill the device memory for the actual
// input- and output word data
CUdeviceptr dInputWords = new CUdeviceptr();
cuMemAlloc(dInputWords, totalLength * Sizeof.BYTE);
offset = 0;
for(int i = 0; i < numWords; i++)
{
String word = wordList.get(i);
byte hostWordData[] = word.getBytes();
CUdeviceptr d = dInputWords.withByteOffset(offset * Sizeof.BYTE);
cuMemcpyHtoD(d, Pointer.to(hostWordData),
wordLengths** * Sizeof.BYTE);
offset += wordLengths**;
}
CUdeviceptr dOutputWords = new CUdeviceptr();
cuMemAlloc(dOutputWords, totalLength * Sizeof.BYTE);
// Set up the kernel parameters
Pointer kernelParams = Pointer.to(
Pointer.to(new int[]{numWords}),
Pointer.to(dInputWords),
Pointer.to(dWordEndIndices),
Pointer.to(dWordLengths),
Pointer.to(dOutputWords)
);
// Call the kernel function.
int blockDimX = 256;
int gridDimX = (int)Math.ceil((double)numWords/blockDimX);
cuLaunchKernel(singlePointerKernel,
gridDimX, 1, 1, // Grid dimension
blockDimX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParams, null // Kernel- and extra parameters
);
cuCtxSynchronize();
// Copy the each word from the output device pointer
// device back into a host array, create a string
// from each array, and put it into the result list
List<String> result = new ArrayList<String>();
offset = 0;
for(int i = 0; i < numWords; i++)
{
byte wordHostData[] = new byte[wordLengths**];
CUdeviceptr d = dOutputWords.withByteOffset(offset * Sizeof.BYTE);
cuMemcpyDtoH(Pointer.to(wordHostData), d,
wordLengths** * Sizeof.BYTE);
String word = new String(wordHostData);
result.add(word);
offset += wordLengths**;
}
// Clean up.
cuMemFree(dInputWords);
cuMemFree(dOutputWords);
cuMemFree(dWordLengths);
cuMemFree(dWordEndIndices);
return result;
}
private static void init() throws IOException
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create the PTX file by calling the NVCC
String ptxFileName = preparePtxFile("JCudaStringsSampleKernel.cu");
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain function pointers to the kernel functions.
multiplePointersKernel = new CUfunction();
cuModuleGetFunction(multiplePointersKernel, module, "multiplePointers");
singlePointerKernel = new CUfunction();
cuModuleGetFunction(singlePointerKernel, module, "singlePointer");
}
private static String preparePtxFile(String cuFileName) throws IOException
{
System.out.println("Creating PTX file");
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
{
endIndex = cuFileName.length()-1;
}
String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
File ptxFile = new File(ptxFileName);
if (ptxFile.exists())
{
if (ptxFile.lastModified() > cuFile.lastModified())
{
return ptxFileName;
}
else
{
System.out.println("Updating PTX file");
}
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command =
"nvcc " + modelString + " -ptx "+
cuFile.getPath()+" -o "+ptxFileName;
System.out.println("Executing
"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage =
new String(toByteArray(process.getErrorStream()));
String outputMessage =
new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException(
"Interrupted while waiting for nvcc output", e);
}
if (exitValue != 0)
{
System.out.println("nvcc process exitValue "+exitValue);
System.out.println("errorMessage:
"+errorMessage);
System.out.println("outputMessage:
"+outputMessage);
throw new IOException(
"Could not create .ptx file: "+errorMessage);
}
System.out.println("Finished creating PTX file");
return ptxFileName;
}
private static byte[] toByteArray(InputStream inputStream)
throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
The kernels, stored as a file called “JCudaStringsSampleKernel.cu”:
extern "C"
__global__ void multiplePointers(
int numWords,
char** inputWords,
int* wordLengths,
char** outputWords)
{
const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numWords)
{
char *inputWord = inputWords[tid];
char *outputWord = outputWords[tid];
int wordLength = wordLengths[tid];
for (int i=0; i<wordLength; i++)
{
// Dummy: Just copy input to output and add 1 to
// the ASCII code of each character
outputWord** = inputWord** + 1;
}
}
}
extern "C"
__global__ void singlePointer(
int numWords,
char* inputWords,
int* wordEndIndices,
int* wordLengths,
char* outputWords)
{
const unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < numWords)
{
int offset = 0;
if (tid > 0)
{
offset = wordEndIndices[tid-1];
}
char *inputWord = inputWords + offset;
char *outputWord = outputWords + offset;
int wordLength = wordLengths[tid];
for (int i=0; i<wordLength; i++)
{
// Dummy: Just copy input to output and add 1 to
// the ASCII code of each character
outputWord** = inputWord** + 1;
}
}
}