Hello,
OK, I had a look at the source code and there have been some… errors. For example, the alignment that is computed with
offset = JCudaDriver.align(offset, npoints * nfeatures * Sizeof.FLOAT);
is horribly wrong, since it should be aligned to the size of the type, and this is just a POINTER in this case.
Additionally, you have set up
JCudaDriver.cuParamSetv(function, offset, pfeature_flipped_d, Sizeof.POINTER);
JCudaDriver.cuParamSetv(function, offset, pfeature_d, Sizeof.POINTER);
(in this order) although the kernel expected the input as the first argument and the output as the second.
One important aspect is also that you did not declare your kernel as extern “C”, so the name got mangled and it probably did not even find the kernel function. You could have detected this very early by just calling
JCudaDriver.setExceptionsEnabled(true);
at the beginning of your program.
For short: There have been some caveats. I have created a very simple example of how your task might be achieved. But note that this is a naive implementation, which does NOT use shared memory, and does NOT obey the rules for coalesced memory access, so it will probably be slow. For an optimized version of a matrix transpose, you may want to have a look at the NVIDIA sample.
The source code
import java.io.*;
import jcuda.*;
import jcuda.driver.*;
import static jcuda.driver.JCudaDriver.*;
public class RowColumSwap
{
public static void main(String args[]) throws IOException
{
String cubinFileName = prepareCubinFile("simpleTranspose.cu");
JCudaDriver.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUcontext pctx = new CUcontext();
CUdevice dev = new CUdevice();
cuDeviceGet(dev, 0);
cuCtxCreate(pctx, 0, dev);
// Load the CUBIN file.
CUmodule module = new CUmodule();
cuModuleLoad(module, cubinFileName);
// Obtain a function pointer to the function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "simpleTranspose");
// Set up the number of threads and blocks
int numThreadsX = 2;
int numBlocksX = 3;
int numThreadsY = 3;
int numBlocksY = 4;
// Compute the size of the matrix
int inputColumns = numThreadsX * numBlocksX;
int inputRows = numThreadsY * numBlocksY;
int outputColumns = inputRows;
int size = inputRows * inputColumns;
// Create the input and output arrays
float input[] = new float[size];
float output[] = new float[size];
for (int i = 0; i < size; i++)
{
input** = i;
}
System.out.println("Input");
printMatrix(input, inputColumns);
// Create the input array on the device
CUdeviceptr dInput = new CUdeviceptr();
cuMemAlloc(dInput, size * Sizeof.FLOAT);
cuMemcpyHtoD(dInput, Pointer.to(input), size*Sizeof.FLOAT);
// Create the output array on the device
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, size * Sizeof.FLOAT);
cuMemsetD32(dOutput, 0, size);
// Set up the arguments
Pointer pInput = Pointer.to(dInput);
Pointer pOutput = Pointer.to(dOutput);
Pointer pSize1 = Pointer.to(new int[]{ inputRows });
Pointer pSize2 = Pointer.to(new int[]{ inputColumns });
int offset = 0;
offset = align(offset, Sizeof.POINTER);
cuParamSetv(function, offset, pInput, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = align(offset, Sizeof.POINTER);
cuParamSetv(function, offset, pOutput, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, pSize1, Sizeof.INT);
offset += Sizeof.INT;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, pSize2, Sizeof.INT);
offset += Sizeof.INT;
// Set up the execution parameters and call the function
cuFuncSetBlockShape(function, numThreadsX, numThreadsY, 1);
cuParamSetSize(function, offset);
cuLaunchGrid(function, numBlocksX, numBlocksY);
cuCtxSynchronize();
// Obtain the result
cuMemcpyDtoH(Pointer.to(output), dOutput, size * Sizeof.FLOAT);
System.out.println("Output");
printMatrix(output, outputColumns);
}
//-- Only helper functions below...
private static void printMatrix(float matrix[], int columns)
{
for (int i=0; i<matrix.length; i++)
{
System.out.printf("%4.1f ", matrix**);
if ((i+1) % columns == 0)
{
System.out.println();
}
}
}
private static String prepareCubinFile(String cuFileName) throws IOException
{
int endIndex = cuFileName.lastIndexOf('.');
if (endIndex == -1)
{
endIndex = cuFileName.length()-1;
}
String cubinFileName = cuFileName.substring(0, endIndex+1)+"cubin";
File cuFile = new File(cuFileName);
if (!cuFile.exists())
{
throw new IOException("Input file not found: "+cuFileName);
}
String modelString = "-m"+System.getProperty("sun.arch.data.model");
String command =
"nvcc " + modelString + " -arch sm_11 -cubin "+
cuFile.getPath()+" -o "+cubinFileName;
System.out.println("Executing
"+command);
Process process = Runtime.getRuntime().exec(command);
String errorMessage = new String(toByteArray(process.getErrorStream()));
String outputMessage = new String(toByteArray(process.getInputStream()));
int exitValue = 0;
try
{
exitValue = process.waitFor();
}
catch (InterruptedException e)
{
Thread.currentThread().interrupt();
throw new IOException("Interrupted while waiting for nvcc output", e);
}
System.out.println("nvcc process exitValue "+exitValue);
if (exitValue != 0)
{
System.out.println("errorMessage:
"+errorMessage);
System.out.println("outputMessage:
"+outputMessage);
throw new IOException("Could not create .cubin file: "+errorMessage);
}
return cubinFileName;
}
private static byte[] toByteArray(InputStream inputStream) throws IOException
{
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte buffer[] = new byte[8192];
while (true)
{
int read = inputStream.read(buffer);
if (read == -1)
{
break;
}
baos.write(buffer, 0, read);
}
return baos.toByteArray();
}
}
The kernel code, stored in ‘simpleTranspose.cu’:
extern "C"
__global__ void simpleTranspose(float *input,
float *output,
int inputRows,
int inputColumns)
{
int inputColumn = threadIdx.x + blockDim.x*blockIdx.x;
int inputRow = threadIdx.y + blockDim.y*blockIdx.y;
int outputRow = inputColumn;
int outputColumn = inputRow;
int outputRows = inputColumns;
int outputColumns = inputRows;
if(inputColumn < inputColumns && inputRow < inputRows)
{
int inputIndex = inputColumn+inputRow*inputColumns;
int outputIndex = outputColumn+outputRow*outputColumns;
output[outputIndex] = input[inputIndex];
}
return;
}