[QUOTE=Marco13]Hello
That’s strange: I just did a quick test, and until now, it seems as if it really is related to the byte order. (Did you verify the results, or do you assume that they are correct because they are not ‘NaN’ or ‘7FFF’?). In this test (see the code below) I introduced a flag ‘reverseOrderTest’, which causes the byte order of the ‘jcufft[]’ byte array to be reversed (pragmatically). The results are compared to the float-version that you posted, and until now, it seems that the byte order is the reason. (At least here, on my Win32 machine…)
Yes, definitely.
The CUDA device query does not say anything about a limit. I found some forum threads indicating that there might be a limit, but no clear official statements so far…
I’d recommend to setExceptionsEnabled(true) during the development in general. Otherwise, checking for errors is tedious. Even the NVIDIA CUDA SDK contains Utility macros (CUDA_SAFE_CALL etc) to make this easier.
Certainly, that sounds feasible. I added another test in this code: In the “runTestMapped”-Method, the plan creation and memory allocation is pulled out of the loop. Additionally, I’m using Memory Mapped files there. As far as I know, these are intended for a use case like this: Mapping a region of a (unmanagably large) file into memory, manipulating it, and committing the data back - and they are said to be fast. The byte order reversal is still necessary, but maybe some time can be saved there anyhow. Note that I have not really used the mechanism of Memory Mapped files before, and I have not tested it extensively (especially not with really “large” files), but it may be worth a try…
package tests.jcufft;
import java.io.*;
import java.nio.*;
import java.nio.channels.*;
import java.nio.channels.FileChannel.MapMode;
import jcuda.Pointer;
import jcuda.jcufft.*;
import jcuda.runtime.*;
public class JCufftByteOrderTest
{
private static boolean reverseOrderTest = false;
public static void main(String[] args) throws IOException
{
JCuda.setExceptionsEnabled(true);
JCufft.setExceptionsEnabled(true);
int fftSize = 128;
int bufferSize = fftSize * 4;
int totalSize = bufferSize * 4;
int batches = bufferSize / fftSize;
File input = new File("JCufftBatchedTest_input.dat");
if (!input.exists())
{
createDummyData(input, totalSize);
}
File outputFloat = new File("JCufftBatchedTest_output_float.dat");
File outputByte = new File("JCufftBatchedTest_output_byte.dat");
File outputByteRev = new File("JCufftBatchedTest_output_byte_rev.dat");
File outputMapped = new File("JCufftBatchedTest_output_mapped.dat");
System.out.println("Float:");
runTestFloat(fftSize, bufferSize, batches, input, outputFloat);
printOutputData(outputFloat, totalSize);
System.out.println("Byte (not reversed)");
reverseOrderTest = false;
runTestByte(fftSize, bufferSize, batches, input, outputByte);
printOutputData(outputByte, totalSize);
System.out.println("Byte (reversed)");
reverseOrderTest = true;
runTestByte(fftSize, bufferSize, batches, input, outputByteRev);
printOutputData(outputByteRev, totalSize);
System.out.println("Mapped");
runTestMapped(fftSize, bufferSize, batches, input, outputMapped);
printOutputData(outputMapped, totalSize);
}
private static void runTestByte(int fftSize, int bufferSize, int batches, File input, File output) throws FileNotFoundException, IOException
{
ByteBuffer inputBuff = ByteBuffer.allocateDirect(bufferSize);
// bufferSize here is a gigabyte, since I'm processing 1024^3 bytes at once
byte jcufft[] = new byte[bufferSize];
FileChannel inchannel = new FileInputStream(input).getChannel();
FileChannel outchannel = new FileOutputStream(output).getChannel();
long size = inchannel.size();
do {
// Read and transfer data to byte array
inputBuff.clear();
inchannel.read(inputBuff);
inputBuff.rewind();
inputBuff.get(jcufft);
if (reverseOrderTest)
{
jcufft = reverseByteOrder(jcufft);
}
// Create pointers to host and device memory, allocate memory
Pointer byte_host_input = Pointer.to(jcufft);
Pointer byte_device_input = new Pointer();
JCuda.cudaMalloc(byte_device_input, bufferSize);
// Copy data to device, perform FFT, copy back to host
JCuda.cudaMemcpy(byte_device_input, byte_host_input, bufferSize, cudaMemcpyKind.cudaMemcpyHostToDevice);
cufftHandle plan = new cufftHandle();
JCufft.cufftPlan1d(plan, fftSize, cufftType.CUFFT_R2C, batches);
JCufft.cufftExecR2C(plan, byte_device_input, byte_device_input);
JCuda.cudaMemcpy(byte_host_input, byte_device_input, bufferSize, cudaMemcpyKind.cudaMemcpyDeviceToHost);
JCufft.cufftDestroy(plan);
JCuda.cudaFree(byte_device_input);
if (reverseOrderTest)
{
jcufft = reverseByteOrder(jcufft);
}
// Write data to file
inputBuff.clear();
inputBuff.put(jcufft);
inputBuff.rewind();
outchannel.write(inputBuff);
} while (inchannel.position() != size);
outchannel.close();
inchannel.close();
}
private static void runTestFloat(int fftSize, int bufferSize, int batches, File input, File output) throws FileNotFoundException, IOException
{
ByteBuffer inputBuff = ByteBuffer.allocateDirect(bufferSize);
FloatBuffer floatBuff = inputBuff.asFloatBuffer();
// bufferSize here should be 1GB, each float is 4 bytes
float jcufft[] = new float[bufferSize / 4];
FileChannel inchannel = new FileInputStream(input).getChannel();
FileChannel outchannel = new FileOutputStream(output).getChannel();
long size = inchannel.size();
do
{
// Read and transfer data to byte array
inputBuff.clear();
inchannel.read(inputBuff);
floatBuff.rewind();
floatBuff.get(jcufft);
// Create pointers to host and device memory, allocate memory
Pointer float_host_input = Pointer.to(jcufft);
Pointer float_device_input = new Pointer();
JCuda.cudaMalloc(float_device_input, bufferSize);
// Copy data to device, perform FFT, copy back to host
JCuda.cudaMemcpy(float_device_input, float_host_input, bufferSize, cudaMemcpyKind.cudaMemcpyHostToDevice);
cufftHandle plan = new cufftHandle();
JCufft.cufftPlan1d(plan, fftSize, cufftType.CUFFT_R2C, batches);
JCufft.cufftExecR2C(plan, float_device_input, float_device_input);
JCuda.cudaMemcpy(float_host_input, float_device_input, bufferSize, cudaMemcpyKind.cudaMemcpyDeviceToHost);
JCufft.cufftDestroy(plan);
JCuda.cudaFree(float_device_input);
// Write data to file
floatBuff.clear();
floatBuff.put(jcufft);
inputBuff.rewind();
outchannel.write(inputBuff);
} while (inchannel.position() != size);
outchannel.close();
inchannel.close();
}
private static void runTestMapped(int fftSize, int bufferSize, int batches, File input, File output) throws FileNotFoundException, IOException
{
// bufferSize here is a gigabyte, since I'm processing 1024^3 bytes at once
float jcufft[] = new float[bufferSize/4];
FileChannel inchannel = new FileInputStream(input).getChannel();
FileChannel outchannel = new RandomAccessFile(output, "rw").getChannel();
long size = inchannel.size();
long position = 0;
// Create pointers to host and device memory, allocate memory
Pointer byte_host_input = Pointer.to(jcufft);
Pointer byte_device_input = new Pointer();
JCuda.cudaMalloc(byte_device_input, bufferSize);
cufftHandle plan = new cufftHandle();
JCufft.cufftPlan1d(plan, fftSize, cufftType.CUFFT_R2C, batches);
do
{
ByteBuffer mappedInput = inchannel.map(MapMode.READ_ONLY, position, bufferSize);
FloatBuffer inputBuffer = mappedInput.asFloatBuffer();
inputBuffer.get(jcufft);
// Copy data to device, perform FFT, copy back to host
JCuda.cudaMemcpy(byte_device_input, byte_host_input, bufferSize, cudaMemcpyKind.cudaMemcpyHostToDevice);
JCufft.cufftExecR2C(plan, byte_device_input, byte_device_input);
JCuda.cudaMemcpy(byte_host_input, byte_device_input, bufferSize, cudaMemcpyKind.cudaMemcpyDeviceToHost);
// Write data to file
MappedByteBuffer mappedOutput = outchannel.map(MapMode.READ_WRITE, position, bufferSize);
FloatBuffer outputBuffer = mappedOutput.asFloatBuffer();
outputBuffer.put(jcufft);
mappedOutput.force();
position += bufferSize;
} while (position < size);
JCufft.cufftDestroy(plan);
JCuda.cudaFree(byte_device_input);
outchannel.close();
inchannel.close();
}
private static byte[] reverseByteOrder(byte input[])
{
byte[] output = new byte[input.length];
for (int i=0; i<input.length; i+=4)
{
output[i+0] = input[i+3];
output[i+1] = input[i+2];
output[i+2] = input[i+1];
output[i+3] = input[i+0];
}
return output;
}
private static void createDummyData(File file, int size) throws IOException
{
DataOutputStream dos = new DataOutputStream(
new FileOutputStream(file));
for (int i=0; i<size; i++)
{
dos.writeFloat((float)Math.sin(i*0.1f));
}
dos.close();
}
private static void printOutputData(File file, int size) throws IOException
{
DataInputStream dis = new DataInputStream(
new FileInputStream(file));
for (int i=0; i<size; i++)
{
float f = dis.readFloat();
System.out.printf("%7s", String.format("%.3f", f));
if ((i+1)%20 == 0)
{
System.out.println(", ");
}
else
{
System.out.print(", ");
}
}
System.out.println("
");
dis.close();
}
}
```[/QUOTE]
Well, I've sort of pinpointed the first exception I was getting - the cudaErrorLaunchFailure (JCuda.checkResult(357), JCuda.cudaMemcpy(2964)). If you look back at the code you wrote in this post, runTestFloat is identical to what I am doing right now, and the Memcpy call is what is throwing the exception. I tried creating a separate float array, even a separate byte array, but the copy back to host fails, and I can't figure out why. I'm running with 6G of heapspace and plenty of RAM/disk space, so I don't think it's a space issue..