CUDA code: compiled with: “nvcc -cubin -m64 -arch sm_21 file.cu -o file.cubin”
One thing to note. I first compiled this code in linux and copied the cubin file from linux into windows and loaded it from windows. Do I need to generate the cubin file in windows or should nvcc be cross compatible between windows and linux.
extern "C"
__global__ void
elt_prod_conj(cufftDoubleComplex *fc, cufftDoubleComplex * c1,
cufftDoubleComplex * c2, int size)
{
__shared__ cufftDoubleComplex sfc[THREADS_PER_BLOCK];
__shared__ cufftDoubleComplex sc1[THREADS_PER_BLOCK];
__shared__ cufftDoubleComplex sc2[THREADS_PER_BLOCK];
int idx = threadIdx.x + blockIdx.x * THREADS_PER_BLOCK;
if (idx >= size)
return;
sc1[threadIdx.x] = c1[idx];
sc2[threadIdx.x] = c2[idx];
__syncthreads();
sfc[threadIdx.x] = cuCmul(sc1[threadIdx.x], cuConj(sc2[threadIdx.x]));
double mag = cuCabs(sfc[threadIdx.x]);
fc[idx] = make_cuDoubleComplex(cuCreal(sfc[threadIdx.x]) / mag,
cuCimag(sfc[threadIdx.x]) / mag);
}
Here is the java code for doing the FFTs and launching the above kernel:
First initialization:
public static CUfunction elt_prod_function;
public static int fftSize;
public static cufftHandle plan_fwd;
public static cufftHandle plan_bwd;
...
initPlans(int width, int height)
{
checkError(JCudaDriver.cuInit(0));
CUdevice device = new CUdevice();
checkError(JCudaDriver.cuDeviceGet(device, 0));
CUcontext context = new CUcontext();
checkError(JCudaDriver.cuCtxCreate(context, 0, device));
CUmodule module = new CUmodule();
checkError(JCudaDriver.cuModuleLoad(module, "util-cuda-bin.cubin"));
elt_prod_function = new CUfunction();
checkError(JCudaDriver.cuModuleGetFunction(elt_prod_function, module, "elt_prod_conj"));
fftSize = (width/2+1)*height;
plan_fwd = new cufftHandle();
plan_bwd = new cufftHandle();
JCufft.cufftPlan2d(plan_fwd, height, width, cufftType.CUFFT_D2Z);
JCufft.cufftPlan2d(plan_bwd, height, width, cufftType.CUFFT_Z2D);
}
// ... Compute the fft
public void computeFFT()
{
if (!isTileRead())
readTile();
if (hasFFT())
return;
double tempJ[] = new double[super.getWidth()*super.getHeight()];
for (int i = 0; i < super.getPixels().length; i++)
{
tempJ** = super.getPixels()**;
}
fft = new CUdeviceptr();
CUdeviceptr ptr = new CUdeviceptr();
JCUDAImageTile.checkError(JCudaDriver.cuMemAlloc(ptr, super.getWidth()*super.getHeight()*Sizeof.DOUBLE));
JCUDAImageTile.checkError(JCudaDriver.cuMemAlloc(fft, fftSize*Sizeof.DOUBLE*2));
JCUDAImageTile.checkError(JCudaDriver.cuMemcpyHtoD(ptr, Pointer.to(tempJ), super.getWidth()*super.getHeight()*Sizeof.DOUBLE));
JCUDAImageTile.checkError(JCufft.cufftExecD2Z(plan_fwd, ptr, fft));
JCudaDriver.cuMemFree(ptr);
JCudaDriver.cuCtxSynchronize();
}
... checkError:
public static void checkError(int val)
{
if (val != 0)
{
Log.msg(Log.LOG_LEVEL_MANDATORY, "Error: " + val);
}
}
... and finally using the kernel using two ffts.
func(ImageTile t1, ImageTile t2)
{
if (!t1.hasFFT())
t1.computeFFT();
if (!t2.hasFFT())
t2.computeFFT();
int numThreads = 256;
int numBlocks = (int)Math.ceil((double)JCUDAImageTile.fftSize / (double)numThreads);
System.out.println(JCUDAImageTile.fftSize + " blocks : " + numBlocks);
CUdeviceptr ptr = new CUdeviceptr();
CUdeviceptr ptrOut = new CUdeviceptr();
JCUDAImageTile.checkError(JCudaDriver.cuMemAlloc(ptr, JCUDAImageTile.fftSize*Sizeof.DOUBLE*2));
JCUDAImageTile.checkError(JCudaDriver.cuMemAlloc(ptrOut, t1.getWidth()*t1.getHeight()*Sizeof.DOUBLE));
jcuda.Pointer kernelParams = jcuda.Pointer.to(
jcuda.Pointer.to(t1.getFFT()),
jcuda.Pointer.to(t2.getFFT()),
jcuda.Pointer.to(ptr),
jcuda.Pointer.to(new int[]{JCUDAImageTile.fftSize}));
JCUDAImageTile.checkError(JCudaDriver.cuLaunchKernel(JCUDAImageTile.elt_prod_function,
numBlocks, 1, 1,
numThreads, 1, 1,
16384, null,
kernelParams, null));
JCUDAImageTile.checkError(JCufft.cufftExecZ2D(JCUDAImageTile.plan_bwd, ptr, ptrOut));
JCUDAImageTile.checkError(JCudaDriver.cuCtxSynchronize());
double [] valsOut = new double[t1.getWidth()*t1.getHeight()];
JCUDAImageTile.checkError(JCudaDriver.cuMemcpyDtoH(jcuda.Pointer.to(valsOut), ptrOut, t1.getWidth()*t1.getHeight()*Sizeof.DOUBLE));
JCUDAImageTile.checkError(JCudaDriver.cuMemFree(ptr));
JCUDAImageTile.checkError(JCudaDriver.cuMemFree(ptrOut));
}
Just a note: I implemented the algorithm using FFTW and some native C bindings and I am able to get the correct answer. Also, I am now trying to take some code I had written in C++ that uses CUDA and porting it into Java.
I ran my tests with exceptions turned on and logging level of debug and no messages were printed to the console.