Hello,
I’d like to write 3D filters for ImageJ.
I’m very new to CUDA, and you will see I only patchworked from the different examples i found on JCuda.org.
The „simpleJCuda plugin“ (SJCP) (i will use it serveral times ) works fine on my system, but i’d like to write filters using 3D textures.
First: I have a problem to load the Kernel. I put the .cubin file in the .jar file, as in the SJCP
-when i use the kernelLauncher i works
-when i use cuModuleLoad(module, cubinFileName); it doesn’t. I get the exception:
„jcuda.CudaException: CUDA_ERROR_FILE_NOT_FOUND
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
at jcuda.driver.JCudaDriver.cuModuleLoad(JCudaDriver.java:1400)
at JCuda_3DFilters.setup(JCuda_3DFilters.java:78)“
anyway what is the difference between the 2 ways of loading a kernel?
So I tried with the kernelLaucher as in SJCP, and It can’t copy the host input to the array in the function mallocTex3D of my code. I get the following exception:
„jcuda.CudaException: CUDA_ERROR_INVALID_VALUE
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
at jcuda.driver.JCudaDriver.cuMemcpy3D(JCudaDriver.java:4487)
at JCuda_3DFilters.mallocTex3D(JCuda_3DFilters.java:195)
at JCuda_3DFilters.run(JCuda_3DFilters.java:87)“
I also would like to understand the difference between „cudaMemcpy“ and „cuMemcpyDtoH“ to copy data back to host
Here is my code, you will see i tried using module, as in the texture example of JCuda.org, as i couldn’t load the .cubin i tried the kernellaucher and commented the important lines related to the module.
/**
* ImageJ Plugin using JCuda
*
*
*/
import java.util.Arrays;
import ij.*;
import ij.process.*;
import ij.gui.*;
import ij.plugin.filter.*;
import static jcuda.driver.JCudaDriver.*;
import static jcuda.driver.CUfilter_mode.*;
import static jcuda.driver.CUaddress_mode.*;
import static jcuda.driver.CUarray_format.*;
import jcuda.*;
import jcuda.driver.*;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.utils.KernelLauncher;
import java.io.InputStream;
/**
* A simple example for an ImageJ Plugin that uses JCuda.
*/
public class JCuda_3DFilters implements PlugInFilter {
/**
* The current image to operate on
*/
private ImagePlus img = null;
public int sizeX, sizeY, sizeZ, sizeXY, sizeXYZ;
public int[] pixels;
public float radius;
public float scalexy, scalez;
private static CUmodule module;
public CUarray array;
public int[] threads_per_block, Max_blockZ, Max_blockX, Max_blockY, Max_gridX, Max_gridY, Max_gridZ;
public int blockZ, blockX, blockY, gridX, gridY, gridZ;
private KernelLauncher kernelLauncher = null;
@Override
public int setup(String arg, ImagePlus imagePlus) {
img = imagePlus;
this.getPixels();
// Initialize the driver and create a context for the first device.
JCudaDriver.setExceptionsEnabled(true);
cuInit(0);
CUcontext pctx = new CUcontext();
CUdevice dev = new CUdevice();
//cuDeviceGetCount(); pour gerer les multidevices
cuDeviceGet(dev, 0);
this.threads_per_block=new int[1];
this.Max_gridX=new int[1];
this.Max_gridY=new int[1];
cuDeviceGetAttribute(this.threads_per_block, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
//cuDeviceGetAttribute(this.Max_blockX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
//cuDeviceGetAttribute(this.Max_blockY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
//cuDeviceGetAttribute(this.Max_blockZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
cuDeviceGetAttribute(this.Max_gridX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
cuDeviceGetAttribute(this.Max_gridY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
//cuDeviceGetAttribute(this.Max_gridZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
cuCtxCreate(pctx, 0, dev);
this.blockX=(int)Math.sqrt(this.threads_per_block[0]);
this.blockY=this.blockX;
IJ.log("block size:" + this.blockX);
this.gridX=(this.sizeX+this.blockX-1)/this.blockX;
this.gridY=(this.sizeY+this.blockY-1)/this.blockY;
// Load the CUBIN file containing the kernels
String cubinFileName = "JCuda_3DFilters_Kernel.cubin";
module = new CUmodule();
//cuModuleLoad(module, cubinFileName);
// Create the kernelLauncher that will execute the kernel
InputStream cubinInputStream = getClass().getResourceAsStream(cubinFileName);
kernelLauncher = KernelLauncher.load(cubinInputStream, "min_3D");
return DOES_16;
}
@Override
public void run (ImageProcessor imageProcessor) {
this.mallocTex3D();
this.execute();
}
void execute() {
// Prepare the output device memory
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, Sizeof.INT * sizeXYZ);
// Obtain the function
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "min_3D");
cuFuncSetBlockShape(function, this.blockX, this.blockY, 1);
// Set up the function parameters
Pointer pOutput = Pointer.to(dOutput);
Pointer psX = Pointer.to(new int[]{ this.sizeX });
Pointer psY = Pointer.to(new int[]{ this.sizeY });
Pointer psXY = Pointer.to(new int[]{ this.sizeXY });
Pointer psZ = Pointer.to(new int[]{ this.sizeZ });
Pointer pRadius = Pointer.to(new float[]{ radius });
int offset = 0;
offset = align(offset, Sizeof.POINTER);
cuParamSetv(function, offset, pOutput, Sizeof.POINTER);
offset += Sizeof.POINTER;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, psX, Sizeof.INT);
offset += Sizeof.INT;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, psY, Sizeof.INT);
offset += Sizeof.INT;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, psXY, Sizeof.INT);
offset += Sizeof.INT;
offset = align(offset, Sizeof.INT);
cuParamSetv(function, offset, psZ, Sizeof.INT);
offset += Sizeof.INT;
offset = align(offset, Sizeof.FLOAT);
cuParamSetv(function, offset, pRadius, Sizeof.FLOAT);
offset += Sizeof.FLOAT;
cuParamSetSize(function, offset);
// Call the function.
//cuLaunchGrid(function, this.gridX, this.gridY);
//cuCtxSynchronize();
kernelLauncher.setGridSize(this.gridX, this.gridY);
kernelLauncher.setBlockSize(this.blockX, this.blockY, 1);
kernelLauncher.call(pOutput, this.sizeX, this.sizeY, this.sizeXY, this.sizeZ, this.radius);
// Obtain the output on the host
int hOutput[] = new int[sizeXYZ];
cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.INT * sizeXYZ);
buildImg(hOutput, "output");
// Clean up
cuArrayDestroy(array);
cuMemFree(dOutput);
}
void getPixels () {
this.sizeX=img.getWidth();
this.sizeY=img.getHeight();
this.sizeZ=img.getNSlices();
this.sizeXY=this.sizeX*this.sizeY;
this.sizeXYZ=this.sizeXY*this.sizeZ;
this.pixels=new int[sizeXYZ];
java.lang.Object[] ips=this.img.getStack().getImageArray();
for (int slice=0; slice<this.sizeZ; slice++){
short[] cur_slice=(short[])ips[slice];
int offsetZ=slice*this.sizeXY;
for (int y=0; y<this.sizeY; y++) {
int offsetY=y*this.sizeX;
for (int x=0; x<this.sizeX; x++) {
short curr_val=cur_slice[offsetY+x];
this.pixels[offsetY+offsetZ+x]=curr_val;
}
}
}
}
void mallocTex3D (){
// Create the array on the device
array = new CUarray();
CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
ad.Format = CU_AD_FORMAT_UNSIGNED_INT16;
ad.Width = sizeX;
ad.Height = sizeY;
ad.Depth = sizeZ;
ad.NumChannels = 1;
cuArray3DCreate(array, ad);
// Copy the host input to the array
CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
copy.srcHost = Pointer.to(pixels);
copy.srcPitch = sizeX * Sizeof.INT;
copy.srcHeight = sizeY;
copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
copy.dstArray = array;
copy.dstHeight = sizeX;
copy.WidthInBytes = sizeX * Sizeof.INT;
copy.Height = sizeY;
copy.Depth = sizeZ;
cuMemcpy3D(copy);
// Set up the texture reference
CUtexref texref = new CUtexref();
cuModuleGetTexRef(texref, module, "min_3D");
cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
}
public ImagePlus buildImg(int[] array, String title){
double min=array[0];
double max=array[0];
ImagePlus img=NewImage.createImage(title, this.sizeX, this.sizeY, this.sizeZ, 16, 1);
for (int z=0; z<this.sizeZ; z++){
IJ.showStatus("Creating the image...");
img.setSlice(z+1);
int offsetZ=z*sizeXY;
for (int y=0; y<this.sizeY; y++){
int offsetY=y*sizeX;
for (int x=0; x<this.sizeX; x++){
int currVal=array[x+offsetZ+offsetY];
min=Math.min(min, currVal);
max=Math.max(max, currVal);
img.getProcessor().putPixel(x,y, currVal);
}
}
}
IJ.showStatus("");
img.getProcessor().setMinAndMax(min, max);
return img;
}
}
Thank you
jeannot