JCuda & ImageJ : 3D filters

Hello,
I’d like to write 3D filters for ImageJ.
I’m very new to CUDA, and you will see I only patchworked from the different examples i found on JCuda.org.
The „simpleJCuda plugin“ (SJCP) (i will use it serveral times :stuck_out_tongue: ) works fine on my system, but i’d like to write filters using 3D textures.

First: I have a problem to load the Kernel. I put the .cubin file in the .jar file, as in the SJCP
-when i use the kernelLauncher i works
-when i use cuModuleLoad(module, cubinFileName); it doesn’t. I get the exception:
„jcuda.CudaException: CUDA_ERROR_FILE_NOT_FOUND
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
at jcuda.driver.JCudaDriver.cuModuleLoad(JCudaDriver.java:1400)
at JCuda_3DFilters.setup(JCuda_3DFilters.java:78)“

anyway what is the difference between the 2 ways of loading a kernel?

So I tried with the kernelLaucher as in SJCP, and It can’t copy the host input to the array in the function mallocTex3D of my code. I get the following exception:
„jcuda.CudaException: CUDA_ERROR_INVALID_VALUE
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
at jcuda.driver.JCudaDriver.cuMemcpy3D(JCudaDriver.java:4487)
at JCuda_3DFilters.mallocTex3D(JCuda_3DFilters.java:195)
at JCuda_3DFilters.run(JCuda_3DFilters.java:87)“

I also would like to understand the difference between „cudaMemcpy“ and „cuMemcpyDtoH“ to copy data back to host

Here is my code, you will see i tried using module, as in the texture example of JCuda.org, as i couldn’t load the .cubin i tried the kernellaucher and commented the important lines related to the module.

/**
 * ImageJ Plugin using JCuda
 *
 *
 */

import java.util.Arrays;

import ij.*;
import ij.process.*;
import ij.gui.*;
import ij.plugin.filter.*;


import static jcuda.driver.JCudaDriver.*;
import static jcuda.driver.CUfilter_mode.*;
import static jcuda.driver.CUaddress_mode.*;
import static jcuda.driver.CUarray_format.*;

import jcuda.*;
import jcuda.driver.*;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.utils.KernelLauncher;
import java.io.InputStream;


/**
 * A simple example for an ImageJ Plugin that uses JCuda.
 */
public class JCuda_3DFilters implements PlugInFilter {
	/**
	 * The current image to operate on
	 */
    private ImagePlus img = null;
    public int sizeX, sizeY, sizeZ, sizeXY, sizeXYZ;
    public int[] pixels;
    public float radius;
    public float scalexy, scalez;
    private static CUmodule module;
    public CUarray array;
    public int[] threads_per_block, Max_blockZ, Max_blockX, Max_blockY, Max_gridX, Max_gridY, Max_gridZ;
    public int blockZ, blockX, blockY, gridX, gridY, gridZ;
    private KernelLauncher kernelLauncher = null;

    @Override
    public int setup(String arg, ImagePlus imagePlus) {
        img = imagePlus;
        this.getPixels();
        // Initialize the driver and create a context for the first device.
        JCudaDriver.setExceptionsEnabled(true);
        cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        //cuDeviceGetCount(); pour gerer les multidevices
        cuDeviceGet(dev, 0);
        this.threads_per_block=new int[1];
        this.Max_gridX=new int[1];
        this.Max_gridY=new int[1];
        cuDeviceGetAttribute(this.threads_per_block, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
        //cuDeviceGetAttribute(this.Max_blockX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
        //cuDeviceGetAttribute(this.Max_blockY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
        //cuDeviceGetAttribute(this.Max_blockZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
        cuDeviceGetAttribute(this.Max_gridX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
        cuDeviceGetAttribute(this.Max_gridY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
        //cuDeviceGetAttribute(this.Max_gridZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
        cuCtxCreate(pctx, 0, dev);
        this.blockX=(int)Math.sqrt(this.threads_per_block[0]);
        this.blockY=this.blockX;
        IJ.log("block size:" + this.blockX);
        this.gridX=(this.sizeX+this.blockX-1)/this.blockX;
        this.gridY=(this.sizeY+this.blockY-1)/this.blockY;
               // Load the CUBIN file containing the kernels
        String cubinFileName = "JCuda_3DFilters_Kernel.cubin";
        module = new CUmodule();
        //cuModuleLoad(module, cubinFileName);

        // Create the kernelLauncher that will execute the kernel
    	InputStream cubinInputStream = getClass().getResourceAsStream(cubinFileName);
        kernelLauncher = KernelLauncher.load(cubinInputStream, "min_3D");
    	return DOES_16;
    }

    @Override
    public void run (ImageProcessor imageProcessor) {
    	
        this.mallocTex3D();
        this.execute();
    }

    void execute() {
    	 // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.INT * sizeXYZ);
        
        // Obtain the function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "min_3D");

        cuFuncSetBlockShape(function, this.blockX, this.blockY, 1);

        // Set up the function parameters
        Pointer pOutput = Pointer.to(dOutput);
        Pointer psX = Pointer.to(new int[]{ this.sizeX });
        Pointer psY = Pointer.to(new int[]{ this.sizeY });
        Pointer psXY = Pointer.to(new int[]{ this.sizeXY });
        Pointer psZ = Pointer.to(new int[]{ this.sizeZ });
        Pointer pRadius = Pointer.to(new float[]{ radius });

        int offset = 0;
        offset = align(offset, Sizeof.POINTER);
        cuParamSetv(function, offset, pOutput, Sizeof.POINTER);
        offset += Sizeof.POINTER;
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, psX, Sizeof.INT);
        offset += Sizeof.INT;
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, psY, Sizeof.INT);
        offset += Sizeof.INT;
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, psXY, Sizeof.INT);
        offset += Sizeof.INT;
        offset = align(offset, Sizeof.INT);
        cuParamSetv(function, offset, psZ, Sizeof.INT);
        offset += Sizeof.INT;
        offset = align(offset, Sizeof.FLOAT);
        cuParamSetv(function, offset, pRadius, Sizeof.FLOAT);
        offset += Sizeof.FLOAT;
        cuParamSetSize(function, offset);

        // Call the function.

        //cuLaunchGrid(function, this.gridX, this.gridY);
        //cuCtxSynchronize();

        kernelLauncher.setGridSize(this.gridX, this.gridY);
        kernelLauncher.setBlockSize(this.blockX, this.blockY, 1);
        kernelLauncher.call(pOutput, this.sizeX, this.sizeY, this.sizeXY, this.sizeZ, this.radius);

        // Obtain the output on the host
        int hOutput[] = new int[sizeXYZ];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.INT * sizeXYZ);
        buildImg(hOutput, "output");
        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);
        

    }

    void getPixels () {
        this.sizeX=img.getWidth();
	this.sizeY=img.getHeight();
	this.sizeZ=img.getNSlices();
	this.sizeXY=this.sizeX*this.sizeY;
	this.sizeXYZ=this.sizeXY*this.sizeZ;
	this.pixels=new int[sizeXYZ];
        java.lang.Object[] ips=this.img.getStack().getImageArray();
        for (int slice=0; slice<this.sizeZ; slice++){
            short[] cur_slice=(short[])ips[slice];
            int offsetZ=slice*this.sizeXY;
            for (int y=0; y<this.sizeY; y++) {
                int offsetY=y*this.sizeX;
                for (int x=0; x<this.sizeX; x++) {
                    short curr_val=cur_slice[offsetY+x];
                    this.pixels[offsetY+offsetZ+x]=curr_val;
                }
            }
	}
    }

    void mallocTex3D (){
        // Create the array on the device
        array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_UNSIGNED_INT16;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 1;
        cuArray3DCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(pixels);
        copy.srcPitch = sizeX * Sizeof.INT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.INT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "min_3D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
    }



    

    public ImagePlus buildImg(int[] array, String title){
		double min=array[0];
		double max=array[0];
		ImagePlus img=NewImage.createImage(title, this.sizeX, this.sizeY, this.sizeZ, 16, 1);

		for (int z=0; z<this.sizeZ; z++){
			IJ.showStatus("Creating the image...");
			img.setSlice(z+1);
			int offsetZ=z*sizeXY;
			for (int y=0; y<this.sizeY; y++){
				int offsetY=y*sizeX;
				for (int x=0; x<this.sizeX; x++){
					int currVal=array[x+offsetZ+offsetY];
					min=Math.min(min, currVal);
					max=Math.max(max, currVal);
					img.getProcessor().putPixel(x,y, currVal);
				}
			}
		}
		IJ.showStatus("");
		img.getProcessor().setMinAndMax(min, max);
                return img;
	}
    

}

Thank you
jeannot

Hello

The „simpleJCuda plugin“ (SJCP) …

Not to be confused with SCJP :wink:

I’m not so deeply involved in ImageJ, but created the sample to provide a starting point for those who want to use JCuda in ImageJ. But I’ll try to answer the general questions for now:

-when i use cuModuleLoad(module, cubinFileName); it doesn’t. I get the exception:
"jcuda.CudaException: CUDA_ERROR_FILE_NOT_FOUND

Die cuModuleLoad call (as all JCuda methods) directly delegates to the cuModuleLoad function. This function will do a „low-level“ File Access, and of course, can not find the CUBIN that is „hidden“ in the JAR. If you want to load the CUBIN manually, you will have to read the CUBIN data from the JAR file, and load this data using cuModuleLoadDataEx. You may want to have a look at how this is done in the
public static KernelLauncher load(InputStream cubinInputStream, String functionName)
method in the KernelLauncher source code. It’s a little bit complicated … that’s why I tried to make it simpler through the KernelLauncher :wink:

It can’t copy the host input to the array in the function mallocTex3D of my code. I get the following exception:
"jcuda.CudaException: CUDA_ERROR_INVALID_VALUE

That’s one thing for which I would have to examine the source code in more detail, but from a short glance: It seems that you are using the ‚module‘ in some places to access the textures. But this module seems not to be initialized (i.e. it’s not the module that has been loaded by the KernelLauncher). The module which has been loaded by the KernelLauncher may be obtained with „KernelLauncher#getModule()“. This module may afterwards be used to access the textures.

I also would like to understand the difference between „cudaMemcpy“ and „cuMemcpyDtoH“ to copy data back to host

There is none :slight_smile: At least, not really: The first one is part of the „CUDA Runtime API“, and the second one is from the „CUDA Driver API“. Effectively, they are doing the same thing (and fortunately, since CUDA 3.0, they are interoperable). In fact, about 90% of the Runtime API and the Driver API are „structurally equal“. There is primarily one important difference: The module management is only possible with the Driver API. So the Driver API can be seen as being slightly more „low-level“, but IMHO, the difference is mainly the different naming schemes in both APIs: In the Runtime API, all structures and function names start with „cuda“, and in the Driver API, they start with „CU“, but they basically mean the same in both cases…

Hello Marco! thank you for your answer, it helps me a lot understanding what i’m doing (but i still have some dark boxes in my code :stuck_out_tongue: )

ok, the kernellaucher is really useful (the offset thing was really boring :stuck_out_tongue: )
so if i understood you well, i corrected my code this way:


InputStream cubinInputStream = getClass().getResourceAsStream(cubinFileName);
kernelLauncher = KernelLauncher.load(cubinInputStream, "min_3D");
module = kernelLauncher.getModule();

i still get the same exception. i think the problem is before the set up of the texture reference.
i seems the array is created on the device but the data are not copied.
the problem may be here but i can’t figure out where:


array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_SIGNED_INT16; //CU_AD_FORMAT_UNSIGNED_INT8?
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 1;
        cuArray3DCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(pixels);
        copy.srcPitch = sizeX * Sizeof.INT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.INT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);


I have other questions, this time about the kernel.
Is there a specific function to go through the neighboring voxels of a voxel in a tex3D?
more broadly, what is the fastest way to go through the volume and the surface of a 3D sphere around a voxels using tex3D.

And another question, from what i understood, it is not possible to use a 3D grid, so what is the fastest way to go through the z-dimension? i’m currently doing a loop on the z-axis in my kernel

Thanks a lot
jean

Hello,

To get this right: The CUDA_ERROR_FILE_NOT_FOUND is solved, and now it’s only reporting the „CUDA_ERROR_INVALID_VALUE“ from the cuMemcpy3D call? The first step would be, of course, to verify the contents of the „CUDA_MEMCPY3D“ object (unfortunately, the error codes can not be so specific in this case). I’m currently not on my „development PC“, but I can try to start the Plugin early next week, maybe I can find the reason for this error message.

Concerning the other questions:

**Is there a specific function to go through the neighboring voxels of a voxel in a tex3D? **

I’m not aware of a specific function for that - but this does not mean that there is none, but only means that I’m not a CUDA expert, and have not worked very much with CUDA itself in general, and even less with textures specifically. If the question only refers to performance (and not simplicity of the code) I think this should not be so much a problem: The textures are cached, so reading elements that are „close“ to each other should be relatively fast (compared to usual global memory accesses).

from what i understood, it is not possible to use a 3D grid, so what is the fastest way to go through the z-dimension? i’m currently doing a loop on the z-axis in my kernel

In fact, in CUDA 4.0 it will be possible to use 3D grids on some cards, but this may still take some time. For now, I think the easiest is to use a loop on host side, if this is possible. Depending on the exact application case, it might be faster (and in some cases, it might even be necessary) to enlarge the grid. For example, if you want a 3D grid with 5x4x3 elements, then you can use a 2D grid, with 3*(5x4) = 15x4 elements - or 5x4*3 = 5x12 elements, respectively. This will cause some „nasty“ index computations, but should be doable.

bye

Hello,

I have updated to CUDA 4.0RC, and it will probably take some time until I can run tests for ImageJ again. Does the problem with the “CUDA_ERROR_INVALID_VALUE” from the cuMemcpy3D call still exist?

bye
Marco

[QUOTE=Marco13]Hello,

I have updated to CUDA 4.0RC, and it will probably take some time until I can run tests for ImageJ again. Does the problem with the „CUDA_ERROR_INVALID_VALUE“ from the cuMemcpy3D call still exist?

bye
Marco[/QUOTE]

Hello Marco,
Sorry i didn’t get an email to notice your answers.

so the CUDA_ERROR_FILE_NOT_FOUND is solved
and i still have the exception „CUDA_ERROR_INVALID_VALUE“ from the cuMemcpy3D. i really don’t know where is the mistake. do you think to code i posted is correct? it is actually the code from the texture example on you site JCuda.org, i just changed the float format to integer…

About the 3D grid, if my cards have a 2.0 computing capability, will it be possible to use 3D grids? if not i’ll get back to my nasty index computations :stuck_out_tongue:
why looping on host side would be faster than on device side? (in my case i can reuse information from the previous computation so i guess i’ll loop on the device)

Thanks
jean

Hello,

Admittedly, I don’t have extensively worked with many different (2D- and 3D) texture formats and configurations yet, and this can certainly be … rather complicated. I also had some difficulties with the first example, so I created the “JCudaDriverTextureTest.java” to at least get a little bit more familiar with this. I assume that you used the part from the example that dealt with 3D float textures, namely the part of “test_float_3D()” ?

If you are doing any modifications, … these modifications may have consequences which I can not foresee by simply “looking at” the CUDA_ARRAY3D_DESCRIPTOR or the CUDA_MEMCPY3D… But in this specific case, for example, you changed the “ad.Format” from CU_AD_FORMAT_FLOAT to CU_AD_FORMAT_SIGNED_INT16. This INT16 format does not correspond to the Java ‘int’ type (which has 32 bits), but the Java ‘short’ type (which is a 16 bit integer).

So which data type are you really going to use for the pixels?

In general, I think fewer modifications might be necessary when you used CU_AD_FORMAT_SIGNED_INT32 instead of CU_AD_FORMAT_SIGNED_INT16, but of course, if your application requires 16bit ints, it has to be _INT16.

If you change the data type to INT16, you must also make sure that the texture reference in the .CU input file is changed accordingly. So originally, there was
texture<float, 3, cudaReadModeElementType> texture_float_3D;
but now you need… well, probably something like
texture<short, 3, cudaReadModeElementType> texture_3D;
?! Additionally, I’m not 100% sure whether all "cudaReadMode"s are supported for all data types, I would have to look this up in the documentation…

Related to the same modification, and also very important: You changed the line
copy.srcPitch = sizeX * Sizeof.FLOAT;
to
copy.srcPitch = sizeX * Sizeof.INT;
(similarly for other lines) - for INT16, this should probably be Sizeof.SHORT instead. But again: All this depends on the type of pixels that you are really going to use.

As I mentioned, I’m currently in the process of migrating to CUDA 4, so I have limited possibilities for really testing this, especially directly as the ImageJ plugin. But in general, I’d recommend you to create some small “test application”, which “simulates” the calls from ImageJ. This way, it’s easier to break down individual tasks. For example, if you know that you have to handle 3D arrays of 16bit ints (Java ‘short’ values), it might be the best to create a small test app, similar to the “JCudaDriverTextureTest.java”, modified to use ‘short’ instead of ‘float’, and which just copies and reads the array. This may make it easier to get the individual sub-steps of your image processing pipeline working and to find possible bugs.

As soon as I have a running and working development environment again, I can try to take some more time for support.

By the way: I don’t know how “mature” the support for 3D grids already is, and how to find out on which cards it is supported, but I can try to find something about this in the release notes.
bye
Marco

Hallelujah!! Thanks marco it’s working! and it’s fast :slight_smile:
you were right i mixed the types.
so the correction for the copy :

        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(pixels);
        copy.srcPitch = sizeX * Sizeof.**SHORT**;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.**SHORT**;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);

2 „small“ mistakes that kept me stuck :o (between the [ B ] ! )

There was another mistake in my plugin, for those who are interested:
it’s very tricky because IJ’s shortprocessor use its own unsigned short type, so as there is no unsigned short type in java, you have to use the .get(x, y) method of shortprocessor, that returns a 32bit int, otherwise you loose everything > 32000…
so my getPixel function was uncorrect, here is one that works
(there may be a faster way but i’m still investigating, if anyone has a better one i’ll be grateful)


    private void getPixels () {
        this.sizeX=img.getWidth();
	this.sizeY=img.getHeight();
	this.sizeZ=img.getNSlices();
	this.sizeXY=this.sizeX*this.sizeY;
	this.sizeXYZ=this.sizeXY*this.sizeZ;
	this.pixels=new int[this.sizeXYZ];
        int offsetZ=0;
        for (int slice=0; slice<this.sizeZ; slice++){
            int offsetY=0;
            ImageProcessor ip = img.getImageStack().getProcessor(slice+1);
            for (int y=0; y<this.sizeY; y++) {
                for (int x=0; x<this.sizeX; x++) {
                    this.pixels[offsetY+offsetZ+x]=ip.get(x, y);
                }
                offsetY+=this.sizeX;
            }
            offsetZ+=this.sizeXY;
	}
    }

Thank you for your help,
bye
jean

Great to hear that :slight_smile:

BTW: In many cases a conversion between signed and unsigned values is not necessary: When the values are only passed to CUDA, it’s up to CUDA to interpret the value appropriately. One could say: These are not numbers, they are only bit patterns :wink: Even if you have a short value >32768 (which has become negative) you have not lost any information - this negative value can be converted into the appropriate positive „„unsigned““ int if necessary:

class ShortTest
{
    public static void main(String args[])
    {
        short s0 = (short)(32000);
        short s1 = (short)(32000+3000);

        System.out.println("32000 as short: "+s0);
        System.out.println("35000 as short: "+s1);

        int i0 = asUnsigned(s0);
        int i1 = asUnsigned(s1);

        System.out.println("32000 as int  : "+i0);
        System.out.println("35000 as int  : "+i1);
    }

    private static int asUnsigned(short s)
    {
        if (s < 0)
        {
            return s-Short.MIN_VALUE*2;
        }
        return s;
    }
}

But I’m not sure if this also the case here: In some cases, it is necessary (or at least easier or more efficient) to peform a conversion to int, in order to avoid problems with the signs.

bye

Hello Marco,
i get your point about the short type, but in my case i’ll have to manipulate the value both in java and cuda. So i think to simplify i’ll convert everything to float.

I’m sorry i was wrong in my previous post, i forgot to execute the function in my code. So it’s not working yet, I get an other exception:
jcuda.CudaException: CUDA_ERROR_LAUNCH_FAILED
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1337)
at jcuda.utils.KernelLauncher.call(KernelLauncher.java:1040)
at JCuda_3DFilters.execute(JCuda_3DFilters.java:102)

I really can’t figure out where is the problem.
In the plugin i’m trying to do, i just mix your simple JCuda plugin with JCudaDriverTextureTest, to test the use of textures with imageJ. I tried to simplify a lot (maybe i should simplify even more)
here is my kernel (i use the same as you in you JCudaDriverTextureTest example) : i just want to return the value of a single pixel for now:

texture<float,  3, cudaReadModeElementType> input;
extern "C"
__global__ void test(float *output, float posX, float posY, float posZ)
{
    float result = tex3D(input, posX, posY, posZ);
    output[0] = result;
}

and here is the execution part, where i get the exception:


    void execute() {

    	 // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * sizeXYZ);
        Pointer pOutput = Pointer.to(dOutput);
        kernelLauncher.setGridSize(this.gridSize, this.gridSize);
        kernelLauncher.setBlockSize(this.blockSize, this.blockSize, 1);
        float posX= (float) 103/sizeX;
        float posY= (float) 45/sizeY;
        float posZ= (float) 0/sizeZ;
        kernelLauncher.call(pOutput, posX, posY, posZ); //(line 102!!!!!!!!!!)

        // Obtain the output on the host
        float hOutput[] = new float[sizeXYZ];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * sizeXYZ);
        buildImg(hOutput, "output").show();
        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);
    }

here is the initialisation of the module:

        this.blockSize=16;
        this.gridSize = (int)Math.ceil((double)Math.max(this.sizeX, this.sizeY)/this.blockSize);
        IJ.log("block size:" + this.blockSize+ "grid size: "+this.gridSize);
               // Load the CUBIN file containing the kernels
        String cubinFileName = "JCuda_3DFilters_Kernel.cubin";

        // Create the kernelLauncher that will execute the kernel
    	InputStream cubinInputStream = getClass().getResourceAsStream(cubinFileName);
        kernelLauncher = KernelLauncher.load(cubinInputStream, "test");
        module = kernelLauncher.getModule();

here is the copy part, i made modifications to have everything in float, and i don’t get exception for this part so i suppose it’s ok:
(i also tested the part where i convert the stackprocessor to a 1D float array and i works)

        void mallocTex3D (){
        // Create the array on the device
        array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 1;
        cuArray3DCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(pixels);
        copy.srcPitch = sizeX * Sizeof.FLOAT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.FLOAT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "input");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
        //cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    }

I know you’re very busy, but if you have any idea of where the problem might be, i’ll be grateful.
Bye
Jean

Hello,

I’ll try to test this and se if I can find the error early this week. Maybe it would be easiest if you could send me the source for the test in a form that allows compiling and testing it quickly, possibly as a ZIP file in a PM or via mail. Otherwise, I’ll try to build the rest of the Plugin around the code you posted. Do I need specific input files to really test this?

bye
Marco

I’ll post my responste to the mail here (with permission from Jeannot) :

Hello

I tested the Plugin that you sent me. As I mentioned, for this “initial debugging” phase it might be helpful to run it as a normal Java application, to avoid having to create the Plugin and run it via ImageJ again and again - debugging CUDA may sometimes… well, at least a little bit of “trial and error”…

The modified code is attached. The most important modifications are marked with “XXX”:

  • Added a “main()”
  • Created a setup method for standalone use
  • Read the CUBIN via a FileInputStream
  • Important: I have set the grid/block dimensions for the KernelLauncher call to (1,1) and (1,1,1). Otherwise, the kernel would be executed by hundreds of threads in parallel, and all threads would write into the “output” simultaneously - this would cause invalid float values to be written. Every thread may only write to ONE memory location.
  • Most important (this probably caused the LAUNCH_FAILED crash that you described) : The pointer that is passed as the “output” to the KernelLauncher call may NOT be a “pointer to a pointer”, but ONLY the pointer itself! (The KernelLauncher internally does all this inconvenient “pointer-to-pointer”-stuff that was necessary when using “cuParamSetv”)

I have not tested it as a Plugin, but maybe this helps to proceed with your tests.

import static jcuda.driver.CUaddress_mode.CU_TR_ADDRESS_MODE_CLAMP;
import static jcuda.driver.CUarray_format.CU_AD_FORMAT_FLOAT;
import static jcuda.driver.CUfilter_mode.CU_TR_FILTER_MODE_LINEAR;
import static jcuda.driver.JCudaDriver.CU_TRSA_OVERRIDE_FORMAT;
import static jcuda.driver.JCudaDriver.*;
import ij.IJ;
import ij.ImagePlus;
import ij.gui.NewImage;
import ij.plugin.filter.PlugInFilter;
import ij.process.FloatProcessor;
import ij.process.ImageProcessor;
 
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.Arrays;
 
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUDA_ARRAY3D_DESCRIPTOR;
import jcuda.driver.CUDA_MEMCPY3D;
import jcuda.driver.CUarray;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUmemorytype;
import jcuda.driver.CUmodule;
import jcuda.driver.CUtexref;
import jcuda.driver.JCudaDriver;
import jcuda.utils.KernelLauncher;
 
 
/**
 * A simple example for an ImageJ Plugin that uses JCuda.
 */
public class JCuda_3DFilters implements PlugInFilter {
   
    // XXX To be able to start it as an application
    public static void main(String args[])
    {
        JCuda_3DFilters p = new JCuda_3DFilters();
        p.sizeX = 16;
        p.sizeY = 16;
        p.sizeZ = 16;
        p.sizeXY = p.sizeX * p.sizeY;
        p.sizeXYZ = p.sizeXY * p.sizeZ;
        p.pixels = new float[p.sizeXYZ];
        for (int i=0; i<p.sizeXYZ; i++)
        {
            p.pixels** = 0.123f;
        }
        p.setupInternal();
        p.run(null);
    }
   
   
    /**
     * The current image to operate on
     */
    private ImagePlus img = null;
    public int sizeX, sizeY, sizeZ, sizeXY, sizeXYZ;
    public float[] pixels;
    public float radius;
    public float scalexy, scalez;
    public CUmodule module;
    public CUarray array;
    public int[] threads_per_block, Max_blockZ, Max_blockX, Max_blockY, Max_gridX, Max_gridY, Max_gridZ;
    public int blockSize, gridSize;
    private KernelLauncher kernelLauncher = null;
 
    @Override
    public int setup(String arg, ImagePlus imagePlus) {
        img = imagePlus;
        this.getPixels();
        return setupInternal();
    }
 
    // XXX Called when starting it as an application
    public int setupInternal()
    {
        // Initialize the driver and create a context for the first device.
        JCudaDriver.setExceptionsEnabled(true);
        cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        //cuDeviceGetCount(); pour gerer les multidevices
        cuDeviceGet(dev, 0);
        this.threads_per_block=new int[1];
        //this.Max_gridX=new int[1];
        cuDeviceGetAttribute(this.threads_per_block, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
        //cuDeviceGetAttribute(this.Max_blockX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
        //cuDeviceGetAttribute(this.Max_blockY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
        //cuDeviceGetAttribute(this.Max_blockZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
        //cuDeviceGetAttribute(this.Max_gridX, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
        //cuDeviceGetAttribute(this.Max_gridY, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
        //cuDeviceGetAttribute(this.Max_gridZ, jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
        cuCtxCreate(pctx, 0, dev);
        //this.blockSize=(int)Math.sqrt(this.threads_per_block[0]);
        this.blockSize=16;
        this.gridSize = (int)Math.ceil((double)Math.max(this.sizeX, this.sizeY)/this.blockSize);
        IJ.log("block size:" + this.blockSize+ " grid size: "+this.gridSize);
        //this.gridSize=(this.sizeX+this.blockX-1)/this.blockX;
               // Load the CUBIN file containing the kernels
        String cubinFileName = "JCuda_3DFilters_Kernel.cubin";
 
        // Create the kernelLauncher that will execute the kernel
        // XXX InputStream cubinInputStream = getClass().getResourceAsStream(cubinFileName);
        InputStream cubinInputStream = null;
        try
        {
            cubinInputStream = new FileInputStream(cubinFileName);
        }
        catch (FileNotFoundException e)
        {
            e.printStackTrace();
        }
        kernelLauncher = KernelLauncher.load(cubinInputStream, "test");
        module = kernelLauncher.getModule();
        return DOES_16;
    }
 
    @Override
    public void run (ImageProcessor imageProcessor) {
        //buildImg(pixels, "lalala").show();
        this.mallocTex3D();
        this.execute();
    }
 
    void execute() {
 
         // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * sizeXYZ);
        // XXX Pointer pOutput = Pointer.to(dOutput);
        //kernelLauncher.setGridSize(this.gridSize, this.gridSize); XXX
        //kernelLauncher.setBlockSize(this.blockSize, this.blockSize, 1); XXX
        kernelLauncher.setGridSize(1, 1);
        kernelLauncher.setBlockSize(1, 1, 1);
        float posX= (float) 0;
        float posY= (float) 0;
        float posZ= (float) 0;
        // XXX kernelLauncher.call(pOutput, posX, posY, posZ);
        kernelLauncher.call(dOutput, posX, posY, posZ);
 
        // Obtain the output on the host
        float hOutput[] = new float[sizeXYZ];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * sizeXYZ);
       
        //testArray();
        System.out.println("output[0] is "+hOutput[0]);
       
        buildImg(hOutput, "output").show();
        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);
    }
   
   
 
    private void getPixels () {
        this.sizeX=img.getWidth();
    this.sizeY=img.getHeight();
    this.sizeZ=img.getNSlices();
    this.sizeXY=this.sizeX*this.sizeY;
    this.sizeXYZ=this.sizeXY*this.sizeZ;
    this.pixels=new float[this.sizeXYZ];
        int offsetZ=0;
        FloatProcessor fp = null;
        for (int slice=0; slice<this.sizeZ; slice++){
            System.arraycopy(img.getImageStack().getProcessor(slice+1).toFloat(0, fp).getPixels(), 0, pixels, offsetZ, sizeXY);
            offsetZ+=this.sizeXY;
    }
    }
 
    void mallocTex3D (){
        // Create the array on the device
        array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 1;
        cuArray3DCreate(array, ad);
 
        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(pixels);
        copy.srcPitch = sizeX * Sizeof.FLOAT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.FLOAT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);
       
        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "input");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
        //cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER);
        //cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);
 
        //testArray();
    }
 
 


    public ImagePlus buildImg(float[] array, String title) {
        double min=array[0];
        double max=array[0];
        ImagePlus out=NewImage.createImage(title, this.sizeX, this.sizeY, this.sizeZ, 32, 1);
        for (int z=0; z<this.sizeZ; z++){
            IJ.showStatus("Creating the image...");
            out.setSlice(z+1);
            int offsetZ=z*sizeXY;
            for (int y=0; y<this.sizeY; y++){
                int offsetY=y*sizeX;
                for (int x=0; x<this.sizeX; x++){
                    float currVal=array[x+offsetZ+offsetY];
                    min=Math.min(min, currVal);
                    max=Math.max(max, currVal);
                    out.getProcessor().putPixelValue(x,y, currVal);
                }
            }
        }
        IJ.showStatus("");
        out.getProcessor().setMinAndMax(min, max);
                return out;
    }
 
   
    /**
     * Read the contents of the array (which may NOT be null)
     * and print it on the console
     */
    private void testArray()
    {
        float temp[] = new float[pixels.length];
       
        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.srcArray = array;
        copy.srcPitch = sizeX * Sizeof.FLOAT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.dstHost = Pointer.to(temp);
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.FLOAT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);
       
        System.out.println("Read from array "+Arrays.toString(temp));
    }
}

(Here is my answer)
Thanks Marco!

So the problem was the pointer to pointer stuff in the kernel launcher, so i just removed the pointer to dOutput in the kernellaucher.call call and it worked with imageJ :D:

kernelLauncher.call(dOutput, sizeX, sizeY, sizeXY, sizeZ);

Bye,
Jean