INVALID CUresult: 999 in Hadoop-Job

Hallo,
i try to run a jCuda supported Hadoop Map/Reduce-Job.
The job is pretty simple. It is an edge detection with the Sobel-Filter.

When i try my program in a simple java application without Hadoop everything works fine. No Errors!

BUT on Hadoop i get Errors.
The Hadoop job reads the Imagefiles from an SequenceFile and process the images via my jcuda supported sobelfilter (see below).
When i try it with a smaller SequenceFile (44.62 MB) it works fine.
But when i try it with an bigger SequenceFile (370.63 MB) it fails with

	at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:170)
	at jcuda.driver.JCudaDriver.cuCtxCreate(JCudaDriver.java:958)
	at CudaSobel.sobel(CudaSobel.java:175)
	at HadoopCUDASobelConverter$ImageMd5Mapper.map(HadoopCUDASobelConverter.java:129)
	at HadoopCUDASobelConverter$ImageMd5Mapper.map(HadoopCUDASobelConverter.java:1)
	at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
	at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:621)
	at org.apache.hadoop.mapred.MapTask.run(MapTask.java:305)
	at org.apache.hadoop.mapred.Child.main(Child.java:170)
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUctx_flags;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;

public class CudaSobel {
        CUcontext pctx = null;
	CUdevice dev = null;
	CUmodule module = null;
	String localcubinFileName = "";
	CUdeviceptr d_InArray = null;
	CUdeviceptr d_OutArray = null;
	CUdeviceptr d_SobelMartix = null;
	int[] h_SobelMartix = { -1, 0, 1, -2, 0, 2, -1, 0, 1 };
	// int []h_SobelMartix={-3,0,4,-10,0,10,-3,0,3};

	int[] h_OutArray = null;

	public CudaSobel(String cubinFileName) {
		// TODO Auto-generated constructor stub
		JCudaDriver.cuInit(0);
		JCudaDriver.setExceptionsEnabled(true);
		pctx = new CUcontext();
		dev = new CUdevice();
		JCudaDriver.cuDeviceGet(dev, 0);

		localcubinFileName = checkCubinFile(cubinFileName);

	}

	public int[] sobel(int[] h_InArray, int iw, int ih) {
		long vor = System.currentTimeMillis();
		h_OutArray = new int[iw * ih];
/*
* ERROR HERE
*/
		//JCudaDriver.cuCtxCreate(pctx, CUctx_flags.CU_CTX_BLOCKING_SYNC, dev);
		
		JCudaDriver.cuCtxCreate(pctx, CUctx_flags.CU_CTX_SCHED_AUTO, dev);
/*
* ERROR HERE
*/		

		d_InArray = new CUdeviceptr();
		d_OutArray = new CUdeviceptr();
		d_SobelMartix = new CUdeviceptr();

		// Load the CUBIN file.
		CUmodule module = new CUmodule();
		JCudaDriver.cuModuleLoad(module, localcubinFileName);

		// Obtain a function pointer to the "find" function.
		CUfunction function = new CUfunction();
		JCudaDriver.cuModuleGetFunction(function, module, "SobelFilter");

		JCudaDriver.cuMemFree(d_InArray);
		JCudaDriver.cuMemFree(d_SobelMartix);
		JCudaDriver.cuMemFree(d_OutArray);

		// Allocate arrays on the device, one for each row. The pointers
		// to these array are stored in host memory.

		JCudaDriver.cuMemAlloc(d_InArray, h_InArray.length * Sizeof.INT);
		JCudaDriver.cuMemcpyHtoD(d_InArray, Pointer.to(h_InArray),
				h_InArray.length * Sizeof.INT);

		JCudaDriver.cuMemAlloc(d_OutArray, h_OutArray.length * Sizeof.INT);
		JCudaDriver.cuMemcpyHtoD(d_OutArray, Pointer.to(h_OutArray),
				h_OutArray.length * Sizeof.INT);

		JCudaDriver
				.cuMemAlloc(d_SobelMartix, h_SobelMartix.length * Sizeof.INT);
		JCudaDriver.cuMemcpyHtoD(d_SobelMartix, Pointer.to(h_SobelMartix),
				h_SobelMartix.length * Sizeof.INT);

		// Set up the parameters for the function call: One pointer (to
		// pointers) for the input, one int for the size, and one pointer
		// for the output array. Note that for 'cuParamSetv' you have
		// to pass a pointer to a pointer, in order to set the value
		// of the pointer as the parameter.
		Pointer dpInArray = Pointer.to(d_InArray);
		Pointer dpOutArray = Pointer.to(d_OutArray);
		Pointer dpSobelMartix = Pointer.to(d_SobelMartix);
		Pointer pih = Pointer.to(new int[] { ih });
		Pointer piw = Pointer.to(new int[] { iw });

		int offset = 0;
		offset = JCudaDriver.align(offset, Sizeof.POINTER);
		JCudaDriver.cuParamSetv(function, offset, dpInArray, Sizeof.POINTER);
		offset += Sizeof.POINTER;

		offset = JCudaDriver.align(offset, Sizeof.POINTER);
		JCudaDriver.cuParamSetv(function, offset, dpOutArray, Sizeof.POINTER);
		offset += Sizeof.POINTER;

		offset = JCudaDriver.align(offset, Sizeof.INT);
		JCudaDriver.cuParamSetv(function, offset, piw, Sizeof.INT);
		offset += Sizeof.INT;

		offset = JCudaDriver.align(offset, Sizeof.INT);
		JCudaDriver.cuParamSetv(function, offset, pih, Sizeof.INT);
		offset += Sizeof.INT;

		offset = JCudaDriver.align(offset, Sizeof.POINTER);
		JCudaDriver
				.cuParamSetv(function, offset, dpSobelMartix, Sizeof.POINTER);
		offset += Sizeof.POINTER;

		JCudaDriver.cuParamSetSize(function, offset);

		// Set up the execution parameters.
		// JCudaDriver.cuFuncSetBlockShape(function, 32, 16, 1);
		// int grid_width = (inImage.getWidth(null)+31)/32;
		// int grid_height = (ih+15)/16;
		int TILE_WIDTH = 20;
		int TILE_HEIGHT = 16;
		JCudaDriver.cuFuncSetBlockShape(function, TILE_WIDTH, TILE_HEIGHT, 1);
		int grid_width = (iw + (TILE_WIDTH - 1)) / TILE_WIDTH;
		int grid_height = (ih + (TILE_HEIGHT - 1)) / TILE_HEIGHT;
		// System.out.println(grid_width + "  " + grid_height);

		long nach = System.currentTimeMillis();
		System.out.println("Prepare Kernellunch: " + (nach - vor));
		vor = System.currentTimeMillis();

		JCudaDriver.cuLaunchGrid(function, grid_width, grid_height);
		JCudaDriver.cuCtxSynchronize();

		nach = System.currentTimeMillis();
		System.out.println("Kernellunch: " + (nach - vor));

		// Allocate host output memory and copy the device output
		// to the host.

		vor = System.currentTimeMillis();
		JCudaDriver.cuMemcpyDtoH(Pointer.to(h_OutArray), d_OutArray,
				h_OutArray.length * Sizeof.INT);
		nach = System.currentTimeMillis();
		System.out.println("Copy Result from Device: " + (nach - vor));

		JCudaDriver.cuMemFree(d_InArray);
		JCudaDriver.cuMemFree(d_OutArray);
		JCudaDriver.cuMemFree(d_SobelMartix);

		JCudaDriver.cuCtxDestroy(pctx);

		return h_OutArray;
	}

}

I don’t know whats wrong?!?

I use:
ubuntu 9.10 64bit
hadoop-0.20.2
jcuda-0.3.2a (with libJCudaDriver-linux-x86_64)
also tryed
jcuda-0.3.2RC (with libJCudaDriver-linux-x86_64)

I could upload my project if that would help.
Anyone any idea?
Do you need any other information?

thanks for help

Hello

The error code ‘999’ stands for CUDA_ERROR_UNKNOWN (The fact that it is printed as an ‘INVALID error code’ was a bug in version 0.3.2, but this referred solely to the error message itself, and is thus not related to the actual “CUDA_ERROR_UNKNOWN”, which indeed seems to be reported by the cuCtxCreate function).

When you are talking about a sequence of images, I assume that this method is called several times. Depending on the remaining setup, it might be necessary to cuCtxDestroy() at the end of the function. (I mean it might be the case that several contexts are created but never destroyed, and after a while it fails to create new contexts - but this is just a guess).

Is it right that this function is called several times, and that it only fails after it has been called a few times?

bye

Hi and thanks for replaying …

Yes you are right. I call the function several times…

for each call i create context (line 50) in the beginning that is where the code fails…

in the end i destroy the context with cuCtxDestroy() (line 156)…
i don’t have an finally block here but i tried it also with a finally block but the error remained…

i in my local testfield (without Hadoop) i called this function over a 100 times without this error… I just got some LaunchTimeout’s but thats another issue

any other ideas?

Sorry, I missed that cuCtxDestroy. So this should be fine. The next guess would be that a
cuModuleUnload(module);
might be worth a try. Admittedly, I’m not sure how “clever” CUDA is in this case, i.e. whether a module will automatically be detroyed/freed when the context is destroyed (would have to look this up in the spec - I guess it should be unloaded automatically, but I’m not sure)

Calling “cuMemFree” for the device memory which has not yet been allocated should not be necessary, and since the specification says that the memory “…must have been returned by a previous call to cuMemAlloc()…”, it might even cause errors in the worst case. You should remove these lines at the beginning of the method (but keep them at the end, of course!)

Apart from that, I don’t see any “obvious” possible reasons for the error. Unfortunately, it’s hard to figure out where a “CUDA_ERROR_UNKNOWN” might come from…

It might look like an attempt for a cheaty workaround for the bug, but just as a general hint: It might be worthwhile examining whether it is possible to do as much of the “redundant” work only once. At the moment, you are creating the context many times. And you are allocating/freeing the memory many times. And especially the loading of the CUBIN file might be rather time consuming and should preferably be done only once. Depending on how this class is used, it might be better to have a pattern like

CudaSobel cs = new CudaSobel(...);
cs.initializeEverything();
for (int i=0; i<100; i++) { cs.sobel(frames**,...); }
cs.shutDownEverything();

But I’m not sure in how far this is applicable here.

In any case, it should work even if everything is initialized and torn down for each call so ‘sobel’, so it would be good to know whether unloading the module and removing the cuMemFree calls already solved it the problem.

bye