[MENTION=3721]typecheck[/MENTION]
Thanks again for pointing this out. I should probably have added some sort of … typecheck … for the pointers. In any case, I reviewed the types now: Some parameters of [inline]cudnnBatchNormalizationBackward[/inline] (and [inline]cudnnSpatialTfSamplerBackward[/inline]) had to be in host memory. Additionally, the last parameters of [inline]cudnnBatchNormalizationBackward[/inline] are optional (i.e. they may be [inline]null[/inline]). The update is in https://github.com/jcuda/jcudnn/commit/76d3c83498ee2e0d0c086b5793de31de8c6c0aba
I also updated and ran the test. Some remarks:

Important: You did not use [inline]cudnnCreate[/inline] to create the [inline]cudnnHandle[/inline].

Is there a particular reason to use the JCublas functions for allocating and copying memory? Strictly speaking and technically, it does not really matter, because (since CUDA 3.0), the CUBLAS functions like [inline]cublasAlloc[/inline] are basically just wrappers for [inline]cudaMalloc[/inline] etc. But it seems more “natural” to use the CUDA functions, because cuDNN is otherwise unrelated to CUBLAS.

Although in Java (due to the lack of [inline]typedef[/inline]) the type information for types that map to primitive types is lost. But I’d strongly recommend to use the type constants instead of plain [inline]int[/inline] values. E.g. instead of
int mode = 1; // spatial mode
...
cudnnSetTensor4dDescriptor(descriptor,
0, 0,
dims[0], dims[1], dims[2], dims[3]);
one should probably use
int mode = CUDNN_BATCHNORM_SPATIAL;
...
cudnnSetTensor4dDescriptor(descriptor,
CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
dims[0], dims[1], dims[2], dims[3]);
(with the appropriate static imports)
Here is the updated test code (as an application) :
package jcuda.jcudnn.test;
import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
import static jcuda.jcudnn.JCudnn.cudnnCreate;
import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
import static jcuda.jcudnn.cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_FLOAT;
import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
import java.util.Arrays;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcudnn.JCudnn;
import jcuda.jcudnn.cudnnHandle;
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.runtime.JCuda;
public class JCudnnBatchNormalizationTest
{
public static void main(String[] args)
{
JCudnn.setExceptionsEnabled(true);
JCuda.setExceptionsEnabled(true);
cudnnHandle cudnnHandle = new cudnnHandle();
cudnnCreate(cudnnHandle);
int[] dims = {2, 3, 1, 1};
cudnnTensorDescriptor descriptor = new cudnnTensorDescriptor();
cudnnCreateTensorDescriptor(descriptor);
cudnnSetTensor4dDescriptor(descriptor,
CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
dims[0], dims[1], dims[2], dims[3]);
cudnnTensorDescriptor norm_dptr = new cudnnTensorDescriptor();
cudnnCreateTensorDescriptor(norm_dptr);
cudnnSetTensor4dDescriptor(norm_dptr,
CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, dims[1], 1, 1);
Pointer x = new Pointer();
Pointer dy = new Pointer();
Pointer dx = new Pointer();
int size = dims[0]*dims[1]*dims[2]*dims[3];
cudaMalloc(x, size * Sizeof.FLOAT);
cudaMalloc(dy, size * Sizeof.FLOAT);
cudaMalloc(dx, size * Sizeof.FLOAT);
cudaMemcpy(x, Pointer.to(new float[] {1,2,3,4,5,6}),
size * Sizeof.FLOAT, cudaMemcpyHostToDevice);
int norm_size = dims[1];
Pointer scale = new Pointer();
Pointer d_scale = new Pointer();
Pointer d_bias = new Pointer();
Pointer saved_mean = new Pointer();
Pointer saved_variance = new Pointer();
cudaMalloc(scale, norm_size * Sizeof.FLOAT);
cudaMalloc(d_scale, norm_size * Sizeof.FLOAT);
cudaMalloc(d_bias, norm_size * Sizeof.FLOAT);
cudaMalloc(saved_mean, norm_size * Sizeof.FLOAT);
cudaMalloc(saved_variance, norm_size * Sizeof.FLOAT);
cudaMemcpy(scale, Pointer.to(new float[] {10,100,1000}),
norm_size * Sizeof.FLOAT, cudaMemcpyHostToDevice);
int mode = CUDNN_BATCHNORM_SPATIAL;
Pointer one = Pointer.to(new float[]{1.0f});
Pointer zero = Pointer.to(new float[]{0.0f});
double epsilon = 1;
int ret = cudnnBatchNormalizationBackward(
cudnnHandle, mode, one, zero, one, zero,
descriptor, x, descriptor, dy, descriptor, dx,
norm_dptr, scale, d_scale, d_bias,
epsilon, saved_mean, saved_variance);
System.out.println(ret);
float[] result = new float[size];
cudaMemcpy(Pointer.to(result), dx,
size * Sizeof.FLOAT, cudaMemcpyDeviceToHost);
System.out.println(Arrays.toString(result));
}
}
Regarding the update: Which OS are you using? Or specifically: If you want to, I can provide the updated Win64 binary, if you don’t want to compile it yourself. (I’ll likely not create a new “release” for this, because the update to CUDA 8.0 (final) is already pending anyhow  although I cannot say an exact date, it should be available “soon”).
*** Edit ***
EDIT: BTW, the output that is printed for “dx” is sill a 0vector. But I’ll have to wrap my head around what the function is actually supposed to do before I can tell whether this is correct or not. (There are many input parameters, and some of them are 0vectors as well, so I’m not sure)
*** Edit ***
Another EDIT: I have checked this against the corresponding Cimplementation
#include <cuda.h>
#include <cudnn.h>
#include <cstdio>
int main(int argc, char* argv[])
{
cudnnHandle_t cudnnHandle;
cudnnCreate(&cudnnHandle);
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
int dims[] = { 2, 3, 1, 1 };
cudnnTensorDescriptor_t descriptor;
cudnnCreateTensorDescriptor(&descriptor);
cudnnSetTensor4dDescriptor(descriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, dims[0], dims[1], dims[2], dims[3]);
cudnnTensorDescriptor_t norm_dptr;
cudnnCreateTensorDescriptor(&norm_dptr);
cudnnSetTensor4dDescriptor(norm_dptr, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, dims[1], 1, 1);
double epsilon = 1;
float one = 1.0f;
float zero = 0.0f;
float *x;
float *dy;
float *dx;
int size = dims[0] * dims[1] * dims[2] * dims[3];
cudaMalloc(&x, size * sizeof(float));
cudaMalloc(&dy, size * sizeof(float));
cudaMalloc(&dx, size * sizeof(float));
float h_x[] = { 1, 2, 3, 4, 5, 6 };
cudaMemcpy(x, h_x, size * sizeof(float), cudaMemcpyHostToDevice);
int norm_size = dims[1];
float *scale;
float *d_scale;
float *d_bias;
float *saved_mean;
float *saved_variance;
cudaMalloc(&scale, norm_size * sizeof(float));
cudaMalloc(&d_scale, norm_size * sizeof(float));
cudaMalloc(&d_bias, norm_size * sizeof(float));
cudaMalloc(&saved_mean, norm_size * sizeof(float));
cudaMalloc(&saved_variance, norm_size * sizeof(float));
float h_scale[] = { 10, 100, 1000 };
cudaMemcpy(scale, h_scale, norm_size * sizeof(float), cudaMemcpyHostToDevice);
int ret = cudnnBatchNormalizationBackward(cudnnHandle, mode, &one, &zero, &one, &zero,
descriptor, x, descriptor, dy, descriptor, dx,
norm_dptr, scale, d_scale, d_bias,
epsilon, saved_mean, saved_variance);
float *result = new float[size];
cudaMemcpy(result, dx, size * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < size; i++)
{
printf("At %d have %f
", i, result[i]);
}
printf("Done");
return 0;
}
And this also prints the 0vector, so it is at least reasonable to say that this is “correct”…