Thanks for the fast answer.
I had a closer look into the disired Kernel.
It contains some templates.
So the new question is: Is it possible to access template Kernels via jcuda?
Here is the Kernel code:
/*
#ifndef _REDUCE_KERNEL_H_
#define _REDUCE_KERNEL_H_
#include <stdio.h>
// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory
{
__device__ inline operator T*()
{
extern __shared__ int __smem[];
return (T*)__smem;
}
__device__ inline operator const T*() const
{
extern __shared__ int __smem[];
return (T*)__smem;
}
};
template<>
struct SharedMemory<double>
{
__device__ inline operator double*()
{
extern __shared__ double __smem_d[];
return (double*)__smem_d;
}
__device__ inline operator const double*() const
{
extern __shared__ double __smem_d[];
return (double*)__smem_d;
}
};
template <class T>
__global__ void
reduce0(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata** : 0;
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
// modulo arithmetic is slow!
if ((tid % (2*s)) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T>
__global__ void
reduce1(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata** : 0;
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
int index = 2 * s * tid;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T>
__global__ void
reduce2(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata** : 0;
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T>
__global__ void
reduce3(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
T mySum = (i < n) ? g_idata** : 0;
if (i + blockDim.x < n)
mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] = mySum = mySum + sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T, unsigned int blockSize>
__global__ void
reduce4(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;
T mySum = (i < n) ? g_idata** : 0;
if (i + blockSize < n)
mySum += g_idata[i+blockSize];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
for(unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s)
{
sdata[tid] = mySum = mySum + sdata[tid + s];
}
__syncthreads();
}
if (tid < 32)
{
volatile T *smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T, unsigned int blockSize>
__global__ void
reduce5(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
T mySum = (i < n) ? g_idata** : 0;
if (i + blockSize < n)
mySum += g_idata[i+blockSize];
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
volatile T* smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; }
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void
reduce6(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
T mySum = 0;
while (i < n)
{
mySum += g_idata**;
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
mySum += g_idata[i+blockSize];
i += gridSize;
}
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
volatile T* smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; }
}
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
extern "C"
bool isPow2(unsigned int x);
////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
template <class T>
void
reduce(int size, int threads, int blocks,
int whichKernel, T *d_idata, T *d_odata)
{
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
// choose which of the optimized versions of reduction to launch
switch (whichKernel)
{
case 0:
reduce0<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 1:
reduce1<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 2:
reduce2<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 3:
reduce3<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
break;
case 4:
switch (threads)
{
case 512:
reduce4<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce4<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce4<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce4<T, 64><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce4<T, 32><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce4<T, 16><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce4<T, 8><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce4<T, 4><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce4<T, 2><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce4<T, 1><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
break;
case 5:
switch (threads)
{
case 512:
reduce5<T, 512><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce5<T, 256><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce5<T, 128><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce5<T, 64><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce5<T, 32><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce5<T, 16><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce5<T, 8><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce5<T, 4><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce5<T, 2><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce5<T, 1><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
break;
case 6:
default:
if (isPow2(size))
{
switch (threads)
{
case 512:
reduce6<T, 512, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce6<T, 256, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce6<T, 128, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce6<T, 64, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce6<T, 32, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce6<T, 16, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce6<T, 8, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce6<T, 4, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce6<T, 2, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce6<T, 1, true><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
}
else
{
switch (threads)
{
case 512:
reduce6<T, 512, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 256:
reduce6<T, 256, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 128:
reduce6<T, 128, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 64:
reduce6<T, 64, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 32:
reduce6<T, 32, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 16:
reduce6<T, 16, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 8:
reduce6<T, 8, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 4:
reduce6<T, 4, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 2:
reduce6<T, 2, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
case 1:
reduce6<T, 1, false><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size); break;
}
}
break;
}
}
// Instantiate the reduction function for 3 types
template void
reduce<int>(int size, int threads, int blocks,
int whichKernel, int *d_idata, int *d_odata);
template void
reduce<float>(int size, int threads, int blocks,
int whichKernel, float *d_idata, float *d_odata);
template void
reduce<double>(int size, int threads, int blocks,
int whichKernel, double *d_idata, double *d_odata);
#endif // #ifndef _REDUCE_KERNEL_H_
And here the Java Code:
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package jcudavecreduce;
import jcuda.jcublas.JCublas;
import static jcuda.driver.JCudaDriver.*;
import java.io.*;
import jcuda.*;
import jcuda.driver.*;
import static jcuda.runtime.cudaMemcpyKind.*;
import static jcuda.runtime.JCuda.*;
public class JCudaVecReduce {
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException{
JCublas.initialize();
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
// Create the PTX file by calling the NVCC
String ptxFileName = preparePtxFile("reduction_kernel.cu");
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Load the ptx file.
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, "reduce");
int numElements = 10000000;
int[] input = new int[numElements];
for(int i=0;i<numElements;i++){
input**=1;
}
float[] output = new float[numElements];
CUdeviceptr dInput = new CUdeviceptr();
cuMemAlloc(dInput, numElements * Sizeof.INT);
cudaMemcpy(dInput,Pointer.to(input),Sizeof.INT,cudaMemcpyHostToDevice);
CUdeviceptr dOutput = new CUdeviceptr();
cuMemAlloc(dOutput, numElements * Sizeof.INT);
cudaMemcpy(dOutput,Pointer.to(output),Sizeof.INT,cudaMemcpyHostToDevice);
int threadPerBlock = 512;
int GridDimX = (numElements+threadPerBlock-1)/threadPerBlock;
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{numElements}),
Pointer.to(new int[] {threadPerBlock}),
Pointer.to(new int[] {GridDimX}),
Pointer.to(new int[] {0}),
Pointer.to(dInput),
Pointer.to(dOutput)
);
//CUstream stream = new CUstream();
// Call the kernel function.
//int blockSizeX = 256;
//int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
long ownStart=System.currentTimeMillis();
cuLaunchKernel(function,
GridDimX, 1, 1, // Grid dimension
threadPerBlock, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
//cuCtxSynchronize();
long ownEnd=System.currentTimeMillis();
cudaMemcpy(Pointer.to(output),dOutput,numElements*Sizeof.INT, cudaMemcpyDeviceToHost);
System.out.println(output[0]);
}
}
Greetings Felix