OK, I tried to create a basic test for dynamic parallelism with the driver API. This may be a bit of a hassle, for several reasons, but … however:
I created a backup of the ‘cdpSimplePrint’ sample project, and just built a test based on this project. I changed the ‘cdpSimplePrint.cu’ main file to
cdpSimplePrint.cu
[spoiler]
#include <iostream>
#include <cstring>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cuda.h>
#include <builtin_types.h>
using namespace std;
CUdevice device;
CUcontext context;
CUmodule module;
CUfunction kernelFunction;
int main(int argc, char **argv)
{
int error = 0;
int max_depth = 2;
int devID = 0;
error = cuInit(0);
if (error != 0) { printf("Error %d
", error); return error; }
error = cuDeviceGet(&device, devID);
if (error != 0) { printf("Error %d
", error); return error; }
error = cuCtxCreate(&context, 0, device);
if (error != 0) { printf("Error %d
", error); return error; }
string module_path = "SimplePrintKernel.cubin";
error = cuModuleLoad(&module, module_path.c_str());
if (error != 0) { printf("Error %d
", error); return error; }
error = cuModuleGetFunction(&kernelFunction, module, "cdp_kernel");
if (error != 0) { printf("Error %d
", error); return error; }
int threadsPerBlock = 2;
int blocksPerGrid = 2;
int depth = 0;
int thread = 0;
int parent_uid = 0;
void *args[] = { &max_depth, &depth, &thread, &parent_uid };
// Launch the kernel from the CPU.
printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:
");
error = cuLaunchKernel(kernelFunction, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1,
0,
NULL, args, NULL);
if (error != 0) { printf("Error %d
", error); return error; }
error = cuCtxSynchronize();
if (error != 0) { printf("Error %d
", error); return error; }
exit(EXIT_SUCCESS);
}
[/spoiler]
(Yes, it is a VERY basic test…)
I tries to load a ‘SimplePrintKernel.cubin’ from the same directory. This CUBIN was created from this source code
SimplePrintKernel.cu
[spoiler]
#include <stdio.h>
////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
__device__ int g_uids = 0;
////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
__device__ void print_info(int depth, int thread, int uid, int parent_uid)
{
if (threadIdx.x == 0)
{
if (depth == 0)
printf("BLOCK %d launched by the host
", uid);
else
{
char buffer[32];
for (int i = 0 ; i < depth ; ++i)
{
buffer[3*i+0] = '|';
buffer[3*i+1] = ' ';
buffer[3*i+2] = ' ';
}
buffer[3*depth] = '\0';
printf("%sBLOCK %d launched by thread %d of block %d
", buffer, uid, thread, parent_uid);
}
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the 'max_depth' has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
extern "C"
__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid)
{
// We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
__shared__ int s_uid;
printf("Thread %d
", threadIdx.x);
if (threadIdx.x == 0)
{
s_uid = atomicAdd(&g_uids, 1);
}
__syncthreads();
// We print the ID of the block and information about its parent.
print_info(depth, thread, s_uid, parent_uid);
// We launch new blocks if we haven't reached the max_depth yet.
if (++depth >= max_depth)
{
return;
}
cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);
}
[/spoiler]
The command line for compiling the kernel into a CUBIN should be
nvcc -m64 -cubin -arch sm_35 SimplePrintKernel.cu -lcudadevrt
Of course, this is for a CC 3.5-device. For my test, I had to remove the recursive launch and compile it for CC 2.0. But I think that it COULD basically work to use dynamic parallelism even from the driver API like this.
There are some sections about the compilation process in the CUDA- and NVCC documentation ( http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#using-separate-compilation-in-cuda , http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compiling-and-linking …), but admittedly, I did not (yet) read them in detail (it’s quite a lot, and there’s probably only a few lines really relevant for this issue…).
However, if the above works, I think that with the same CUBIN file, it should work exactly the same way in JCuda.
Any information about whether it works (either in CUDA-C or in JCuda/Java, or both) would be helpful…