You had already asked me about Hyper-Q in a mail (30.01.2021, 03:50), and I already mentioned that I have not yet used this technology, and cannot provide profound help here.
But looking at the example (from professional-cuda-c-programming/simpleHyperqDepth.cu at master · deeperlearning/professional-cuda-c-programming · GitHub ) and the error message:
You apparently try to compile a CUDA Runtime File. Apparently, because you didn’t provide any context.
The given file is a C-file with a CUDA kernel. This has to compiled with the NVCC, and a C-compiler in the background. The syntax of
kernel_1 <<<grid, block, 0, streams[i] >>>();
should already have told you that, if you had any idea about what you are doing there.
However, here is the same example, ported to the CUDA Driver API, with JCuda:
package jcuda.test;
import static jcuda.driver.JCudaDriver.cuCtxCreate;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuDeviceGetAttribute;
import static jcuda.driver.JCudaDriver.cuEventCreate;
import static jcuda.driver.JCudaDriver.cuEventElapsedTime;
import static jcuda.driver.JCudaDriver.cuEventRecord;
import static jcuda.driver.JCudaDriver.cuEventSynchronize;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoadData;
import static jcuda.driver.JCudaDriver.cuStreamCreate;
import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdevice_attribute;
import jcuda.driver.CUevent;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.CUstream;
import jcuda.driver.JCudaDriver;
import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram;
// Based on https://github.com/deeperlearning/professional-cuda-c-programming/
// blob/master/examples/chapter06/simpleHyperqDepth.cu
public class JCudaHyperqExample
{
/**
* The source code of the program that will be compiled at runtime:
*
* Note: The function should be declared as
* extern "C"
* to make sure that it can be found under the given name.
*/
private static String programSourceCode =
"#define N 300000" + "\n" +
"#define NSTREAM 4" + "\n" +
"" + "\n" +
"extern \"C\"" + "\n" +
"__global__ void kernel_1()" + "\n" +
"{" + "\n" +
" double sum = 0.0;" + "\n" +
"" + "\n" +
" for(int i = 0; i < N; i++)" + "\n" +
" {" + "\n" +
" sum = sum + tan(0.1) * tan(0.1);" + "\n" +
" }" + "\n" +
"}" + "\n" +
"" + "\n" +
"extern \"C\"" + "\n" +
"__global__ void kernel_2()" + "\n" +
"{" + "\n" +
" double sum = 0.0;" + "\n" +
"" + "\n" +
" for(int i = 0; i < N; i++)" + "\n" +
" {" + "\n" +
" sum = sum + tan(0.1) * tan(0.1);" + "\n" +
" }" + "\n" +
"}" + "\n" +
"" + "\n" +
"extern \"C\"" + "\n" +
"__global__ void kernel_3()" + "\n" +
"{" + "\n" +
" double sum = 0.0;" + "\n" +
"" + "\n" +
" for(int i = 0; i < N; i++)" + "\n" +
" {" + "\n" +
" sum = sum + tan(0.1) * tan(0.1);" + "\n" +
" }" + "\n" +
"}" + "\n" +
"" + "\n" +
"extern \"C\"" + "\n" +
"__global__ void kernel_4()" + "\n" +
"{" + "\n" +
" double sum = 0.0;" + "\n" +
"" + "\n" +
" for(int i = 0; i < N; i++)" + "\n" +
" {" + "\n" +
" sum = sum + tan(0.1) * tan(0.1);" + "\n" +
" }" + "\n" +
"}" + "\n";
/**
* Entry point of this sample
*
* @param args Not used
*/
public static void main(String[] args)
{
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
JNvrtc.setExceptionsEnabled(true);
// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
// Check that concurrent kernels are supported
int[] attributeArray = { 0 };
cuDeviceGetAttribute(attributeArray,
CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, device);
System.out.println("Concurrent kernels supported? "+attributeArray[0]);
// Compile the source code and obtain the function
nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram(
program, programSourceCode, null, 0, null, null);
nvrtcCompileProgram(program, 0, null);
String programLog[] = new String[1];
nvrtcGetProgramLog(program, programLog);
System.out.println("Program compilation log:\n" + programLog[0]);
String[] ptx = new String[1];
nvrtcGetPTX(program, ptx);
nvrtcDestroyProgram(program);
CUmodule module = new CUmodule();
cuModuleLoadData(module, ptx[0]);
CUfunction kernel_1 = new CUfunction();
cuModuleGetFunction(kernel_1, module, "kernel_1");
CUfunction kernel_2 = new CUfunction();
cuModuleGetFunction(kernel_2, module, "kernel_2");
CUfunction kernel_3 = new CUfunction();
cuModuleGetFunction(kernel_3, module, "kernel_3");
CUfunction kernel_4 = new CUfunction();
cuModuleGetFunction(kernel_4, module, "kernel_4");
int n_streams = 4;
CUstream[] streams = new CUstream[n_streams];
for (int i = 0 ; i < n_streams ; i++)
{
streams[i] = new CUstream();
cuStreamCreate(streams[i], 0);
}
CUevent start = new CUevent();
CUevent stop = new CUevent();
cuEventCreate(start, 0);
cuEventCreate(stop, 0);
cuEventRecord(start, null);
int numElements = 100000000;
int blockSizeX = 256;
int gridSizeX = (numElements + blockSizeX - 1) / blockSizeX;
for (int i = 0; i < n_streams; i++)
{
cuLaunchKernel(kernel_1,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, streams[i],
null, null
);
cuLaunchKernel(kernel_2,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, streams[i],
null, null
);
cuLaunchKernel(kernel_3,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, streams[i],
null, null
);
cuLaunchKernel(kernel_4,
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, streams[i],
null, null
);
}
cuEventRecord(stop, null);
cuEventSynchronize(stop);
float[] milliseconds = { 0.0f };
cuEventElapsedTime(milliseconds, start, stop);
System.out.printf("Measured time for parallel execution = %.3fs\n",
milliseconds[0] / 1000.0f);
// TODO Clean up (release streams and events).
}
}
Compile it. Run it. Now you can write into your thesis that you used Hyper-Q, and you can write into your CV that you have experience with modern technologies like NVIDIA CUDA Hyper-Q. Isn’t that great?