Hi all,
I’m trying to take a simple Cuda kernel which computes a vector dot-product and invoke it via JCuda. The kernel works fine invoked via C++ but coredumps when invoked via JCuda inside libjcuda. I’ve compared the C++ and Java, and also compared the Java to the vector_add sample provided by JCuda, but still can’t find the error/discrepancy. I’m hoping someone wiser than me can point out the problem!
The kernel:
/**
* dotproduct_cuda - this is the kernal for the GPU
* a: input vector a
* b: input vector b
* result: float for result
* N: size of input vectors
*/
extern "C"
__global__
void __dotproduct_cuda(float *a, float *b, float *result, size_t N, bool verbose) {
__shared__
float temp[THREADS_PER_BLOCK];
// grid-stride loop
// stride size is blockDim.x * gridDim.x - if n < stride, the loop executes exactly once
temp[threadIdx.x] = 0;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
if (verbose)
printf(" Thread[%u,%u] += %d * %d\n", blockIdx.x, threadIdx.x, a[i], b[i]);
temp[threadIdx.x] += a[i] * b[i];
}
// Make sure all threads are done multiplying before aggregating the results
__syncthreads();
// Thread 0 aggregates all of the results
if (threadIdx.x == 0) {
float sum = 0;
for (int i = 0; i < blockDim.x; i++) {
sum += temp[i];
}
if (verbose)
printf(" Total for block %u: %f\n", blockIdx.x, sum);
atomicAdd(result, sum);
}
}
Invocation from Java:
// Allocate the device input data, and copy the host input data to the device
CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, numSamples * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numSamples * Sizeof.FLOAT);
CUdeviceptr deviceInputB = new CUdeviceptr();
cuMemAlloc(deviceInputB, numSamples * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB), numSamples * Sizeof.FLOAT);
// Allocate device output memory
CUdeviceptr deviceOutput = new CUdeviceptr();
cuMemAlloc(deviceOutput, Sizeof.FLOAT);
// Set up the kernel parameters: A pointer to an array
// of pointers which point to the actual values.
Pointer kernelParameters = Pointer.to(
Pointer.to(deviceInputA),
Pointer.to(deviceInputB),
Pointer.to(deviceOutput),
Pointer.to(new int[] { numSamples, (kernelVerbose ? 1 : 0) }));
// Determine our size requirements
// Once N exceeds MAX_BLOCKS *THREADS_PER_BLOCK, the grid-stride pattern is used
if (blockSizeX == 0)
blockSizeX = DEFAULT_BLOCK_SIZE;
if (gridSizeX == 0) {
gridSizeX = (int) Math.ceil((float) numSamples / DEFAULT_BLOCK_SIZE);
if (gridSizeX == 1)
blockSizeX = numSamples;
if (gridSizeX > MAX_BLOCKS) // this will trigger grid-stride loops
gridSizeX = MAX_BLOCKS;
}
logger.debug("blocks {}, threads {}", gridSizeX, blockSizeX);
// Call the kernel function.
// __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, * d_result, N, verbose);
cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
32768, null, // Shared memory size and stream
kernelParameters, null // Kernel- and extra parameters
);
logger.debug("Kernel launched");
// Synchronize the devices
cuCtxSynchronize();
logger.debug("Context synchronized");
// Allocate host output memory and copy the device output to the host.
float[] hostOutput = new float[1];
cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, numSamples * Sizeof.FLOAT);
// Clean up.
cuMemFree(deviceInputA);
cuMemFree(deviceInputB);
cuMemFree(deviceOutput);
Invocation from C++:
float result_gpu = 0;
// Allocate device memory
cudaMalloc((void**) &d_a, sizeof(float) * size);
cudaMalloc((void**) &d_b, sizeof(float) * size);
cudaMalloc((void**) &d_result, sizeof(float)); // a single float
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(float) * size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(float) * size, cudaMemcpyHostToDevice);
cudaMemcpy(d_result, &result_gpu, sizeof(float), cudaMemcpyHostToDevice);
// Determine our size requirements
// Once N exceeds MAX_BLOCKS * THREADS_PER_BLOCK, the grid-stride pattern is used
if (threads == 0)
threads = THREADS_PER_BLOCK;
if (blocks == 0) {
blocks = ceil((float) N / THREADS_PER_BLOCK);
if (blocks == 1)
threads = N;
if (blocks > MAX_BLOCKS) // this will trigger grid-stride loops
blocks = MAX_BLOCKS;
}
if (verbose)
printf("blocks %d, threads %d\n", blocks, threads);
// Execute kernel
__dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, d_result, N, verbose);
// Make the host block until the device is finished
cudaDeviceSynchronize();
// Check for CUDA errors
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
// Transfer result back to host memory
cudaMemcpy(&result_gpu, d_result, sizeof(float), cudaMemcpyDeviceToHost);
// Deallocate host memory
free(a);
free(b);
// Deallocate device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
I appreciate any assistance!