Coredump invoking kernel in JCuda, kernel works fine invoked natively

rsand · 24. Oktober 2022 um 16:31

Hi all,

I’m trying to take a simple Cuda kernel which computes a vector dot-product and invoke it via JCuda. The kernel works fine invoked via C++ but coredumps when invoked via JCuda inside libjcuda. I’ve compared the C++ and Java, and also compared the Java to the vector_add sample provided by JCuda, but still can’t find the error/discrepancy. I’m hoping someone wiser than me can point out the problem!

The kernel:

/**
 * dotproduct_cuda - this is the kernal for the GPU
 * a: input vector a
 * b: input vector b
 * result: float for result
 * N: size of input vectors
 */
extern "C"
__global__
void __dotproduct_cuda(float *a, float *b, float *result, size_t N, bool verbose) {
    __shared__
    float temp[THREADS_PER_BLOCK];

    // grid-stride loop
    // stride size is blockDim.x * gridDim.x - if n < stride, the loop executes exactly once
    temp[threadIdx.x] = 0;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
        if (verbose)
            printf("    Thread[%u,%u] += %d * %d\n", blockIdx.x, threadIdx.x, a[i], b[i]);
        temp[threadIdx.x] += a[i] * b[i];
    }

    // Make sure all threads are done multiplying before aggregating the results
    __syncthreads();

    // Thread 0 aggregates all of the results
    if (threadIdx.x == 0) {
        float sum = 0;
        for (int i = 0; i < blockDim.x; i++) {
            sum += temp[i];
        }
        if (verbose)
            printf("    Total for block %u: %f\n", blockIdx.x, sum);
        atomicAdd(result, sum);
    }
}

Invocation from Java:

        // Allocate the device input data, and copy the host input data to the device
        CUdeviceptr deviceInputA = new CUdeviceptr();
        cuMemAlloc(deviceInputA, numSamples * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numSamples * Sizeof.FLOAT);
        CUdeviceptr deviceInputB = new CUdeviceptr();
        cuMemAlloc(deviceInputB, numSamples * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB), numSamples * Sizeof.FLOAT);

        // Allocate device output memory
        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, Sizeof.FLOAT);

        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.
        Pointer kernelParameters = Pointer.to(
                Pointer.to(deviceInputA),
                Pointer.to(deviceInputB),
                Pointer.to(deviceOutput),
                Pointer.to(new int[] { numSamples, (kernelVerbose ? 1 : 0) }));

        // Determine our size requirements
        // Once N exceeds MAX_BLOCKS *THREADS_PER_BLOCK, the grid-stride pattern is used
        if (blockSizeX == 0)
            blockSizeX = DEFAULT_BLOCK_SIZE;
        if (gridSizeX == 0) {
            gridSizeX = (int) Math.ceil((float) numSamples / DEFAULT_BLOCK_SIZE);
            if (gridSizeX == 1)
                blockSizeX = numSamples;
            if (gridSizeX > MAX_BLOCKS) // this will trigger grid-stride loops
                gridSizeX = MAX_BLOCKS;
        }

        logger.debug("blocks {}, threads {}", gridSizeX, blockSizeX);

        // Call the kernel function.
        // __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, * d_result, N, verbose);
        cuLaunchKernel(function,
                gridSizeX, 1, 1, // Grid dimension
                blockSizeX, 1, 1, // Block dimension
                32768, null, // Shared memory size and stream
                kernelParameters, null // Kernel- and extra parameters
        );

        logger.debug("Kernel launched");

        // Synchronize the devices
        cuCtxSynchronize();

        logger.debug("Context synchronized");

        // Allocate host output memory and copy the device output to the host.
        float[] hostOutput = new float[1];
        cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, numSamples * Sizeof.FLOAT);

        // Clean up.
        cuMemFree(deviceInputA);
        cuMemFree(deviceInputB);
        cuMemFree(deviceOutput);

Invocation from C++:

    float result_gpu = 0;

    // Allocate device memory
    cudaMalloc((void**) &d_a, sizeof(float) * size);
    cudaMalloc((void**) &d_b, sizeof(float) * size);
    cudaMalloc((void**) &d_result, sizeof(float)); // a single float

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result_gpu, sizeof(float), cudaMemcpyHostToDevice);

    // Determine our size requirements
    // Once N exceeds MAX_BLOCKS * THREADS_PER_BLOCK, the grid-stride pattern is used
    if (threads == 0)
        threads = THREADS_PER_BLOCK;
    if (blocks == 0) {
        blocks = ceil((float) N / THREADS_PER_BLOCK);
        if (blocks == 1)
            threads = N;
        if (blocks > MAX_BLOCKS)  // this will trigger grid-stride loops
            blocks = MAX_BLOCKS;
    }
    if (verbose)
        printf("blocks %d, threads %d\n", blocks, threads);

    // Execute kernel
    __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, d_result, N, verbose);

    // Make the host block until the device is finished
    cudaDeviceSynchronize();

    // Check for CUDA errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(-1);
    }

    // Transfer result back to host memory
    cudaMemcpy(&result_gpu, d_result, sizeof(float), cudaMemcpyDeviceToHost);

    // Deallocate host memory
    free(a);
    free(b);

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);

I appreciate any assistance!

rsand · 24. Oktober 2022 um 18:57

The final cuMemcpyDtoH has the wrong size, it should be just Sizeof.FLOAT. But I’m still getting the coredump. I feel like the coredump is happening before the kernel starts to execute e.g. its in the initialization somehow. The first „printf“ statement in the kernel is not appearing.

Marco13 · 24. Oktober 2022 um 23:44

I did not look into all details here - for example, the details of the computation of the blockSizeX and gridSizeX, or the kernel, or the shared memory size etc. And I had to make some guesses about the surrounding code, and just pragmatically used fairly arbitrary values here and there. But from this (quick!) test, I did not receive a core dump, but only a CUDA_ERROR_INVALID_VALUE when launching the kernel (thrown as an exception when JCudaDriver.setExceptionsEnabled(true); was called at the beginning of the main function).

I think the main issue might just have been that the kernel parameters haven’t been set up correctly. The kernel expects size_t N, bool verbose as the last parameters, and you apparently tried to pass these in „at once“, with Pointer.to(new int[] { numSamples, (kernelVerbose ? 1 : 0) }). That’s not supposed to work.

Try changing the setup to

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
            Pointer.to(deviceInputA),
            Pointer.to(deviceInputB),
            Pointer.to(deviceOutput),
            Pointer.to(new int[] { numSamples }),
            Pointer.to(new int[] { kernelVerbose ? 1 : 0 }));

NOTE: This assumes that a size_t and a bool has the size of an int, and that may be plainly wrong. I’d have to look up the CUDA specs to see what a size_t and bool in CUDA actually is - does a size_t really have 4 bytes? How many bytes does a bool have? I’d strongly recommend to avoid confusion, and not use size_t and bool there. In doubt, just change the types in the kernel to int, unless there’s a strong reason to not do that…

If this does not solve the core dump, maybe you can enter the input values and size definitions that you are using in this quick test snippet:

// For https://forum.byte-welt.net/t/coredump-invoking-kernel-in-jcuda-kernel-works-fine-invoked-natively/24087
package jcuda.driver.test;

import static jcuda.driver.JCudaDriver.cuCtxCreate;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuDeviceGet;
import static jcuda.driver.JCudaDriver.cuInit;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemFree;
import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleLoad;

import java.util.Arrays;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.CUfunction;
import jcuda.driver.CUmodule;
import jcuda.driver.JCudaDriver;

public class JCudaDotProductTest
{
    public static void main(String[] args)
    {
        // Enable exceptions and omit subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);

        // Create a context for the first device
        cuInit(0);
        CUcontext context = new CUcontext();
        CUdevice device = new CUdevice();
        cuDeviceGet(device, 0);
        cuCtxCreate(context, 0, device);

        // Load the module and obtain the pointer to the kernel function
        CUmodule module = new CUmodule();
        cuModuleLoad(module, "JCudaDotProductKernel.ptx");
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "__dotproduct_cuda");
        
        // Guesses and dummy data...
        int numSamples = 1000;
        float hostInputA[] = new float[numSamples];
        float hostInputB[] = new float[numSamples];
        Arrays.fill(hostInputA, 2.0f);
        Arrays.fill(hostInputB, 2.0f);
        boolean kernelVerbose = false;
        int THREADS_PER_BLOCK = 128; // Set as #define in kernel
        int DEFAULT_BLOCK_SIZE = 128;
        int blockSizeX = 0;
        int gridSizeX = 0;
        int MAX_BLOCKS = 256;
        
        // Allocate the device input data, and copy the host input data to the device
        CUdeviceptr deviceInputA = new CUdeviceptr();
        cuMemAlloc(deviceInputA, numSamples * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numSamples * Sizeof.FLOAT);
        CUdeviceptr deviceInputB = new CUdeviceptr();
        cuMemAlloc(deviceInputB, numSamples * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB), numSamples * Sizeof.FLOAT);

        // Allocate device output memory
        CUdeviceptr deviceOutput = new CUdeviceptr();
        cuMemAlloc(deviceOutput, Sizeof.FLOAT);

        // Set up the kernel parameters: A pointer to an array
        // of pointers which point to the actual values.
        Pointer kernelParameters = Pointer.to(
                Pointer.to(deviceInputA),
                Pointer.to(deviceInputB),
                Pointer.to(deviceOutput),
                Pointer.to(new int[] { numSamples }),
                Pointer.to(new int[] { kernelVerbose ? 1 : 0 }));

        // Determine our size requirements
        // Once N exceeds MAX_BLOCKS *THREADS_PER_BLOCK, the grid-stride pattern is used
        if (blockSizeX == 0)
            blockSizeX = DEFAULT_BLOCK_SIZE;
        if (gridSizeX == 0) {
            gridSizeX = (int) Math.ceil((float) numSamples / DEFAULT_BLOCK_SIZE);
            if (gridSizeX == 1)
                blockSizeX = numSamples;
            if (gridSizeX > MAX_BLOCKS) // this will trigger grid-stride loops
                gridSizeX = MAX_BLOCKS;
        }

        //logger.debug("blocks {}, threads {}", gridSizeX, blockSizeX);
        System.out.println("blockSizeX "+blockSizeX);
        System.out.println("gridSizeX "+gridSizeX);

        // Call the kernel function.
        // __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, * d_result, N, verbose);
        cuLaunchKernel(function,
                gridSizeX, 1, 1, // Grid dimension
                blockSizeX, 1, 1, // Block dimension
                32768, null, // Shared memory size and stream
                kernelParameters, null // Kernel- and extra parameters
        );

        System.out.println("Kernel launched");

        // Synchronize the devices
        cuCtxSynchronize();

        System.out.println("Context synchronized");

        // Allocate host output memory and copy the device output to the host.
        float[] hostOutput = new float[1];
        cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, Sizeof.FLOAT);
        
        System.out.println("Result "+hostOutput[0]);

        // Clean up.
        cuMemFree(deviceInputA);
        cuMemFree(deviceInputB);
        cuMemFree(deviceOutput);        
    }
}

Marco13 · 24. Oktober 2022 um 23:51

An aside, depending on what your actual goal is: If you just want to compute a dot product (or similar standard operations) with CUDA via JCuda, you should have a look at JCublas. It has built-in functions for that - for example, the dot product.

(The relevant lines in an otherwise unrelated sample - some other JCublas samples are at jcuda-samples/JCudaSamples/src/main/java/jcuda/jcublas/samples at master · jcuda/jcuda-samples · GitHub )

rsand · 25. Oktober 2022 um 13:09

Marco, you’re a hero! That’s exactly what the issue was, I had to pass the size_t and boolean in separate arrays. Thank you!

Marco13 · 25. Oktober 2022 um 14:01

To emphasize that again: You should have a closer look at the data type sizes of size_t and bool in CUDA. When they are „wrong“ (i.e. different than the size of int in Java), then, in the worst case, it might seem to work now, but might cause inexplicable crashes at a later point.