Inconsistent Float computation

I have a very simply computation with float values that gives different results when the parameter is presented in a float variable or as a float literal. Is there any explanation for that? Is this caused in the OpenCL compiler or is this something that has the cause on the GPU?
(working with OpenCL 1.2, using Nvidia Quadro RTX 3000).

/*
This little program produces the following output on Quadro RTX 3000:
0x4a,0x8f,0x4d,0x3e
0x48,0x8f,0x4d,0x3e
Notice the difference in the first (actually last) Byte.
*/

__kernel void TestNvidia()
{
float floatValue = 0.19976422f;
printf("%#v4hhx\n", as_uchar4(1.199563f - 5 * floatValue));
printf("%#v4hhx\n", as_uchar4(1.199563f - 5 * 0.19976422f));
}

The same test on an AMD Radeon R9 M375 does not show that difference.

It’s hard to give a definite reason, beyond some observations. When obtaining the program binaries for the kernel with this…

package org.jocl.test;
import static org.jocl.CL.*;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

import org.jocl.*;

public class JOCLGetBinaryTest
{
    private static String programSource =
        "__kernel void "+
        "sampleKernel()"+
        "{"+
        "    float floatValue = 0.19976422f;"+
        "    printf(\"%#v4hhx\\n\", as_uchar4(1.199563f - 5 * floatValue));"+
        "    printf(\"%#v4hhx\\n\", as_uchar4(1.199563f - 5 * 0.19976422f));"+
        "}";

    private static int numDevices;
    private static cl_program program;
    private static cl_context context;
    
    public static void main(String args[])
    {
        printBytes();
        defaultInitialization();
        
        // Obtain the length of the binary data that will be queried, 
        // for each device
        long binaryDataSizes[] = new long[numDevices];
        clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, 
            numDevices * Sizeof.size_t, Pointer.to(binaryDataSizes), null);
        
        // Allocate arrays that will store the binary data, each
        // with the appropriate size
        byte binaryDatas[][] = new byte[numDevices][];
        for (int i=0; i<numDevices; i++)
        {
            int binaryDataSize = (int)binaryDataSizes[i];
            binaryDatas[i] = new byte[binaryDataSize];
        }
        
        // Create a pointer to an array of pointers which are pointing
        // to the binary data arrays
        Pointer binaryDataPointers[] = new Pointer[numDevices];
        for (int i=0; i<numDevices; i++)
        {
            binaryDataPointers[i] = Pointer.to(binaryDatas[i]);
        }

        // Query the binary data
        Pointer pointerToBinaryDataPointers = Pointer.to(binaryDataPointers);
        clGetProgramInfo(program, CL_PROGRAM_BINARIES, 
            numDevices * Sizeof.POINTER, pointerToBinaryDataPointers, null);

        // Print the binary data (for NVIDIA, this is the PTX data)
        for (int i=0; i<numDevices; i++)
        {
            System.out.println("Binary data for device "+i+":");
            System.out.println(new String(binaryDatas[i]));
        }
        
        clReleaseProgram(program);
        clReleaseContext(context);
    }
    
    
    private static void defaultInitialization()
    {
        long numBytes[] = {0};
        
        // Obtain the platform IDs and initialize the context properties
        cl_platform_id platforms[] = new cl_platform_id[1];
        clGetPlatformIDs(platforms.length, platforms, null);
        cl_context_properties contextProperties = new cl_context_properties();
        contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
        
        // Create an OpenCL context on a GPU device
        context = clCreateContextFromType(
            contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
        if (context == null)
        {
            // If no context for a GPU device could be created,
            // try to create one for a CPU device.
            context = clCreateContextFromType(
                contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
            
            if (context == null)
            {
                System.out.println("Unable to create a context");
                return;
            }
        }

        // Enable exceptions and subsequently omit error checks in this sample
        CL.setExceptionsEnabled(true);
        
        // Get the list of GPU devices associated with the context
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes); 
        
        // Obtain the cl_device_id for the first device
        numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
        cl_device_id devices[] = new cl_device_id[numDevices];
        clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],  
            Pointer.to(devices), null);

        // Create the program from the source code
        program = clCreateProgramWithSource(context,
            1, new String[]{ programSource }, null, null);
        
        // Build the program
        clBuildProgram(program, 0, null, null, null, null);
    }
    
    
    private static void printBytes()
    {
        ByteBuffer bb0 = ByteBuffer.wrap(new byte[]
            { 0x4a, (byte) 0x8f, 0x4d, 0x3e });
        int i0 = bb0.order(ByteOrder.nativeOrder()).asIntBuffer().get(0);
        ByteBuffer bb1 = ByteBuffer.wrap(new byte[]
            { 0x48, (byte) 0x8f, 0x4d, 0x3e });
        int i1 = bb1.order(ByteOrder.nativeOrder()).asIntBuffer().get(0);
        System.out.println(i0);
        System.out.println(i1);
    }
}

then the output is

Binary data for device 0:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Unknown Toolkit Version
// Based on LLVM 3.4svn
//

.version 7.0
.target sm_75, texmode_independent
.address_size 64

    // .globl    sampleKernel
.func  (.param .b32 func_retval0) vprintf
(
    .param .b64 vprintf_param_0,
    .param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[9] = {37, 35, 118, 52, 104, 104, 120, 10, 0};

.entry sampleKernel(

)
{
    .local .align 8 .b8     __local_depot0[16];
    .reg .b64     %SP;
    .reg .b64     %SPL;
    .reg .b32     %r<5>;
    .reg .b64     %rd<7>;


    mov.u64     %SPL, __local_depot0;
    cvta.local.u64     %SP, %SPL;
    add.u64     %rd1, %SP, 0;
    add.u64     %rd2, %SPL, 0;
    add.u64     %rd3, %SP, 8;
    add.u64     %rd4, %SPL, 8;
    mov.u64     %rd5, $str;
    cvta.global.u64     %rd6, %rd5;
    mov.u32     %r1, 1045270346;
    st.local.u32     [%rd4], %r1;
    // Callseq Start 0
    {
    .reg .b32 temp_param_reg;
    // <end>}
    .param .b64 param0;
    st.param.b64    [param0+0], %rd6;
    .param .b64 param1;
    st.param.b64    [param1+0], %rd3;
    .param .b32 retval0;
    call.uni (retval0), 
    vprintf, 
    (
    param0, 
    param1
    );
    ld.param.b32    %r2, [retval0+0];

    //{
    }// Callseq End 0
    mov.u32     %r3, 1045270344;
    st.local.u32     [%rd2], %r3;
    // Callseq Start 1
    {
    .reg .b32 temp_param_reg;
    // <end>}
    .param .b64 param0;
    st.param.b64    [param0+0], %rd6;
    .param .b64 param1;
    st.param.b64    [param1+0], %rd1;
    .param .b32 retval0;
    call.uni (retval0), 
    vprintf, 
    (
    param0, 
    param1
    );
    ld.param.b32    %r4, [retval0+0];

    //{
    }// Callseq End 1
    ret;
}

Although I’ll certainly not claim to understand what all the code means, it shows these lines:

    mov.u32     %r1, 1045270346;
    mov.u32     %r3, 1045270344;

and these are exactly the „int-representations“ of the four bytes that are printed. This suggests that these values are already computed at compile time.

This does not exactly explain where the difference, but at that point, I have to vaguely say: „Floating point precision is limited, this may be part of some optimization (although disabling the optimization did not seem to have an effect)…“.