It’s hard to give a definite reason, beyond some observations. When obtaining the program binaries for the kernel with this…
package org.jocl.test;
import static org.jocl.CL.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.jocl.*;
public class JOCLGetBinaryTest
{
private static String programSource =
"__kernel void "+
"sampleKernel()"+
"{"+
" float floatValue = 0.19976422f;"+
" printf(\"%#v4hhx\\n\", as_uchar4(1.199563f - 5 * floatValue));"+
" printf(\"%#v4hhx\\n\", as_uchar4(1.199563f - 5 * 0.19976422f));"+
"}";
private static int numDevices;
private static cl_program program;
private static cl_context context;
public static void main(String args[])
{
printBytes();
defaultInitialization();
// Obtain the length of the binary data that will be queried,
// for each device
long binaryDataSizes[] = new long[numDevices];
clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
numDevices * Sizeof.size_t, Pointer.to(binaryDataSizes), null);
// Allocate arrays that will store the binary data, each
// with the appropriate size
byte binaryDatas[][] = new byte[numDevices][];
for (int i=0; i<numDevices; i++)
{
int binaryDataSize = (int)binaryDataSizes[i];
binaryDatas[i] = new byte[binaryDataSize];
}
// Create a pointer to an array of pointers which are pointing
// to the binary data arrays
Pointer binaryDataPointers[] = new Pointer[numDevices];
for (int i=0; i<numDevices; i++)
{
binaryDataPointers[i] = Pointer.to(binaryDatas[i]);
}
// Query the binary data
Pointer pointerToBinaryDataPointers = Pointer.to(binaryDataPointers);
clGetProgramInfo(program, CL_PROGRAM_BINARIES,
numDevices * Sizeof.POINTER, pointerToBinaryDataPointers, null);
// Print the binary data (for NVIDIA, this is the PTX data)
for (int i=0; i<numDevices; i++)
{
System.out.println("Binary data for device "+i+":");
System.out.println(new String(binaryDatas[i]));
}
clReleaseProgram(program);
clReleaseContext(context);
}
private static void defaultInitialization()
{
long numBytes[] = {0};
// Obtain the platform IDs and initialize the context properties
cl_platform_id platforms[] = new cl_platform_id[1];
clGetPlatformIDs(platforms.length, platforms, null);
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platforms[0]);
// Create an OpenCL context on a GPU device
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_GPU, null, null, null);
if (context == null)
{
// If no context for a GPU device could be created,
// try to create one for a CPU device.
context = clCreateContextFromType(
contextProperties, CL_DEVICE_TYPE_CPU, null, null, null);
if (context == null)
{
System.out.println("Unable to create a context");
return;
}
}
// Enable exceptions and subsequently omit error checks in this sample
CL.setExceptionsEnabled(true);
// Get the list of GPU devices associated with the context
clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, null, numBytes);
// Obtain the cl_device_id for the first device
numDevices = (int) numBytes[0] / Sizeof.cl_device_id;
cl_device_id devices[] = new cl_device_id[numDevices];
clGetContextInfo(context, CL_CONTEXT_DEVICES, numBytes[0],
Pointer.to(devices), null);
// Create the program from the source code
program = clCreateProgramWithSource(context,
1, new String[]{ programSource }, null, null);
// Build the program
clBuildProgram(program, 0, null, null, null, null);
}
private static void printBytes()
{
ByteBuffer bb0 = ByteBuffer.wrap(new byte[]
{ 0x4a, (byte) 0x8f, 0x4d, 0x3e });
int i0 = bb0.order(ByteOrder.nativeOrder()).asIntBuffer().get(0);
ByteBuffer bb1 = ByteBuffer.wrap(new byte[]
{ 0x48, (byte) 0x8f, 0x4d, 0x3e });
int i1 = bb1.order(ByteOrder.nativeOrder()).asIntBuffer().get(0);
System.out.println(i0);
System.out.println(i1);
}
}
then the output is
Binary data for device 0:
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Unknown Toolkit Version
// Based on LLVM 3.4svn
//
.version 7.0
.target sm_75, texmode_independent
.address_size 64
// .globl sampleKernel
.func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.global .align 1 .b8 $str[9] = {37, 35, 118, 52, 104, 104, 120, 10, 0};
.entry sampleKernel(
)
{
.local .align 8 .b8 __local_depot0[16];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<5>;
.reg .b64 %rd<7>;
mov.u64 %SPL, __local_depot0;
cvta.local.u64 %SP, %SPL;
add.u64 %rd1, %SP, 0;
add.u64 %rd2, %SPL, 0;
add.u64 %rd3, %SP, 8;
add.u64 %rd4, %SPL, 8;
mov.u64 %rd5, $str;
cvta.global.u64 %rd6, %rd5;
mov.u32 %r1, 1045270346;
st.local.u32 [%rd4], %r1;
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd3;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r2, [retval0+0];
//{
}// Callseq End 0
mov.u32 %r3, 1045270344;
st.local.u32 [%rd2], %r3;
// Callseq Start 1
{
.reg .b32 temp_param_reg;
// <end>}
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 param1;
st.param.b64 [param1+0], %rd1;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r4, [retval0+0];
//{
}// Callseq End 1
ret;
}
Although I’ll certainly not claim to understand what all the code means, it shows these lines:
mov.u32 %r1, 1045270346;
mov.u32 %r3, 1045270344;
and these are exactly the „int-representations“ of the four bytes that are printed. This suggests that these values are already computed at compile time.
This does not exactly explain where the difference, but at that point, I have to vaguely say: „Floating point precision is limited, this may be part of some optimization (although disabling the optimization did not seem to have an effect)…“.