[QUOTE=Marco13]Yes, I was curious, and tried it right now Until now I could not reproduce the error. I assume that you are using the
public static KernelLauncher create(
String cuFileName, String functionName, String ... nvccArguments)
function, directly passing it the name of the āCUā file, and are not manually parsing the file and using another function to create the kernelLauncher (e.g. from a String that was read from the file).
Quite confusing is the fact that it seems to create a valid PTX file, but then fails to locate the function that it obviously contains.
Iām not sure what is the best way to find the possible reason for this bug. Could you post the contents of the PTX file here? Then I could try to load this one manually, and see whether this works (although, admittedly, I cannot imagine why it should not work: The function is there, and itās only identified by its nameā¦)[/QUOTE]
Actually noā¦ I didnāt know that I can provide the actual file name to the compile method so I created another function that gets the file name and returns the content of that file as a string.
Therefore I am using this one:
KernelLauncher.compile(String, āfunction_nameā);
Here is my kernel source code: (donāt laugh at my code! I know itās not good but I only started learning CUDA 4 days ago)
device void push(double* stack, int *sp, double a) {
(*sp)++;
stack[*sp] = a;
}
device double pop(double* stack, int *sp) {
double a = stack[*sp];
(*sp)ā;
return a;
}
extern āCā
global void evaluate(char* expression, int expLength, double *x, double *y, double *result, double *hits)
{
double stack[100];
int sp = -1;
int i = threadIdx.x;
double first;
double second;
/* for whatever reason it does not move the pointer CHAR forward 2 bytes at a time */
for (int k = 0; k < expLength * 2; k+= 2)
{
switch(expression[k])
{
case 'x':
push(stack, &sp, x**);
break;
case 'y':
push(stack, &sp, y**);
break;
case '+':
second = pop(stack, &sp);
first = pop(stack, &sp);
push(stack, &sp, first + second);
break;
case '*':
second = pop(stack, &sp);
first = pop(stack, &sp);
push(stack, &sp, first * second);
break;
case '-':
second = pop(stack, &sp);
first = pop(stack, &sp);
push(stack, &sp, first - second);
break;
case '/':
second = pop(stack, &sp);
if (second == 0) {
pop(stack, &sp);
push(stack, &sp, 1);
break;
}
first = pop(stack, &sp);
push(stack, &sp, first / second);
break;
case 's':
push(stack, &sp, sin(pop(stack, &sp)));
break;
}
}
double obtainedResult = pop(stack, &sp);
hits** = abs(obtainedResult - result**);
/*TODO reduction*/
}
And the PTX:
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda-5.0/open64/lib//be
// nvopencc 4.1 built on 2012-09-21
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000994_00000000-9_Regression.cpp3.i (/tmp/ccBI#.9U8Tt5)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_00000994_00000000-8_Regression.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.7/include/stddef.h"
.file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/bin/../include/host_defines.h"
.file 6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file 7 "/usr/local/cuda/bin/../include/device_types.h"
.file 8 "/usr/local/cuda/bin/../include/driver_types.h"
.file 9 "/usr/local/cuda/bin/../include/surface_types.h"
.file 10 "/usr/local/cuda/bin/../include/texture_types.h"
.file 11 "/usr/local/cuda/bin/../include/vector_types.h"
.file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file 14 "Regression.cu"
.file 15 "/usr/local/cuda/bin/../include/common_functions.h"
.file 16 "/usr/local/cuda/bin/../include/math_functions.h"
.file 17 "/usr/local/cuda/bin/../include/math_constants.h"
.file 18 "/usr/local/cuda/bin/../include/device_functions.h"
.file 19 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file 20 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file 21 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file 22 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file 23 "/usr/local/cuda/bin/../include/sm_35_atomic_functions.h"
.file 24 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file 25 "/usr/local/cuda/bin/../include/sm_30_intrinsics.h"
.file 26 "/usr/local/cuda/bin/../include/sm_35_intrinsics.h"
.file 27 "/usr/local/cuda/bin/../include/surface_functions.h"
.file 28 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file 29 "/usr/local/cuda/bin/../include/texture_indirect_functions.h"
.file 30 "/usr/local/cuda/bin/../include/surface_indirect_functions.h"
.file 31 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.entry evaluate (
.param .u64 __cudaparm_evaluate_expression,
.param .s32 __cudaparm_evaluate_expLength,
.param .u64 __cudaparm_evaluate_x,
.param .u64 __cudaparm_evaluate_y,
.param .u64 __cudaparm_evaluate_result,
.param .u64 __cudaparm_evaluate_hits)
{
.reg .u32 %r<29>;
.reg .u64 %rd<70>;
.reg .f32 %f<6>;
.reg .f64 %fd<17>;
.reg .pred %p<11>;
.local .align 8 .b8 __cuda___cuda_local_var_14779_9_non_const_stack_048[800];
.loc 14 13 0
$LDWbegin_evaluate:
ld.param.s32 %r1, [__cudaparm_evaluate_expLength];
mul.lo.s32 %r2, %r1, 2;
cvt.s32.u16 %r3, %tid.x;
mov.u32 %r4, 0;
setp.le.s32 %p1, %r2, %r4;
@%p1 bra $Lt_0_6402;
add.s32 %r5, %r2, 1;
shr.s32 %r6, %r5, 31;
mov.s32 %r7, 1;
and.b32 %r8, %r6, %r7;
add.s32 %r9, %r8, %r5;
shr.s32 %r10, %r9, 1;
cvt.s64.s32 %rd1, %r3;
mul.wide.s32 %rd2, %r3, 8;
ld.param.u64 %rd3, [__cudaparm_evaluate_expression];
mov.s32 %r11, 0;
mov.s32 %r12, -1;
mov.u64 %rd4, __cuda___cuda_local_var_14779_9_non_const_stack_048;
mov.s32 %r13, %r10;
$Lt_0_5378:
//<loop> Loop body line 13, nesting depth: 1, estimated iterations: unknown
.loc 14 26 0
ld.global.s8 %r14, [%rd3+0];
mov.u32 %r15, 120;
setp.eq.s32 %p2, %r14, %r15;
@%p2 bra $Lt_0_258;
mov.u32 %r16, 121;
setp.eq.s32 %p3, %r14, %r16;
@%p3 bra $Lt_0_770;
mov.u32 %r17, 43;
setp.eq.s32 %p4, %r14, %r17;
@%p4 bra $Lt_0_1026;
mov.u32 %r18, 42;
setp.eq.s32 %p5, %r14, %r18;
@%p5 bra $Lt_0_1282;
mov.u32 %r19, 45;
setp.eq.s32 %p6, %r14, %r19;
@%p6 bra $Lt_0_1538;
mov.u32 %r20, 47;
setp.eq.s32 %p7, %r14, %r20;
@%p7 bra $Lt_0_1794;
bra.uni $Lt_0_514;
$Lt_0_258:
.loc 14 2 0
cvt.u64.u32 %rd5, %r12;
cvt.u32.u64 %r21, %rd5;
add.s32 %r22, %r21, 1;
mov.s32 %r12, %r22;
.loc 14 3 0
ld.param.u64 %rd6, [__cudaparm_evaluate_x];
add.u64 %rd7, %rd6, %rd2;
ld.global.f64 %fd1, [%rd7+0];
cvt.s64.s32 %rd8, %r22;
mul.wide.s32 %rd9, %r22, 8;
add.u64 %rd10, %rd4, %rd9;
st.local.f64 [%rd10+0], %fd1;
.loc 14 30 0
bra.uni $Lt_0_514;
$Lt_0_770:
.loc 14 2 0
cvt.u64.u32 %rd11, %r12;
cvt.u32.u64 %r23, %rd11;
add.s32 %r24, %r23, 1;
mov.s32 %r12, %r24;
.loc 14 3 0
ld.param.u64 %rd12, [__cudaparm_evaluate_y];
add.u64 %rd13, %rd12, %rd2;
ld.global.f64 %fd2, [%rd13+0];
cvt.s64.s32 %rd14, %r24;
mul.wide.s32 %rd15, %r24, 8;
add.u64 %rd16, %rd4, %rd15;
st.local.f64 [%rd16+0], %fd2;
.loc 14 34 0
bra.uni $Lt_0_514;
$Lt_0_1026:
.loc 14 37 0
cvt.s64.s32 %rd17, %r12;
mul.wide.s32 %rd18, %r12, 8;
add.u64 %rd19, %rd4, %rd18;
ld.local.f64 %fd3, [%rd19+0];
.loc 14 7 0
cvt.u64.u32 %rd20, %r12;
cvt.u32.u64 %r25, %rd20;
sub.s32 %r26, %r25, 1;
cvt.s64.s32 %rd21, %r26;
mul.wide.s32 %rd22, %r26, 8;
add.u64 %rd23, %rd4, %rd22;
ld.local.f64 %fd4, [%rd23+0];
.loc 14 2 0
cvt.u64.u32 %rd24, %r26;
cvt.u32.u64 %r27, %rd24;
mov.s32 %r12, %r27;
.loc 14 3 0
add.f64 %fd5, %fd4, %fd3;
cvt.s64.s32 %rd25, %r27;
mul.wide.s32 %rd26, %r27, 8;
add.u64 %rd27, %rd4, %rd26;
st.local.f64 [%rd27+0], %fd5;
.loc 14 40 0
bra.uni $Lt_0_514;
$Lt_0_1282:
.loc 14 43 0
cvt.s64.s32 %rd28, %r12;
mul.wide.s32 %rd29, %r12, 8;
add.u64 %rd30, %rd4, %rd29;
ld.local.f64 %fd3, [%rd30+0];
.loc 14 7 0
cvt.u64.u32 %rd20, %r12;
cvt.u32.u64 %r25, %rd20;
sub.s32 %r26, %r25, 1;
cvt.s64.s32 %rd31, %r26;
mul.wide.s32 %rd32, %r26, 8;
add.u64 %rd33, %rd4, %rd32;
ld.local.f64 %fd4, [%rd33+0];
.loc 14 2 0
cvt.u64.u32 %rd34, %r26;
cvt.u32.u64 %r27, %rd34;
mov.s32 %r12, %r27;
.loc 14 3 0
mul.f64 %fd6, %fd4, %fd3;
cvt.s64.s32 %rd35, %r27;
mul.wide.s32 %rd36, %r27, 8;
add.u64 %rd37, %rd4, %rd36;
st.local.f64 [%rd37+0], %fd6;
.loc 14 46 0
bra.uni $Lt_0_514;
$Lt_0_1538:
.loc 14 49 0
cvt.s64.s32 %rd38, %r12;
mul.wide.s32 %rd39, %r12, 8;
add.u64 %rd40, %rd4, %rd39;
ld.local.f64 %fd3, [%rd40+0];
.loc 14 7 0
cvt.u64.u32 %rd20, %r12;
cvt.u32.u64 %r25, %rd20;
sub.s32 %r26, %r25, 1;
cvt.s64.s32 %rd41, %r26;
mul.wide.s32 %rd42, %r26, 8;
add.u64 %rd43, %rd4, %rd42;
ld.local.f64 %fd4, [%rd43+0];
.loc 14 2 0
cvt.u64.u32 %rd44, %r26;
cvt.u32.u64 %r27, %rd44;
mov.s32 %r12, %r27;
.loc 14 3 0
sub.f64 %fd7, %fd4, %fd3;
cvt.s64.s32 %rd45, %r27;
mul.wide.s32 %rd46, %r27, 8;
add.u64 %rd47, %rd4, %rd46;
st.local.f64 [%rd47+0], %fd7;
.loc 14 52 0
bra.uni $Lt_0_514;
$Lt_0_1794:
.loc 14 7 0
cvt.s64.s32 %rd48, %r12;
mul.wide.s32 %rd49, %r12, 8;
add.u64 %rd50, %rd4, %rd49;
ld.local.f64 %fd3, [%rd50+0];
.loc 14 55 0
cvt.u64.u32 %rd20, %r12;
cvt.u32.u64 %r25, %rd20;
sub.s32 %r26, %r25, 1;
cvt.u64.u32 %rd51, %r26;
cvt.u32.u64 %r27, %rd51;
mov.f64 %fd8, 0d0000000000000000; // 0
setp.eq.f64 %p8, %fd3, %fd8;
@!%p8 bra $Lt_0_5634;
.loc 14 2 0
mov.s32 %r12, %r27;
.loc 14 3 0
mov.f64 %fd9, 0d3ff0000000000000; // 1
cvt.s64.s32 %rd52, %r27;
mul.wide.s32 %rd53, %r27, 8;
add.u64 %rd54, %rd4, %rd53;
st.local.f64 [%rd54+0], %fd9;
.loc 14 59 0
bra.uni $Lt_0_514;
$Lt_0_5634:
.loc 14 7 0
cvt.s64.s32 %rd55, %r26;
mul.wide.s32 %rd56, %r26, 8;
add.u64 %rd57, %rd4, %rd56;
ld.local.f64 %fd4, [%rd57+0];
.loc 14 2 0
mov.s32 %r12, %r27;
.loc 14 3 0
cvt.rn.f32.f64 %f1, %fd4;
cvt.rn.f32.f64 %f2, %fd3;
div.full.f32 %f3, %f1, %f2;
cvt.f64.f32 %fd10, %f3;
cvt.s64.s32 %rd58, %r27;
mul.wide.s32 %rd59, %r27, 8;
add.u64 %rd60, %rd4, %rd59;
st.local.f64 [%rd60+0], %fd10;
$Lt_0_514:
.loc 14 64 0
add.s32 %r11, %r11, 2;
add.u64 %rd3, %rd3, 2;
setp.gt.s32 %p9, %r2, %r11;
@%p9 bra $Lt_0_5378;
bra.uni $Lt_0_4866;
$Lt_0_6402:
cvt.s64.s32 %rd61, %r3;
mul.wide.s32 %rd2, %r3, 8;
mov.s32 %r12, -1;
mov.u64 %rd4, __cuda___cuda_local_var_14779_9_non_const_stack_048;
$Lt_0_4866:
.loc 14 69 0
cvt.s64.s32 %rd62, %r12;
mul.wide.s32 %rd63, %r12, 8;
add.u64 %rd64, %rd4, %rd63;
ld.local.f64 %fd11, [%rd64+0];
ld.param.u64 %rd65, [__cudaparm_evaluate_result];
add.u64 %rd66, %rd65, %rd2;
ld.global.f64 %fd12, [%rd66+0];
sub.f64 %fd13, %fd11, %fd12;
abs.f64 %fd14, %fd13;
cvt.rn.f32.f64 %f4, %fd14;
cvt.f64.f32 %fd15, %f4;
ld.param.u64 %rd67, [__cudaparm_evaluate_hits];
add.u64 %rd68, %rd67, %rd2;
st.global.f64 [%rd68+0], %fd15;
.loc 14 77 0
exit;
$LDWend_evaluate:
} // evaluate