Generally the kernel seems to be suitable to coalesced memory access so maybe it causes better performance in the second case (but I’m not sure how using registers affect to coalesced memory access in the first case). I prepared ptx files but I’m totally inexperienced with this kind of stuff…
First ptx (kernel with variables tPos, tVel):
.entry particles_kernel (
.param .u32 __cudaparm_particles_kernel_vbo,
.param .u32 __cudaparm_particles_kernel_pos,
.param .u32 __cudaparm_particles_kernel_vel,
.param .s32 __cudaparm_particles_kernel_np)
{
.reg .u16 %rh<4>;
.reg .u32 %r<31>;
.reg .f32 %f<26>;
.reg .pred %p<3>;
.loc 16 3 0
$LDWbegin_particles_kernel:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.u32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
ld.param.s32 %r4, [__cudaparm_particles_kernel_np];
setp.le.s32 %p1, %r4, %r3;
@%p1 bra $Lt_0_4098;
.loc 16 11 0
mul.lo.u32 %r5, %r3, 16;
ld.param.u32 %r6, [__cudaparm_particles_kernel_pos];
add.u32 %r7, %r6, %r5;
ld.global.v4.f32 {%f1,%f2,%f3,%f4}, [%r7+0];
.loc 16 12 0
ld.param.u32 %r8, [__cudaparm_particles_kernel_vel];
add.u32 %r9, %r8, %r5;
ld.global.v4.f32 {%f5,%f6,%f7,%f8}, [%r9+0];
.loc 16 14 0
add.f32 %f9, %f1, %f5;
.loc 16 15 0
add.f32 %f10, %f2, %f6;
.loc 16 16 0
add.f32 %f11, %f3, %f7;
.loc 16 11 0
neg.f32 %f12, %f5;
mov.f32 %f13, 0f40400000; // 3
set.gt.u32.f32 %r10, %f9, %f13;
neg.s32 %r11, %r10;
mov.f32 %f14, 0fc0400000; // -3
set.lt.u32.f32 %r12, %f9, %f14;
neg.s32 %r13, %r12;
or.b32 %r14, %r11, %r13;
neg.s32 %r15, %r14;
slct.f32.s32 %f15, %f5, %f12, %r15;
.loc 16 24 0
neg.f32 %f16, %f6;
mov.f32 %f17, 0f40400000; // 3
set.gt.u32.f32 %r16, %f10, %f17;
neg.s32 %r17, %r16;
mov.f32 %f18, 0fc0400000; // -3
set.lt.u32.f32 %r18, %f10, %f18;
neg.s32 %r19, %r18;
or.b32 %r20, %r17, %r19;
neg.s32 %r21, %r20;
slct.f32.s32 %f19, %f6, %f16, %r21;
.loc 16 30 0
neg.f32 %f20, %f7;
mov.f32 %f21, 0f40400000; // 3
set.gt.u32.f32 %r22, %f11, %f21;
neg.s32 %r23, %r22;
mov.f32 %f22, 0fc0400000; // -3
set.lt.u32.f32 %r24, %f11, %f22;
neg.s32 %r25, %r24;
or.b32 %r26, %r23, %r25;
neg.s32 %r27, %r26;
slct.f32.s32 %f23, %f7, %f20, %r27;
st.global.v4.f32 [%r7+0], {%f9,%f10,%f11,%f4};
st.global.v4.f32 [%r9+0], {%f15,%f19,%f23,%f8};
.loc 16 40 0
ld.param.u32 %r28, [__cudaparm_particles_kernel_vbo];
add.u32 %r29, %r28, %r5;
mov.f32 %f24, 0f3f800000; // 1
st.global.v4.f32 [%r29+0], {%f9,%f10,%f11,%f24};
$Lt_0_4098:
.loc 16 43 0
exit;
$LDWend_particles_kernel:
} // particles_kernel
Second ptx:
.entry particles_kernel (
.param .u32 __cudaparm_particles_kernel_vbo,
.param .u32 __cudaparm_particles_kernel_pos,
.param .u32 __cudaparm_particles_kernel_vel,
.param .s32 __cudaparm_particles_kernel_np)
{
.reg .u16 %rh<4>;
.reg .u32 %r<13>;
.reg .f32 %f<30>;
.reg .pred %p<9>;
.loc 16 3 0
$LDWbegin_particles_kernel:
mov.u16 %rh1, %ctaid.x;
mov.u16 %rh2, %ntid.x;
mul.wide.u16 %r1, %rh1, %rh2;
cvt.u32.u16 %r2, %tid.x;
add.u32 %r3, %r2, %r1;
ld.param.s32 %r4, [__cudaparm_particles_kernel_np];
setp.le.s32 %p1, %r4, %r3;
@%p1 bra $Lt_0_5634;
.loc 16 9 0
mul.lo.u32 %r5, %r3, 16;
ld.param.u32 %r6, [__cudaparm_particles_kernel_pos];
add.u32 %r7, %r6, %r5;
ld.param.u32 %r8, [__cudaparm_particles_kernel_vel];
add.u32 %r9, %r8, %r5;
ld.global.v4.f32 {%f1,%f2,%f3,_}, [%r7+0];
ld.global.v4.f32 {%f4,%f5,%f6,_}, [%r9+0];
add.f32 %f7, %f1, %f4;
.loc 16 10 0
add.f32 %f8, %f2, %f5;
st.global.v2.f32 [%r7+0], {%f7,%f8};
.loc 16 11 0
add.f32 %f9, %f3, %f6;
st.global.f32 [%r7+8], %f9;
.loc 16 9 0
ld.global.f32 %f10, [%r7+0];
mov.f32 %f11, 0fc0400000; // -3
setp.lt.f32 %p2, %f10, %f11;
@%p2 bra $L_0_4354;
mov.f32 %f12, 0f40400000; // 3
setp.gt.f32 %p3, %f10, %f12;
@!%p3 bra $L_0_4098;
$L_0_4354:
.loc 16 15 0
ld.global.f32 %f13, [%r9+0];
neg.f32 %f14, %f13;
st.global.f32 [%r9+0], %f14;
$L_0_4098:
.loc 16 18 0
ld.global.f32 %f15, [%r7+4];
mov.f32 %f16, 0fc0400000; // -3
setp.lt.f32 %p4, %f15, %f16;
@%p4 bra $L_0_4866;
mov.f32 %f17, 0f40400000; // 3
setp.gt.f32 %p5, %f15, %f17;
@!%p5 bra $L_0_4610;
$L_0_4866:
.loc 16 20 0
ld.global.f32 %f18, [%r9+4];
neg.f32 %f19, %f18;
st.global.f32 [%r9+4], %f19;
$L_0_4610:
.loc 16 23 0
ld.global.f32 %f20, [%r7+8];
mov.f32 %f21, 0fc0400000; // -3
setp.lt.f32 %p6, %f20, %f21;
@%p6 bra $L_0_5378;
mov.f32 %f22, 0f40400000; // 3
setp.gt.f32 %p7, %f20, %f22;
@!%p7 bra $L_0_5122;
$L_0_5378:
.loc 16 25 0
ld.global.f32 %f23, [%r9+8];
neg.f32 %f24, %f23;
st.global.f32 [%r9+8], %f24;
ld.global.f32 %f20, [%r7+8];
$L_0_5122:
ld.global.v2.f32 {%f25,%f26}, [%r7+0];
.loc 16 28 0
mov.f32 %f27, %f20;
ld.param.u32 %r10, [__cudaparm_particles_kernel_vbo];
add.u32 %r11, %r10, %r5;
mov.f32 %f28, 0f3f800000; // 1
st.global.v4.f32 [%r11+0], {%f25,%f26,%f27,%f28};
$Lt_0_5634:
.loc 16 30 0
exit;
$LDWend_particles_kernel:
} // particles_kernel