Registers vs global memory

I wonder what exactly causes the performance difference between two version of the same kernel. First kernel uses temporary variables (float4 tPos, tVel) which are stored in registers (~408 fps). Second kernel reads a data directly from the global memory (~426 fps). I think that there is too little operations which using registers (tPos, tVel) to achieve some benefits of the low latency.

The number of particles = 1048576, threads per block = 128, blocks per grid = 8192.

First kernel:


extern "C"
__global__ void particles_kernel(float4 *vbo, float4* pos, float4* vel, int np)
{
	int tid = blockIdx.x * blockDim.x + threadIdx.x;
        if(tid < np)
	{
		float4 tPos = pos[tid];
		float4 tVel = vel[tid];

		tPos.x += tVel.x;
		tPos.y += tVel.y;
		tPos.z += tVel.z;

		if(tPos.x < -3.0f || tPos.x > 3.0f )
		{
			tVel.x = -tVel.x;
		}
		if(tPos.y < -3.0f || tPos.y > 3.0f)
		{
			tVel.y = -tVel.y;
		}
		if(tPos.z < -3.0f || tPos.z > 3.0f)
		{
			tVel.z = -tVel.z;
		}
	
		pos[tid] = tPos;
		vel[tid] = tVel;
		vbo[tid] = make_float4(tPos.x, tPos.y, tPos.z, 1.0f);
	}
}

Second kernel:


extern "C"
__global__ void particles_kernel(float4 *vbo, float4 *pos, float4 *vel, int np)
{
	int tid = blockIdx.x * blockDim.x + threadIdx.x;
	if(tid < np)
	{
		pos[tid].x += vel[tid].x;
		pos[tid].y += vel[tid].y;
		pos[tid].z += vel[tid].z;

		if(pos[tid].x < -3.0f || pos[tid].x > 3.0f)
		{
			vel[tid].x = -vel[tid].x;
		}
		if(pos[tid].y < -3.0f || pos[tid].y > 3.0f)
		{
			vel[tid].y = -vel[tid].y;
		}
		if(pos[tid].z < -3.0f || pos[tid].z > 3.0f)
		{
			vel[tid].z = -vel[tid].z;
		}
	
		vbo[tid] = make_float4(pos[tid].x, pos[tid].y, pos[tid].z, 1.0f);
	}
}

Hello

That’s hard to explain (for me, at least ;)). Maybe obtaining the binary data (which is PTX on NVIDIA platforms) might bring some insights. First I thought that the reason could be that there are less registers available when additional local variables are declared, but since these are the only local variables, that can hardly explain the difference…

bye
Marco

Generally the kernel seems to be suitable to coalesced memory access so maybe it causes better performance in the second case (but I’m not sure how using registers affect to coalesced memory access in the first case). I prepared ptx files but I’m totally inexperienced with this kind of stuff…

First ptx (kernel with variables tPos, tVel):


.entry particles_kernel (
		.param .u32 __cudaparm_particles_kernel_vbo,
		.param .u32 __cudaparm_particles_kernel_pos,
		.param .u32 __cudaparm_particles_kernel_vel,
		.param .s32 __cudaparm_particles_kernel_np)
	{
	.reg .u16 %rh<4>;
	.reg .u32 %r<31>;
	.reg .f32 %f<26>;
	.reg .pred %p<3>;
	.loc	16	3	0
$LDWbegin_particles_kernel:
	mov.u16 	%rh1, %ctaid.x;
	mov.u16 	%rh2, %ntid.x;
	mul.wide.u16 	%r1, %rh1, %rh2;
	cvt.u32.u16 	%r2, %tid.x;
	add.u32 	%r3, %r2, %r1;
	ld.param.s32 	%r4, [__cudaparm_particles_kernel_np];
	setp.le.s32 	%p1, %r4, %r3;
	@%p1 bra 	$Lt_0_4098;
	.loc	16	11	0
	mul.lo.u32 	%r5, %r3, 16;
	ld.param.u32 	%r6, [__cudaparm_particles_kernel_pos];
	add.u32 	%r7, %r6, %r5;
	ld.global.v4.f32 	{%f1,%f2,%f3,%f4}, [%r7+0];
	.loc	16	12	0
	ld.param.u32 	%r8, [__cudaparm_particles_kernel_vel];
	add.u32 	%r9, %r8, %r5;
	ld.global.v4.f32 	{%f5,%f6,%f7,%f8}, [%r9+0];
	.loc	16	14	0
	add.f32 	%f9, %f1, %f5;
	.loc	16	15	0
	add.f32 	%f10, %f2, %f6;
	.loc	16	16	0
	add.f32 	%f11, %f3, %f7;
	.loc	16	11	0
	neg.f32 	%f12, %f5;
	mov.f32 	%f13, 0f40400000;    	// 3
	set.gt.u32.f32 	%r10, %f9, %f13;
	neg.s32 	%r11, %r10;
	mov.f32 	%f14, 0fc0400000;    	// -3
	set.lt.u32.f32 	%r12, %f9, %f14;
	neg.s32 	%r13, %r12;
	or.b32 	%r14, %r11, %r13;
	neg.s32 	%r15, %r14;
	slct.f32.s32 	%f15, %f5, %f12, %r15;
	.loc	16	24	0
	neg.f32 	%f16, %f6;
	mov.f32 	%f17, 0f40400000;    	// 3
	set.gt.u32.f32 	%r16, %f10, %f17;
	neg.s32 	%r17, %r16;
	mov.f32 	%f18, 0fc0400000;    	// -3
	set.lt.u32.f32 	%r18, %f10, %f18;
	neg.s32 	%r19, %r18;
	or.b32 	%r20, %r17, %r19;
	neg.s32 	%r21, %r20;
	slct.f32.s32 	%f19, %f6, %f16, %r21;
	.loc	16	30	0
	neg.f32 	%f20, %f7;
	mov.f32 	%f21, 0f40400000;    	// 3
	set.gt.u32.f32 	%r22, %f11, %f21;
	neg.s32 	%r23, %r22;
	mov.f32 	%f22, 0fc0400000;    	// -3
	set.lt.u32.f32 	%r24, %f11, %f22;
	neg.s32 	%r25, %r24;
	or.b32 	%r26, %r23, %r25;
	neg.s32 	%r27, %r26;
	slct.f32.s32 	%f23, %f7, %f20, %r27;
	st.global.v4.f32 	[%r7+0], {%f9,%f10,%f11,%f4};
	st.global.v4.f32 	[%r9+0], {%f15,%f19,%f23,%f8};
	.loc	16	40	0
	ld.param.u32 	%r28, [__cudaparm_particles_kernel_vbo];
	add.u32 	%r29, %r28, %r5;
	mov.f32 	%f24, 0f3f800000;    	// 1
	st.global.v4.f32 	[%r29+0], {%f9,%f10,%f11,%f24};
$Lt_0_4098:
	.loc	16	43	0
	exit;
$LDWend_particles_kernel:
	} // particles_kernel

Second ptx:


.entry particles_kernel (
		.param .u32 __cudaparm_particles_kernel_vbo,
		.param .u32 __cudaparm_particles_kernel_pos,
		.param .u32 __cudaparm_particles_kernel_vel,
		.param .s32 __cudaparm_particles_kernel_np)
	{
	.reg .u16 %rh<4>;
	.reg .u32 %r<13>;
	.reg .f32 %f<30>;
	.reg .pred %p<9>;
	.loc	16	3	0
$LDWbegin_particles_kernel:
	mov.u16 	%rh1, %ctaid.x;
	mov.u16 	%rh2, %ntid.x;
	mul.wide.u16 	%r1, %rh1, %rh2;
	cvt.u32.u16 	%r2, %tid.x;
	add.u32 	%r3, %r2, %r1;
	ld.param.s32 	%r4, [__cudaparm_particles_kernel_np];
	setp.le.s32 	%p1, %r4, %r3;
	@%p1 bra 	$Lt_0_5634;
	.loc	16	9	0
	mul.lo.u32 	%r5, %r3, 16;
	ld.param.u32 	%r6, [__cudaparm_particles_kernel_pos];
	add.u32 	%r7, %r6, %r5;
	ld.param.u32 	%r8, [__cudaparm_particles_kernel_vel];
	add.u32 	%r9, %r8, %r5;
	ld.global.v4.f32 	{%f1,%f2,%f3,_}, [%r7+0];
	ld.global.v4.f32 	{%f4,%f5,%f6,_}, [%r9+0];
	add.f32 	%f7, %f1, %f4;
	.loc	16	10	0
	add.f32 	%f8, %f2, %f5;
	st.global.v2.f32 	[%r7+0], {%f7,%f8};
	.loc	16	11	0
	add.f32 	%f9, %f3, %f6;
	st.global.f32 	[%r7+8], %f9;
	.loc	16	9	0
	ld.global.f32 	%f10, [%r7+0];
	mov.f32 	%f11, 0fc0400000;    	// -3
	setp.lt.f32 	%p2, %f10, %f11;
	@%p2 bra 	$L_0_4354;
	mov.f32 	%f12, 0f40400000;    	// 3
	setp.gt.f32 	%p3, %f10, %f12;
	@!%p3 bra 	$L_0_4098;
$L_0_4354:
	.loc	16	15	0
	ld.global.f32 	%f13, [%r9+0];
	neg.f32 	%f14, %f13;
	st.global.f32 	[%r9+0], %f14;
$L_0_4098:
	.loc	16	18	0
	ld.global.f32 	%f15, [%r7+4];
	mov.f32 	%f16, 0fc0400000;    	// -3
	setp.lt.f32 	%p4, %f15, %f16;
	@%p4 bra 	$L_0_4866;
	mov.f32 	%f17, 0f40400000;    	// 3
	setp.gt.f32 	%p5, %f15, %f17;
	@!%p5 bra 	$L_0_4610;
$L_0_4866:
	.loc	16	20	0
	ld.global.f32 	%f18, [%r9+4];
	neg.f32 	%f19, %f18;
	st.global.f32 	[%r9+4], %f19;
$L_0_4610:
	.loc	16	23	0
	ld.global.f32 	%f20, [%r7+8];
	mov.f32 	%f21, 0fc0400000;    	// -3
	setp.lt.f32 	%p6, %f20, %f21;
	@%p6 bra 	$L_0_5378;
	mov.f32 	%f22, 0f40400000;    	// 3
	setp.gt.f32 	%p7, %f20, %f22;
	@!%p7 bra 	$L_0_5122;
$L_0_5378:
	.loc	16	25	0
	ld.global.f32 	%f23, [%r9+8];
	neg.f32 	%f24, %f23;
	st.global.f32 	[%r9+8], %f24;
	ld.global.f32 	%f20, [%r7+8];
$L_0_5122:
	ld.global.v2.f32 	{%f25,%f26}, [%r7+0];
	.loc	16	28	0
	mov.f32 	%f27, %f20;
	ld.param.u32 	%r10, [__cudaparm_particles_kernel_vbo];
	add.u32 	%r11, %r10, %r5;
	mov.f32 	%f28, 0f3f800000;    	// 1
	st.global.v4.f32 	[%r11+0], {%f25,%f26,%f27,%f28};
$Lt_0_5634:
	.loc	16	30	0
	exit;
$LDWend_particles_kernel:
	} // particles_kernel

Well, I’m also inexperienced with this, but with this code, there is at least something that guesses can be based on :wink: I mainly was wondering whether there has to be a really „significant“ difference, and thought that the compiler could have translated both source codes into very similar PTX codes.

But obviously, it didn’t. The following should be taken with a huge grain of salt. I’m not an expert at this, everything that I say might be inaccurate or plainly wrong. But I’ll try to analyze the differences between the kernels, and try to reconstruct what it is doing in the blocks where the actual comparisons are made. The first kernel:


	....                                               // tVel.x is stored in f5
	neg.f32 	%f12, %f5;                         // Negate tVel.x and store it in f12
	mov.f32 	%f13, 0f40400000;    	// 3       // Store '3' in f13
	set.gt.u32.f32 	%r10, %f9, %f13;                   // Compare tPos.x and  3, and store the result in r10
	neg.s32 	%r11, %r10;                        // Negate r10 and store it in r11
	mov.f32 	%f14, 0fc0400000;    	// -3      // Store '-3' in f14
	set.lt.u32.f32 	%r12, %f9, %f14;                   // Compare tPos.x and -3, and store the result in r12
	neg.s32 	%r13, %r12;                        // Negate r12 and store it in r13
	or.b32 	%r14, %r11, %r13;                          // Combine the results of the comparisons into r14
	neg.s32 	%r15, %r14;                        // Negate r14 and store it in r15
	slct.f32.s32 	%f15, %f5, %f12, %r15;             // Store either tVel.x or -tVel.x in f15
        ...                                                //     (later, this value, f15, will be stored in global memory)

The second kernel:


	ld.global.f32 	%f10, [%r7+0];                     // Store pos.x in f10
	mov.f32 	%f11, 0fc0400000;    	// -3	   // Store -3 in f11
	setp.lt.f32 	%p2, %f10, %f11;		   // Create the predicate which states whether pos.x < -3
	@%p2 bra 	$L_0_4354;                         //    (and if this is true, jump to the first label)
	mov.f32 	%f12, 0f40400000;    	// 3	   // Store  3 in f12
	setp.gt.f32 	%p3, %f10, %f12;		   // Create the predicate which states whether pos.x >  3  		      
	@!%p3 bra 	$L_0_4098;			   //    (and if this is false, jump to the second label)
$L_0_4354:                                                 // First label
	.loc	16	15	0                          
	ld.global.f32 	%f13, [%r9+0];                     // Load vel.x from global memory
	neg.f32 	%f14, %f13;                        // Negate it
	st.global.f32 	[%r9+0], %f14;                     // Store vel.x back to global memory
$L_0_4098:                                                 // Second label
        ... next comparison...

  • In the first kernel it simply did the computation and stored the results in the registers, and in the end, it has to write the positions and velocities of every particle into global memory
  • In the second kernel, it obviously tries very hard (using the predicates and jumps to labels) to avoid writing to global memory.

So in the first kernel, it has to access the global memory three times for each particle, in every case. But with the second kernel, I assume that it succeeds in avoiding many global memory accesses in most cases, because most of the particles are NOT leaving the bounding box.

But I have to emphasize this again: These are only just guesses.

A little more experimenting could be worthwhile, but precisely benchmarking (or even profiling) kernels may be difficult. I once started writing a little benchmarking tool (for OpenCL, but the kernels are the same) where it should be possible to let kernels run on predefined input data and precisely measure the execution time, but this is still under construction…

bye

EDIT: One ‚true‘ should have been ‚false‘

Thanks for explanations

I have also tried to get some informations helpful when using Cuda Occupancy Calculator, but I don’t know if I’m doing it right.

nvcc -Xptxas -v particles.cu generates desired informations but I get also errors:
LIBCMT.lib(crt0.obj): error LNK 2019: unresolved external symbol _main referenced in function ___tmainCRTStartup
a.exe : fatal error LNK1120 : 1 unresolved externals

I found this topic: click so I assume that in my case I also should use “-c” and that works - I don’t get errors.

nvcc -c -Xptxas -v particles.cu by default generates informations for sm_10 so I also assume that a valid command line in my case should look like this:

nvcc -c Xptxas -v -arch sm_21 particles.cu

First kernel:

  • 0 bytes stack frame
  • 0 bytes spill stores
  • 0 bytes spill loads
  • used 16 registers, 48 bytes cmem[0]

Second kernel:

  • 0 bytes stack frame
  • 0 bytes spill stores
  • 0 bytes spill loads
  • used 12 registers, 48 bytes cmem[0]

How Should I interpret these values ? The first kernel uses 3 registers (variables tid, tPos, tVel), the second only one (tid).

Hello

I have not yet really used the Occupancy Calclulator, only clicked around a little, but did not really learn how to use it with the documentation.

You’re right, the real number of registers can, as far as I know, only be determined when the PTX is actually compiled. In any case, each of the ‘float4’ local variables takes at least 4 registers, one for each component, but there is not necessarily a 1:1-translation of local variables to registers. It is not “directly” computing on global memory. Every value that is read from global memory has to be stored in registers, so for example, something like
globalMem[a] = globalMem** + globalMem```;
will (for float4 memory) at least take 3*4 registers for the computation itself (although there are not local variables)

But maybe I’ll have to read more in the PTX documentation to give more profound answers here… :o

bye
Marco

I did not looked at the assembler code, but here is my assumption!

Every literal is of cause saved somewhere so the following two codes produce same register count!
The Compiler can optimise things, but not everything of cause. Never trust the compiler especially on Hardware Programming.

int x = 1;
if(x < 3)
int x = 1;
int y  = 3;
if(x < y)

**
Your Codes:**

Both:

//make_float_4 uses 4 registers, or are those 4 values copied by reference ?
//only pointer and arrays are copied by reference!
vbo[tid] = make_float4(tPos.x, tPos.y, tPos.z, 1.0f);

Code1


float4 tPos = pos[tid];	//4 register
float4 tVel = vel[tid];		//4 register

//register count for if clauses depend on compiler optimisations!
//3.0f needs to be stored somewhere!
//-3.0f maybe takes same storage as 3.0f!  1 registers ?
//-tVel.x, -tVel.y, -tVel.z negation needs to be stored somewhere! 3 registers ?


//results in around 16 register usage total!

Code2

//1 register
int tid = blockIdx.x * blockDim.x + threadIdx.x;

//3 registers
pos[tid].x += vel[tid].x;
pos[tid].y += vel[tid].y;
pos[tid].z += vel[tid].z;

//results in around 12 register usage total!

Many, or almost the most Cuda Applications create some sub functions in which values are not passed by reference!
Hence register count increases.

Go by reference if possible, especially using functions in a loop!

Example of bad coding:
Declare Loop Variables outside of loop!, or use volatile keyword. But i dont use it, dont trust it :smiley:

for(int i = 0; i < 5; i++)
{
	unsigned int x = i + 3;
	for(int j = 0; j < 5; i++)
	{
		out**[j] = x;
	}
}

int j = 0;
unsigned int x = 0;

for(int i = 0; i < 5; i++)
{
	x = i + 3;
	for(j = 0; j < 5; i++)
	{
		out**[j] = x;
	}
}