(Unexpected out)Cuda Program for finding multiple of 3 or 5 and sum of them

I have used reduction kernel to compute the sum of multiples :
(Sorry, I didnt found special CUDA topic so I posted this thread here)
Can you please help me!
Here is what i have done:-```
#include<stdio.h>
#include<cuda.h>
#define MAX 100
#define blocksize 16

global void compute(int*);

int main(){
int ha[MAX],count=0,i;
int ga;
int size = MAX
sizeof(int);

int blocks = MAX / blocksize;
if (MAX % blocksize != 0) blocks++;
printf("check

");
cudaError_t err;

//init ha

// for (int i=0; i < MAX; i++) ha** = i;
for (i=1; i < MAX; i++){
if (i % 3 == 0 || i % 5 == 0){
ha** = i;
printf("%d “, ha**);
count++;
}
} printf(”
No of elements in array = %d
",count);

if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size)))){
	printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
	exit(EXIT_FAILURE);
}
if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice)))){
	printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
}


compute <<<1, 256, (256*sizeof(int))>>>(ga);
//compute<<<blocks, blocksize, size >>>(ga);
cudaDeviceSynchronize();

if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost))){
	printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
	exit(EXIT_FAILURE);
}
//print results
		printf("the sum of all multiple of 3 and 5 is= %d	", ha[0]);
cudaFree(ga);
getchar();
return 0;

}

global void compute(int *ga){
//int id = threadIdx.x + (blockDim.x * blockIdx.x);
int id = threadIdx.x;
extern shared int s[];
s[id] = MAX;
__syncthreads();
if (id<MAX)
{
s[id] = ga[id];
}
__syncthreads();
//if (s[id] % 3 == 0 || s[id] % 5 == 0)
// s[id] = id;

//applying the reeduction
if (blockDim.x >= 1024){
	if (id < 512){
		s[id] = s[id] + s[id + 512];
	}
	__syncthreads();
}

if (blockDim.x >= 512){
	if (id < 256)
	{
		s[id] = s[id] + s[id + 256];
	}
	__syncthreads();
}
if (blockDim.x >= 256){
	if (id < 128)
	{
		s[id] = s[id] + s[id + 128];
	}
	__syncthreads();
}
if (blockDim.x >= 128){
	if (id < 64)
	{
		s[id] = s[id] + s[id + 64];
	}
	__syncthreads();
}
//if this is the last warp
if (id < 32){
	if (blockDim.x >= 64)
		s[id] = s[id] + s[id + 32];
	if (blockDim.x >= 32)
		s[id] = s[id] + s[id + 16];
	if (blockDim.x >= 16)
		s[id] = s[id] + s[id + 8];
	if (blockDim.x >= 8)
		s[id] = s[id] + s[id + 4];
	if (blockDim.x >= 4)
		s[id] = s[id] + s[id + 2];
	if (blockDim.x >= 2)
		s[id] = s[id] + s[id + 1];
}
//thread zero will store min of this block i.e. s[0];
if (id == 0)
{
	ga[blockIdx.x] = s[0];
}

}```

At least two errors here:

  1. You have to initialize your memory to 0.

        for (i = 0; i < MAX; i++) //Start at 0
        {
            if (i % 3 == 0 || i % 5 == 0)
            {
                ha** = i;
                printf("%d	", ha**);
                count++;
            }
            else
            {
                ha** = 0; // Set to 0 otherwise
            }
        }

  1. You have to initialize your memory to 0 - again!
    In the kernel, the line
    s[id] = MAX;
    should be
    s[id] = 0;

You should add a note at
https://devtalk.nvidia.com/default/topic/971958/cuda-program-for-finding-mulptiple-of-3-or-5-and-sum-of-these-multiples/
and
Cuda Program for finding mulptiple of 3 or 5 and sum of these multiples - Stack Overflow
if the problem is solved now.

Hello marco, actually links you gave me, are the posts I posted there but no replies.
Anyways, here my edited code:- please, tell me what is wrong with kernel. It is giving me output 1133 instead ~2318.
i.e. half of the sum of half elements.

//in main()
....
	int j=0;
		for (inti=0; i< MAX; i++) 
		{
	ha** = 0;
		//j++;
		}
	//int j=0;
	for (i=1; i< MAX; i++)
	{
	
		if (i % 3 == 0 || i % 5 == 0)
		{
			ha[j] = i;
			//printf("%d",j);
	printf("%d	", ha[j]);
			count++;
			j++;
		}
		
	}
....

//kernel:
__global__ voidcompute(int*);

int  main(){

	int ha[MAX],count=0,i;
	int *ga;
	int size = MAX*sizeof(int);
	
	
	

	int j=0;
		for (inti=0; i< MAX; i++) 
		{
	ha** = 0;
		//j++;
		}
	//int j=0;
	for (i=1; i< MAX; i++)
	{	
	
		if (i % 3 == 0 || i % 5 == 0)
		{
			ha[j] = i;
			//printf("%d",j);
	printf("%d	", ha[j]);
			count++;
			j++;
		}
		
	}	printf("
No of elements in array = %d
",count);
	int blocks = count/ blocksize;
	if (count % blocksize != 0) blocks++;
	printf("check
");

	cudaError_t err;
	if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size)))){
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice)))){
		printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	
//	compute <<<2, 64, (256*sizeof(int))>>>(ga);
	compute<<<blocks, blocksize, size >>>(ga);
	cudaDeviceSynchronize();

	if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost))){
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	//print results
			printf("the sum of all multiple of 3 and 5 is= %d	", ha[0]);
	cudaFree(ga);
	getchar();
	return 0;
}

__global__ voidcompute(int *ga){
	int id = threadIdx.x + blockDim.x * blockIdx.x;
//	int id = threadIdx.x;
	extern __shared__ int s[];
	s[id] = 0;
	__syncthreads();
	if (id<MAX)
	{
		s[id] = ga[id];
	}
	__syncthreads();

	//applying the reeduction
	if (blockDim.x>= 1024){
		if (id < 512){
			s[id] = s[id] + s[id + 512];
		}
		__syncthreads();
	}

	if (blockDim.x>= 512){
		if (id < 256)
		{
			s[id] = s[id] + s[id + 256];
		}
		__syncthreads();
	}
	if (blockDim.x>= 256){
		if (id < 128)
		{
			s[id] = s[id] + s[id + 128];
		}
		__syncthreads();
	}
	if (blockDim.x>= 128){
		if (id < 64)
		{
			s[id] = s[id] + s[id + 64];
		}
		__syncthreads();
	}
	
	if (id < 32){

		if (blockDim.x>= 64)
			s[id] = s[id] + s[id + 32];
		if (blockDim.x>= 32)
			s[id] = s[id] + s[id + 16];
		if (blockDim.x>= 16)
			s[id] = s[id] + s[id + 8];
		if (blockDim.x>= 8)
			s[id] = s[id] + s[id + 4];
		if (blockDim.x>= 4)
			s[id] = s[id] + s[id + 2];
		if (blockDim.x>= 2)
			s[id] = s[id] + s[id + 1];
	}
	//thread zero will store min of this block i.e. s[0];
	if (id == 0)
	{
		ga[blockIdx.x] = s[0];
	}
}

I know that you posted them. I wondered where you got the code from, and googled for applying the reeduction - easy to find, due to the typo.

However, I’m not sure what you changed, but with the changes that I mentioned, the code should work. Here is what I tested (with additional, VERY elaborate debug output, maybe you consider them helpful)


#include<stdio.h>
#include<cuda.h>
#define MAX 100
#define blocksize 16

__global__ void compute(int*);

int  main(){
    int ha[MAX], count = 0, i;
    int *ga;
    int size = MAX*sizeof(int);

    int blocks = MAX / blocksize;
    if (MAX % blocksize != 0) blocks++;
    printf("check
");
    cudaError_t err;

    //init ha
    //	for (int i=0; i < MAX; i++) ha** = i;
    for (i = 0; i < MAX; i++){
        if (i % 3 == 0 || i % 5 == 0){
            ha** = i;
            printf("%d	", ha**);
            count++;
        }
        else
        {
            ha** = 0;
        }
    }	printf("
No of elements in array = %d
", count);


    if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size)))){
        printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice)))){
        printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


    compute << <1, 256, (256 * sizeof(int)) >> >(ga);
    //compute<<<blocks, blocksize, size >>>(ga);
    cudaDeviceSynchronize();

    if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost))){
        printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    //print results
    printf("the sum of all multiple of 3 and 5 is= %d	", ha[0]);
    cudaFree(ga);
    getchar();
    return 0;
}

__global__ void compute(int *ga){
    //int id = threadIdx.x + (blockDim.x * blockIdx.x);
    int id = threadIdx.x;
    extern __shared__ int s[];
    s[id] = 0;
    __syncthreads();
    if (id<MAX)
    {
        s[id] = ga[id];
    }
    __syncthreads();
    //if (s[id] % 3 == 0 || s[id] % 5 == 0)
    //	s[id] = id;


    //applying the reeduction
    if (blockDim.x >= 1024){
        if (id < 512){
            s[id] = s[id] + s[id + 512];
        }
        __syncthreads();
    }

    if (blockDim.x >= 512){
        if (id < 256)
        {
            s[id] = s[id] + s[id + 256];
        }
        __syncthreads();
    }
    if (blockDim.x >= 256){
        if (id < 128)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 128]);
            s[id] = s[id] + s[id + 128];
        }
        __syncthreads();
    }
    if (blockDim.x >= 128){
        if (id < 64)
        {
            s[id] = s[id] + s[id + 64];
        }
        __syncthreads();
    }
    //if this is the last warp
    if (id < 32){
        if (blockDim.x >= 64)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 32]);
            s[id] = s[id] + s[id + 32];
        }
        if (blockDim.x >= 32)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 16]);
            s[id] = s[id] + s[id + 16];
        }
        if (blockDim.x >= 16)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 8]);
            s[id] = s[id] + s[id + 8];
        }
        if (blockDim.x >= 8)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 4]);
            s[id] = s[id] + s[id + 4];
        }
        if (blockDim.x >= 4)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 2]);
            s[id] = s[id] + s[id + 2];
        }
        if (blockDim.x >= 2)
        {
            printf("at %d have %d, adding %d
", id, s[id], s[id + 1]);
            s[id] = s[id] + s[id + 1];
        }
    }
    //thread zero will store min of this block i.e. s[0];
    if (id == 0)
    {
        ga[blockIdx.x] = s[0];
    }
}

This is my code.
Thanks, it helps me a lot.I am almost close to my answer. But I have some question please can you answer?
Outputs for above code according to blocksize and MAX:
Now, I am launching my kernel like :**
compute<<<blocks, blocksize, (blocksize * sizeof(int)) >>>(ga);**

**
MAX | blocksize | output: elements in array | Output:sum of multiples**

100 | 16 | 47 | 2378
1000 | 16 | 467 | 15360
10000 | 16 | 467 | 15360
10000 | 64 | 4667 | 16233
10000 | 512 | 4667 | 60945

after changing blocksize other than 16, these results seems wrong or changes accoording to block size(dont know the reason).
**Is this because of hardware limitations?If yes, how should I launch my kernel for MAX = 100000. **

my device detail :
ound 1 devices
Device 0: GeForce 210 with Compute Capability 1.2
Maximum number of threads per block : 512
Maximum x-dimension of a block : 512
Maximum y-dimension of a block : 512
Maximum z-dimension of a block : 64
Maximum x-dimension of a grid : 65535
Maximum y-dimension of a grid : 65535
Maximum z-dimension of a grid : 1
Maximum shared memory per thread block in bytes : 16384
Total constant memory on the device in bytes : 65536
Warp size in threads : 32
Maximum pitch in bytes allowed for memory copies : 2147483647
Maximum number of 32-bit registers per thread block : 16384
Clock frequency in kilohertz : 1238000
Alignment requirement : 256
Number of multiprocessors on the device : 2
Whether there is a run time limit on kernels : 1
Device is integrated with host memory : 0
Device can map host memory into CUDA address space : 1
Compute mode : 0
Maximum 1D texture width : 8192
Maximum 2D texture width : 65536
Maximum 2D texture height : 32768
Maximum 3D texture width : 2048
Maximum 3D texture height : 2048
Maximum 3D texture depth : 2048
Maximum 2D layered texture width : 8192
Maximum 2D layered texture height : 8192
Maximum layers in a 2D layered texture : 512
Alignment requirement for surfaces : 256
Device can execute multiple kernels concurrently : 0
Device has ECC support enabled : 0
PCI bus ID of the device : 1
PCI device ID of the device : 0
Device is using TCC driver model : 0
Peak memory clock frequency in kilohertz : 533000
Global memory bus width in bits : 64
Size of L2 cache in bytes : 0
Maximum resident threads per multiprocessor : 1024
Number of asynchronous engines : 1
Device shares a unified address space with the host : 0
Maximum 1D layered texture width : 8192
Maximum layers in a 1D layered texture : 512
PCI domain ID of the device : 0

thanks for the help.

Solution : We need to use atomicAdd() here to store the result in kernel ( when if( id==0) ).

I’m pretty sure that you took the kernel code from the „reduction“ sample from NVIDIA.

But it’s good to hear that it’s solved now.