I have used reduction kernel to compute the sum of multiples :
(Sorry, I didnt found special CUDA topic so I posted this thread here)
Can you please help me!
Here is what i have done:-```
#include<stdio.h>
#include<cuda.h>
#define MAX 100
#define blocksize 16
global void compute(int*);
int main(){
int ha[MAX],count=0,i;
int ga;
int size = MAXsizeof(int);
int blocks = MAX / blocksize;
if (MAX % blocksize != 0) blocks++;
printf("check
");
cudaError_t err;
//init ha
// for (int i=0; i < MAX; i++) ha** = i;
for (i=1; i < MAX; i++){
if (i % 3 == 0 || i % 5 == 0){
ha** = i;
printf("%d “, ha**);
count++;
}
} printf(”
No of elements in array = %d
",count);
if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size)))){
printf("error in cuaMalloc %s", (char *)cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice)))){
printf("Error in cudaMemcpy H_T_D %s ", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
compute <<<1, 256, (256*sizeof(int))>>>(ga);
//compute<<<blocks, blocksize, size >>>(ga);
cudaDeviceSynchronize();
if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost))){
printf("Error in cudaMemcpy D_T_H %s ", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
//print results
printf("the sum of all multiple of 3 and 5 is= %d ", ha[0]);
cudaFree(ga);
getchar();
return 0;
}
global void compute(int *ga){
//int id = threadIdx.x + (blockDim.x * blockIdx.x);
int id = threadIdx.x;
extern shared int s[];
s[id] = MAX;
__syncthreads();
if (id<MAX)
{
s[id] = ga[id];
}
__syncthreads();
//if (s[id] % 3 == 0 || s[id] % 5 == 0)
// s[id] = id;
//applying the reeduction
if (blockDim.x >= 1024){
if (id < 512){
s[id] = s[id] + s[id + 512];
}
__syncthreads();
}
if (blockDim.x >= 512){
if (id < 256)
{
s[id] = s[id] + s[id + 256];
}
__syncthreads();
}
if (blockDim.x >= 256){
if (id < 128)
{
s[id] = s[id] + s[id + 128];
}
__syncthreads();
}
if (blockDim.x >= 128){
if (id < 64)
{
s[id] = s[id] + s[id + 64];
}
__syncthreads();
}
//if this is the last warp
if (id < 32){
if (blockDim.x >= 64)
s[id] = s[id] + s[id + 32];
if (blockDim.x >= 32)
s[id] = s[id] + s[id + 16];
if (blockDim.x >= 16)
s[id] = s[id] + s[id + 8];
if (blockDim.x >= 8)
s[id] = s[id] + s[id + 4];
if (blockDim.x >= 4)
s[id] = s[id] + s[id + 2];
if (blockDim.x >= 2)
s[id] = s[id] + s[id + 1];
}
//thread zero will store min of this block i.e. s[0];
if (id == 0)
{
ga[blockIdx.x] = s[0];
}
}```