Addition of prime no in CUDA

hello,:slight_smile:

I want to do addition of all prime no. from 1 to 10,000. But my program showing me output up to 2000 no. only. If I increase the SIZE>2000 It is not showing correct output. I don’t know why it is so?? Is it due to any configuration related with graphics card?? or anything else??
I am using GeForce 210 (Compute Capability 1.2) with CUDA toolkit 6.5.(IDE-visual studio 2013)

Here is my program:


#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<windows.h>
#define SIZE 2500

__global__ void vectoreAdd(int *d_a, int *d_b)
{
	int i = threadIdx.x + (blockIdx.x*blockDim.x);

	if (i<SIZE)

	for (i = 2; i<SIZE; i++)
	{
		int counter = 0;
		for (int j = 2; j<d_a**; j++)
		{
			if (d_a** % j == 0)
			{
			counter = 1; break;
			}
		}
if (counter == 0)
		{
			d_b** = d_a**;
		}

	}
}
int main()
{
	int *a, *b, sum = 0,count=-1;      //Declaration of host and device  
	int *d_a, *d_b;
	cudaEvent_t start, stop;
	float elapseTime;
	int blocks, block_size = 512;

	a = (int *)malloc(SIZE*sizeof(int));
	b = (int *)malloc(SIZE*sizeof(int));


	cudaMalloc((void**)&d_a, SIZE * sizeof(int));
	cudaMalloc((void**)&d_b, SIZE * sizeof(int));


	for (int i = 1; i<SIZE; i++)
	{
		a** = i;
		b** = 0;

	}

	cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);

	blocks = SIZE / block_size;
	if (SIZE% block_size != 0)
		blocks++;

	//Calculate time of execution
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//start recording 
	cudaEventRecord(start, 0);


	vectoreAdd << < blocks, block_size >> >(d_a, d_b);

	//stop recording
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	//Copy results from GPU to CPU
	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("
Total elapsed time on GPU=%lf", elapseTime);

	cudaThreadSynchronize();

	cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

	for (int m = 0; m < SIZE; m++)
	{
		if (b[m] != 0)
		{
			printf("
 prime no is:%d", b[m]);
			count = count + 1;
		}
	}
	printf("

 Total prime no: %d", count);
		for (int j = 1; j<SIZE; j++)
	{
		sum = sum + b[j];
	}

	printf("
 
sum of all prime no upto %d is:%d", SIZE, sum);

	free(a);
	free(b);

	cudaFree(d_a);
	cudaFree(d_b);

	getchar();

	return 0;
}

Can you please help me to sort it out??:slight_smile:

I tried it with 2500 and 10000, and compared it to a Java program, and it worked for me.

What does it print when you start it with 2500?

What does it print when you add


    printf("

 Total prime no: %d", count);
    for (int j = 1; j<SIZE; j++)
    {
        printf("Adding %d
", b[j]); // <----------------- some debug output
        sum = sum + b[j];
    }

(or when you run it in a debugger, if possible) ?

hi marco,

I want to make above program more optimize, so I decided to add sumation code in kernel itself. (means summation also will done in parallel manner on GPU, instead of doing it on CPU) but it is showing me wrong output(i.e =0) Can you please tell me mistake in this??
this is my modified code:


#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<windows.h>

#define SIZE 10

__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
	int i = threadIdx.x + (blockIdx.x*blockDim.x);
	int *sum=0;

	if (i<SIZE)

	for (i = 2; i<SIZE; i++)

	{
		int counter = 0;
		for (int j = 2; j<d_a**; j++)
		{
			if (d_a** % j == 0)

			{
				counter = 1;

				break;
			}
		}

		if (counter == 0)
		{
			//printf("
 %d", d_a**);
			d_b** = d_a**;
			sum=sum+ d_a**;
			//d_c= d_c + d_a**;
			//d_c = sum;
		}
	/*	for( int k=0;k<SIZE;k++)
		{
			//if( d_b[k]!=0)
				sum= sum+ d_b[k];
				d_c = sum;				
			
		}*/
	}
}
int main()
{

	int *a, *b, *c=0 ;      //Declaration of host and device  
	int *d_a, *d_b, *d_c;
	cudaEvent_t start, stop;
	float elapseTime;
	int blocks, block_size = 512;

	a = (int *)malloc(SIZE*sizeof(int));
	b = (int *)malloc(SIZE*sizeof(int));
	c = (int *)malloc(SIZE*sizeof(int));

	cudaMalloc((void**)&d_a, SIZE * sizeof(int));
	cudaMalloc((void**)&d_b, SIZE * sizeof(int));
	cudaMalloc((void**)&d_c, SIZE * sizeof(int));



	for (int i = 1; i<SIZE; i++)
	{
		a** = i;
		b** = 0;
		c =	0;
	}

	cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);


	blocks = SIZE / block_size;
	if (SIZE% block_size != 0)
		blocks++;

	//Calculate time of execution
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//start recording 
	cudaEventRecord(start, 0);


	vectoreAdd << < blocks, block_size >> >(d_a, d_b, d_c);

	//stop recording
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	//Copy results from GPU to CPU
	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	printf("
Total elapsed time on GPU=%lf", elapseTime);

	cudaThreadSynchronize();

	
	cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	
	for (int m = 0; m < SIZE; m++)
	{
		if (b[m] != 0)
		{
			printf("

 prime no is:%d", b[m]);
		}
	}
	for (int i = 1; i < SIZE; i++)
	{
		if (b** != 0)
		{
			printf("

	b[%d]=%d", i, b**);
		}
	}//sum = sum + b[j];
	printf("
 
sum of all prime no upto %d is:%d", SIZE, c);

	free(a);free(b);free(c);
	cudaFree(d_a);cudaFree(d_b);cudaFree(d_c);
	getchar();

	return 0;

}
Thanks :)

*** Edit ***

reply to #2

1.when I tried with SIZE 2500 its showing "summation is =0"
2. by adding your code 
 printf("

 Total prime no: %d", count);
    for (int j = 1; j<SIZE; j++)
    {
        printf("Adding %d
", b[j]); // <----------------- some debug output
        sum = sum + b[j];
    }
showing wrong output. I did it this for SIZE 10= output=34

*** Edit ***

reply to #2

1.when I tried with SIZE 2500 its showing “summation is =0”
2. by adding your code
printf("

Total prime no: %d", count);
for (int j = 1; j<SIZE; j++)
{
printf("Adding %d
", b[j]); // <----------------- some debug output
sum = sum + b[j];
}
showing wrong output. I did it this for SIZE 10= output=34

can you learn to use code tags for your long codes?
it is an insult to readers not to try simple improvements on postings

This will not work. You cannot compute the sum like this in the kernel. You have to consider what the kernel actually is, and how it is executed.

You can imagine it like that:

The kernel is executed by many threads in parallel. When you have a block size of 100, then the kernel will be executed by 100 threads. At the same time!

And if these 100 threads are all writing into the (global!) d_c variable, then the final value of this variable will basically be random garbage. It will almost certainly not contain the real sum.

(You could solve this, with „atomics“, but these require a certain compute capability, and, more importantly: They will destroy all parallelism in this case, and cause a very low performance).

If you want to compute the sum of all elements of a (global) array, then you can use a reduction. This question is likely from one of your colleagues: JCuda - Byte-Welt - Die Welt des Programmierens , but there are many examples of different reductions online.

I refered this thread with cuda I guess, because it’s showing me here ,“tags for this thread is:CUDA” -_-

Can you please tell me what should I do to use code tags??

rule for every forum everwhere and everything in life:
you can look how others are doing it, e.g. quoting/ citing other postings to see how they are build

write `

 code-lines 

`

 code-lines 

general search ‘forum code tags’ finds something, too :
BB Code List - Daily Writing Tips Forum
https://www.phpbb.com/community/faq.php?mode=bbcode
and so on

reply to #5 @marco

I use reduction logic in code by refering this linnk–>How to find the sum of array in CUDA by reduction - Stack Overflow
But It is giving some errors. I am not getting my mistake, could you please help me out??:slight_smile:



#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>

#define SIZE 10
#define N 100

__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
	__shared__ int sdata[256];

	int i = threadIdx.x + (blockIdx.x*blockDim.x);

	sdata[threadIdx.x] = d_a**;
	__syncthreads();

	if (i<SIZE)

	for (i = 2; i<SIZE; i++)
	{
		int counter = 0;
		for (int j = 2; j<d_a**; j++)
		{
			if (d_a** % j == 0)
			{
			counter = 1; break;
			}
		}
if (counter == 0)
		{
			d_b** = d_a**;
		}

	}
	// do reduction in shared mem
	for (int s = 1; s < blockDim.x; s *= 2)
	{
		int index = 2 * s * threadIdx.x;;

		if (index < blockDim.x)
		{
			sdata[index] += sdata[index + s];
		}
		__syncthreads();
	}

	// write result for this block to global mem
	if (threadIdx.x == 0)
		atomicAdd(d_c, sdata[0]);
}

}
int main()
{
	clock_t tic = clock();
	int *a, *b, *summation=0, sum = 0,count=-1;       //declare summation as double/long if needed
	int *d_a, *d_b, *d_c;

	//int blocks, block_size = 512;

	int size = N * sizeof(int); 
	
	a = (int *)malloc(SIZE*sizeof(int));
	b = (int *)malloc(SIZE*sizeof(int));
	summation = (int *)malloc(SIZE*sizeof(int));
	

	cudaMalloc((void**)&d_a, SIZE * sizeof(int));
	cudaMalloc((void**)&d_b, SIZE * sizeof(int));
	cudaMalloc((void**)&d_c, SIZE * sizeof(int));


	for (int i = 1; i<SIZE; i++)
	{
		a** = i;
		b** = 0;
	
	}

	cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	/*blocks = SIZE / block_size;
	if (SIZE% block_size != 0)
		blocks++;   */

	dim3 blocksize(256); // create 1D threadblock
	dim3 gridsize(N / blocksize.x);  //create 1D grid

	vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);


	//cudaThreadSynchronize();

	cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost); 
	cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

	for (int m = 0; m < SIZE; m++)
	{
		if (b[m] != 0)
		{
			printf("
 prime no is:%d", b[m]);
			count = count + 1;
		}
	}
	printf("

 Total prime no. are: %d", count);
/*		for (int j = 1; j<SIZE; j++)
	{
		sum = sum + b[j];
	}*/

	printf("
 
sum of all prime no upto %d is:%d", SIZE, summation);
	
	clock_t toc = clock();
	printf("

Elapsed: %f seconds
", (double)(toc - tic) / CLOCKS_PER_SEC);

	free(a);	free(b);	free(summation);
	cudaFree(d_a);		cudaFree(d_b);		cudaFree(d_c);
	
	getchar();	return 0;
} 

*** Edit ***

thank you so much, I will keep in mind from on wards…!!:slight_smile:

It looks like you tried to do the „prime detection“ and the reduction in the same kernel.

I’ll have to look at this in more detail, and think a bit more about whether this can really give the right results. (Again: Keep in mind that this is executed by all threads in parallel!).
But thinking about this is actually your job :wink:

As a first step, I’d recommend to create ONE kernel for the „prime detection“, and ONE kernel that does the reduction.

Even I think on it and did another program…:lol:
but it is showing garbage value…






#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>

#define SIZE 10

__global__ void prime(int *d_a, int *d_b)
{
	int i = threadIdx.x + (blockIdx.x*blockDim.x);

	if (i<SIZE)

	for (i = 2; i<SIZE; i++)
	{
		int counter = 0;
		for (int j = 2; j<d_a**; j++)
		{
			if (d_a** % j == 0)
			{
				counter = 1; break;
			}
		}
		if (counter == 0)
		{
			d_b** = d_a**;
		}

	}
}

__global__ void add(int *d_c)
{
	int i = threadIdx.x + (blockDim.x * blockIdx.x);
	int id = threadIdx.x;
	extern __shared__ int s[];
	s[id] = 0;
	__syncthreads();
	if (id<SIZE)
	{
		s[id] = d_c**;
	}
	__syncthreads();
	

	//applying the reeduction
	if (blockDim.x >= 2048){
		if (id < 1024){
			s[id] = s[id] + s[id + 512];
		}
		__syncthreads();
	}
	if (blockDim.x >= 1024){
		if (id < 512){
			s[id] = s[id] + s[id + 512];
		}
		__syncthreads();
	}

	if (blockDim.x >= 512){
		if (id < 256)
		{
			s[id] = s[id] + s[id + 256];
		}
		__syncthreads();
	}
	if (blockDim.x >= 256){
		if (id < 128)
		{
			s[id] = s[id] + s[id + 128];
		}
		__syncthreads();
	}
	if (blockDim.x >= 128){
		if (id < 64)
		{
			s[id] = s[id] + s[id + 64];
		}
		__syncthreads();
	}
	//if this is the last warp
	if (id < 32){
		if (blockDim.x >= 64)
		{

			s[id] = s[id] + s[id + 32];
		}
		if (blockDim.x >= 32)
		{

			s[id] = s[id] + s[id + 16];
		}
		if (blockDim.x >= 16)
		{

			s[id] = s[id] + s[id + 8];
		}
		if (blockDim.x >= 8)
		{

			s[id] = s[id] + s[id + 4];
		}
		if (blockDim.x >= 4)
		{

			s[id] = s[id] + s[id + 2];
		}
		if (blockDim.x >= 2)
		{

			s[id] = s[id] + s[id + 1];
		}
	}
	//thread zero will store min of this block i.e. s[0];
	if (id == 0)
	{
		//ga[blockIdx.x] = s[0];
		atomicAdd(d_c, s[0]);
	}
}




int main()
{
	//clock_t tic = clock();
	int *a, *b,sum = 0, count = -1;  
	int cd[SIZE];//Declaration of host and 	device
		int *d_a, *d_b, *d_c;
		cudaError_t err;
	cudaEvent_t start, stop;
	float elapseTime;
	int blocks, block_size = 512;

	a = (int *)malloc(SIZE*sizeof(int));
	b = (int *)malloc(SIZE*sizeof(int));
	//cd = (int *)malloc(SIZE*sizeof(int));

	cudaMalloc((void**)&d_a, SIZE * sizeof(int));
	cudaMalloc((void**)&d_b, SIZE * sizeof(int));
	
	if (cudaSuccess != (err = (cudaMalloc((void**)&d_c, SIZE * sizeof(int))))){
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	for (int i = 1; i<SIZE; i++)
	{
		a** = i;
		b** = 0;
		cd** = 0;
	}

	cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
	if (cudaSuccess != (err = (cudaMemcpy(d_c, cd, SIZE*sizeof(int), cudaMemcpyHostToDevice)))){
		printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	blocks = SIZE / block_size;
	if (SIZE% block_size != 0)
	blocks++;

	
	//Calculate time of execution
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//start recording
	cudaEventRecord(start, 0);


	prime << < blocks, block_size >> >(d_a, d_b);
//	cudaThreadSynchronize();

	add << < blocks, block_size >> >(d_c);
	//stop recording
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	//Copy results from GPU to CPU
	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);	cudaEventDestroy(stop);



	//cudaThreadSynchronize();

	cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaSuccess != (err = cudaMemcpy(cd, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost))){
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}


	for (int m = 0; m < SIZE; m++)
	{
		if (b[m] != 0)
		{
			printf("
 prime no is:%d", b[m]);
			cd[m] = b[m];
			count = count + 1;
		}
	}
	printf("

 Total prime no: %d", count);
	/*for (int j = 1; j<SIZE; j++)
	{
		sum = sum + b[j];
	}*/
	
	//printf("
 
sum of all prime no upto %d is:%d", SIZE, c);
	printf("

 summation : %d	
", cd[0]);



	printf("

Total elapsed time on GPU=%lf", elapseTime);
	//clock_t toc = clock();
	//printf("

Elapsed: %f seconds
", (double)(toc - tic) /CLOCKS_PER_SEC);

	free(a);
	free(b);
	free(cd);
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	getchar();


	return 0;
}


actually I want to give ,output of first kernel to second kernel as a input. but I doubt is it really happening in this code or not??:stuck_out_tongue_winking_eye:

I am not responsible for debugging your code

Usually, I’m trying to help, everybody. When you have a specific quesiton, I will try to help you and answer it. (Although I’m not a CUDA expert).

But you obviously have no idea what you are doing there. The current program clearly prints
Error in cudaMemcpy D_T_H an illegal memory access was encountered
and now you have to figure out what is wrong there.

This is difficult. Because the code is a copy-and-paste-mess (and that’s another reason of why I don’t see why I should do this for you…). There are dozens of errors in the code that will prevent it from running sooner or later, and I cannot fix them for you.

So before I’ll try to help you any further: Clean up the code, and try to understand what the code actually does.

You will not have any success with “trial-and-error” here.

I resolved the error from code, and here is the CUDA code of addition of prime no. from 1 to 20,000




#include<stdio.h>
#include<cuda.h>
#define SIZE 20000

__global__ void prime(int *ga, int *gb)				//ga=sum, gb=prime 
{
	int i = threadIdx.x + (blockDim.x * blockIdx.x);
	int k = threadIdx.x;

	int flag, j;
	if (i < SIZE)
	{
		flag = 0;
		for (j = 2; j <= i / 2; j++)
		{
			if (ga** % j == 0)
			{
				flag = 1;
				break;
			}
		}
		if (flag == 0)
		{
			gb** = ga**;
		}
	}

	extern __shared__ int arr[];
	arr[k] = 0;
	__syncthreads();
	if (k<SIZE)
	{
		arr[k] = gb**;
	}
	__syncthreads();

	if (blockDim.x >= 1024){
		if (k < 512){
			arr[k] = arr[k] + arr[k + 512];}
		__syncthreads();
	}

	if (blockDim.x >= 512)
	{
		if (k < 256)
		{arr[k] = arr[k] + arr[k + 256];}
		__syncthreads();
	}
	if (blockDim.x >= 256)
	{
		if (k < 128)
		{arr[k] = arr[k] + arr[k+ 128];}
		__syncthreads();
	}
	if (blockDim.x >= 128)
	{
		if (k < 64)
		{arr[k] = arr[k] + arr[k + 64];}
		__syncthreads();
	}
	
	if (k < 32){
		if (blockDim.x >= 64)
		{arr[k] = arr[k] + arr[k + 32];}
		
		if (blockDim.x >= 32)
		{arr[k] = arr[k] + arr[k + 16];}
		
		if (blockDim.x >= 16)
		{arr[k] = arr[k] + arr[k + 8];}
		
		if (blockDim.x >= 8)
		{arr[k] = arr[k] + arr[k + 4];}
		
		if (blockDim.x >= 4)
		{arr[k] = arr[k] + arr[k + 2];}
		
		if (blockDim.x >= 2)
		{	arr[k] = arr[k] + arr[k + 1];}
	}

	if (k == 0)
	{
		atomicAdd(ga, arr[0]);
	}
}

int  main()
{
	int ha[SIZE], cpu1[SIZE], cpu2[SIZE], count = 0, i, flag, sum = 0;
	int *ga, *gb, hb[SIZE];
	int size = SIZE*sizeof(int);

	const int blocksize = 512;
	int sh_size = (blocksize * sizeof(int));

	int blocks = SIZE / blocksize;
	if (SIZE % blocksize != 0) blocks++;

	for (int k = 0; k < SIZE; k++)			//start from 2
	{
		cpu1[k] = k;
		ha[k] = k;
		cpu2[k] = 0;
	}

	for (i = 2; i < SIZE; i++)
	{
		flag = 0;
		for (int j = 2; j <= i / 2; j++)
		{
			if (cpu1** % j == 0)
			{
				flag = 1; break;
			}
		}
		if (flag == 0)
		{
			if (cpu1** != 0)
				cpu2** = cpu1**;

			sum = sum + cpu2**;
			//printf("%d	", cpu2**);
			count++;
		}
	}
	printf("

Total prime no. count on CPU = %d
", count);

	printf("
Addition on CPU  : %d

", sum);

	cudaError_t err;
	cudaEvent_t start, stop;
	float elapseTime;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	if (cudaSuccess != (err = (cudaMalloc((void**)&ga, size))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = (cudaMalloc((void**)&gb, size))))
	{
		printf("error in cuaMalloc  %s", (char *)cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	if (cudaSuccess != (err = (cudaMemcpy(ga, ha, size, cudaMemcpyHostToDevice))))
	{
		printf("Error in cudaMemcpy H_T_D  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	cudaEventRecord(start, 0);

	prime << <blocks, blocksize, sh_size >> >(ga, gb);
	
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&elapseTime, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	if (cudaSuccess != (err = cudaMemcpy(hb, gb, size, cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	if (cudaSuccess != (err = cudaMemcpy(ha, ga, size, cudaMemcpyDeviceToHost)))
	{
		printf("Error in cudaMemcpy D_T_H  %s ", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
	
	for (int y = 2; y < SIZE; y++)
	{
		if (hb[y] != 0)
		printf("	 %d	", hb[y]);				//prime=hb
	}
	printf("

Total prime no. count on CPU = %d", count);
	printf("

Summation on GPU  : %d	
", ha[0]);		//ha=sum						
	printf("
Total elapsed time on GPU=%lfmillisecond
 ", elapseTime);
	cudaFree(ga);
	getchar();
	return 0;
}

 

Good to head that you managed to resolve this.

(Although I assume that this was mainly intended for “practicing” and for becoming familiar with CUDA - I don’t think that you’ll see a speedup by doing this on the GPU)