Thanks for the quick reply.

from what I understand by your posts which You had mentioned, It seems jCUDA is not that simple. for example, in C the workflow is following:-

- Get some data on host (CPU).
- Allocate memory on the device (GPU).
- Copy data on the device.
- Launch kernels on device by using <<numBlocks,numthreads>>.
- copy results to host.

Now I can do steps 1,2 & 3 in JAVA uptil now. You had mentioned something about .cu files. Does that mean that I can use the kernels i had written for C based CUDA and utilize it with JAVA?

I am including a little C CUDA program of generating prime numbers. Can You please tell me how to run this kernel in JAVA via .cu procedure? or any other?

Personally, i think CUDA in C was much straight forward. It went on like a simple C program. JCuda is a bit of a tangle, But using it has itâ€™s own advantages.

```
#include <stdlib.h>
using namespace std;
int numBlocks = 16;
int numThreads = 128;
int p = numBlocks * numThreads;
//const int p = 100;
const int n = 10000000;
__device__ static int block_low(int id, int p, int n)
{return (id*n)/p;}
__device__ static int block_high(int id, int p, int n)
{return (block_low(id+1,p,n)-1);}
__device__ static int block_size(int id, int p, int n)
{return (block_low(id+1,p,n) - block_low(id,p,n));}
__global__ static void Sieve(int* sieve,int sieve_size,int p)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int prime;
int low_value = block_low(tid,p,n-1);
int high_value = block_high(tid,p,n-1);
int size = block_size(tid,p,n-1);
int index;
int first;
if (tid == 0) index = 0;
prime = 2;
do{
if (sieve[prime] == 0)
{
if (prime*prime > low_value)
first = prime*prime - low_value;
else{
if (low_value%prime == 0) first = low_value;
else first = low_value + (prime - (low_value % prime));
}//End Else
for (int i=first;i<=high_value;i+=prime) sieve** = 1;
}
prime = prime + 1;
}while (prime*prime <= n); //End Do While
} //End Function
int main()
{
int *host_sieve;
int *device_sieve;
int bl0_size = (n-1)/p;
if (2+bl0_size < (int) sqrt((double) n))
{cout<<"
Too Many Blocks";
getchar();
return 0;}//End If
host_sieve = new int[n];
for (int i = 0; i<n; i++) host_sieve** = 0;
cudaMalloc((void**) &device_sieve, sizeof(int) * n);
Sieve<<<numBlocks,numThreads>>>(device_sieve,n,p);
cudaThreadSynchronize();
cudaMemcpy(host_sieve, device_sieve, sizeof(int) * n, cudaMemcpyDeviceToHost);
cudaFree(device_sieve);
for (int i = 2; i<n; i++)
{
if (host_sieve** == 0)
{cout<<i<<" ";}
if (i%20 == 0) getchar();
}
cout<<"
DONE
";
delete host_sieve;
getchar();
return 0;
}//End Main```
```