Hi,
I would like to do the segmented scan.
input: [93, 90, 37, 97, 91, 27, 59, 0, 52, 18, 68, 1, …,86, 9, 83, 78, 1, 39, 24, 2, 29, 18, 57, 5, 73]
flag: [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,…, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
I want to do the prefix sum of 5 elements in each segment, but i failed to do it.
The following is my code, can anyone help me to correct it ?
import java.util.Arrays;
import java.util.Date;
import java.util.Random;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcudpp.CUDPPAlgorithm;
import jcuda.jcudpp.CUDPPConfiguration;
import jcuda.jcudpp.CUDPPDatatype;
import jcuda.jcudpp.CUDPPHandle;
import jcuda.jcudpp.CUDPPOperator;
import jcuda.jcudpp.CUDPPOption;
import jcuda.jcudpp.JCudpp;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaMemcpyKind;
public class JCudppPrefixSum {
// Program main
public static int[] prefixSum(int[] h_idata, int[] flag){
int numElements = h_idata.length;
int memSize = Sizeof.INT * numElements;
// allocate device memory
Pointer d_idata = new Pointer();
Pointer d_iflags = new Pointer();
JCuda.cudaMalloc(d_idata, memSize);
JCuda.cudaMalloc(d_iflags, memSize);
// copy host memory to device
JCuda.cudaMemcpy(d_idata, Pointer.to(h_idata), memSize,
cudaMemcpyKind.cudaMemcpyHostToDevice);
JCuda.cudaMemcpy(d_iflags, Pointer.to(d_iflags), memSize,
cudaMemcpyKind.cudaMemcpyHostToDevice);
// allocate device memory for result
Pointer d_odata = new Pointer();
JCuda.cudaMalloc(d_odata, memSize);
CUDPPConfiguration config = new CUDPPConfiguration();
config.op = CUDPPOperator.CUDPP_ADD;
config.datatype = CUDPPDatatype.CUDPP_INT;
config.algorithm = CUDPPAlgorithm.CUDPP_SEGMENTED_SCAN;
config.options = CUDPPOption.CUDPP_OPTION_FORWARD |
CUDPPOption.CUDPP_OPTION_EXCLUSIVE;
CUDPPHandle scanplan = new CUDPPHandle();
JCudpp.cudppPlan(scanplan, config, numElements, 1, 0);
// Run the scan
JCudpp.cudppSegmentedScan(scanplan, d_odata, d_idata, d_iflags, numElements);
// allocate mem for the result on host side
int h_odata[] = new int[numElements];
// copy result from device to host
JCuda.cudaMemcpy(Pointer.to(h_odata), d_odata, memSize,
cudaMemcpyKind.cudaMemcpyDeviceToHost);
JCudpp.cudppDestroyPlan(scanplan);
System.out.println(Arrays.toString(h_odata));
JCuda.cudaFree(d_idata);
JCuda.cudaFree(d_odata);
JCuda.cudaFree(d_iflags);
return h_odata;
}
public static void main(String args[])
{
int numElements = 50;
// allocate host memory
int h_idata[] = new int[numElements];
int flag[] = new int[numElements];
for(int i=0; i<numElements; i++){
if(i%5 ==0)
flag** = 1;
else
flag** = 0;
}
// initialize the memory
h_idata = createRandomIntData(numElements);
System.out.println(Arrays.toString(h_idata));
System.out.println(Arrays.toString(flag));
prefixSum(h_idata, flag);
}
private static int[] createRandomIntData(int n)
{
Random random = new Random((new Date()).getTime());
int x[] = new int[n];
for (int i = 0; i < n; i++)
{
x** = random.nextInt(100);
}
return x;
}
}
Regards,
Lemon