Hi,
I am using GPU to accelerate a programme written in JAVA. In the programme, it use „ExecutorService“ to create multi-threads. My idea is that each thread launches an GPU kernel and use streams to make them run concurrently ( all the threads share the same context).
However, there were errors when using asynchronous memory copy. Then, I wrote a simple programme and tried to figure it out. But it did not work.
The following are the codes I used. Some codes are from https://forum.byte-welt.net/archive/index.php/t-4082.html.
import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
import static jcuda.runtime.JCuda.cudaSetDevice;
import static jcuda.runtime.JCuda.cudaFree;
import static jcuda.runtime.JCuda.cudaFreeHost;
import static jcuda.runtime.JCuda.cudaHostAlloc;
import static jcuda.runtime.JCuda.cudaHostAllocWriteCombined;
import static jcuda.runtime.JCuda.cudaMalloc;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.JCuda.cudaMemcpyAsync;
import static jcuda.runtime.JCuda.cudaStreamCreate;
import static jcuda.runtime.JCuda.cudaStreamDestroy;
import static jcuda.runtime.JCuda.cudaStreamSynchronize;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToHost;
import static jcuda.driver.JCudaDriver.*;
import jcuda.Pointer;
import jcuda.runtime.cudaEvent_t;
import jcuda.runtime.*;
import java.util.Arrays;
import java.io.*;
import java.util.Iterator;
import java.util.concurrent.*;
import jcuda.*;
import jcuda.driver.*;
import java.nio.FloatBuffer;
import java.nio.ByteOrder;
import java.util.Locale;
public class JCudaVectorAdd
{
static ExecutorService masterExecutor;
static ExecutorService mapExecutor;
public static void main(String args[]) throws IOException
{
JCudaVectorAdd obj=new JCudaVectorAdd();
obj.run();
return ;
}
public void run()
{
this.masterExecutor= Executors.newSingleThreadExecutor();
this.mapExecutor=Executors.newFixedThreadPool(1);
for(int i=0;i<1;i++)
{
Masterjob masterjob=new Masterjob();
Future<Integer> reduceResult=masterExecutor.submit(masterjob);
while(true)
{
try
{
if( reduceResult.isDone())
break;
}
catch( Throwable ex)
{
}
}
}
masterExecutor.shutdownNow();
mapExecutor.shutdownNow();
}
private class Masterjob implements Callable<Integer>
{
private Masterjob()
{
}
@Override
public Integer call()
{
final CountDownLatch runningjobs=new CountDownLatch(1);
try
{
for(int i=0;i<1;i++)
{
mapExecutor.submit(new ReadMapReduceJob(runningjobs));
}
runningjobs.await();
return 0;
}
catch(Throwable ex)
{
return 0;
}
}
}
private class ReadMapReduceJob implements Runnable
{
final CountDownLatch runningjobs;
private ReadMapReduceJob(CountDownLatch runningjobs)
{
this.runningjobs=runningjobs;
}
@Override
public void run()
{
try{
JCudaDriver.setExceptionsEnabled(true);
cuInit(0);
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
for(int time=0;time<100;time++)
{
int numElements = 100000;
Pointer A=new Pointer();
JCudaDriver.cuMemAllocHost(A,numElements*Sizeof.FLOAT);
FloatBuffer aa= A.getByteBuffer(0,numElements*Sizeof.FLOAT).order(ByteOrder.nativeOrder()).asFloatBuffer();
aa.position(0);
float [] expecteds=new float[numElements];
Arrays.fill(expecteds,3.33f);
aa.put(expecteds,0,numElements);
cudaStream_t stream_new=new cudaStream_t();
cudaStreamCreate(stream_new);
Pointer deviceInputA=new Pointer();
cudaMalloc(deviceInputA, numElements*Sizeof.FLOAT);
cudaMemcpyAsync(deviceInputA,A,numElements*Sizeof.FLOAT, cudaMemcpyHostToDevice,stream_new);
//cudaMemcpy(deviceInputA,A,numElements*Sizeof.FLOAT,cudaMemcpyHostToDevice);
Pointer hostOutput=new Pointer();
JCudaDriver.cuMemAllocHost(hostOutput,numElements*Sizeof.FLOAT);
cudaMemcpyAsync(hostOutput,deviceInputA,numElements*Sizeof.FLOAT, cudaMemcpyDeviceToHost,stream_new);
//cudaMemcpy(hostOutput,deviceInputA,numElements*Sizeof.FLOAT,cudaMemcpyDeviceToHost);
FloatBuffer cc= hostOutput.getByteBuffer(0,numElements*Sizeof.FLOAT).order(ByteOrder.nativeOrder()).asFloatBuffer();
float host_output[]=new float[numElements];
cc.get(host_output);
cc.rewind();
boolean equal = Arrays.equals(expecteds, host_output);
System.out.println("Equal? "+equal);
cudaFreeHost(hostOutput);
cudaFreeHost(A);
cudaFree(deviceInputA);
cudaStreamDestroy(stream_new);
}
}
catch( Throwable ex)
{
}
finally
{
runningjobs.countDown();
}
}
}
}
I used CUDA 7.5 and Java 8 to compile and run. The results are right and wrong.
Equal? false
Equal? false
Equal? false
Equal? false
Equal? false
Equal? false
Equal? false
Equal? false
Equal? true
.....
If I uncomment cudaMemcpy() and comment cudaMemcpyAsync(). The results are right.
Thanks in advance