I am using JCublas and calling cublasDsyrk
It normally works fine, but after about an hour (I am doing a lot of processing) I get a EXCEPTION_ACCESS_VIOLATION
It seems to be only when I have big matrices - about 500 x 150,000. This would mean about 75m doubles, which is 300 MB
Is there an obvious memory limit or something i should be aware of? I am using a Geoforce GT 640, which I think is fairly low spec.
My GPU specs are:
name=GeForce GT 640
totalGlobalMem=4294967296
sharedMemPerBlock=49152
regsPerBlock=65536
warpSize=32
memPitch=2147483647
maxThreadsPerBlock=1024
maxThreadsDim=[1024, 1024, 64]
maxGridSize=[2147483647, 65535, 65535]
clockRate=797000
totalConstMem=65536
major=3
minor=0
textureAlignment=512
texturePitchAlignment=32
deviceOverlap=1
multiProcessorCount=2
kernelExecTimeoutEnabled=1
integrated=0
canMapHostMemory=1
computeMode=cudaComputeModeDefault
maxTexture1D=65536
maxTexture1DMipmap=16384
maxTexture1DLinear=134217728
maxTexture2D=[65536, 65536]
maxTexture2DMipmap=[16384, 16384]
maxTexture2DLinear=[65000, 65000, 1048544]
maxTexture2DGather=[16384, 16384]
maxTexture3D=[4096, 4096, 4096]
maxTexture3DAlt=[2048, 2048, 16384]
maxTextureCubemap=16384
maxTexture1DLayered=[16384, 2048]
maxTexture2DLayered=[16384, 16384, 2048]
maxTextureCubemapLayered=[16384, 2046]
maxSurface1D=65536
maxSurface2D=[65536, 32768]
maxSurface3D=[65536, 32768, 2048]
maxSurface1DLayered=[65536, 2048]
maxSurface2DLayered=[65536, 32768, 2048]
maxSurfaceCubemap=32768
maxSurfaceCubemapLayered=[32768, 2046]
surfaceAlignment=512
concurrentKernels=1
ECCEnabled=0
pciBusID=1
pciDeviceID=0
pciDomainID=0
tccDriver=0
asyncEngineCount=1
unifiedAddressing=1
memoryClockRate=891000
memoryBusWidth=128
l2CacheSize=262144
maxThreadsPerMultiProcessor=2048
streamPrioritiesSupported=0
globalL1CacheSupported=0
localL1CacheSupported=1
sharedMemPerMultiprocessor=49152
regsPerMultiprocessor=65536
managedMemory=1
isMultiGpuBoard=0
multiGpuBoardGroupID=0
*** Edit ***
cublasHandle handle = new cublasHandle();
cublasCreate(handle);
Pointer d_A = new Pointer();
Pointer d_C = new Pointer();
cudaMalloc(d_A, nRuns * nX * Sizeof.DOUBLE);
cudaMalloc(d_C, nX * nX * Sizeof.DOUBLE);
double dh_A[] = new double[nRuns * nX];
double dh_C[] = new double[nX * nX];
// create dh_A array
cublasSetVector(nRuns * nX, Sizeof.DOUBLE, Pointer.to(dh_A), 1, d_A, 1);
cublasSetVector(nX * nX, Sizeof.DOUBLE, Pointer.to(dh_C), 1, d_C, 1);
Pointer pAlpha = Pointer.to(new double[]{1.0d});
Pointer pBeta = Pointer.to(new double[]{0.0d});
cublasDsyrk(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, nX, nRuns, pAlpha, d_A, nRuns, pBeta, d_C, nX);
cublasGetVector(nX * nX, Sizeof.DOUBLE, d_C, 1, Pointer.to(dh_C), 1);
cudaFree(d_A);
cudaFree(d_C);
cublasDestroy(handle);