I am doing a lot of processing and it is taking days, even with CUDA. So maybe I ought to get a more powerful GPU card. Also I did some research and it seems there are faster CUBLAS GEMM versions, and maybe I can take advantage of a tuned GEMM in my SYRK impplemantation. I do everytig double precision. Butit seems from my own tests that there isn;t much room for improvement - I need it x100 faster!

Would be interested if anybody can report performance with the following code together with details of what GPU they are using. Might be useful for others looking at purchasing a GPU card.

Thanks.

```
package datamining;
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
import java.util.concurrent.*;
import java.util.Random;
import java.util.*;
import jcuda.*;
import jcuda.runtime.*;
import static jcuda.runtime.JCuda.*;
import jcuda.jcublas.JCublas;
import static jcuda.jcublas.JCublas2.*;
import static jcuda.jcublas.cublasOperation.*;
import static jcuda.jcublas.cublasFillMode.*;
import static jcuda.driver.JCudaDriver.*;
import jcuda.jcublas.cublasHandle;
import jcuda.driver.*;
import java.util.Arrays;
import jcuda.jcublas.JCublas2;
//import jcuda.utils.KernelLauncher;
/**
*
* @author Nigel
*/
public class TestCuda {
private static int numTasks = Runtime.getRuntime().availableProcessors();
private TestCuda() {
super();
}
public static void main(String[] args) {
System.getProperties().list(System.out);
Pointer pointer = new Pointer();
JCuda.cudaMalloc(pointer, 4);
JCuda.cudaFree(pointer);
int[] nDevices = new int[1];
CUdevice[] devices;
JCudaDriver.setExceptionsEnabled(true);
JCuda.setExceptionsEnabled(true);
JCublas2.setExceptionsEnabled(true);
JCudaDriver.cuInit(0);
int count = JCudaDriver.cuDeviceGetCount(nDevices);
if (nDevices[0] == 0) {
System.out.println("No GPU devices found" + count + " " + nDevices[0]);
return;
}
System.out.println("Total number of devices: " + nDevices[0]);
devices = new CUdevice[nDevices[0]];
for (int dev = 0; dev < nDevices[0]; dev++) {
devices[dev] = new CUdevice();
int ireturn = JCudaDriver.cuDeviceGet(devices[dev], dev);
int[] major = new int[1];
int[] minor = new int[1];
ireturn = JCudaDriver.cuDeviceComputeCapability(major, minor, devices[dev]);
System.out.println("Version: " + String.format("%d.%d", major[0], minor[0]));
cudaDeviceProp deviceProp = new cudaDeviceProp();
JCuda.cudaGetDeviceProperties(deviceProp, dev);
System.out.println(deviceProp.toFormattedString());
}
long current;
int total = 40000 * 200 * 200;
int nCols = 200;
int nIts = 5;
while (nCols < 1000) {
int nRows = total / (nCols * nCols);
double dh_A[] = createRandomDoubleData(nRows * nCols);
double dh_B[] = new double[nRows * nCols];
System.arraycopy(dh_A, 0, dh_B, 0, nRows * nCols);
double dh_C[] = createRandomDoubleData(nCols * nCols);
System.out.println("rows " + nRows + " iterations " + nIts + " numTasks " + numTasks + " nCols " + nCols);
Pointer d_A = new Pointer();
Pointer d_B = new Pointer();
Pointer d_C = new Pointer();
JCublas.cublasAlloc(nRows * nCols, Sizeof.DOUBLE, d_A);
JCublas.cublasAlloc(nCols * nCols, Sizeof.DOUBLE, d_C);
current = System.currentTimeMillis();
for (int j = 0; j < nIts; j++) {
JCublas.cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_A), 1, d_A, 1);
JCublas.cublasDsyrk('U', 'T', nCols, nRows, 1.0d, d_A, nRows, 0.0d, d_C, nCols);
JCublas.cublasGetVector(nCols * nCols, Sizeof.DOUBLE, d_C, 1, Pointer.to(dh_C), 1);
}
current = System.currentTimeMillis() - current;
System.out.println("CUBLAS DSYRK " + current);
JCublas.cublasFree(d_A);
JCublas.cublasFree(d_C);
//
current = System.currentTimeMillis();
JCublas.cublasAlloc(nRows * nCols, Sizeof.DOUBLE, d_A);
JCublas.cublasAlloc(nRows * nCols, Sizeof.DOUBLE, d_B);
JCublas.cublasAlloc(nCols * nCols, Sizeof.DOUBLE, d_C);
for (int j = 0; j < nIts; j++) {
JCublas.cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_A), 1, d_A, 1);
JCublas.cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_B), 1, d_B, 1);
JCublas.cublasSetVector(nCols * nCols, Sizeof.DOUBLE, Pointer.to(dh_C), 1, d_C, 1);
JCublas.cublasDgemm('T', 'N', nCols, nCols, nRows, 1.0d, d_A, nRows, d_B, nRows, 0.0d, d_C, nCols);
JCublas.cublasGetVector(nCols * nCols, Sizeof.DOUBLE, d_C, 1, Pointer.to(dh_C), 1);
}
current = System.currentTimeMillis() - current;
System.out.println("CUBLAS DGEMM " + current);
JCublas.cublasFree(d_A);
JCublas.cublasFree(d_B);
JCublas.cublasFree(d_C);
//
try {
cublasHandle handle = new cublasHandle();
cublasCreate(handle);
Pointer pAlpha = Pointer.to(new double[]{1.0d});
Pointer pBeta = Pointer.to(new double[]{0.0d});
cudaMalloc(d_A, nRows * nCols * Sizeof.DOUBLE);
cudaMalloc(d_C, nCols * nCols * Sizeof.DOUBLE);
current = System.currentTimeMillis();
for (int j = 0; j < nIts; j++) {
cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_A), 1, d_A, 1);
cublasSetVector(nCols * nCols, Sizeof.DOUBLE, Pointer.to(dh_C), 1, d_C, 1);
cublasDsyrk(handle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, nCols, nRows, pAlpha, d_A, nRows, pBeta, d_C, nCols);
cublasGetVector(nCols * nCols, Sizeof.DOUBLE, d_C, 1, Pointer.to(dh_C), 1);
}
current = System.currentTimeMillis() - current;
System.out.println("CUBLAS2 DSYRK " + current);
cudaFree(d_A);
cudaFree(d_C);
//
cudaMalloc(d_A, nRows * nCols * Sizeof.DOUBLE);
cudaMalloc(d_B, nRows * nCols * Sizeof.DOUBLE);
cudaMalloc(d_C, nCols * nCols * Sizeof.DOUBLE);
current = System.currentTimeMillis();
for (int j = 0; j < nIts; j++) {
cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_A), 1, d_A, 1);
cublasSetVector(nRows * nCols, Sizeof.DOUBLE, Pointer.to(dh_B), 1, d_B, 1);
cublasSetVector(nCols * nCols, Sizeof.DOUBLE, Pointer.to(dh_C), 1, d_C, 1);
cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, nCols, nCols, nRows, pAlpha, d_A, nRows, d_B, nRows, pBeta, d_C, nCols);
cublasGetVector(nCols * nCols, Sizeof.DOUBLE, d_C, 1, Pointer.to(dh_C), 1);
}
current = System.currentTimeMillis() - current;
System.out.println("CUBLAS2 DGEMM " + current);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasDestroy(handle);
} catch (Exception e) {
System.out.println(e);
}
nCols += 1;
}
JCublas.cublasShutdown();
}
private static double[] createRandomDoubleData(int n) {
Random random = new Random();
double x[] = new double[n];
for (int i = 0; i < n; i++) {
x** = random.nextDouble();
}
return x;
}
}
```

*** Edit ***

My impression s that with the dimensions I have, data transfer is not the issue. Data transfer goes as nRows x nCols, processing, goes as nRows x nCols x nCols.

For example I get (note the jump when nCols goes to 209 - 208 is a multiple of 16

```
Device properties:
name=GeForce GT 640
totalGlobalMem=4294967296
sharedMemPerBlock=49152
regsPerBlock=65536
warpSize=32
memPitch=2147483647
maxThreadsPerBlock=1024
maxThreadsDim=[1024, 1024, 64]
maxGridSize=[2147483647, 65535, 65535]
clockRate=797000
totalConstMem=65536
major=3
minor=0
textureAlignment=512
texturePitchAlignment=32
deviceOverlap=1
multiProcessorCount=2
kernelExecTimeoutEnabled=1
integrated=0
canMapHostMemory=1
computeMode=cudaComputeModeDefault
maxTexture1D=65536
maxTexture1DMipmap=16384
maxTexture1DLinear=134217728
maxTexture2D=[65536, 65536]
maxTexture2DMipmap=[16384, 16384]
maxTexture2DLinear=[65000, 65000, 1048544]
maxTexture2DGather=[16384, 16384]
maxTexture3D=[4096, 4096, 4096]
maxTexture3DAlt=[2048, 2048, 16384]
maxTextureCubemap=16384
maxTexture1DLayered=[16384, 2048]
maxTexture2DLayered=[16384, 16384, 2048]
maxTextureCubemapLayered=[16384, 2046]
maxSurface1D=65536
maxSurface2D=[65536, 32768]
maxSurface3D=[65536, 32768, 2048]
maxSurface1DLayered=[65536, 2048]
maxSurface2DLayered=[65536, 32768, 2048]
maxSurfaceCubemap=32768
maxSurfaceCubemapLayered=[32768, 2046]
surfaceAlignment=512
concurrentKernels=1
ECCEnabled=0
pciBusID=1
pciDeviceID=0
pciDomainID=0
tccDriver=0
asyncEngineCount=1
unifiedAddressing=1
memoryClockRate=891000
memoryBusWidth=128
l2CacheSize=262144
maxThreadsPerMultiProcessor=2048
streamPrioritiesSupported=0
globalL1CacheSupported=0
localL1CacheSupported=1
sharedMemPerMultiprocessor=49152
regsPerMultiprocessor=65536
managedMemory=1
isMultiGpuBoard=0
multiGpuBoardGroupID=0
rows 40000 iterations 5 numTasks 8 nCols 200
CUBLAS DSYRK 768
CUBLAS DGEMM 1219
CUBLAS2 DSYRK 765
CUBLAS2 DGEMM 1218
rows 39602 iterations 5 numTasks 8 nCols 201
CUBLAS DSYRK 746
CUBLAS DGEMM 1297
CUBLAS2 DSYRK 747
CUBLAS2 DGEMM 1261
rows 39211 iterations 5 numTasks 8 nCols 202
CUBLAS DSYRK 741
CUBLAS DGEMM 1296
CUBLAS2 DSYRK 740
CUBLAS2 DGEMM 1262
rows 38826 iterations 5 numTasks 8 nCols 203
CUBLAS DSYRK 742
CUBLAS DGEMM 1274
CUBLAS2 DSYRK 743
CUBLAS2 DGEMM 1190
rows 38446 iterations 5 numTasks 8 nCols 204
CUBLAS DSYRK 733
CUBLAS DGEMM 1190
CUBLAS2 DSYRK 731
CUBLAS2 DGEMM 1168
rows 38072 iterations 5 numTasks 8 nCols 205
CUBLAS DSYRK 734
CUBLAS DGEMM 1209
CUBLAS2 DSYRK 732
CUBLAS2 DGEMM 1155
rows 37703 iterations 5 numTasks 8 nCols 206
CUBLAS DSYRK 735
CUBLAS DGEMM 1209
CUBLAS2 DSYRK 735
CUBLAS2 DGEMM 1176
rows 37340 iterations 5 numTasks 8 nCols 207
CUBLAS DSYRK 733
CUBLAS DGEMM 1193
CUBLAS2 DSYRK 733
CUBLAS2 DGEMM 1151
rows 36982 iterations 5 numTasks 8 nCols 208
CUBLAS DSYRK 723
CUBLAS DGEMM 1174
CUBLAS2 DSYRK 729
CUBLAS2 DGEMM 1152
rows 36629 iterations 5 numTasks 8 nCols 209
CUBLAS DSYRK 762
CUBLAS DGEMM 1366
CUBLAS2 DSYRK 762
CUBLAS2 DGEMM 1280
```

*** Edit ***

My back of the envelope makes this around 12.6 GFlops, if you include multiply and add operations.

1,600,000,000 multiply/adds repeated 5 times takes 1.2 secs.

so that is 1.6/0.24 = about 6.4 multiply/adds per second = 12.6 GFlops if you count multiply and add separately.

Seems very far short of other benchmarks I have seen.

*** Edit ***

GTX Titan (not the X) gives 1306 GFlops on DGEMM Titan’s Compute Performance (aka Ph.D Lust) - NVIDIA’s GeForce GTX Titan Review, Part 2: Titan’s Performance Unveiled

If corrct, that is the kind of performance I need. But I’m not going to go out and spend $1000 without some due diligence.