Performance Optimization

Noodles · 3. Juni 2016 um 07:02

Hi,

im programming LSTM Neural Networks for my Bachelors-Thesis.
For this im using JCuda, now my Code is not as fast as I would like it to be and im wondering if anybody could give me tips on how to optimize the performance of the Code.

Heres the Code im using(Its a bit much, so you might want to just look at the stuff after the spoiler):
[SPOILER]```package Matrix;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.JCudaDriver;
import jcuda.jcublas.JCublas;
import jcuda.runtime.JCuda;
import jcuda.vec.VecFloat;

public class GPUOp {

public static CUdeviceptr fNormalize(int n,CUdeviceptr d_In,boolean clean) {
	CUdeviceptr d_Out = new CUdeviceptr();
	JCublas.cublasAlloc(n, Sizeof.FLOAT, d_Out);
	
	VecFloat.exp(n, d_Out, d_In); // x = e^x
	VecFloat.scalarDiv(n, d_Out, 1f, d_Out); // x = e^(-x)
	VecFloat.scalarAdd(n, d_Out, 1f, d_Out); // x = 1+e^(-x)
	VecFloat.scalarDiv(n, d_Out, 1f, d_Out); // x = 1/(1+e^(-x))
	
	if (clean)
		JCudaDriver.cuMemFree(d_In);
	return d_Out;
}

public static CUdeviceptr gNormalize(int n,CUdeviceptr d_In,boolean clean) {
	CUdeviceptr d_Out = new CUdeviceptr();
	JCublas.cublasAlloc(n, Sizeof.FLOAT, d_Out);
	
	VecFloat.exp(n, d_Out, d_In); // x = e^x
	VecFloat.scalarDiv(n, d_Out, 1f, d_Out); // x = e^(-x)
	VecFloat.scalarAdd(n, d_Out, 1f, d_Out); // x = 1+e^(-x)
	VecFloat.scalarDiv(n, d_Out, 2f, d_Out); // x = 2/(1+e^(-x))
	VecFloat.subScalar(n, d_Out, d_Out, 1f); // x = (2/(1+e^(-x)))-1
	
	if (clean)
		JCudaDriver.cuMemFree(d_In);
	return d_Out;
}

public static CUdeviceptr GateBulk(CuDeviceHolder env, Mat2 weights,int neurons) {
	CuDeviceHolder d_weights = new CuDeviceHolder(weights);
	CUdeviceptr result = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, result);
	JCublas.cublasSgemm('n', 'n', neurons, 1, neurons, 1.0f, d_weights.get(), neurons, env.get(), neurons, 0.0f, result, neurons);
	d_weights.free();		
	result = fNormalize(neurons,result,true);
	return result;
}

public static CUdeviceptr GateBulk(CuDeviceHolder env, Mat2 weights,int neurons,int input) {
	CuDeviceHolder d_weights = new CuDeviceHolder(weights);		
	CUdeviceptr result = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, result);
	JCublas.cublasSgemm('n', 'n', neurons, 1, input, 1.0f, d_weights.get(), neurons, env.get(), input, 0.0f, result, neurons);
	d_weights.free();
	result = fNormalize(neurons,result,true);
	return result;
}

public static void doCalc(Mat2 result,Mat2 outputWeights, Mat2 inputInGateWeights, Mat2 inputOutGateWeights, Mat2 inputChangeGateWeights, Mat2 inputValues, Mat2 outGateWeights,
		Mat2 changeGateWeights, Mat2 inGateWeights, Mat2 internalNeuronValues,Mat2 externalNeuronValues) {
			
	int neurons = internalNeuronValues.elements();
	int input = inputValues.elements();
	int output = result.elements();
	
	//
	CuDeviceHolder d_ENV = new CuDeviceHolder(externalNeuronValues);
	CUdeviceptr d_yOut = GateBulk(d_ENV,outGateWeights,neurons);
	CUdeviceptr d_yIn = GateBulk(d_ENV,inGateWeights,neurons);
	CUdeviceptr d_yCh = GateBulk(d_ENV,changeGateWeights,neurons);
			
	CuDeviceHolder d_I = new CuDeviceHolder(inputValues);
	
	CUdeviceptr d_yOut2 = GateBulk(d_I,inputOutGateWeights,neurons,input);
	VecFloat.add(neurons,d_yOut,d_yOut,d_yOut2);		
	JCudaDriver.cuMemFree(d_yOut2);
	CUdeviceptr d_yIn2 = GateBulk(d_I,inputInGateWeights,neurons,input);
	VecFloat.add(neurons,d_yIn,d_yIn,d_yIn2);
	JCudaDriver.cuMemFree(d_yIn2);
	CUdeviceptr d_yCh2 = GateBulk(d_I,inputInGateWeights,neurons,input);
	VecFloat.add(neurons,d_yCh,d_yCh,d_yCh2);
	JCudaDriver.cuMemFree(d_yCh2);
	d_I.free();
	//
	
	//
	CUdeviceptr d_Ch = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, d_Ch);
	VecFloat.mul(neurons, d_Ch, d_yIn, d_yCh);
	JCudaDriver.cuMemFree(d_yIn);
	JCudaDriver.cuMemFree(d_yCh);
	//
	
	//
	CuDeviceHolder d_INV = new CuDeviceHolder(internalNeuronValues);
	VecFloat.add(neurons, d_INV.get(), d_INV.get(), d_Ch);
	JCudaDriver.cuMemFree(d_Ch);
	//
	
	//
	CUdeviceptr d_nI = fNormalize(neurons,d_INV.get(),false);
	float[] h_INV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, d_INV.get(), 1, Pointer.to(h_INV), 1);
	internalNeuronValues.set(h_INV);
	d_INV.free();
	//
	
	//
	VecFloat.mul(neurons,d_ENV.get(), d_yOut, d_nI);
	JCudaDriver.cuMemFree(d_yOut);
	JCudaDriver.cuMemFree(d_nI);
	//
	
	//
	CuDeviceHolder d_OW = new CuDeviceHolder(outputWeights);
	CUdeviceptr d_Out = new CUdeviceptr();
	JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out);
	JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, d_OW.get(), output, d_ENV.get(), neurons, 0.0f, d_Out, output);
	//
	
	//
	float[] h_ENV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, d_ENV.get(), 1, Pointer.to(h_ENV), 1);
	externalNeuronValues.set(h_ENV);
	d_ENV.free();
	d_OW.free();
	//
	
	float[] h_Out = new float[output];
	JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
	result.set(h_Out);
	JCudaDriver.cuMemFree(d_Out);
	
	
}

}```[/SPOILER]

Now the important things I would like to ask is what Funtion calls cost alot of perfomance.
The only Functions I am using are

JCublas.cublasSgemm();
JCublas.cublasGetVector();
new CUdeviceptr();
JCudaDriver.cuMemFree();
VecFloat.exp();
VecFloat.scalarDiv();
VecFloat.scalarAdd();
VecFloat.scalarDiv();
VecFloat.subScalar();```

which of these function is performance intensive.

I would be able to cut back alot on 
```JCublas.cublasAlloc();
new CUdeviceptr();
JCudaDriver.cuMemFree();```

but with the rest I dont think I can do anything about it, 
if you're wondering which sizes the VecFloat and Sgemm 
have to Handle then the max should be around 1000 or 1000x1000 elements.

cheers
Noodles

Marco13 · 3. Juni 2016 um 08:21

Yes, I already considered writing a few words about this in the other thread, but … there, fixing the bug was obviously the more pressing issue.

I noticed several things.

The first is NOT related to performance: You are using cublasAlloc and cublasSetVector a lot. These are basically just small wrappers around JCudaDriver.cuMemAlloc and JCudaDriver.cuMemcpy. At other places, you are already using cuMemFree. It might be more “consistent” to use the same methods in both cases, but one could argue about this - it may not be “important”.

What may be important are the frequent allocations and de-allocations that you are doing. The new CUdeviceptr(); is essentially “free”, but doing the real CUBLAS alloc/free calls may be be a bit costly. A related test that I did a while ago, at the bottom of this post: https://forum.byte-welt.net/byte-welt-projekte-projects/jcuda/17980-jcublas-dsyrk-dgemm-benchmark-3.html#post127945 (I’ll write a few more words about this, below)

What certainly is importand regarding performance: The memory copies! I twitched a little when I saw the “GateBulk” method, where you copy the whole matrix to the device in each call!. This is really expensive. Looking at a few lines of code from your “main” method:

        CUdeviceptr d_yOut2 = GateBulk(d_I,inputOutGateWeights,neurons,input);
        VecFloat.add(neurons,d_yOut,d_yOut,d_yOut2);       
        JCudaDriver.cuMemFree(d_yOut2);
        CUdeviceptr d_yIn2 = GateBulk(d_I,inputInGateWeights,neurons,input);
        VecFloat.add(neurons,d_yIn,d_yIn,d_yIn2);
        JCudaDriver.cuMemFree(d_yIn2);
        CUdeviceptr d_yCh2 = GateBulk(d_I,inputInGateWeights,neurons,input);
        VecFloat.add(neurons,d_yCh,d_yCh,d_yCh2);
        JCudaDriver.cuMemFree(d_yCh2);

This is done frequently. Note that, during each call to “GateBulk”, you are copying the whole “inputInGateWeights” matrix to the device. This will really slow things down.

You already have the “CuDeviceHolder” class, which could probably be declared (and dedicatedly be used) as something like a “DeviceMat2” class: A direct representation of a Mat2, but on the device. (With many degrees of freedom for the implementation, but just to give the idea…).

The first approach would then basically be to change the above code snippet to

        // Allocate and copy the matrix to the device only ONCE!
        CuDeviceHolder weightsOnDevice = new CuDeviceHolder(inputOutGateWeights);

        CUdeviceptr d_yOut2 = GateBulk(d_I,weightsOnDevice ,neurons,input); // Here, pass in the "weightsOnDevice"
        VecFloat.add(neurons,d_yOut,d_yOut,d_yOut2);       
        JCudaDriver.cuMemFree(d_yOut2);

        CUdeviceptr d_yIn2 = GateBulk(d_I,weightsOnDevice ,neurons,input); // Here, pass in the "weightsOnDevice"
        VecFloat.add(neurons,d_yIn,d_yIn,d_yIn2);
        JCudaDriver.cuMemFree(d_yIn2);

        CUdeviceptr d_yCh2 = GateBulk(d_I,weightsOnDevice ,neurons,input); // Here, pass in the "weightsOnDevice"
        VecFloat.add(neurons,d_yCh,d_yCh,d_yCh2);
        JCudaDriver.cuMemFree(d_yCh2);

        // Free it when no longer needed
        weightsOnDevice.free();

Note: The scope of this “weightsOnDevice” could probably be larger - maybe from the start of the whole method until the end! Again, I could not read and understand your whole code.

The key point is: Avoid memory copies whenever possible.

This should already bring a significant speedup.

A next step could be to avoid some of the memory allocations/frees. I assume that most of your vectors will have the same size all the time. So patterns like these

        CUdeviceptr d_yOut2 = GateBulk(d_I,weightsOnDevice ,neurons,input);
        VecFloat.add(neurons,d_yOut,d_yOut,d_yOut2);       
        JCudaDriver.cuMemFree(d_yOut2);

        CUdeviceptr d_yIn2 = GateBulk(d_I,weightsOnDevice ,neurons,input);
        VecFloat.add(neurons,d_yIn,d_yIn,d_yIn2);
        JCudaDriver.cuMemFree(d_yIn2);

could probably be changed to something roughly like this:

        CUdeviceptr deviceDataUsedAllTheTime = allocateThisOnce();

        // Let this method write into the given "deviceDataUsedAllTheTime" 
        // to avoid unnecessary allocations/frees 
        GateBulk(d_I,weightsOnDevice ,neurons,input, deviceDataUsedAllTheTime );
        VecFloat.add(neurons,d_yOut,d_yOut,deviceDataUsedAllTheTime );       

        GateBulk(d_I,weightsOnDevice ,neurons,input, deviceDataUsedAllTheTime); 
        VecFloat.add(neurons,d_yIn,d_yIn,deviceDataUsedAllTheTime );

        ...
 
        // At the end:
        free(deviceDataUsedAllTheTime);

(Again, note that these are just hints. You’ll have to think about some details of the implementation. But the basic idea should be clear)

Noodles · 3. Juni 2016 um 16:27

Hi,

so first of all just to clear things up, I havent actually tried anything or changed anything yet,
I’ll do that soon.

The GateBulk Method should be invoked on a different weight matrix each time it is called(if it is not then I made a small mistake), meaning I cant actually reuse a matrix on device, without changing the idea of the computation.

Another Question arose along the day: Assuming I would load all of these matrices of sizes
neurons x neurons 3*
inputs x neurons 3*
neurons x outputs 1*
neurons x 1 2*

into the Graphiccards memory a total of 100 times, for maximum values of neurons(=300), inputs(=288), outputs(=11), would this cause Problems with memory ? Or are these still small numbers ?

I might want to explain my computation a bit further I am running Genetic algorythms to evolve Neural Networks, the plan is to use population of up to 1000 Neural Nets. Each of these Nets runs about 1000 simmulations of the “Problem” and is then Evaluated based on its performance. One of these simmulations can have up to approx 1000 calls of the doCalc() function presented in the above post. So for the evaluation of one Generation 100010001000 calls to doCalc() must be made. Now this is an incredibly high number and as long as the doCalc function takes more than on second, this is a Problem.

Now with a GPU-Cluster at my University I can run the 1.000.000.000 down to 1.000.000 with Multi-Threading, and possibly even down to 1.000 if I Multi-Thread each Testing on the GPU, which is probably possible. So if I were to assume a Runtime of doCalc() to take exactly 1 second, and I were to run this Genetic algorythm for 24 hours, that would mean im able to go through 86,4 Generations, which is an extremly small number. I expect to take upwards of 1000 Generations just for one Test on one Problem and I will have to do this for maybe 10 Problems. Meaning, it would take me 115 days to do all of my testing, so you can see why I want to speed this up.

cheers
Noodles

Marco13 · 4. Juni 2016 um 05:59

Sorry, I see that my „pattern matching“ failed there to some extent:

CUdeviceptr d_yOut2 = GateBulk(d_I,inputOutGateWeights,neurons,input);
CUdeviceptr d_yIn2  = GateBulk(d_I,inputInGateWeights, neurons,input);
CUdeviceptr d_yCh2  = GateBulk(d_I,inputInGateWeights, neurons,input);

(And according to your description (and guessing from the variable names), the last call should probably receive the inputChangeGateWeights and not the inputInGateWeights)),

Nevertheless, the general recommendatation was to avoid copying data to the device, and although it is not clear from where the „doCalc“ method is called and where these „weights“ matrices come from, skimming over the code (for me) strongly suggests that there are many, many unnecessary host->device copies involved.

neurons x neurons 3*
inputs x neurons 3*
neurons x outputs 1*
neurons x 1 2*

into the Graphiccards memory a total of 100 times, for maximum values of neurons(=300), inputs(=288), outputs(=11),

Estimate


300 * 300 * 3 + 288 * 300 * 3 + 300 * 11 + 300 * 2 = 533100

Each being „float“ values, this would be ~2.1 MB, and all this 100 times should be <220 MB, and thus, no problem. But increasing this from 300 to 3000 neurons (which affect the size quadratically) would break everything, so one should at least keep the possible question of scalability in mind (particularly: I could imagine that people expect at least a short paragraph in the thesis: „This currently fits into memory. When the number of neurons should be increased, we could still try to solve it with (…some handwaving here…)“.

But back to the „doCalc“ function: From what you described so far, the pattern sounds roughly like this (pseudocode, of course!):

for (int p=0; p<population; p++)
{
    for (int s=0; s<simulations; s++)
    {
        for (int i=0; i<simulationSteps; i++)
        {
            Mat2 weightsA = ...;
            Mat2 weightsB = ...;
            Mat2 weightsC = ...;
            Mat2 weightsD = ...;
            doCalc(weightsA, ... weightsD);
        }
    }
}

Now, each call to „doCalc“ copies the data from the host to the device (either directly, or implicitly, in the „GateBulk“ calls). So one could think about changing this to

for (int p=0; p<population; p++)
{
    for (int s=0; s<simulations; s++)
    {
        Mat2 weightsA = ...;
        Mat2 weightsB = ...;
        Mat2 weightsC = ...;
        Mat2 weightsD = ...;

        CuDeviceHolder deviceWeightsA = ...;
        CuDeviceHolder deviceWeightsB = ...;
        CuDeviceHolder deviceWeightsC = ...;
        CuDeviceHolder deviceWeightsD = ...;

        for (int i=0; i<simulationSteps; i++)
        {
            doCalc(deviceWeightsA, ... deviceWeightsD);

            maybeUpdateIfNecessary(deviceWeightsA);
            maybeUpdateIfNecessary(deviceWeightsB);
            maybeUpdateIfNecessary(deviceWeightsC);
            maybeUpdateIfNecessary(deviceWeightsD);
        }

        deviceWeightsA.free();
        deviceWeightsB.free();
        deviceWeightsC.free();
        deviceWeightsD.free();

    }
}

Of course, all this is just suggestive. It will change the interface of the „doCalc“ method, which may not be desired (but my gut feeling is that you don’t care about this very much right now).

I can only emphasize: Avoid memory copies.

(You could also use Page-Locked memory for a higher bandwidth, but I think that there are easier options right now).

Noodles · 4. Juni 2016 um 11:20

Hi Marco,

thanks for the help the code runs alot faster now probably like 1.000.000 times faster,
my problems now are RAM :D.

Just too show off the changes and maybe have some people give me more advice,
basicaly I let every Mat2 have an intern representation on the GPU:

[SPOILER]```package Matrix;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.JCudaDriver;
import jcuda.jcublas.JCublas;
import jcuda.runtime.JCuda;
import jcuda.vec.VecFloat;

public class GPUOp {
public static void fNormalize(Mat2 d_Out,int n,Mat2 d_In) {
VecFloat.exp(n, d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
VecFloat.scalarAdd(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1/(1+e^(-x))
}

public static void gNormalize(Mat2 d_Out,int n,Mat2 d_In) {		
	VecFloat.exp(n, d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
	VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
	VecFloat.scalarAdd(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
	VecFloat.scalarDiv(n, d_Out.getDevice().get(), 2f, d_Out.getDevice().get()); // x = 2/(1+e^(-x))
	VecFloat.subScalar(n, d_Out.getDevice().get(), d_Out.getDevice().get(), 1f); // x = (2/(1+e^(-x)))-1
}

public static void GateBulk(Mat2 d_Out,Mat2 env, Mat2 weights,int neurons) {
	JCublas.cublasSgemm('n', 'n', neurons, 1, neurons, 1.0f, weights.getDevice().get(), neurons, env.getDevice().get(), neurons, 0.0f, d_Out.getDevice().get(), neurons);		
	fNormalize(d_Out,neurons,d_Out);
}

public static void GateBulk(Mat2 d_Out,Mat2 env, Mat2 weights,int neurons,int input) {
	JCublas.cublasSgemm('n', 'n', neurons, 1, input, 1.0f, weights.getDevice().get(), neurons, env.getDevice().get(), input, 0.0f, d_Out.getDevice().get(), neurons);
	fNormalize(d_Out,neurons,d_Out);
}

public static void doCalcIns(Mat2 in1,Mat2 in2,Mat2 in3,Mat2 inputValues,Mat2 inputOutGateWeights,Mat2 inputInGateWeights,Mat2 inputChangeGateWeights) {
	int neurons = inputOutGateWeights.cols();
	int input = inputValues.elements();
	
	GateBulk(in1,inputValues,inputOutGateWeights,neurons,input);
	GateBulk(in2,inputValues,inputInGateWeights,neurons,input);
	GateBulk(in3,inputValues,inputChangeGateWeights,neurons,input);
}

public static void doCalc(Mat2 result,Mat2 outputWeights, Mat2 in1, Mat2 in2, Mat2 in3, Mat2 outGateWeights,
		Mat2 changeGateWeights, Mat2 inGateWeights, Mat2 internalNeuronValues,Mat2 externalNeuronValues,Mat2 calc1,Mat2 calc2,Mat2 calc3) {
			
	int neurons = internalNeuronValues.elements();
	int output = result.elements();
	
	/*CUdeviceptr calc1 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc1);
	CUdeviceptr calc2 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc2);
	CUdeviceptr calc3 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc3);
	CUdeviceptr calc4 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc4);*/
	
	//
	GateBulk(calc1,externalNeuronValues,outGateWeights,neurons);
	//calc1 used d_yOut
	GateBulk(calc2,externalNeuronValues,inGateWeights,neurons);
	//calc2 used d_yIn
	GateBulk(calc3,externalNeuronValues,changeGateWeights,neurons);
	//calc3 used d_yCh
	
	VecFloat.add(neurons,calc1.getDevice().get(),calc1.getDevice().get(),in1.getDevice().get());
	VecFloat.add(neurons,calc2.getDevice().get(),calc2.getDevice().get(),in2.getDevice().get());
	VecFloat.add(neurons,calc3.getDevice().get(),calc3.getDevice().get(),in3.getDevice().get());
	
	//
	
	//
	VecFloat.mul(neurons, calc2.getDevice().get(), calc2.getDevice().get(), calc3.getDevice().get());
	//calc3 free
	//
	
	//
	VecFloat.add(neurons, internalNeuronValues.getDevice().get(), internalNeuronValues.getDevice().get(), calc2.getDevice().get());
	//calc2 free
	//
	
	//
	fNormalize(calc2,neurons,internalNeuronValues);
	//calc2 used
	float[] h_INV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, internalNeuronValues.getDevice().get(), 1, Pointer.to(h_INV), 1);
	internalNeuronValues.set(h_INV);
	//
	
	//
	VecFloat.mul(neurons,externalNeuronValues.getDevice().get(), calc1.getDevice().get(), calc2.getDevice().get());
	//calc1 free
	//calc2 free
	//
	
	//
	CUdeviceptr d_Out = new CUdeviceptr();
	JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out);
	JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output);
	//
	
	//
	float[] h_ENV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, externalNeuronValues.getDevice().get(), 1, Pointer.to(h_ENV), 1);
	externalNeuronValues.set(h_ENV);
	//
	
	float[] h_Out = new float[output];
	JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
	result.set(h_Out);
	JCudaDriver.cuMemFree(d_Out);	
}

public static void init() {
	JCuda.setExceptionsEnabled(true);
	JCudaDriver.setExceptionsEnabled(true);
	JCublas.setExceptionsEnabled(true);
	JCublas.cublasInit();
	VecFloat.init();
}

public static void close() {
	JCublas.cublasShutdown();
	VecFloat.shutdown();
}

public static void print(CUdeviceptr ptr,int size) {
	float[] h_ENV = new float[size];
	JCublas.cublasGetVector(size, Sizeof.FLOAT, ptr, 1, Pointer.to(h_ENV), 1);
	System.out.println("######################");
	for (int i=0;i<h_ENV.length;i++) {
		System.out.println(h_ENV**);
	}
	System.out.println("######################");
}

}


PS: thanks for pointing out that small Error with `inputChangeGateWeights` vs `inputInGateWeights`

Thanks for all the help
cheers
Noodles

Marco13 · 5. Juni 2016 um 04:27

Although I doubt some of the 0’s in “1.000.000”, memory copies are often the real bottleneck.

I’m not sure how exactly the Mat2 class now looks like. If you just “added” the device memory, then this certainly raises some questions about the memory management. One could consider some sophisticated infrastructure to handle this more generically, and to avoid errors. Basically, some data structure that keeps track of the host- and the device memory, and whether one of them was “changed”, and automatically does an update when one of them “changed” and the other one is accessed. But that’s not entirely trivial.

However, of course, with the proposed pattern, you always need a clear idea about

at which point in time, which memory has to be on the device

Depending on the application, you can’t keep all the matrices on the device all the time. But obviously, the goal is then to

copy the data to the device only ONCE
do EVERYTHING that this data will ever be needed for on the device
free the data ONCE

The necessity to “free” the device memory can’t be avoided, and forgetting it may cause “memory leaks”. All this can’t be handled by the garbage collector… (As already said in the other thread: JCuda/JCublas is a very thin layer around CUDA. And with great power comes great responsibility ;-)).

If the code is “compact” and easy to test (that is: if it’s possible to just unpack it into some directory, open it in Eclipse and run a single main() without nitfy arguments and external data and dependencies etc), then you could also send it to jcuda AT jcuda PUNKT org, and maybe I’d allocate some time to have a short look. (BTW: I assume that I could respond in German as well)

Noodles · 6. Juni 2016 um 08:25

Hi,

my changes where minor and I have big overheads taking care of the freeing and allocation,
also I can subdivide the computations into chunks to not overload the GPU.

I am having more Issues, I am sometimes getting
CUBLAS_STATUS_MAPPING_ERROR
when I try to retrieve data in the End of doCalc().

This only happens when I put alot of Stuff onto the GPU,
but I dont have Allocation Problems, so where do these mapping errors come from?

Any Ideas?
BTW I am heavily Multi-Threading and the Problem does not arise if I dont Multi-Thread.

PS: about your offer I might come back to you about that ;).

cheers
Noodles

Marco13 · 6. Juni 2016 um 16:17

The “usual” explaination for CUBLAS_STATUS_MAPPING_ERROR is that the watchdog timer has been triggered. You may already have found the corresponding NVIDIA forum threads, but: Are there any computations involved that take longer than 2 seconds for a single kernel?
Otherwise: Is the behavior reproducible?
Does it only happen when a certain matrix size is exceeded?

Noodles · 6. Juni 2016 um 16:35

Hi,

it seems to be reproducible, by the virtue of if I start the Programm with wrong Settings, then the Error occurs, but it seems to occur a bit Randomly.

There should not be an computation that takes longer than 2 seconds unless the Thread is paused, which in Theory I guess could make a computation take longer than 2 seconds…, but im not sure.

It happens if I compute too many Neural Nets of certain maximum sizes, actually its just 2, and this seems to be a ridiculously small amount. Since the Neural Nets are not big, they have a maximum of 100 Neurons, 800 Inputs, 30 Outputs.

So I have no idea why this is happening now but didnt happen before.
BTW after CUBLAS_STATUS_MAPPING_ERROR was thrown alot of CUBLAS_ERROR_LAUNCH(or similar) are thrown, but I guess this is to be expected after CUDA died?

cheers
Noodles

Noodles · 7. Juni 2016 um 05:35

Hi,

the Problem seems to stem from this code snipet
[SPOILER]public static void getOutput(Mat2 result,Mat2 outputWeights,Mat2 externalNeuronValues) { int neurons = externalNeuronValues.elements(); int output = result.elements(); CUdeviceptr d_Out = new CUdeviceptr(); JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out); JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output); float[] h_Out = new float[output]; JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1); result.set(h_Out); JCudaDriver.cuMemFree(d_Out); }[/SPOILER]

in the culbasGetVector().

*** Edit ***

Hi,

nevermind I had large Errors in my Code pretty much all sGemm hat wrong dimensional inputs, or they where skewed in very awkward ways.
I dont even know why no Errors where thrown, I was basically doing computation like: a x b * c x a, and this was working without transposing, I dont even have the slightest.

Well then to my next Problem or Question, handling Allocation Problems, I want to run my Code on a Cluster at my University, now I dont want to have to find out with which Input it still runs without Allocation errors.
I would like the System to find out on its own, I can find out how much memory one of my Brains will need now I need a way to find out how much memory is left on the GPU and then I maybe can compute the maximum amount simultaneously, with the System taking care of Over-Allocation.

Is this possible?

Marco13 · 7. Juni 2016 um 05:59

[,%20long[]%29"]JCudaDriver.html#cuMemGetInfo](http://www.jcuda.org/jcuda/doc/jcuda/driver/JCudaDriver.html#cuMemGetInfo(long ) should be relevant here. More detailed information about a particular device can be obtained with [,%20int,%20jcuda.driver.CUdevice%29"]JCudaDriver.html#cuDeviceGetAttribute. The http://jcuda.org/samples/JCudaDeviceQuery.java sample already lists (most of) these.

Noodles · 7. Juni 2016 um 07:53

When using public static int cuMemGetInfo(long[] free, long[] total) do I have to think about certain Block sizes, like if I had a float[] of size 653 would it maybe automatically allocate 656 because of something like Memory Blocks? And if I do, could someone point me in the direction of the Documentation on this?

cheers
Noodles

*** Edit ***

Just too make sure, If multiple Host-Threads call doCalc() asynchronously that should not produce problems, right? In the documentation of cublas it says the whole cublas library is Thread-Safe, so I was expecting JCublas to do the same. Am I in the wrong because I was skimming the Forums, and stumbled onto a Thread about passing contexts when using multiple host-threads. So could someone clear up my confusion by explaining when contexts have to be passed and when they don’t have to be passed?

Marco13 · 7. Juni 2016 um 13:38

I’m not entirely sure how you intended to use cuMemGetInfo. If you have a class that contains 100 device pointers, each with sizes between 10 and 100 bytes, then really computing how much device memory these occupy (and allocating the “maximum possible number” of these objects) may be fiddly. It’s not unlikely that there is some overhead, at least. For example, allocating 10 bytes might remove 12 of the “free” bytes. But admittedly, I don’t think that it makes much sense to compute this at this level. Rather something like: “Do I still have enough memory to allocate this 1000x1000 matrix?” (where 100 bytes more or less won’t matter).

Regarding the threading: JCublas should be thread-safe to the same extent as CUBLAS. Manual context management for multiple host threads is mainly relevant when using the Driver API. There, each context is connected to one host thread. So, roughly speaking, when allocating memory in one thread, using the driver API, you may not be able to use this memory from a different thread, unless you made sure that everything stays in the same context. The runtime libraries (like CUBLAS) are a bit simpler in this regard.

Noodles · 8. Juni 2016 um 12:57

Hi,

i noticed I still have a Problem with CUBLAS_STATUS_MAPPING_ERROR,

[SPOILER]public static void getOutput(Mat2 result,Mat2 outputWeights,Mat2 externalNeuronValues) { long time = 0; try { time = System.currentTimeMillis(); int neurons = externalNeuronValues.elements(); int output = result.elements(); CUdeviceptr d_Out = new CUdeviceptr(); JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out); JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output); float[] h_Out = new float[output]; JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1); result.set(h_Out); JCublas.cublasFree(d_Out); } catch (Exception e) {System.out.println(System.currentTimeMillis()-time);System.out.println(result.rows()+","+result.cols()+";"+outputWeights.rows()+","+outputWeights.cols()+";"+externalNeuronValues.rows()+","+externalNeuronValues.cols());result.print();outputWeights.print();externalNeuronValues.print();e.printStackTrace();} }[/SPOILER]

it happens in this code everytime I call JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
the computation does not take more than 2 seconds its actually about 2 milliseconds.

And this Error only appears if I have at least 2 Host-Threads trying to run on the GPU.
If I have two Threads this Error seems to apear relatively Random, but eventually it allways appears, since im in a while(true) loop.

Any help?

cheers
Noodles

Marco13 · 8. Juni 2016 um 13:33

Sorry, it’s hard (or impossible) to guess what might be wrong there. Can this, in any way, be reproduced in a smaller setup?

Noodles · 8. Juni 2016 um 15:48

Hi,

I tried to reproduce it in a smaller Setup, but I wasnt able to do so.

Any tips on how I might be able to debug this?

I believe some calculation is breaking Cuda and then when I want to getVector or SetVector the Problem occurs.
If I could everytime I want to get or set first ask Cuda if it is still working, that would be great for finding the Error.

cheers
Noodles

*** Edit ***

Hi,

ok I was able to reproduce the Error in a “smaller Setup”

[SPOILER]```import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import Matrix.GPUOp;
import Matrix.Mat2;

public class Test implements Runnable{
private ExecutorService executor = Executors.newFixedThreadPool(50);

public static Test test = new Test();

public static boolean run = true;

public static int started = 0;
public static int finished = 0;

public static void main(String[] args) {
	test.act();
}

public void act() {
	GPUOp.init();
	while (run) {
		executor.execute(new Test());
		started++;
		System.out.println(started+","+finished);
		synchronized (Test.class) {
			while (finished<started-200) {
				try {
					wait();
				} catch (Exception e) {}
			}
		}			
	}
	GPUOp.close();
}

public void run() {
	try {
		Thread.sleep((long)(Math.random()*2000));
	} catch (Exception e) {}
	int numNeurons = (int)(Math.random()*100)+1;
	int numInputs = (int)(Math.random()*800)+1;
	int numOutputs = (int)(Math.random()*30)+1;
	Mat2 result = new Mat2(numOutputs,1);
	Mat2 outputWeights = new Mat2(numOutputs,numNeurons);
	Mat2 externalNeuronValues = new Mat2(numNeurons,1);
	
	Mat2 in1 = new Mat2(numNeurons,1);
	Mat2 in2 = new Mat2(numNeurons,1);
	Mat2 in3 = new Mat2(numNeurons,1);
	Mat2 outGateWeights = new Mat2(numNeurons,numNeurons);
	Mat2 changeGateWeights = new Mat2(numNeurons,numNeurons);
	Mat2 inGateWeights = new Mat2(numNeurons,numNeurons);
	Mat2 internalNeuronValues = new Mat2(numNeurons,1);
	Mat2 calc1 = new Mat2(numNeurons,1);
	Mat2 calc2 = new Mat2(numNeurons,1);
	Mat2 calc3 = new Mat2(numNeurons,1);
	Mat2 inputValues = new Mat2(numInputs,1);
	Mat2 inputOutGateWeights = new Mat2(numNeurons,numInputs);
	Mat2 inputInGateWeights = new Mat2(numNeurons,numInputs);
	Mat2 inputChangeGateWeights = new Mat2(numNeurons,numInputs);
	
	result.randomize(-1, 1);
	outputWeights.randomize(-1, 1);
	externalNeuronValues.randomize(-1, 1);
	in1.randomize(-1, 1);
	in2.randomize(-1, 1);
	in3.randomize(-1, 1);
	outGateWeights.randomize(-1, 1);
	changeGateWeights.randomize(-1, 1);
	inGateWeights.randomize(-1, 1);
	internalNeuronValues.randomize(-1, 1);
	calc1.randomize(-1, 1);
	calc2.randomize(-1, 1);
	calc3.randomize(-1, 1);
	inputValues.randomize(-1, 1);
	inputOutGateWeights.randomize(-1, 1);
	inputInGateWeights.randomize(-1, 1);
	inputChangeGateWeights.randomize(-1, 1);
	
	result.GPUinit();
	outputWeights.GPUinit();
	externalNeuronValues.GPUinit();
	in1.GPUinit();
	in2.GPUinit();
	in3.GPUinit();
	outGateWeights.GPUinit();
	changeGateWeights.GPUinit();
	inGateWeights.GPUinit();
	internalNeuronValues.GPUinit();
	calc1.GPUinit();
	calc2.GPUinit();
	calc3.GPUinit();
	inputValues.GPUinit();
	inputOutGateWeights.GPUinit();
	inputInGateWeights.GPUinit();
	inputChangeGateWeights.GPUinit();
	
	//GPUOp.doCalcIns(in1, in2, in3, inputValues, inputOutGateWeights, inputInGateWeights, inputChangeGateWeights);
	
	GPUOp.doCalc(in1, in2, in3, outGateWeights, changeGateWeights, inGateWeights, internalNeuronValues, externalNeuronValues, calc1, calc2, calc3);
	
	GPUOp.getOutput(result, outputWeights, externalNeuronValues);
	
	result.GPUfree();
	outputWeights.GPUfree();
	externalNeuronValues.GPUfree();
	in1.GPUfree();
	in2.GPUfree();
	in3.GPUfree();
	outGateWeights.GPUfree();
	changeGateWeights.GPUfree();
	inGateWeights.GPUfree();
	internalNeuronValues.GPUfree();
	calc1.GPUfree();
	calc2.GPUfree();
	calc3.GPUfree();
	inputValues.GPUfree();
	inputOutGateWeights.GPUfree();
	inputInGateWeights.GPUfree();
	inputChangeGateWeights.GPUfree();
	
	test.finished();
}

public synchronized void finished() {
	finished++;
	test.notifyAll();
}

}```[/SPOILER]

[SPOILER]```package Matrix;

import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.JCudaDriver;
import jcuda.jcublas.JCublas;
import jcuda.runtime.JCuda;
import jcuda.vec.VecFloat;

public class GPUOp {
public static void fNormalize(Mat2 d_Out,int n,Mat2 d_In) {
VecFloat.exp(n, d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
VecFloat.scalarAdd(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1/(1+e^(-x))
}

public static void gNormalize(Mat2 d_Out,int n,Mat2 d_In) {		
	VecFloat.exp(n, d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
	VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
	VecFloat.scalarAdd(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
	VecFloat.scalarDiv(n, d_Out.getDevice().get(), 2f, d_Out.getDevice().get()); // x = 2/(1+e^(-x))
	VecFloat.subScalar(n, d_Out.getDevice().get(), d_Out.getDevice().get(), 1f); // x = (2/(1+e^(-x)))-1
}

public static void GateBulk(Mat2 d_Out,Mat2 env, Mat2 weights,int neurons) {
	JCublas.cublasSgemm('n', 'n', neurons, 1, neurons, 1.0f, weights.getDevice().get(), neurons, env.getDevice().get(), neurons, 0.0f, d_Out.getDevice().get(), neurons);		
	fNormalize(d_Out,neurons,d_Out);
}

public static void GateBulk(Mat2 d_Out,Mat2 env, Mat2 weights,int neurons,int input) {
	JCublas.cublasSgemm('n', 'n', neurons, 1, input, 1.0f, weights.getDevice().get(), neurons, env.getDevice().get(), input, 0.0f, d_Out.getDevice().get(), neurons);
	fNormalize(d_Out,neurons,d_Out);
}

public static void doCalcIns(Mat2 in1,Mat2 in2,Mat2 in3,Mat2 inputValues,Mat2 inputOutGateWeights,Mat2 inputInGateWeights,Mat2 inputChangeGateWeights) {
	int neurons = inputOutGateWeights.cols();
	int input = inputValues.elements();
	
	GateBulk(in1,inputValues,inputOutGateWeights,neurons,input);
	GateBulk(in2,inputValues,inputInGateWeights,neurons,input);
	GateBulk(in3,inputValues,inputChangeGateWeights,neurons,input);
}

public static void doCalc(Mat2 in1, Mat2 in2, Mat2 in3, Mat2 outGateWeights,
		Mat2 changeGateWeights, Mat2 inGateWeights, Mat2 internalNeuronValues,Mat2 externalNeuronValues,Mat2 calc1,Mat2 calc2,Mat2 calc3) {
			
	int neurons = internalNeuronValues.elements();
	
	/*CUdeviceptr calc1 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc1);
	CUdeviceptr calc2 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc2);
	CUdeviceptr calc3 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc3);
	CUdeviceptr calc4 = new CUdeviceptr();
	JCublas.cublasAlloc(neurons, Sizeof.FLOAT, calc4);*/
	
	//
	GateBulk(calc1,externalNeuronValues,outGateWeights,neurons);
	//calc1 used d_yOut
	GateBulk(calc2,externalNeuronValues,inGateWeights,neurons);
	//calc2 used d_yIn
	GateBulk(calc3,externalNeuronValues,changeGateWeights,neurons);
	//calc3 used d_yCh
	
	VecFloat.add(neurons,calc1.getDevice().get(),calc1.getDevice().get(),in1.getDevice().get());
	VecFloat.add(neurons,calc2.getDevice().get(),calc2.getDevice().get(),in2.getDevice().get());
	VecFloat.add(neurons,calc3.getDevice().get(),calc3.getDevice().get(),in3.getDevice().get());
	
	//
	
	//
	VecFloat.mul(neurons, calc2.getDevice().get(), calc2.getDevice().get(), calc3.getDevice().get());
	//calc3 free
	//
	
	//
	VecFloat.add(neurons, internalNeuronValues.getDevice().get(), internalNeuronValues.getDevice().get(), calc2.getDevice().get());
	//calc2 free
	//
	
	//
	fNormalize(calc2,neurons,internalNeuronValues);
	//calc2 used
	/*float[] h_INV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, internalNeuronValues.getDevice().get(), 1, Pointer.to(h_INV), 1);
	internalNeuronValues.set(h_INV);*/
	//
	
	//
	VecFloat.mul(neurons,externalNeuronValues.getDevice().get(), calc1.getDevice().get(), calc2.getDevice().get());
	//calc1 free
	//calc2 free
	//
	
	/*//
	CUdeviceptr d_Out = new CUdeviceptr();
	JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out);
	JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output);
	//*/
	
	//
	/*float[] h_ENV = new float[neurons];
	JCublas.cublasGetVector(neurons, Sizeof.FLOAT, externalNeuronValues.getDevice().get(), 1, Pointer.to(h_ENV), 1);
	externalNeuronValues.set(h_ENV);*/
	//
	
	/*float[] h_Out = new float[output];
	JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
	result.set(h_Out);
	JCudaDriver.cuMemFree(d_Out);*/
}

public static void getOutput(Mat2 result,Mat2 outputWeights,Mat2 externalNeuronValues) {
	synchronized (GPUOp.class) {
		long time = 0;
		try {
			time = System.currentTimeMillis();
			int neurons = externalNeuronValues.elements();
			int output = result.elements();
			CUdeviceptr d_Out = new CUdeviceptr();
			JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out);
			JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output);
			float[] h_Out = new float[output];			
			JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
			result.set(h_Out);
			JCublas.cublasFree(d_Out);
		} catch (Exception e) {System.out.println(System.currentTimeMillis()-time);System.out.println(result.rows()+","+result.cols()+";"+outputWeights.rows()+","+outputWeights.cols()+";"+externalNeuronValues.rows()+","+externalNeuronValues.cols());result.print();outputWeights.print();externalNeuronValues.print();e.printStackTrace();}		
	}
}

public static void init() {
	JCuda.setExceptionsEnabled(true);
	JCudaDriver.setExceptionsEnabled(true);
	JCublas.setExceptionsEnabled(true);
	JCublas.cublasInit();
	VecFloat.init();
}

public static void close() {
	JCublas.cublasShutdown();
	VecFloat.shutdown();
}

public static void print(CUdeviceptr ptr,int size) {
	float[] h_ENV = new float[size];
	JCublas.cublasGetVector(size, Sizeof.FLOAT, ptr, 1, Pointer.to(h_ENV), 1);
	System.out.println("######################");
	for (int i=0;i<h_ENV.length;i++) {
		System.out.println(h_ENV**);
	}
	System.out.println("######################");
}

}```[/SPOILER]

[SPOILER]```package Matrix;

import Util.Random;

public class Mat2 {

private int rows;
private int cols;
private float[] data;
private int elements;
private boolean rowFirst = false;

private CuDeviceHolder ptr;

public Mat2(int rows,int cols) {
	this.rows = rows;
	this.cols = cols;
	elements = this.rows*this.cols;
	data = new float[elements];
}

public Mat2(Mat2 mat) {
	this.rows = mat.rows;
	this.cols = mat.cols;
	elements = this.rows*this.cols;
	data = new float[mat.elements];
	for (int i=0;i<data.length;i++) {
		data**=mat.data**;
	}
	this.rowFirst=mat.rowFirst;
}

public void set(float[] data) {
	this.data = data;
}

public void set(int row,int col,float val) {
	data[loc(row,col)] = val;
}

public float get(int row,int col) {
	return data[loc(row,col)];
}

public int rows() {
	return rows;
}

public int cols() {
	return cols;
}

public float[] data() {
	return data;
}

public int elements() {
	return elements;
}

/*public Mat2 add(Mat2 second) {
	Mat2 result = new Mat2(rows,cols);
	if (!rowFirst) result.colFirst();
	result.data = GPUOp.add(data, second.data);
	return result;
}

public Mat2 mul(Mat2 second) {
	Mat2 result = new Mat2(rows,second.cols);
	if (!rowFirst) result.colFirst();
	result.data = GPUOp.mul(data, second.data,rows,cols,second.rows,second.cols);
	return result;
}*/

public Mat2 sk(float skalar) {
	return null;
}

public Mat2 copy() {
	Mat2 result = new Mat2(rows,cols);
	if (!rowFirst) result.colFirst();
	for (int r=0;r<rows;r++) {
		for (int c=0;c<cols;c++) {
			result.set(r, c, get(r,c));
		}
	}
	return result;
}

public int loc(int row,int col) {
	if (rowFirst) {
		if (row>=rows) throw new RuntimeException("row "+row);
		if (col>=cols) throw new RuntimeException("col "+col);
		return (rows*row)+col;
	} else {
		if (row>=rows) throw new RuntimeException("row "+row);
		if (col>=cols) throw new RuntimeException("col "+col);
		return (rows*col)+row;
	}
}

public boolean colFirst() {
	if (!rowFirst) return false;
	rowFirst = false;
	return true;
}

public boolean rowFirst() {
	if (rowFirst) return false;
	rowFirst = true;
	return true;
}

/*@Override
public String toString() {
	String result = "";
	for (int r=0;r<rows;r++) {
		if (r!=0)result+="

“;
for (int c=0;c<cols;c++) {
if(c!=0) result+=”,";
result += get(r,c);
}
}
return result;
}*/

public void randomize(float min, float max) {
	for (int i=0;i<data.length;i++) {
		data**=Random.getRandom().getFloat(min, max);
	}
}

public void id() {
	for (int i=0;i<(rows<cols?rows:cols);i++) {
		set(i,i,1f);
	}
}

public void removeRow(int i) {
	rows -=1;
	elements = rows*cols;
	float[] newData = new float[elements];
	for (int row=0;row<rows+1;row++) {
		for (int col=0;col<cols;col++) {
			if (row<i) {
				try {
				newData[(rows*col)+row]=data[((rows+1)*col)+row];
				}catch (Exception e) {e.printStackTrace();System.out.println(rows+","+cols+","+elements+","+row+","+col+","+i);}
			}
			if (row>i) {
				newData[(rows*col)+row-1]=data[((rows+1)*col)+row];
			}
		}
	}
	data = newData;
}

public void removeCol(int i) {
	cols -=1;
	elements = rows*cols;
	float[] newData = new float[elements];
	for (int j=0;j<data.length;j++) {
		if (j < rows*i){
			newData[j]=data[j];
		} else if (j >= rows+rows*i){
			newData[j-rows]=data[j];
		}			
	}
	data = newData;
}

public void addRow(int i,float min,float max) {
	rows+=1;
	elements = rows*cols;
	float[] newData = new float[elements];
	for (int row=0;row<rows;row++) {
		for (int col=0;col<cols;col++) {
			if (row<i) {
				try {
					newData[(rows*col)+row]=data[((rows-1)*col)+row];
				}catch (Exception e) {e.printStackTrace();System.out.println(rows+","+cols+","+elements+","+row+","+col+","+i);}
			}
			if (row==i) {
				newData[(rows*col)+row]=Random.getRandom().getFloat(min, max);
			}
			if (row>i) {
				newData[(rows*col)+row]=data[((rows-1)*col)+row-1];
			}
		}
	}
	data=newData;
}

public void addCol(int i,float min,float max) {
	cols+=1;
	elements = rows*cols;
	float[] newData = new float[elements];
	for (int j=0;j<newData.length;j++) {
		if (j < rows*i){
			newData[j]=data[j];
		} else if (j >= rows+rows*i){
			newData[j]=data[j-rows];
		} else {
			newData[j]=Random.getRandom().getFloat(min, max);
		}
	}
	data = newData;
}

public void mutate(float min,float max) {
	int r=Random.getRandom().getInt(0, elements-1);
	data[r]=data[r]+Random.getRandom().getFloat(min, max);
	/*if (data[r]>5) data[r]=5;
	if (data[r]<-5) data[r]=-5;*/
}

public void print() {
	System.out.println("#####################################");
	for (int i=0;i<rows;i++) {
		for (int j=0;j<cols;j++) {
			if (j>0) System.out.print(",");
			System.out.print(get(i,j));
		}
		System.out.println();
	}
	System.out.println("#####################################");
}

public CuDeviceHolder getDevice() {
	return ptr;
}

public void GPUinit() {
	if (ptr!=null) System.out.println("++++++++++++++++++++++++++++++++++++++");
	ptr= new CuDeviceHolder(this);
}

public void GPUfree() {
	ptr.free();
	ptr=null;
}

}```[/SPOILER]

Now then in the First Class given Test, the line I have commented out produces the Problem, the way I have given the Example it actually works, but if I comment the line in then somewhere an Error is produced, its not easy to see where since theres alot of Threads starting and ending and such.

Must I try to simplify the Problem further?

cheers
Noodles

Marco13 · 8. Juni 2016 um 15:55

(I’ll try to allocate some time to check this - hoping that it can be tested easily)

Noodles · 8. Juni 2016 um 16:12

Forgot the Random class

[SPOILER]```package Util;

public class Random {
private static Random random = null;

private java.util.Random r = new java.util.Random(System.currentTimeMillis());

private Random() {}

public static Random getRandom() {
	if (random==null) random = new Random();
	return random;
}

public int getInt() {
	return r.nextInt();
}

public float getFloat() {
	return r.nextFloat();
}

public int getInt(int min, int max) {
	return (int)((r.nextDouble()*((max+1)-min))+min);
}

public float getFloat(float min,float max) {
	return (float)((r.nextDouble()*(max-min))+min);
}

}

Marco13 · 9. Juni 2016 um 07:52

Just a short note: The CUBLAS API should be thread safe in this regard. But the Vec (VecFloat) classes attach to one particular context. I’ll have to allocate more time for this, sorry.

Marco13 · 10. Juni 2016 um 02:04

Another short note: As mentioned above, the Vec-classes are attached to one context. In contrast to that, the runtime libraries (like CUBLAS) always attach to a magical “primary context”. Unfortunately, the context handling functions that are involved here are NOT part of the public CUDA API. Some points are summarized in https://devtalk.nvidia.com/default/topic/519087/cuda-context-and-threading/ (going beyond the basic descriptions of contexts in the programming guide, at Programming Guide :: CUDA Toolkit Documentation ). But I’ll have to read through this once more, and think about how this could affect “version 0.0.2” of the vector utility classes.

At the highest level, and without having thought through the implications and possible implementations, I see two options:

Trying to make the vector library “thread-safe” (i.e. context aware)
Making the user of the vector library responsible for proper context handling

The latter would defeat some of the purpose of the vector library. It should be easy to use, and it should “just work”, like the runtime libraries. However, the runtime libraries (like CUBLAS) also went through some refactoring in this regard. The (actually “deprecated”) CUBLAS/JCublas interface does not offer any facilities here. But the CUBLAS_v2/JCublas2 interface knows the concept of the cublasHandle ( cuBLAS :: CUDA Toolkit Documentation ), which basically IS a “context”.

Regardless of this broad question (which will not be answered within a few days) :

I am NOT sure whether this is the reason for the problems that you are observing. It is not unlikely, but I don’t know it for sure.
You are using the vector library. Although it’s version 0.0.1, and changes would have to be expected (and there was this “TODO” right from the beginning), and I did not consider the case where someone just throws 50 threads on some problem (“to make it faster”), you have to make progress here. I’m sure you have deadlines and must produce results, and so the pressing question for you is how to get it running. Unfortunately, I don’t know a trivial workaround, and (even if I knew one), it would still not guarantee that this would solve the problem.

I’ll try to do some tests during the weekend.

(I’ll basically try to find a “simple” solution (even if it is “not nice”, and will not make it into version 0.0.2 of the vector library), so that we first of all know for sure whether this context thingy is the reason for the current behavior, and (IF it is), you can proceed with your current work - but I can’t make any promises here right now)