Hi,
this is really going over my head,
I just made my classes a bit more readable by removing commented out lines and such an made the use of some of my functions easier and the problem was fixed, I just went over my Code and cant see any difference, but the Error is gone.
Also I dont quite understand why but the “faster” approach of using multiple Threads isnt actually faster its actually about three times as slow, so im sticking to just compute things in a line, anyways I thank you for all your help, and heres the version that for some Reason works.
[SPOILER]```package Matrix;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.CUdeviceptr;
import jcuda.driver.JCudaDriver;
import jcuda.jcublas.JCublas;
import jcuda.runtime.JCuda;
import jcuda.vec.VecFloat;
public class GPUOp {
public static void fNormalize(Mat2 d_Out,Mat2 d_In) {
VecFloat.exp(d_In.rows(), d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
VecFloat.scalarDiv(d_In.rows(), d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
VecFloat.scalarAdd(d_In.rows(), d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
VecFloat.scalarDiv(d_In.rows(), d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1/(1+e^(-x))
}
public static void gNormalize(Mat2 d_Out,int n,Mat2 d_In) {
VecFloat.exp(n, d_Out.getDevice().get(), d_In.getDevice().get()); // x = e^x
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = e^(-x)
VecFloat.scalarAdd(n, d_Out.getDevice().get(), 1f, d_Out.getDevice().get()); // x = 1+e^(-x)
VecFloat.scalarDiv(n, d_Out.getDevice().get(), 2f, d_Out.getDevice().get()); // x = 2/(1+e^(-x))
VecFloat.subScalar(n, d_Out.getDevice().get(), d_Out.getDevice().get(), 1f); // x = (2/(1+e^(-x)))-1
}
public static void GateBulkNeuron(Mat2 d_Out,Mat2 env, Mat2 weights) {
int neurons = env.rows();
JCublas.cublasSgemm('n', 'n', neurons, 1, neurons, 1.0f, weights.getDevice().get(), neurons, env.getDevice().get(), neurons, 0.0f, d_Out.getDevice().get(), neurons);
fNormalize(d_Out,d_Out);
}
public static void GateBulkInput(Mat2 d_Out,Mat2 input, Mat2 weights) {
int numNeurons = weights.rows();
int numInputs = weights.cols();
JCublas.cublasSgemm('n', 'n', numNeurons, 1, numInputs, 1.0f, weights.getDevice().get(), numNeurons, input.getDevice().get(), numInputs, 0.0f, d_Out.getDevice().get(), numNeurons);
fNormalize(d_Out,d_Out);
}
public static void doCalcIns(Mat2 in1,Mat2 in2,Mat2 in3,Mat2 inputValues,Mat2 inputOutGateWeights,Mat2 inputInGateWeights,Mat2 inputChangeGateWeights) {
GateBulkInput(in1,inputValues,inputOutGateWeights);
GateBulkInput(in2,inputValues,inputInGateWeights);
GateBulkInput(in3,inputValues,inputChangeGateWeights);
}
public static void doCalc(Mat2 in1, Mat2 in2, Mat2 in3, Mat2 outGateWeights,
Mat2 changeGateWeights, Mat2 inGateWeights, Mat2 internalNeuronValues,Mat2 externalNeuronValues,Mat2 calc1,Mat2 calc2,Mat2 calc3) {
int neurons = internalNeuronValues.elements();
GateBulkNeuron(calc1,externalNeuronValues,outGateWeights);
//calc1 used d_yOut
GateBulkNeuron(calc2,externalNeuronValues,inGateWeights);
//calc2 used d_yIn
GateBulkNeuron(calc3,externalNeuronValues,changeGateWeights);
//calc3 used d_yCh
VecFloat.add(neurons,calc1.getDevice().get(),calc1.getDevice().get(),in1.getDevice().get());
VecFloat.add(neurons,calc2.getDevice().get(),calc2.getDevice().get(),in2.getDevice().get());
VecFloat.add(neurons,calc3.getDevice().get(),calc3.getDevice().get(),in3.getDevice().get());
VecFloat.mul(neurons, calc2.getDevice().get(), calc2.getDevice().get(), calc3.getDevice().get());
//calc3 free
VecFloat.add(neurons, internalNeuronValues.getDevice().get(), internalNeuronValues.getDevice().get(), calc2.getDevice().get());
//calc2 free
fNormalize(calc2,internalNeuronValues);
//calc2 used
VecFloat.mul(neurons,externalNeuronValues.getDevice().get(), calc1.getDevice().get(), calc2.getDevice().get());
//calc1 free
//calc2 free
}
public static void getOutput(Mat2 result,Mat2 outputWeights,Mat2 externalNeuronValues) {
int neurons = externalNeuronValues.elements();
int output = result.elements();
CUdeviceptr d_Out = new CUdeviceptr();
JCublas.cublasAlloc(output, Sizeof.FLOAT, d_Out);
JCublas.cublasSgemm('n', 'n', output, 1, neurons, 1.0f, outputWeights.getDevice().get(), output, externalNeuronValues.getDevice().get(), neurons, 0.0f, d_Out, output);
float[] h_Out = new float[output];
JCublas.cublasGetVector(output, Sizeof.FLOAT, d_Out, 1, Pointer.to(h_Out), 1);
result.set(h_Out);
JCublas.cublasFree(d_Out);
}
public static void init() {
JCuda.setExceptionsEnabled(true);
JCudaDriver.setExceptionsEnabled(true);
JCublas.setExceptionsEnabled(true);
JCublas.cublasInit();
VecFloat.init();
}
public static void close() {
JCublas.cublasShutdown();
VecFloat.shutdown();
}
public static void print(CUdeviceptr ptr,int size) {
float[] h_ENV = new float[size];
JCublas.cublasGetVector(size, Sizeof.FLOAT, ptr, 1, Pointer.to(h_ENV), 1);
System.out.println("######################");
for (int i=0;i<h_ENV.length;i++) {
System.out.println(h_ENV**);
}
System.out.println("######################");
}
}```[/SPOILER]
PS: dont worry about my Deadlines, I dont have any yet, I postponed setting Deadlines until I have a working Version, because the actuall Thesis isnt about the working or not working of this Programm, just using the Programm to analyse things.
cheers
Noodles