Hi again,
I know it is very overreach for debugging my program. I have tried to debug for days and nights but in vain and driving me crazy.
I have double checked the program logic but there is no different.
May I again invite your kindly help for investigating the problem?
The original code from C:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <cublas.h>
#include <cuda_runtime.h>
int* cudaSgetrf(unsigned int n, float *dA);
void cudaSgetri(unsigned int n, float *dA, int *pivots);
void cudaStrtri(unsigned int n, float *dA);
void cudaInvertMatrix(unsigned int n, float *A);
int main(int argc, char** argv){
float orgData[3][3] = {
{1.0, 5.0, 1.0},
{7.0, 9.0, 13.0},
{2.0, 8.0, 6.0}
};
int i,j;
//unsigned int row = 3;
float *matrixIn1D = malloc(9* sizeof(float));
for (i=0; i<3; i++){
for (j=0; j < 3; j++) {
matrixIn1D[i+j*3]=orgData**[j];
//printf("%4.2f %4.2f %s",matrixIn1D[i*j], orgData**[j], "
");
}
}
for (i=0; i<9;i++){
//printf("%4.2f %s",matrixIn1D**, "
");
}
/*
for (i=0; i<3; i++){
for (j=0; j < 3; j++) {
matrixIn1D[i*j]=orgData**[j];
printf("%4.2f %4.2f %s",matrixIn1D[i*j], orgData**[j], "
");
}
}
*/
cudaInvertMatrix(3, matrixIn1D);
for (i=0; i<3; i++){
for (j=0; j < 3; j++) {
orgData**[j]=matrixIn1D[i+j*3];
//printf("%f %s",matrixIn1D[i*j],"
");
//printf("%f %s",orgData**[j],"
");
}
}
}
/*
** cudaInvertMatrix
** Inverts a square matrix in place
** n - matrix dimension
** A - pointer to array of floats representing the matrix in column-major order
*/
void cudaInvertMatrix(unsigned int n, float *A) {
int *pivots;
float *dA;
cublasInit();
cublasAlloc(n * n, sizeof(float), (void**)&dA);
cublasSetMatrix(n, n, sizeof(float), A, n, dA, n);
// Perform LU factorization
pivots = cudaSgetrf(n, dA);
// Perform inversion on factorized matrix
cudaSgetri(n, dA, pivots);
cublasGetMatrix(n, n, sizeof(float), dA, n, A, n);
cublasFree(dA);
cublasShutdown();
}
/*
** cudaSgetrf
** Performs an in-place LU factorization on a square matrix
** Uses the unblocked BLAS2 approach
*/
int * cudaSgetrf(unsigned int n, float *dA) {
int i, pivot, *pivots;
float *offset, factor;
pivots = (int *) calloc(n, sizeof(int));
for(i = 0; i < n; ++i)
pivots** = i;
for(i = 0; i < n - 1; i++) {
offset = dA + i*n + i;
pivot = i - 1 + cublasIsamax(n - i, offset, 1);
if(pivot != i) {
pivots** = pivot;
cublasSswap(n, dA + pivot, n, dA + i, n);
}
cublasGetVector(1, sizeof(float), offset, 1, &factor, 1);
cublasSscal(n - i - 1, 1 / factor, offset + 1, 1);
cublasSger(n - i - 1, n - i - 1, -1.0f, offset + 1, 1, offset + n, n, offset + n + 1, n);
}
return pivots;
}
/*
** cudaSgetri
** Computes the inverse of an LU-factorized square matrix
*/
void cudaSgetri(unsigned int n, float *dA, int *pivots) {
int i;
float *dWork, *offset;
// Perform inv(U)
cudaStrtri(n, dA);
// Solve inv(A)*L = inv(U)
cublasAlloc(n - 1, sizeof(float), (void**)&dWork);
for(i = n - 1; i > 0; --i) {
offset = dA + (i - 1)*n + i;
cudaMemcpy(dWork, offset, (n - 1) * sizeof(float), cudaMemcpyDeviceToDevice);
cublasSscal(n - i, 0, offset, 1);
cublasSgemv('n', n, n - i, -1.0f, dA + i*n, n, dWork, 1, 1.0f, dA + (i-1)*n, 1);
}
cublasFree(dWork);
// Pivot back to original order
for(i = n - 1; i >= 0; --i)
if(i != pivots**)
cublasSswap(n, dA + i*n, 1, dA + pivots***n, 1);
}
/*
** cudaStrtri
** Computes the inverse of an upper triangular matrix in place
** Uses the unblocked BLAS2 approach
*/
void cudaStrtri(unsigned int n, float *dA){
int i;
float factor, *offset;
for(i = 0; i < n; ++i) {
offset = dA + i*n;
cublasGetVector(1, sizeof(float), offset + i, 1, &factor, 1);
factor = 1 / factor;
cublasSetVector(1, sizeof(float), &factor, 1, offset + i, 1);
cublasStrmv('u', 'n', 'n', i, dA, n, offset, 1);
cublasSscal(i, -1 * factor, offset, 1);
}
}
My translated java code, it can output the result but it is not correct
import jcuda.*;
import jcuda.jcublas.JCublas;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaMemcpyKind;
import jcuda.utils.Print;
public class TestInverseCUDA {
public static void main(String[] args) {
float[][] x ={{1.0f, 5.0f, 1.0f},{7.0f, 9.0f, 13.0f},{2.0f, 8.0f, 6.0f}};
System.out.println(Print.toString2D(x));
float[] y = rowMajorToColumnMajor(x);
//float []z= matrix21Darray(x);
cudaInvertMatrix(3, y);
//x = columnMajorToRowMajor(matrix21Darray(x), 2);
//System.out.println(Print.toString2D(columnMajorToRowMajor(y, 2)));
//System.out.println(Print.toString2D(x));
System.out.println(Print.toString2D(columnMajorToRowMajor(y, 3)));
}
public static float[] matrix21Darray(float[][] in){
int size = in.length*in[0].length;
float[] temp = new float[size];
int k=0;
for (int i=0; i< in.length; i++ ){
for (int j=0; j <in[0].length; j++){
temp[k]=in**[j];
k++;
}
}
return temp;
}
public static float[] rowMajorToColumnMajor(float[][] in) {
float[] x = new float[in.length * in[0].length];
for (int i = 0; i < in.length; i++) {
for (int j = 0; j < in[0].length; j++) {
x[i + j * in.length] = in**[j];
}
}
return x;
}
/**
*
* @param in For converting data back to row major format
* @param row number of row
* @return 2d data array
*/
public static float[][] columnMajorToRowMajor(float[] in, int row) {
int col = in.length / row;
float[][] x = new float[row][col];
for (int j = 0; j < row; j++) {
for (int i = 0; i < col; i++) {
x[j]** = in[i * row + j];
}
}
return x;
}
//main function
public static void cudaInvertMatrix(int n, float[] A) {
//int *pivots;
int[] pivots;
//float *dA;
Pointer dA = new Pointer();
JCublas.cublasInit();
JCublas.cublasAlloc(n * n, Sizeof.FLOAT, dA);
JCublas.cublasSetMatrix(n, n, Sizeof.FLOAT, Pointer.to(A), n, dA, n);
// Perform LU factorization
pivots = cudaSgetrf(n, dA);
// Perform inversion on factorized matrix
cudaSgetri(n, dA, pivots);
JCublas.cublasGetMatrix(n, n, Sizeof.FLOAT, dA, n, Pointer.to(A), n);
JCublas.cublasFree(dA);
JCublas.cublasShutdown();
}
/*** cudaSgetrf
** Performs an in-place LU factorization on a square matrix
**Uses the unblocked BLAS2 approach
*/
private static int[] cudaSgetrf(int n, Pointer dA) {
//int i, pivot, *pivots;
//float *offset, factor;
int i, pivot;
//pivots = (int *) calloc(n, sizeof(int));
int[] pivots = new int[n];
float[] factor = new float[1];
factor[0]=0;
Pointer offset;
for (i = 0; i < n; ++i) {
pivots** = i;
}
for (i = 0; i < n - 1; i++) {
offset = dA.withByteOffset((i * n + i) + Sizeof.FLOAT);
pivot = i - 1 + JCublas.cublasIsamax(n - i, offset, 1);
if (pivot != i) {
pivots** = pivot;
JCublas.cublasSswap(n, dA.withByteOffset(pivot * Sizeof.FLOAT), n, dA.withByteOffset(i * Sizeof.FLOAT), n);
}
JCublas.cublasGetVector(1, Sizeof.FLOAT, offset, 1, Pointer.to(factor), 1);
JCublas.cublasSscal(n - i - 1, 1 / factor[0], offset.withByteOffset(1 * Sizeof.FLOAT), 1);
JCublas.cublasSger(n - i - 1, n - i - 1, -1.0f, offset.withByteOffset(1 * Sizeof.FLOAT), 1, offset.withByteOffset(n * Sizeof.FLOAT), n, offset.withByteOffset((n + 1) * Sizeof.FLOAT), n);
}
return pivots;
}
/*** cudaSgetri
** Computes the inverse of an LU-factorized square matrix
*/
private static void cudaSgetri(int n, Pointer dA, int[] pivots) {
//float *dWork, *offset;
Pointer dWork = new Pointer(), offset = new Pointer();
// Perform inv(U)
cudaStrtri(n, dA);
// Solve inv(A)*L = inv(U)
JCublas.cublasAlloc(n - 1, Sizeof.FLOAT, dWork);
for (int i = n - 1; i > 0; --i) {
offset = dA.withByteOffset(((i - 1) * n + i) * Sizeof.FLOAT);
JCuda.cudaMemcpy(dWork, offset, (n - 1) * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToDevice);
JCublas.cublasSscal(n - i, 0, offset, 1);
JCublas.cublasSgemv('n', n, n - i, -1.0f, dA.withByteOffset((i * n) * Sizeof.FLOAT), n, dWork, 1, 1.0f, dA.withByteOffset(((i - 1) * n) * Sizeof.FLOAT), 1);
}
JCublas.cublasFree(dWork);
// Pivot back to original order
for (int i = n - 1; i >= 0; --i) {
if (i != pivots**) {
JCublas.cublasSswap(n, dA.withByteOffset((i * n) * Sizeof.FLOAT), 1, dA.withByteOffset((pivots** * n) * Sizeof.FLOAT), 1);
}
}
}
/*** cudaStrtri
** Computes the inverse of an upper triangular matrix in place
** Uses the unblocked BLAS2 approach
*/
private static void cudaStrtri(int n, Pointer dA) {
float[] factor = new float[1];
factor[0]=0;
Pointer offset = new Pointer();
for (int i = 0; i < n; ++i) {
offset = dA.withByteOffset((i * n) * Sizeof.FLOAT);
JCublas.cublasGetVector(1, Sizeof.FLOAT, offset.withByteOffset(i * Sizeof.FLOAT), 1, Pointer.to(factor), 1);
factor[0] = 1 / factor[0];
JCublas.cublasSetVector(1, Sizeof.FLOAT, Pointer.to(factor), 1, offset.withByteOffset(i * Sizeof.FLOAT), 1);
JCublas.cublasStrmv('u', 'n', 'n', i, dA, n, offset, 1);
JCublas.cublasSscal(i, -1 * factor[0], offset, 1);
}
}
}