From 2760479cd87f17bd1d27c5ec4d5cf84b72e45657 Mon Sep 17 00:00:00 2001
From: lriha <lubomir.riha@vsb.cz>
Date: Thu, 11 Apr 2019 01:58:48 +0200
Subject: [PATCH] Updated testing and memory evaluation

---
 CSparse/Demo/Makefile           |   8 +-
 CSparse/Demo/cs_demo.c          |   2 +-
 CSparse/Lib/Makefile            |   2 +-
 CSparse/Source/cs_cholsol.c     | 213 +++++++++++++++++++++++++++-----
 CSparse/Source/cs_lsolve.c      |  66 ++++++++++
 CSparse/Source/cs_lsolve_gpu.cu |  16 +--
 6 files changed, 259 insertions(+), 48 deletions(-)

diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index d3c84d7..de4872f 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -8,11 +8,11 @@ CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
 all: lib cs_demo1 cs_demo2 cs_demo3
 #	- ./cs_demo2 < ../Matrix/t1
 # 	- ./cs_demo2 < ../Matrix/FEM-2S
-# 	- ./cs_demo2 < ../Matrix/bcsstk26-v2
-# 	- ./cs_demo2 < ../Matrix/bcsstk25
-# 	- ./cs_demo2 < ../Matrix/bcsstk17
+	- ./cs_demo2 < ../Matrix/bcsstk26-v2
+	- ./cs_demo2 < ../Matrix/bcsstk25
+	- ./cs_demo2 < ../Matrix/bcsstk17
 # 	- ./cs_demo2 < ../Matrix/ship_001
-	- ./cs_demo2 < ../Matrix/bcsstk01
+# 	- ./cs_demo2 < ../Matrix/bcsstk01
 # 	- ./cs_demo2 < ../Matrix/crystm02
 #  	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 
diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c
index abcd210..95d225e 100644
--- a/CSparse/Demo/cs_demo.c
+++ b/CSparse/Demo/cs_demo.c
@@ -83,7 +83,7 @@ problem *get_problem (FILE *f, double tol)
     problem *Prob ;
     Prob = cs_calloc (1, sizeof (problem)) ;
     if (!Prob) return (NULL) ;
-    T = cs_load (f) ;                   /* load triplet matrix T from a file */
+    T = cs_load2 (f) ;                   /* load triplet matrix T from a file */
     Prob->A = A = cs_compress (T) ;     /* A = compressed-column form of T */
     cs_spfree (T) ;                     /* clear T */
     if (!cs_dupl (A)) return (free_problem (Prob)) ; /* sum up duplicates */
diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile
index 1a847e5..0dea956 100644
--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -16,7 +16,7 @@
 
 LIBRARY = libcsparse
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
-CC = nvcc -dc -g -G -O0 --cudart shared -DDEBUG
+CC = nvcc -dc -g -G -O0 --cudart shared #-DDEBUG
 I = -I../Include
 RANLIB = ranlib
 ARCHIVE = $(AR) $(ARFLAGS)
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index aeafe9e..980d4e1 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -1,6 +1,9 @@
 #include "cs.h"
 #include "cuda_runtime.h"
+#include <time.h>
 
+static double tic (void) { return (clock () / (double) CLOCKS_PER_SEC) ; }
+static double toc (double t) { double s = tic () ; return (CS_MAX (0, s-t)) ; }
 
 /* x=A\b where A is symmetric positive definite; b overwritten with solution */
 csi cs_cholsol_cpu (csi order, const cs *A, double *b)
@@ -43,6 +46,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     printf("\n--------------------------------------------------------------------------------------------------------------------\n");
     printf("Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
  
+    double t;
     double GPUmem_sizeGB        = 32.0; // GB
     double cube_size            = cbrt((double)n/3.0);
     double n_rhs_ratio          = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) );  
@@ -51,8 +55,11 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0;
     int    blocks               = total_num_of_LSC_perGPU;
 
-    blocks = 2
-    n_rhs  = 2; 
+
+
+
+    // blocks = 10;
+    // n_rhs  = 30; 
 
 
     printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
@@ -63,9 +70,33 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n);
     printf("\n");
 
+
     S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
     N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
     x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
+
+    double tot_size_MB =  (double)(((N->L->n)+1) *         sizeof(int))     / 1024.0 / 1024.0 + 
+                          (double)((N->L->nzmax) *         sizeof(int))     / 1024.0 / 1024.0 + 
+                          (double)((N->L->nzmax) *         sizeof(double))  / 1024.0 / 1024.0 + 
+                          (double)(n * n_rhs *             sizeof(double))  / 1024.0 / 1024.0; 
+                          // (double)(0.5*n_rhs * n_rhs *     sizeof(double))  / 1024.0 / 1024.0;
+
+    double max_num_of_LSC_processed_per_GPU = GPUmem_sizeGB / tot_size_MB * 1024.0;
+
+    blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU);
+
+    printf("Input data memory requiroments to calculate LSC on GPU\n");
+    printf(" - Lp   size: %f MB\n", (double)(((N->L->n)+1) *         sizeof(int))     / 1024.0 / 1024.0);
+    printf(" - Li   size: %f MB\n", (double)((N->L->nzmax) *         sizeof(int))     / 1024.0 / 1024.0);
+    printf(" - Lx   size: %f MB\n", (double)((N->L->nzmax) *         sizeof(double))  / 1024.0 / 1024.0);
+    printf(" - RHSs size: %f MB\n", (double)(n * n_rhs *         sizeof(double))  / 1024.0 / 1024.0);
+    // printf(" - LSC  size: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
+    printf(" - Tot  size: %f MB\n", tot_size_MB);
+    printf("\nTotal number of LSC that can be processes at the same time based on GPU memory size\n");
+    printf(" - GPU memory size: %f GB\n", GPUmem_sizeGB);
+    printf(" - Max num of LSC : %f \n"  , max_num_of_LSC_processed_per_GPU);
+    printf("\n");
+
     ok = (S && N && x) ;
     
     if (ok)
@@ -73,7 +104,6 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
                 
         // *** GPU code start 
-        // - cs_lsolve (N->L, x) ;           /* x = L\x */
 
         // Copy RHS vector to multiple columns 
         double *rhs_t;
@@ -99,17 +129,18 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
 #define FULL_MEM
 #ifdef  FULL_MEM
-    {
+    
         int GPU_mem = 0; 
 
         int num_of_arrays = blocks;
 
         // create intermediate host array for storage of device row-pointers
-        int**    h_array_Lp = (int**)   malloc(num_of_arrays * sizeof(int*));
-        int**    h_array_Li = (int**)   malloc(num_of_arrays * sizeof(int*));
-        double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*));
-        double** h_array_x  = (double**)malloc(num_of_arrays * sizeof(double*));
-        
+        int**    h_array_Lp  = (int**)   malloc(num_of_arrays * sizeof(int*));
+        int**    h_array_Li  = (int**)   malloc(num_of_arrays * sizeof(int*));
+        double** h_array_Lx  = (double**)malloc(num_of_arrays * sizeof(double*));
+        double** h_array_x   = (double**)malloc(num_of_arrays * sizeof(double*));
+        double** x_gpu_array = (double**)malloc(num_of_arrays * sizeof(double*));
+
         // create top-level device array pointer
         int**    d_array_Lp;
         int**    d_array_Li;
@@ -121,8 +152,6 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*));
         cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*));
         
-
-
         size_t free_byte ;
         size_t total_byte ;
         cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
@@ -137,16 +166,19 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     
         int i; 
         for(i = 0 ; i < num_of_arrays ; i++){
+
+            x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ;
+
             cudaMalloc(&h_array_Lp[i],           ((N->L->n)+1) * sizeof(int));
             cudaMalloc(&h_array_Li[i],           (N->L->nzmax) * sizeof(int));
             cudaMalloc(&h_array_Lx[i],           (N->L->nzmax) * sizeof(double));
             cudaMalloc(&h_array_x [i],               n * n_rhs * sizeof(double));
 
-            printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int))     / 1024.0 / 1024.0);
-            printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int))     / 1024.0 / 1024.0);
-            printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double))  / 1024.0 / 1024.0);
-            printf(" x size: %f MB\n",     (double)(n * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
-            printf("LSCsize: %f MB\n",     (double)(0.5*n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
+            // printf(" - Lp size: %f MB; ", (double)(((N->L->n)+1) *         sizeof(int))     / 1024.0 / 1024.0);
+            // printf(   "Li size: %f MB; ", (double)((N->L->nzmax) *         sizeof(int))     / 1024.0 / 1024.0);
+            // printf(   "Lx size: %f MB; ", (double)((N->L->nzmax) *         sizeof(double))  / 1024.0 / 1024.0);
+            // printf(    "x size: %f MB; ",     (double)(n * n_rhs *         sizeof(double))  / 1024.0 / 1024.0);
+            // printf(   "LSCsize: %f MB\n",     (double)(0.5*n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
 
             cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int),    cudaMemcpyHostToDevice);
             cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int),    cudaMemcpyHostToDevice);
@@ -160,7 +192,8 @@ csi cs_cholsol (csi order, const cs *A, double *b)
             double total_db = (double)total_byte ;
             double used_db = total_db - free_db ;
 
-            printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+            if (i < 2 || i > (num_of_arrays - 3))
+                printf("%d : GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", i, used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
 
         }
         // fixup top level device array pointer to point to array of device row-pointers
@@ -169,21 +202,138 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
         cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
 
+        printf("================================================================================================\n");
+        blocks = 1; 
+        cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+        cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+        printf("------------------------------------------------------------------------------------------------\n");
+
+        for (i = 40; i < num_of_arrays; i = i + 40) {
+            blocks = i; 
+            cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+            cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+            printf("------------------------------------------------------------------------------------------------\n");
+        }
+
+        blocks = num_of_arrays;
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+        printf("================================================================================================\n");
 
+        printf("================================================================================================\n");
+        printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n");
+        int    *d_Lp;
+        int    *d_Li;
+        double *d_Lx;
+        double *d_x;
+
+        cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
+        cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
+        cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
+        cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
+
+        cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
+
+        blocks = num_of_arrays;
+        cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+        cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+
+        printf("------------------------------------------------------------------------------------------------\n");
+        printf(" - For max num of LSC\n");
+
+        blocks = total_num_of_LSC_perGPU; 
+        cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+        cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+        printf("================================================================================================\n");
 
-        double** x_gpu_array  = (double**)malloc(num_of_arrays * sizeof(double*));
+
+        // int    *d_Lp = &d_array_Lp[0];
+        // int    *d_Li = &d_array_Li[0];
+        // double *d_Lx = &d_array_Lx[0];
+        // double *d_x  = &d_array_x [0];
+
+        // cs_lsolve_gpu_trans  (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
+        // cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
+
+        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+
+        printf("\n");
+
+        t = tic () ;
         for(i = 0 ; i < num_of_arrays ; i++) {
-            x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ;
             cudaMemcpy(x_gpu_array[i], h_array_x [i], n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
         }
+        printf ("Mem copy GPU to CPU all LSCs : %8.2f ms \n", 1000.0 * toc (t)) ;
 
-        double *x_gpu = x_gpu_array[1]; 
+        // CPU code verification - lsolve for multiple RSH 
+        t = tic () ;
+        cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
+        printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
 
-    }
+        t = tic () ;
+        cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
+        printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ;
+
+        int l;
+        printf("\n");
+        for(l = 0 ; l < num_of_arrays ; l++) {
+            double *x_gpu = x_gpu_array[l]; 
+
+
+//             // *** Debug - check with per element output 
+// #ifdef DEBUG
+//             {
+//                 int i; 
+//                 int r; 
+//                 cs_lsolve  (N->L, x) ;          /* x = L\x */        
+//                 cs_ltsolve (N->L, x) ;          /* x = L'\x */
+//                 for (i=0; i<n; i++) {
+//                     printf("cpu: %f\t gpu:\t", x[i]); 
+//                     for (r = 0; r < n_rhs; r++) {
+//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
+//                             printf("OK\t");
+//                         else
+//                             printf("Er\t"); 
+//                         //  printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+//                         printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
+//                     }
+//                     printf("\n");
+//                 }
+//                 printf("\n");
+//             }
+// #else
+        
+//             {
+//                 int i; 
+//                 int r; 
+//                 int errors = 0;
+//                 for (i=0; i<n; i++) {
+//                     for (r = 0; r < n_rhs; r++) {
+//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
+//                             errors++; 
+//                         }
+//                     }
+//                 }
+//                 if (errors == 0) 
+//                     printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
+//                 else
+//                     printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
+            
+//             }
+
+// #endif
+
+        
+        }
+
+
+    
 #else 
-    {
+    
         // *** Vesion with 
         // Copy Chol. factor and RHSs from CPU to GPU 
         int    *d_Lp;
@@ -208,10 +358,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         double *x_gpu;
         x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
         cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
-    }
-#endif
-
-
+    
         // CPU code verification - lsolve for multiple RSH 
         cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
 	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
@@ -221,10 +368,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         {
         int i; 
         int r; 
-        
-        cs_lsolve  (N->L, x) ;          /* x = L\x */
-        cs_ltsolve (N->L, x) ;          /* x = L'\x */
-        
+ 
         for (i=0; i<n; i++) {
             printf("cpu: %f\t gpu:\t", x[i]); 
             for (r = 0; r < n_rhs; r++) {
@@ -239,14 +383,15 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         }
         printf("\n");
         }
-#else 
-        cs_lsolve  (N->L, x) ;          /* x = L\x */        
-        cs_ltsolve (N->L, x) ;          /* x = L'\x */
 #endif
 
-        // *** END - GPU code  
+#endif
 
+        // *** END - GPU code  
 
+        cs_lsolve  (N->L, x) ;          /* x = L\x */        
+        
+        cs_ltsolve (N->L, x) ;          /* x = L'\x */
 
         cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
     }
diff --git a/CSparse/Source/cs_lsolve.c b/CSparse/Source/cs_lsolve.c
index 1e92434..cda8178 100644
--- a/CSparse/Source/cs_lsolve.c
+++ b/CSparse/Source/cs_lsolve.c
@@ -21,6 +21,71 @@ csi cs_lsolve (const cs *L, double *x)
 }
 
 
+csi cs_lsolve_mrhs_par2 (const cs *L, double *x, csi n_rhs)
+{
+    csi r, p, j, n, *Lp, *Li ;
+    double *Lx ;
+    if (!CS_CSC (L) || !x) return (0) ;                     /* check inputs */
+    n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ;
+    
+    
+    for (j = 0 ; j < n ; j++)
+    {
+        #pragma omp simd 
+        for (r = 0; r < n_rhs; r++)
+        {
+            csi addr = j * n_rhs + r; 
+            x [addr] /= Lx [Lp [j]] ;
+        }
+
+        for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+        {
+            #pragma omp simd 
+            for (r = 0; r < n_rhs; r++)
+            {
+                csi addr = j * n_rhs + r;
+                x [Li [p] * n_rhs + r] -= Lx [p] * x [addr] ;
+                //printf("XPU: %f", x [Li [p]]);
+            }
+        }
+    }
+    
+    return (1) ;
+}
+
+csi cs_lsolve_mrhs_par1 (const cs *L, double *x, csi n_rhs)
+{
+    csi r, p, j, n, *Lp, *Li ;
+    double *Lx ;
+    if (!CS_CSC (L) || !x) return (0) ;                     /* check inputs */
+    n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ;
+    
+    #pragma omp parallel for simd
+    for (r = 0; r < n_rhs; r++) 
+    {
+        for (j = 0 ; j < n ; j++)
+        {
+            // for (r = 0; r < n_rhs; r++)
+            // {
+                csi addr = j * n_rhs + r; 
+                x [addr] /= Lx [Lp [j]] ;
+            // }
+
+            for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+            {
+                // for (r = 0; r < n_rhs; r++)
+                // {
+                    csi addr = j * n_rhs + r;
+                    x [Li [p] * n_rhs + r] -= Lx [p] * x [addr] ;
+                    //printf("XPU: %f", x [Li [p]]);
+                // }
+            }
+        }
+    }    
+    
+    return (1) ;
+}
+
 csi cs_lsolve_mrhs (const cs *L, double *x, csi n_rhs)
 {
     csi r, p, j, n, *Lp, *Li ;
@@ -28,6 +93,7 @@ csi cs_lsolve_mrhs (const cs *L, double *x, csi n_rhs)
     if (!CS_CSC (L) || !x) return (0) ;                     /* check inputs */
     n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ;
     
+    
     for (j = 0 ; j < n ; j++)
     {
         for (r = 0; r < n_rhs; r++)
diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu
index 966045e..0367765 100644
--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -209,7 +209,7 @@ extern "C" {
       if (threads > n_rhs) threads = n_rhs;
       if (threads > MAX_THREADS) threads = MAX_THREADS; 
 
-      printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+      printf("\nRunning kernel 'cs_lsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
       
       cudaEvent_t start, stop;
       cudaEventCreate(&start);
@@ -223,7 +223,7 @@ extern "C" {
       float milliseconds = 0;
       cudaEventElapsedTime(&milliseconds, start, stop);
 
-      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+      printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
 
     }
 }
@@ -236,7 +236,7 @@ extern "C" {
       if (threads > n_rhs) threads = n_rhs;
       if (threads > MAX_THREADS) threads = MAX_THREADS; 
 
-      printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+      printf("\nRunning kernel 'cs_lsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
       
       cudaEvent_t start, stop;
       cudaEventCreate(&start);
@@ -250,7 +250,7 @@ extern "C" {
       float milliseconds = 0;
       cudaEventElapsedTime(&milliseconds, start, stop);
 
-      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+      printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
 
     }
 }
@@ -263,7 +263,7 @@ extern "C" {
       if (threads > n_rhs) threads = n_rhs;
       if (threads > MAX_THREADS) threads = MAX_THREADS; 
       
-      printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+      printf("\nRunning kernel 'cs_ltsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
 
       cudaEvent_t start, stop;
       cudaEventCreate(&start);
@@ -277,7 +277,7 @@ extern "C" {
       float milliseconds = 0;
       cudaEventElapsedTime(&milliseconds, start, stop);
 
-      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+      printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
 
     }
 }
@@ -290,7 +290,7 @@ extern "C" {
       if (threads > n_rhs) threads = n_rhs;
       if (threads > MAX_THREADS) threads = MAX_THREADS; 
       
-      printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+      printf("\nRunning kernel 'cs_ltsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
 
       cudaEvent_t start, stop;
       cudaEventCreate(&start);
@@ -304,7 +304,7 @@ extern "C" {
       float milliseconds = 0;
       cudaEventElapsedTime(&milliseconds, start, stop);
 
-      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+      printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
 
     }
 }
-- 
GitLab