From 39f726caa9e5d329b88acfaafede2a16f68b6c32 Mon Sep 17 00:00:00 2001
From: lriha <lubomir.riha@vsb.cz>
Date: Tue, 7 May 2019 15:10:52 +0200
Subject: [PATCH] Working version - optimal kernel doesnot give correct results

---
 CSparse/Demo/Makefile           |   8 +-
 CSparse/Lib/Makefile            |   2 +-
 CSparse/Source/cs_cholsol.c     | 166 +++++++++++++++-----------------
 CSparse/Source/cs_lsolve_gpu.cu |  76 ++++++++++-----
 4 files changed, 134 insertions(+), 118 deletions(-)

diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index 6ec3f2c..be1842b 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -11,17 +11,17 @@ all: lib cs_demo1 cs_demo2 cs_demo3
 # 	- ./cs_demo2 < ../Matrix/10_E.txt
 # 	- ./cs_demo2 < ../Matrix/10_Etrans
 # 	- ./cs_demo2 < ../Matrix/10_Ecol.txt
-# 	- ./cs_demo2 < ../Matrix/5_E.txt
-	- ./cs_demo2 < ../Matrix/10_E_orig.txt
+# 	- ./cs_demo2 < ../Matrix/5_E.txt 
+	- ./cs_demo2 < ../Matrix/10_E_orig.txt 
 # 	- ./cs_demo2 < ../Matrix/15_E_orig.txt
 # 	- ./cs_demo2 < ../Matrix/20_E.txt
-#	- ./cs_demo2 < ../Matrix/t1
+# 	- ./cs_demo2 < ../Matrix/t1
 # 	- ./cs_demo2 < ../Matrix/FEM-2S
 # 	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 # 	- ./cs_demo2 < ../Matrix/bcsstk25
 # 	- ./cs_demo2 < ../Matrix/bcsstk17
 # 	- ./cs_demo2 < ../Matrix/ship_001
-# 	- ./cs_demo2 < ../Matrix/bcsstk01
+# 	- ./cs_demo2 < ../Matrix/bcsstk01-v2
 # 	- ./cs_demo2 < ../Matrix/crystm02
 #  	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 
diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile
index 07d839e..fa1df87 100644
--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -15,7 +15,7 @@
 # CSparse/Lib.  It does not install it for system-wide usage.
 
 LIBRARY = libcsparse
-CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
+CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) #-O
 CC = nvcc -dc -g -O3 # -G -O0 --cudart shared #-DDEBUG
 I = -I../Include
 RANLIB = ranlib
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index e9626e6..3bc37ff 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -56,12 +56,6 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     int    blocks               = total_num_of_LSC_perGPU;
 
 
-
-
-    // blocks = 10;
-    // n_rhs  = 30; 
-
-
     printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
     printf(" - LSC size         = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
     printf(" - LSC size (symm.) = %f MB \n", LSCsize); 
@@ -85,6 +79,13 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
     blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU);
 
+
+    // *** For debugging 
+    blocks = 3;
+    n_rhs  = 2; 
+    // END 
+
+
     printf("Input data memory requiroments to calculate LSC on GPU\n");
     printf(" - Lp   size: %f MB\n", (double)(((N->L->n)+1) *         sizeof(int))     / 1024.0 / 1024.0);
     printf(" - Li   size: %f MB\n", (double)((N->L->nzmax) *         sizeof(int))     / 1024.0 / 1024.0);
@@ -135,6 +136,8 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         int GPU_mem = 0; 
 
         int num_of_arrays = blocks;
+        // int num_of_arrays = 1 ; //blocks;
+
 
         // create intermediate host array for storage of device row-pointers
         int**    h_array_Lp  = (int**)   malloc(num_of_arrays * sizeof(int*));
@@ -206,61 +209,34 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
         printf("================================================================================================\n");
         blocks = 1; 
+        int j = 0; 
+        for(j = 0 ; j < blocks ; j++){
+            cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+        }
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         printf("------------------------------------------------------------------------------------------------\n");
 
         for (i = 80; i < num_of_arrays; i = i * 2) {
             blocks = i; 
+            int j = 0; 
+            for(j = 0 ; j < blocks ; j++){
+                cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+            }
             cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
             cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
             printf("------------------------------------------------------------------------------------------------\n");
         }
 
         blocks = num_of_arrays;
+        for(j = 0 ; j < blocks ; j++){
+            cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+        }
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         printf("================================================================================================\n");
 
-        // printf("================================================================================================\n");
-        // printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n");
-        // int    *d_Lp;
-        // int    *d_Li;
-        // double *d_Lx;
-        // double *d_x;
-
-        // cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
-        // cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
-        // cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
-        // cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
-
-        // cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
-
-        // blocks = num_of_arrays;
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-
-        // printf("------------------------------------------------------------------------------------------------\n");
-        // printf(" - For max num of LSC\n");
-
-        // blocks = total_num_of_LSC_perGPU; 
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // printf("================================================================================================\n");
-
-        // int    *d_Lp = &d_array_Lp[0];
-        // int    *d_Li = &d_array_Li[0];
-        // double *d_Lx = &d_array_Lx[0];
-        // double *d_x  = &d_array_x [0];
 
-        // cs_lsolve_gpu_trans  (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
-
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
 
         printf("\n");
 
@@ -273,70 +249,75 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         // CPU code verification - lsolve for multiple RSH 
         t = tic () ;
         cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
-        printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
+        printf ("CPU code 'cs_lsolveolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
 
         t = tic () ;
         cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
         printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ;
 
+
+
+
         int l;
         printf("\n");
         for(l = 0 ; l < num_of_arrays ; l++) {
             double *x_gpu = x_gpu_array[l]; 
 
 
-//             // *** Debug - check with per element output 
-// #ifdef DEBUG
-//             {
-//                 int i; 
-//                 int r; 
-//                 cs_lsolve  (N->L, x) ;          /* x = L\x */        
-//                 cs_ltsolve (N->L, x) ;          /* x = L'\x */
-//                 for (i=0; i<n; i++) {
-//                     printf("cpu: %f\t gpu:\t", x[i]); 
-//                     for (r = 0; r < n_rhs; r++) {
-//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
-//                             printf("OK\t");
-//                         else
-//                             printf("Er\t"); 
-//                         //  printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
-//                         printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
-//                     }
-//                     printf("\n");
-//                 }
-//                 printf("\n");
-//             }
-// #else
+            // *** Debug - check with per element output 
+    // #define DEBUG
+    #ifdef DEBUG
+            {
+                int i; 
+                int r; 
+                cs_lsolve  (N->L, x) ;          // x = L\x         
+                cs_ltsolve (N->L, x) ;          // x = L'\x 
+                for (i=0; i<n; i++) {
+                    printf("cpu: %f\t gpu:\t", x[i]); 
+                    for (r = 0; r < n_rhs; r++) {
+                        if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
+                            printf("OK ");
+                        else
+                        // printf("Er "); 
+                        // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+                        // printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
+                        printf("%f\t", x_gpu[i*n_rhs + r] );
+
+                    }
+                    printf("\n");
+                }
+                printf("\n");
+            }
+    #else
         
-//             {
-//                 int i; 
-//                 int r; 
-//                 int errors = 0;
-//                 for (i=0; i<n; i++) {
-//                     for (r = 0; r < n_rhs; r++) {
-//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
-//                             errors++; 
-//                         }
-//                     }
-//                 }
-//                 if (errors == 0) 
-//                     printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
-//                 else
-//                     printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
+            {
+                int i; 
+                int r; 
+                int errors = 0;
+                for (i=0; i<n; i++) {
+                    for (r = 0; r < n_rhs; r++) {
+                        if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
+                            errors++; 
+                        }
+                    }
+                }
+                if (errors == 0) 
+                    printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
+                else
+                    printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
             
-//             }
+            }
 
-// #endif
+    #endif
 
         
         }
-
-
-    
-#else 
+   
+#else // define FULL_MEM
     
         // *** Vesion with 
         // Copy Chol. factor and RHSs from CPU to GPU 
+        
         int    *d_Lp;
         int    *d_Li;
         double *d_Lx;
@@ -364,8 +345,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
 	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
  
+
         // *** Debug - check with per element output 
-#ifdef DEBUG
+    # define DEBUG
+    #ifdef DEBUG
         {
         int i; 
         int r; 
@@ -384,9 +367,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         }
         printf("\n");
         }
-#endif
+    #endif
+
+#endif // define FULL_MEM
 
-#endif
 
         // *** END - GPU code  
 
diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu
index 3f2be67..5ff38ce 100644
--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -79,62 +79,94 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub
 
 
 __global__
-void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
+void cs_lsolve_gpu_kernel_trans_multi_opt (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
 {
     
     const int    * __restrict__ const Lp = Lp_a[blockIdx.x];
     const int    * __restrict__ const Li = Li_a[blockIdx.x];
     const double * __restrict__ const Lx = Lx_a[blockIdx.x];
-          double *x  =  x_a[blockIdx.x];
-
+          double *x                      =  x_a[blockIdx.x];
 
     int tid  = threadIdx.x; // + blockIdx.x * blockDim.x;
     int g_id = 0;
 
+    int i, p, j; 
+    int addr; 
+    int rhs_offset; 
+    int Li_p; 
+    double Lx_p;
+    int inc, off;
+
+    
+    __shared__ int    Lp_sh [4096];
+    __shared__ int    Li_sh [1024];
+    __shared__ double Lx_sh [1024];
+
+    for (i = 0; i < (n+2); i = i + blockDim.x) {
+      if ( (i + tid) < (n+2) ) {
+        Lp_sh[i + tid] = Lp[i + tid]; 
+        // printf("Lp_sh[%d] = %d \n", i+tid, Lp_sh[i + tid]);
+      }
+    }
+
+    __syncthreads(); 
+
+
     for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x)
     {
         // printf("g_id = %d \n", g_id);
         if ( g_id + tid < n_rhs ) {
 
-            int p, j;
-            
-            int rhs_offset = g_id + tid;
+            rhs_offset = g_id + tid;
 
             for (j = 0 ; j < n ; j++)
             {
                 
-                int addr = j * n_rhs + rhs_offset;
+                inc = (Lp_sh [j+1] - Lp_sh [j]);
+                // off =  Lp_sh [j]; 
+                // if (tid == 1 ) printf("Inc: %d off: %d \n", inc, off);
+                __syncthreads(); 
+
+                if ( tid < inc ) {
+                  Li_sh [tid] = Li [Lp_sh [j] + tid];
+                  Lx_sh [tid] = Lx [Lp_sh [j] + tid];
+                  // printf("Li_sh[%d] = %d \n", tid, Li_sh[tid]);
+                  // printf("Lx_sh[%d] = %f \n", tid, Lx_sh[tid]);
+                }
 
-                x [addr]  = x [addr] / Lx [Lp [j]] ;
-                
-                for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                __syncthreads(); 
+
+                // if (tid == 0 )printf("Lx[%d] = %f and Lx_sh[%d] = %f \n", off, Lx [off], 0, Lx_sh[0]);
+
+                   addr  = j * n_rhs + rhs_offset;
+                x [addr] = x [addr] / Lx_sh [0] ;
+
+                // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                for (p = 1 ; p < inc ; p++)
                 {
-                    
-                    int    Li_p = Li [p]; 
-                    double Lx_p = Lx [p];
+                    Li_p = Li_sh [p]; 
+                    Lx_p = Lx_sh [p];
 
                     x [Li_p * n_rhs + rhs_offset] = x [Li_p * n_rhs + rhs_offset] - Lx_p * x [addr] ;
+
+                    // x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ;
                     
                     // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
-                
                 }
-
-            }
-               
+            }      
+        __syncthreads(); 
         }
-
     }
-
 }
-
+ 
 __global__
-void cs_lsolve_gpu_kernel_trans_multi_good (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
+void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
 {
     
     const int    * __restrict__ const Lp = Lp_a[blockIdx.x];
     const int    * __restrict__ const Li = Li_a[blockIdx.x];
     const double * __restrict__ const Lx = Lx_a[blockIdx.x];
-          double *x  =  x_a[blockIdx.x];
+          double *x                      =  x_a[blockIdx.x];
 
 
     int tid  = threadIdx.x; // + blockIdx.x * blockDim.x;
-- 
GitLab