Working version - optimal kernel doesnot give correct results

39f726ca · Lubomir Riha · 0bd977f8 · 39f726ca · 39f726ca · 39f726ca
Commit 39f726ca authored 6 years ago by Lubomir Riha
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -11,17 +11,17 @@ all: lib cs_demo1 cs_demo2 cs_demo3
 # 	- ./cs_demo2 < ../Matrix/10_E.txt
 # 	- ./cs_demo2 < ../Matrix/10_Etrans
 # 	- ./cs_demo2 < ../Matrix/10_Ecol.txt
-# 	- ./cs_demo2 < ../Matrix/5_E.txt
+# 	- ./cs_demo2 < ../Matrix/5_E.txt 
-	- ./cs_demo2 < ../Matrix/10_E_orig.txt
+	- ./cs_demo2 < ../Matrix/10_E_orig.txt 
 # 	- ./cs_demo2 < ../Matrix/15_E_orig.txt
 # 	- ./cs_demo2 < ../Matrix/20_E.txt
-#	- ./cs_demo2 < ../Matrix/t1
+# 	- ./cs_demo2 < ../Matrix/t1
 # 	- ./cs_demo2 < ../Matrix/FEM-2S
 # 	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 # 	- ./cs_demo2 < ../Matrix/bcsstk25
 # 	- ./cs_demo2 < ../Matrix/bcsstk17
 # 	- ./cs_demo2 < ../Matrix/ship_001
-# 	- ./cs_demo2 < ../Matrix/bcsstk01
+# 	- ./cs_demo2 < ../Matrix/bcsstk01-v2
 # 	- ./cs_demo2 < ../Matrix/crystm02
 #  	- ./cs_demo2 < ../Matrix/bcsstk26-v2

--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -15,7 +15,7 @@
 # CSparse/Lib.  It does not install it for system-wide usage.
 LIBRARY = libcsparse
-CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
+CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) #-O
 CC = nvcc -dc -g -O3 # -G -O0 --cudart shared #-DDEBUG
 I = -I../Include
 RANLIB = ranlib

--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -56,12 +56,6 @@ csi cs_cholsol (csi order, const cs *A, double *b)
    int    blocks               = total_num_of_LSC_perGPU;
-    // blocks = 10;
-    // n_rhs  = 30; 
    printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
    printf(" - LSC size         = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
    printf(" - LSC size (symm.) = %f MB \n", LSCsize); 
@@ -85,6 +79,13 @@ csi cs_cholsol (csi order, const cs *A, double *b)
    blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU);
+    // *** For debugging 
+    blocks = 3;
+    n_rhs  = 2; 
+    // END 
    printf("Input data memory requiroments to calculate LSC on GPU\n");
    printf(" - Lp   size: %f MB\n", (double)(((N->L->n)+1) *         sizeof(int))     / 1024.0 / 1024.0);
    printf(" - Li   size: %f MB\n", (double)((N->L->nzmax) *         sizeof(int))     / 1024.0 / 1024.0);
@@ -135,6 +136,8 @@ csi cs_cholsol (csi order, const cs *A, double *b)
        int GPU_mem = 0; 
        int num_of_arrays = blocks;
+        // int num_of_arrays = 1 ; //blocks;
        // create intermediate host array for storage of device row-pointers
        int**    h_array_Lp  = (int**)   malloc(num_of_arrays * sizeof(int*));
@@ -206,61 +209,34 @@ csi cs_cholsol (csi order, const cs *A, double *b)
        printf("================================================================================================\n");
        blocks = 1; 
+        int j = 0; 
+        for(j = 0 ; j < blocks ; j++){
+            cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+        }
        cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
        cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
        printf("------------------------------------------------------------------------------------------------\n");
        for (i = 80; i < num_of_arrays; i = i * 2) {
            blocks = i; 
+            int j = 0; 
+            for(j = 0 ; j < blocks ; j++){
+                cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+            }
            cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
            cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
            printf("------------------------------------------------------------------------------------------------\n");
        }
        blocks = num_of_arrays;
+        for(j = 0 ; j < blocks ; j++){
+            cudaMemcpy(h_array_x [j] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+        }
        cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
        cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
        printf("================================================================================================\n");
-        // printf("================================================================================================\n");
-        // printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n");
-        // int    *d_Lp;
-        // int    *d_Li;
-        // double *d_Lx;
-        // double *d_x;
-        // cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
-        // cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
-        // cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
-        // cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
-        // cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
-        // cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
-        // blocks = num_of_arrays;
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // printf("------------------------------------------------------------------------------------------------\n");
-        // printf(" - For max num of LSC\n");
-        // blocks = total_num_of_LSC_perGPU; 
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // printf("================================================================================================\n");
-        // int    *d_Lp = &d_array_Lp[0];
-        // int    *d_Li = &d_array_Li[0];
-        // double *d_Lx = &d_array_Lx[0];
-        // double *d_x  = &d_array_x [0];
-        // cs_lsolve_gpu_trans  (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
-        // cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
        printf("\n");
@@ -273,70 +249,75 @@ csi cs_cholsol (csi order, const cs *A, double *b)
        // CPU code verification - lsolve for multiple RSH 
        t = tic () ;
        cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
-        printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
+        printf ("CPU code 'cs_lsolveolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
        t = tic () ;
        cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
        printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ;
        int l;
        printf("\n");
        for(l = 0 ; l < num_of_arrays ; l++) {
            double *x_gpu = x_gpu_array[l]; 
-//             // *** Debug - check with per element output 
+            // *** Debug - check with per element output 
-// #ifdef DEBUG
+    // #define DEBUG
-//             {
+    #ifdef DEBUG
-//                 int i; 
+            {
-//                 int r; 
+                int i; 
-//                 cs_lsolve  (N->L, x) ;          /* x = L\x */        
+                int r; 
-//                 cs_ltsolve (N->L, x) ;          /* x = L'\x */
+                cs_lsolve  (N->L, x) ;          // x = L\x         
-//                 for (i=0; i<n; i++) {
+                cs_ltsolve (N->L, x) ;          // x = L'\x 
-//                     printf("cpu: %f\t gpu:\t", x[i]); 
+                for (i=0; i<n; i++) {
-//                     for (r = 0; r < n_rhs; r++) {
+                    printf("cpu: %f\t gpu:\t", x[i]); 
-//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
+                    for (r = 0; r < n_rhs; r++) {
-//                             printf("OK\t");
+                        if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
-//                         else
+                            printf("OK ");
-//                             printf("Er\t"); 
+                        else
-//                         //  printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+                        // printf("Er "); 
-//                         printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
+                        // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
-//                     }
+                        // printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
-//                     printf("\n");
+                        printf("%f\t", x_gpu[i*n_rhs + r] );
-//                 }
-//                 printf("\n");
+                    }
-//             }
+                    printf("\n");
-// #else
+                }
+                printf("\n");
+            }
+    #else
-//             {
+            {
-//                 int i; 
+                int i; 
-//                 int r; 
+                int r; 
-//                 int errors = 0;
+                int errors = 0;
-//                 for (i=0; i<n; i++) {
+                for (i=0; i<n; i++) {
-//                     for (r = 0; r < n_rhs; r++) {
+                    for (r = 0; r < n_rhs; r++) {
-//                         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
+                        if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
-//                             errors++; 
+                            errors++; 
-//                         }
+                        }
-//                     }
+                    }
-//                 }
+                }
-//                 if (errors == 0) 
+                if (errors == 0) 
-//                     printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
+                    printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
-//                 else
+                else
-//                     printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
+                    printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
-//             }
+            }
-// #endif
+    #endif
        }
+#else // define FULL_MEM
-#else 
        // *** Vesion with 
        // Copy Chol. factor and RHSs from CPU to GPU 
        int    *d_Lp;
        int    *d_Li;
        double *d_Lx;
@@ -364,8 +345,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
        cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
 	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
        // *** Debug - check with per element output 
-#ifdef DEBUG
+    # define DEBUG
+    #ifdef DEBUG
        {
        int i; 
        int r; 
@@ -384,9 +367,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
        }
        printf("\n");
        }
-#endif
+    #endif
+#endif // define FULL_MEM
-#endif
        // *** END - GPU code  

--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -79,62 +79,94 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub
 __global__
-void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
+void cs_lsolve_gpu_kernel_trans_multi_opt (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
 {
    const int    * __restrict__ const Lp = Lp_a[blockIdx.x];
    const int    * __restrict__ const Li = Li_a[blockIdx.x];
    const double * __restrict__ const Lx = Lx_a[blockIdx.x];
-          double *x  =  x_a[blockIdx.x];
+          double *x                      =  x_a[blockIdx.x];
    int tid  = threadIdx.x; // + blockIdx.x * blockDim.x;
    int g_id = 0;
+    int i, p, j; 
+    int addr; 
+    int rhs_offset; 
+    int Li_p; 
+    double Lx_p;
+    int inc, off;
+    __shared__ int    Lp_sh [4096];
+    __shared__ int    Li_sh [1024];
+    __shared__ double Lx_sh [1024];
+    for (i = 0; i < (n+2); i = i + blockDim.x) {
+      if ( (i + tid) < (n+2) ) {
+        Lp_sh[i + tid] = Lp[i + tid]; 
+        // printf("Lp_sh[%d] = %d \n", i+tid, Lp_sh[i + tid]);
+      }
+    }
+    __syncthreads(); 
    for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x)
    {
        // printf("g_id = %d \n", g_id);
        if ( g_id + tid < n_rhs ) {
-            int p, j;
+            rhs_offset = g_id + tid;
-            int rhs_offset = g_id + tid;
            for (j = 0 ; j < n ; j++)
            {
-                int addr = j * n_rhs + rhs_offset;
+                inc = (Lp_sh [j+1] - Lp_sh [j]);
+                // off =  Lp_sh [j]; 
+                // if (tid == 1 ) printf("Inc: %d off: %d \n", inc, off);
+                __syncthreads(); 
+                if ( tid < inc ) {
+                  Li_sh [tid] = Li [Lp_sh [j] + tid];
+                  Lx_sh [tid] = Lx [Lp_sh [j] + tid];
+                  // printf("Li_sh[%d] = %d \n", tid, Li_sh[tid]);
+                  // printf("Lx_sh[%d] = %f \n", tid, Lx_sh[tid]);
+                }
-                x [addr]  = x [addr] / Lx [Lp [j]] ;
+                __syncthreads(); 
-                for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                // if (tid == 0 )printf("Lx[%d] = %f and Lx_sh[%d] = %f \n", off, Lx [off], 0, Lx_sh[0]);
+                   addr  = j * n_rhs + rhs_offset;
+                x [addr] = x [addr] / Lx_sh [0] ;
+                // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                for (p = 1 ; p < inc ; p++)
                {
+                    Li_p = Li_sh [p]; 
-                    int    Li_p = Li [p]; 
+                    Lx_p = Lx_sh [p];
-                    double Lx_p = Lx [p];
                    x [Li_p * n_rhs + rhs_offset] = x [Li_p * n_rhs + rhs_offset] - Lx_p * x [addr] ;
+                    // x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ;
                    // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
                }
+            }      
-            }
+        __syncthreads(); 
        }
    }
 }
 __global__
-void cs_lsolve_gpu_kernel_trans_multi_good (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
+void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
 {
    const int    * __restrict__ const Lp = Lp_a[blockIdx.x];
    const int    * __restrict__ const Li = Li_a[blockIdx.x];
    const double * __restrict__ const Lx = Lx_a[blockIdx.x];
-          double *x  =  x_a[blockIdx.x];
+          double *x                      =  x_a[blockIdx.x];
    int tid  = threadIdx.x; // + blockIdx.x * blockDim.x;