From 0e844bb28c85e8b7a87d482cc3996d0d7d612db0 Mon Sep 17 00:00:00 2001
From: lriha <lubomir.riha@vsb.cz>
Date: Wed, 10 Apr 2019 12:01:24 +0200
Subject: [PATCH] updated for automac LSC size calculation and ported to DGX2

---
 CSparse/Demo/Makefile       |  4 +++-
 CSparse/Lib/Makefile        |  2 +-
 CSparse/Source/cs_cholsol.c | 32 ++++++++++++++++++++++++--------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index 0a41b93..55010dd 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -10,6 +10,8 @@ all: lib cs_demo1 cs_demo2 cs_demo3
 # 	- ./cs_demo2 < ../Matrix/FEM-2S
 	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 	- ./cs_demo2 < ../Matrix/bcsstk25
+	- ./cs_demo2 < ../Matrix/bcsstk17
+	- ./cs_demo2 < ../Matrix/ship_001
 # 	- ./cs_demo2 < ../Matrix/bcsstk01
 # 	- ./cs_demo2 < ../Matrix/crystm02
 #  	- ./cs_demo2 < ../Matrix/bcsstk26-v2
@@ -20,7 +22,7 @@ lib:
 cs_demo1: lib cs_demo1.c Makefile
 	$(CC) $(CF) $(I) -o cs_demo1 cs_demo1.c $(CS)
 
-cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile
+cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile 
 	$(CC) $(CF) $(I) -o cs_demo2 cs_demo2.c cs_demo.c $(CS)
 
 cs_demo3: lib cs_demo3.c cs_demo.c cs_demo.h Makefile
diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile
index c421f77..aa4afc8 100644
--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -16,7 +16,7 @@
 
 LIBRARY = libcsparse
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
-CC = nvcc -dc # -g -G -O0 --cudart shared # -DDEBUG
+CC = nvcc -dc -g -G -O0 --cudart shared # -DDEBUG
 I = -I../Include
 RANLIB = ranlib
 ARCHIVE = $(AR) $(ARFLAGS)
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index de981fd..69d6d6b 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -34,10 +34,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 {
     
     printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
-
-    int n_rhs_ratio = 6;
+    double GPUmem_sizeGB = 32.0;
+    double n_rhs_ratio = 6.0;
     int n_rhs; 
-    int blocks=13;//0; // cca 30 GB
+    int blocks=1000;//0; // cca 30 GB
 
     double *x ;
     css *S ;
@@ -45,7 +45,23 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     csi n, ok ;
     if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
     n = A->n ;
-    n_rhs = n / n_rhs_ratio; 
+
+    double cube_size = cbrt((double)n/3.0);
+    n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) );  
+    printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
+    n_rhs = (int)((double)n / n_rhs_ratio); 
+    // n_rhs = 1000;
+    printf(" - LSC size         = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
+    double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double);
+    printf(" - LSC size (symm.) = %f MB \n", LSCsize); 
+    printf(" - number of RHS for this matrix is : %d \n", n_rhs );
+    int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0;
+    printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU );
+    blocks = total_num_of_LSC_perGPU;
+    printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n);
+
+
+
     S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
     N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
     x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
@@ -80,7 +96,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         }
         // END - Copy RHS vector to multiple columns 
 
-
+/*
         int GPU_mem = 0; 
 
         int num_of_arrays = blocks;
@@ -126,7 +142,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
             printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int))     / 1024.0 / 1024.0);
             printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double))  / 1024.0 / 1024.0);
             printf(" x size: %f MB\n",     (double)(n * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
-            printf("LSCsize: %f MB\n",     (double)(n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
+            printf("LSCsize: %f MB\n",     (double)(0.5*n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
 
             cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int),    cudaMemcpyHostToDevice);
             cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int),    cudaMemcpyHostToDevice);
@@ -151,7 +167,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
-
+*/
 
 
         // Copy Chol. factor and RHSs from CPU to GPU 
@@ -180,7 +196,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
         // CPU code verification - lsolve for multiple RSH 
         cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
-        cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
+	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
 
         // int i; 
         // int r; 
-- 
GitLab