From 0e844bb28c85e8b7a87d482cc3996d0d7d612db0 Mon Sep 17 00:00:00 2001 From: lriha <lubomir.riha@vsb.cz> Date: Wed, 10 Apr 2019 12:01:24 +0200 Subject: [PATCH] updated for automac LSC size calculation and ported to DGX2 --- CSparse/Demo/Makefile | 4 +++- CSparse/Lib/Makefile | 2 +- CSparse/Source/cs_cholsol.c | 32 ++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index 0a41b93..55010dd 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -10,6 +10,8 @@ all: lib cs_demo1 cs_demo2 cs_demo3 # - ./cs_demo2 < ../Matrix/FEM-2S - ./cs_demo2 < ../Matrix/bcsstk26-v2 - ./cs_demo2 < ../Matrix/bcsstk25 + - ./cs_demo2 < ../Matrix/bcsstk17 + - ./cs_demo2 < ../Matrix/ship_001 # - ./cs_demo2 < ../Matrix/bcsstk01 # - ./cs_demo2 < ../Matrix/crystm02 # - ./cs_demo2 < ../Matrix/bcsstk26-v2 @@ -20,7 +22,7 @@ lib: cs_demo1: lib cs_demo1.c Makefile $(CC) $(CF) $(I) -o cs_demo1 cs_demo1.c $(CS) -cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile +cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile $(CC) $(CF) $(I) -o cs_demo2 cs_demo2.c cs_demo.c $(CS) cs_demo3: lib cs_demo3.c cs_demo.c cs_demo.h Makefile diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile index c421f77..aa4afc8 100644 --- a/CSparse/Lib/Makefile +++ b/CSparse/Lib/Makefile @@ -16,7 +16,7 @@ LIBRARY = libcsparse CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O -CC = nvcc -dc # -g -G -O0 --cudart shared # -DDEBUG +CC = nvcc -dc -g -G -O0 --cudart shared # -DDEBUG I = -I../Include RANLIB = ranlib ARCHIVE = $(AR) $(ARFLAGS) diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index de981fd..69d6d6b 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -34,10 +34,10 @@ csi cs_cholsol (csi order, const cs *A, double *b) { printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); - - int n_rhs_ratio = 6; + double GPUmem_sizeGB = 32.0; + double n_rhs_ratio = 6.0; int n_rhs; - int blocks=13;//0; // cca 30 GB + int blocks=1000;//0; // cca 30 GB double *x ; css *S ; @@ -45,7 +45,23 @@ csi cs_cholsol (csi order, const cs *A, double *b) csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; - n_rhs = n / n_rhs_ratio; + + double cube_size = cbrt((double)n/3.0); + n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) ); + printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); + n_rhs = (int)((double)n / n_rhs_ratio); + // n_rhs = 1000; + printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs); + double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double); + printf(" - LSC size (symm.) = %f MB \n", LSCsize); + printf(" - number of RHS for this matrix is : %d \n", n_rhs ); + int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; + printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU ); + blocks = total_num_of_LSC_perGPU; + printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n); + + + S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ @@ -80,7 +96,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) } // END - Copy RHS vector to multiple columns - +/* int GPU_mem = 0; int num_of_arrays = blocks; @@ -126,7 +142,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); printf(" x size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); - printf("LSCsize: %f MB\n", (double)(n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + printf("LSCsize: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int), cudaMemcpyHostToDevice); @@ -151,7 +167,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); - +*/ // Copy Chol. factor and RHSs from CPU to GPU @@ -180,7 +196,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) // CPU code verification - lsolve for multiple RSH cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ - cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ + cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ // int i; // int r; -- GitLab