Skip to content
Snippets Groups Projects
Commit 0e844bb2 authored by Lubomir Riha's avatar Lubomir Riha
Browse files

updated for automac LSC size calculation and ported to DGX2

parent 1bcadd5d
No related branches found
No related tags found
No related merge requests found
......@@ -10,6 +10,8 @@ all: lib cs_demo1 cs_demo2 cs_demo3
# - ./cs_demo2 < ../Matrix/FEM-2S
- ./cs_demo2 < ../Matrix/bcsstk26-v2
- ./cs_demo2 < ../Matrix/bcsstk25
- ./cs_demo2 < ../Matrix/bcsstk17
- ./cs_demo2 < ../Matrix/ship_001
# - ./cs_demo2 < ../Matrix/bcsstk01
# - ./cs_demo2 < ../Matrix/crystm02
# - ./cs_demo2 < ../Matrix/bcsstk26-v2
......@@ -20,7 +22,7 @@ lib:
cs_demo1: lib cs_demo1.c Makefile
$(CC) $(CF) $(I) -o cs_demo1 cs_demo1.c $(CS)
cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile
cs_demo2: lib cs_demo2.c cs_demo.c cs_demo.h Makefile
$(CC) $(CF) $(I) -o cs_demo2 cs_demo2.c cs_demo.c $(CS)
cs_demo3: lib cs_demo3.c cs_demo.c cs_demo.h Makefile
......
......@@ -16,7 +16,7 @@
LIBRARY = libcsparse
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
CC = nvcc -dc # -g -G -O0 --cudart shared # -DDEBUG
CC = nvcc -dc -g -G -O0 --cudart shared # -DDEBUG
I = -I../Include
RANLIB = ranlib
ARCHIVE = $(AR) $(ARFLAGS)
......
......@@ -34,10 +34,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
{
printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
int n_rhs_ratio = 6;
double GPUmem_sizeGB = 32.0;
double n_rhs_ratio = 6.0;
int n_rhs;
int blocks=13;//0; // cca 30 GB
int blocks=1000;//0; // cca 30 GB
double *x ;
css *S ;
......@@ -45,7 +45,23 @@ csi cs_cholsol (csi order, const cs *A, double *b)
csi n, ok ;
if (!CS_CSC (A) || !b) return (0) ; /* check inputs */
n = A->n ;
n_rhs = n / n_rhs_ratio;
double cube_size = cbrt((double)n/3.0);
n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) );
printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
n_rhs = (int)((double)n / n_rhs_ratio);
// n_rhs = 1000;
printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double);
printf(" - LSC size (symm.) = %f MB \n", LSCsize);
printf(" - number of RHS for this matrix is : %d \n", n_rhs );
int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0;
printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU );
blocks = total_num_of_LSC_perGPU;
printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n);
S = cs_schol (order, A) ; /* ordering and symbolic analysis */
N = cs_chol (A, S) ; /* numeric Cholesky factorization */
x = cs_malloc (n, sizeof (double)) ; /* get workspace */
......@@ -80,7 +96,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
}
// END - Copy RHS vector to multiple columns
/*
int GPU_mem = 0;
int num_of_arrays = blocks;
......@@ -126,7 +142,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0);
printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0);
printf(" x size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0);
printf("LSCsize: %f MB\n", (double)(n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0);
printf("LSCsize: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0);
cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int), cudaMemcpyHostToDevice);
......@@ -151,7 +167,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
*/
// Copy Chol. factor and RHSs from CPU to GPU
......@@ -180,7 +196,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
// CPU code verification - lsolve for multiple RSH
cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
// int i;
// int r;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment