diff --git a/CSparse/Demo/cs_demo2.c b/CSparse/Demo/cs_demo2.c index fb91eb5676937ba261daefcbe8d931b4c01004fe..0be84cb1c1a708e73dc4515a9b2e1d83bc61d3ad 100644 --- a/CSparse/Demo/cs_demo2.c +++ b/CSparse/Demo/cs_demo2.c @@ -51,10 +51,8 @@ int main (void) // demo2 (Prob) ; // free_problem (Prob) ; - problem *Prob = get_problem (stdin, 1e-14) ; - cs *A, *C ; double *b, *x, *resid, t, tol ; csi k, m, n, ok, order, nb, ns, *r, *s, *rr, sprank ; @@ -75,8 +73,7 @@ int main (void) { ns += ((r [k+1] == r [k]+1) && (s [k+1] == s [k]+1)) ; } - printf ("blocks: %g singletons: %g structural rank: %g\n", - (double) nb, (double) ns, (double) sprank) ; + printf ("blocks: %g singletons: %g structural rank: %g\n", (double) nb, (double) ns, (double) sprank) ; cs_dfree (D) ; // for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */ @@ -102,7 +99,7 @@ int main (void) // printf ("time: %8.2f ", toc (t)) ; // print_resid (ok, C, x, b, resid) ; /* print residual */ // } - // if (!Prob->sym) return (1) ; + if (!Prob->sym) return (1) ; for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */ { if (!order && m > 1000) continue ; diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index 69d6d6bc864060ea37d51da782aa94f188ff406f..c77f2ef01543ef235740609a81918c88043ae2e6 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -31,13 +31,7 @@ csi cs_cholsol_cpu (csi order, const cs *A, double *b) csi cs_cholsol (csi order, const cs *A, double *b) -{ - - printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); - double GPUmem_sizeGB = 32.0; - double n_rhs_ratio = 6.0; - int n_rhs; - int blocks=1000;//0; // cca 30 GB +{ double *x ; css *S ; @@ -45,22 +39,29 @@ csi cs_cholsol (csi order, const cs *A, double *b) csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; + + printf("\n--------------------------------------------------------------------------------------------------------------------\n"); + printf("Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); + + double GPUmem_sizeGB = 32.0; // GB + double cube_size = cbrt((double)n/3.0); + double n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) ); + int n_rhs = (int)((double)n / n_rhs_ratio); + double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double); + int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; + int blocks = total_num_of_LSC_perGPU; + + blocks = 1; + n_rhs = 1; + - double cube_size = cbrt((double)n/3.0); - n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) ); printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); - n_rhs = (int)((double)n / n_rhs_ratio); - // n_rhs = 1000; printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs); - double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double); printf(" - LSC size (symm.) = %f MB \n", LSCsize); printf(" - number of RHS for this matrix is : %d \n", n_rhs ); - int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU ); - blocks = total_num_of_LSC_perGPU; printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n); - - + printf("\n"); S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */