From 2760479cd87f17bd1d27c5ec4d5cf84b72e45657 Mon Sep 17 00:00:00 2001 From: lriha <lubomir.riha@vsb.cz> Date: Thu, 11 Apr 2019 01:58:48 +0200 Subject: [PATCH] Updated testing and memory evaluation --- CSparse/Demo/Makefile | 8 +- CSparse/Demo/cs_demo.c | 2 +- CSparse/Lib/Makefile | 2 +- CSparse/Source/cs_cholsol.c | 213 +++++++++++++++++++++++++++----- CSparse/Source/cs_lsolve.c | 66 ++++++++++ CSparse/Source/cs_lsolve_gpu.cu | 16 +-- 6 files changed, 259 insertions(+), 48 deletions(-) diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index d3c84d7..de4872f 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -8,11 +8,11 @@ CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS) all: lib cs_demo1 cs_demo2 cs_demo3 # - ./cs_demo2 < ../Matrix/t1 # - ./cs_demo2 < ../Matrix/FEM-2S -# - ./cs_demo2 < ../Matrix/bcsstk26-v2 -# - ./cs_demo2 < ../Matrix/bcsstk25 -# - ./cs_demo2 < ../Matrix/bcsstk17 + - ./cs_demo2 < ../Matrix/bcsstk26-v2 + - ./cs_demo2 < ../Matrix/bcsstk25 + - ./cs_demo2 < ../Matrix/bcsstk17 # - ./cs_demo2 < ../Matrix/ship_001 - - ./cs_demo2 < ../Matrix/bcsstk01 +# - ./cs_demo2 < ../Matrix/bcsstk01 # - ./cs_demo2 < ../Matrix/crystm02 # - ./cs_demo2 < ../Matrix/bcsstk26-v2 diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c index abcd210..95d225e 100644 --- a/CSparse/Demo/cs_demo.c +++ b/CSparse/Demo/cs_demo.c @@ -83,7 +83,7 @@ problem *get_problem (FILE *f, double tol) problem *Prob ; Prob = cs_calloc (1, sizeof (problem)) ; if (!Prob) return (NULL) ; - T = cs_load (f) ; /* load triplet matrix T from a file */ + T = cs_load2 (f) ; /* load triplet matrix T from a file */ Prob->A = A = cs_compress (T) ; /* A = compressed-column form of T */ cs_spfree (T) ; /* clear T */ if (!cs_dupl (A)) return (free_problem (Prob)) ; /* sum up duplicates */ diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile index 1a847e5..0dea956 100644 --- a/CSparse/Lib/Makefile +++ b/CSparse/Lib/Makefile @@ -16,7 +16,7 @@ LIBRARY = libcsparse CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O -CC = nvcc -dc -g -G -O0 --cudart shared -DDEBUG +CC = nvcc -dc -g -G -O0 --cudart shared #-DDEBUG I = -I../Include RANLIB = ranlib ARCHIVE = $(AR) $(ARFLAGS) diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index aeafe9e..980d4e1 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -1,6 +1,9 @@ #include "cs.h" #include "cuda_runtime.h" +#include <time.h> +static double tic (void) { return (clock () / (double) CLOCKS_PER_SEC) ; } +static double toc (double t) { double s = tic () ; return (CS_MAX (0, s-t)) ; } /* x=A\b where A is symmetric positive definite; b overwritten with solution */ csi cs_cholsol_cpu (csi order, const cs *A, double *b) @@ -43,6 +46,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) printf("\n--------------------------------------------------------------------------------------------------------------------\n"); printf("Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); + double t; double GPUmem_sizeGB = 32.0; // GB double cube_size = cbrt((double)n/3.0); double n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) ); @@ -51,8 +55,11 @@ csi cs_cholsol (csi order, const cs *A, double *b) int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; int blocks = total_num_of_LSC_perGPU; - blocks = 2 - n_rhs = 2; + + + + // blocks = 10; + // n_rhs = 30; printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); @@ -63,9 +70,33 @@ csi cs_cholsol (csi order, const cs *A, double *b) printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n); printf("\n"); + S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ + + double tot_size_MB = (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0 + + (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0 + + (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0 + + (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0; + // (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0; + + double max_num_of_LSC_processed_per_GPU = GPUmem_sizeGB / tot_size_MB * 1024.0; + + blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU); + + printf("Input data memory requiroments to calculate LSC on GPU\n"); + printf(" - Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); + printf(" - Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); + printf(" - Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); + printf(" - RHSs size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + // printf(" - LSC size: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + printf(" - Tot size: %f MB\n", tot_size_MB); + printf("\nTotal number of LSC that can be processes at the same time based on GPU memory size\n"); + printf(" - GPU memory size: %f GB\n", GPUmem_sizeGB); + printf(" - Max num of LSC : %f \n" , max_num_of_LSC_processed_per_GPU); + printf("\n"); + ok = (S && N && x) ; if (ok) @@ -73,7 +104,6 @@ csi cs_cholsol (csi order, const cs *A, double *b) cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ // *** GPU code start - // - cs_lsolve (N->L, x) ; /* x = L\x */ // Copy RHS vector to multiple columns double *rhs_t; @@ -99,17 +129,18 @@ csi cs_cholsol (csi order, const cs *A, double *b) #define FULL_MEM #ifdef FULL_MEM - { + int GPU_mem = 0; int num_of_arrays = blocks; // create intermediate host array for storage of device row-pointers - int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); - int** h_array_Li = (int**) malloc(num_of_arrays * sizeof(int*)); - double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*)); - double** h_array_x = (double**)malloc(num_of_arrays * sizeof(double*)); - + int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); + int** h_array_Li = (int**) malloc(num_of_arrays * sizeof(int*)); + double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*)); + double** h_array_x = (double**)malloc(num_of_arrays * sizeof(double*)); + double** x_gpu_array = (double**)malloc(num_of_arrays * sizeof(double*)); + // create top-level device array pointer int** d_array_Lp; int** d_array_Li; @@ -121,8 +152,6 @@ csi cs_cholsol (csi order, const cs *A, double *b) cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*)); cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*)); - - size_t free_byte ; size_t total_byte ; cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; @@ -137,16 +166,19 @@ csi cs_cholsol (csi order, const cs *A, double *b) int i; for(i = 0 ; i < num_of_arrays ; i++){ + + x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ; + cudaMalloc(&h_array_Lp[i], ((N->L->n)+1) * sizeof(int)); cudaMalloc(&h_array_Li[i], (N->L->nzmax) * sizeof(int)); cudaMalloc(&h_array_Lx[i], (N->L->nzmax) * sizeof(double)); cudaMalloc(&h_array_x [i], n * n_rhs * sizeof(double)); - printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); - printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); - printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); - printf(" x size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); - printf("LSCsize: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + // printf(" - Lp size: %f MB; ", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); + // printf( "Li size: %f MB; ", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); + // printf( "Lx size: %f MB; ", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); + // printf( "x size: %f MB; ", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + // printf( "LSCsize: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int), cudaMemcpyHostToDevice); @@ -160,7 +192,8 @@ csi cs_cholsol (csi order, const cs *A, double *b) double total_db = (double)total_byte ; double used_db = total_db - free_db ; - printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + if (i < 2 || i > (num_of_arrays - 3)) + printf("%d : GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", i, used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); } // fixup top level device array pointer to point to array of device row-pointers @@ -169,21 +202,138 @@ csi cs_cholsol (csi order, const cs *A, double *b) cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); + printf("================================================================================================\n"); + blocks = 1; + cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + printf("------------------------------------------------------------------------------------------------\n"); + + for (i = 40; i < num_of_arrays; i = i + 40) { + blocks = i; + cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + printf("------------------------------------------------------------------------------------------------\n"); + } + + blocks = num_of_arrays; cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + printf("================================================================================================\n"); + printf("================================================================================================\n"); + printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n"); + int *d_Lp; + int *d_Li; + double *d_Lx; + double *d_x; + + cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); + cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); + cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); + cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); + + cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; + + blocks = num_of_arrays; + cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + + printf("------------------------------------------------------------------------------------------------\n"); + printf(" - For max num of LSC\n"); + + blocks = total_num_of_LSC_perGPU; + cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + printf("================================================================================================\n"); - double** x_gpu_array = (double**)malloc(num_of_arrays * sizeof(double*)); + + // int *d_Lp = &d_array_Lp[0]; + // int *d_Li = &d_array_Li[0]; + // double *d_Lx = &d_array_Lx[0]; + // double *d_x = &d_array_x [0]; + + // cs_lsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks); + // cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks); + + // cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + + printf("\n"); + + t = tic () ; for(i = 0 ; i < num_of_arrays ; i++) { - x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ; cudaMemcpy(x_gpu_array[i], h_array_x [i], n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); } + printf ("Mem copy GPU to CPU all LSCs : %8.2f ms \n", 1000.0 * toc (t)) ; - double *x_gpu = x_gpu_array[1]; + // CPU code verification - lsolve for multiple RSH + t = tic () ; + cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ + printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ; - } + t = tic () ; + cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ + printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ; + + int l; + printf("\n"); + for(l = 0 ; l < num_of_arrays ; l++) { + double *x_gpu = x_gpu_array[l]; + + +// // *** Debug - check with per element output +// #ifdef DEBUG +// { +// int i; +// int r; +// cs_lsolve (N->L, x) ; /* x = L\x */ +// cs_ltsolve (N->L, x) ; /* x = L'\x */ +// for (i=0; i<n; i++) { +// printf("cpu: %f\t gpu:\t", x[i]); +// for (r = 0; r < n_rhs; r++) { +// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) +// printf("OK\t"); +// else +// printf("Er\t"); +// // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); +// printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); +// } +// printf("\n"); +// } +// printf("\n"); +// } +// #else + +// { +// int i; +// int r; +// int errors = 0; +// for (i=0; i<n; i++) { +// for (r = 0; r < n_rhs; r++) { +// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { +// errors++; +// } +// } +// } +// if (errors == 0) +// printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l); +// else +// printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l); + +// } + +// #endif + + + } + + + #else - { + // *** Vesion with // Copy Chol. factor and RHSs from CPU to GPU int *d_Lp; @@ -208,10 +358,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); - } -#endif - - + // CPU code verification - lsolve for multiple RSH cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ @@ -221,10 +368,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) { int i; int r; - - cs_lsolve (N->L, x) ; /* x = L\x */ - cs_ltsolve (N->L, x) ; /* x = L'\x */ - + for (i=0; i<n; i++) { printf("cpu: %f\t gpu:\t", x[i]); for (r = 0; r < n_rhs; r++) { @@ -239,14 +383,15 @@ csi cs_cholsol (csi order, const cs *A, double *b) } printf("\n"); } -#else - cs_lsolve (N->L, x) ; /* x = L\x */ - cs_ltsolve (N->L, x) ; /* x = L'\x */ #endif - // *** END - GPU code +#endif + // *** END - GPU code + cs_lsolve (N->L, x) ; /* x = L\x */ + + cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } diff --git a/CSparse/Source/cs_lsolve.c b/CSparse/Source/cs_lsolve.c index 1e92434..cda8178 100644 --- a/CSparse/Source/cs_lsolve.c +++ b/CSparse/Source/cs_lsolve.c @@ -21,6 +21,71 @@ csi cs_lsolve (const cs *L, double *x) } +csi cs_lsolve_mrhs_par2 (const cs *L, double *x, csi n_rhs) +{ + csi r, p, j, n, *Lp, *Li ; + double *Lx ; + if (!CS_CSC (L) || !x) return (0) ; /* check inputs */ + n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ; + + + for (j = 0 ; j < n ; j++) + { + #pragma omp simd + for (r = 0; r < n_rhs; r++) + { + csi addr = j * n_rhs + r; + x [addr] /= Lx [Lp [j]] ; + } + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + #pragma omp simd + for (r = 0; r < n_rhs; r++) + { + csi addr = j * n_rhs + r; + x [Li [p] * n_rhs + r] -= Lx [p] * x [addr] ; + //printf("XPU: %f", x [Li [p]]); + } + } + } + + return (1) ; +} + +csi cs_lsolve_mrhs_par1 (const cs *L, double *x, csi n_rhs) +{ + csi r, p, j, n, *Lp, *Li ; + double *Lx ; + if (!CS_CSC (L) || !x) return (0) ; /* check inputs */ + n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ; + + #pragma omp parallel for simd + for (r = 0; r < n_rhs; r++) + { + for (j = 0 ; j < n ; j++) + { + // for (r = 0; r < n_rhs; r++) + // { + csi addr = j * n_rhs + r; + x [addr] /= Lx [Lp [j]] ; + // } + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + // for (r = 0; r < n_rhs; r++) + // { + csi addr = j * n_rhs + r; + x [Li [p] * n_rhs + r] -= Lx [p] * x [addr] ; + //printf("XPU: %f", x [Li [p]]); + // } + } + } + } + + return (1) ; +} + csi cs_lsolve_mrhs (const cs *L, double *x, csi n_rhs) { csi r, p, j, n, *Lp, *Li ; @@ -28,6 +93,7 @@ csi cs_lsolve_mrhs (const cs *L, double *x, csi n_rhs) if (!CS_CSC (L) || !x) return (0) ; /* check inputs */ n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ; + for (j = 0 ; j < n ; j++) { for (r = 0; r < n_rhs; r++) diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu index 966045e..0367765 100644 --- a/CSparse/Source/cs_lsolve_gpu.cu +++ b/CSparse/Source/cs_lsolve_gpu.cu @@ -209,7 +209,7 @@ extern "C" { if (threads > n_rhs) threads = n_rhs; if (threads > MAX_THREADS) threads = MAX_THREADS; - printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + printf("\nRunning kernel 'cs_lsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -223,7 +223,7 @@ extern "C" { float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); } } @@ -236,7 +236,7 @@ extern "C" { if (threads > n_rhs) threads = n_rhs; if (threads > MAX_THREADS) threads = MAX_THREADS; - printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + printf("\nRunning kernel 'cs_lsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -250,7 +250,7 @@ extern "C" { float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); } } @@ -263,7 +263,7 @@ extern "C" { if (threads > n_rhs) threads = n_rhs; if (threads > MAX_THREADS) threads = MAX_THREADS; - printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + printf("\nRunning kernel 'cs_ltsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -277,7 +277,7 @@ extern "C" { float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); } } @@ -290,7 +290,7 @@ extern "C" { if (threads > n_rhs) threads = n_rhs; if (threads > MAX_THREADS) threads = MAX_THREADS; - printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + printf("\nRunning kernel 'cs_ltsolve_gpu_trans' with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); cudaEvent_t start, stop; cudaEventCreate(&start); @@ -304,7 +304,7 @@ extern "C" { float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + printf(" - GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); } } -- GitLab