From 39f726caa9e5d329b88acfaafede2a16f68b6c32 Mon Sep 17 00:00:00 2001 From: lriha <lubomir.riha@vsb.cz> Date: Tue, 7 May 2019 15:10:52 +0200 Subject: [PATCH] Working version - optimal kernel doesnot give correct results --- CSparse/Demo/Makefile | 8 +- CSparse/Lib/Makefile | 2 +- CSparse/Source/cs_cholsol.c | 166 +++++++++++++++----------------- CSparse/Source/cs_lsolve_gpu.cu | 76 ++++++++++----- 4 files changed, 134 insertions(+), 118 deletions(-) diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index 6ec3f2c..be1842b 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -11,17 +11,17 @@ all: lib cs_demo1 cs_demo2 cs_demo3 # - ./cs_demo2 < ../Matrix/10_E.txt # - ./cs_demo2 < ../Matrix/10_Etrans # - ./cs_demo2 < ../Matrix/10_Ecol.txt -# - ./cs_demo2 < ../Matrix/5_E.txt - - ./cs_demo2 < ../Matrix/10_E_orig.txt +# - ./cs_demo2 < ../Matrix/5_E.txt + - ./cs_demo2 < ../Matrix/10_E_orig.txt # - ./cs_demo2 < ../Matrix/15_E_orig.txt # - ./cs_demo2 < ../Matrix/20_E.txt -# - ./cs_demo2 < ../Matrix/t1 +# - ./cs_demo2 < ../Matrix/t1 # - ./cs_demo2 < ../Matrix/FEM-2S # - ./cs_demo2 < ../Matrix/bcsstk26-v2 # - ./cs_demo2 < ../Matrix/bcsstk25 # - ./cs_demo2 < ../Matrix/bcsstk17 # - ./cs_demo2 < ../Matrix/ship_001 -# - ./cs_demo2 < ../Matrix/bcsstk01 +# - ./cs_demo2 < ../Matrix/bcsstk01-v2 # - ./cs_demo2 < ../Matrix/crystm02 # - ./cs_demo2 < ../Matrix/bcsstk26-v2 diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile index 07d839e..fa1df87 100644 --- a/CSparse/Lib/Makefile +++ b/CSparse/Lib/Makefile @@ -15,7 +15,7 @@ # CSparse/Lib. It does not install it for system-wide usage. LIBRARY = libcsparse -CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O +CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) #-O CC = nvcc -dc -g -O3 # -G -O0 --cudart shared #-DDEBUG I = -I../Include RANLIB = ranlib diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index e9626e6..3bc37ff 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -56,12 +56,6 @@ csi cs_cholsol (csi order, const cs *A, double *b) int blocks = total_num_of_LSC_perGPU; - - - // blocks = 10; - // n_rhs = 30; - - printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs); printf(" - LSC size (symm.) = %f MB \n", LSCsize); @@ -85,6 +79,13 @@ csi cs_cholsol (csi order, const cs *A, double *b) blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU); + + // *** For debugging + blocks = 3; + n_rhs = 2; + // END + + printf("Input data memory requiroments to calculate LSC on GPU\n"); printf(" - Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); printf(" - Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); @@ -135,6 +136,8 @@ csi cs_cholsol (csi order, const cs *A, double *b) int GPU_mem = 0; int num_of_arrays = blocks; + // int num_of_arrays = 1 ; //blocks; + // create intermediate host array for storage of device row-pointers int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); @@ -206,61 +209,34 @@ csi cs_cholsol (csi order, const cs *A, double *b) printf("================================================================================================\n"); blocks = 1; + int j = 0; + for(j = 0 ; j < blocks ; j++){ + cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice); + } cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); printf("------------------------------------------------------------------------------------------------\n"); for (i = 80; i < num_of_arrays; i = i * 2) { blocks = i; + int j = 0; + for(j = 0 ; j < blocks ; j++){ + cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice); + } cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); printf("------------------------------------------------------------------------------------------------\n"); } blocks = num_of_arrays; + for(j = 0 ; j < blocks ; j++){ + cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice); + } cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); printf("================================================================================================\n"); - // printf("================================================================================================\n"); - // printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n"); - // int *d_Lp; - // int *d_Li; - // double *d_Lx; - // double *d_x; - - // cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); - // cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); - // cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); - // cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); - - // cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; - // cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; - // cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; - // cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; - - // blocks = num_of_arrays; - // cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - - // printf("------------------------------------------------------------------------------------------------\n"); - // printf(" - For max num of LSC\n"); - - // blocks = total_num_of_LSC_perGPU; - // cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - // printf("================================================================================================\n"); - - // int *d_Lp = &d_array_Lp[0]; - // int *d_Li = &d_array_Li[0]; - // double *d_Lx = &d_array_Lx[0]; - // double *d_x = &d_array_x [0]; - // cs_lsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks); - // cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks); - - // cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - // cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); printf("\n"); @@ -273,70 +249,75 @@ csi cs_cholsol (csi order, const cs *A, double *b) // CPU code verification - lsolve for multiple RSH t = tic () ; cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ - printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ; + printf ("CPU code 'cs_lsolveolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ; t = tic () ; cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ; + + + int l; printf("\n"); for(l = 0 ; l < num_of_arrays ; l++) { double *x_gpu = x_gpu_array[l]; -// // *** Debug - check with per element output -// #ifdef DEBUG -// { -// int i; -// int r; -// cs_lsolve (N->L, x) ; /* x = L\x */ -// cs_ltsolve (N->L, x) ; /* x = L'\x */ -// for (i=0; i<n; i++) { -// printf("cpu: %f\t gpu:\t", x[i]); -// for (r = 0; r < n_rhs; r++) { -// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) -// printf("OK\t"); -// else -// printf("Er\t"); -// // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); -// printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); -// } -// printf("\n"); -// } -// printf("\n"); -// } -// #else + // *** Debug - check with per element output + // #define DEBUG + #ifdef DEBUG + { + int i; + int r; + cs_lsolve (N->L, x) ; // x = L\x + cs_ltsolve (N->L, x) ; // x = L'\x + for (i=0; i<n; i++) { + printf("cpu: %f\t gpu:\t", x[i]); + for (r = 0; r < n_rhs; r++) { + if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) + printf("OK "); + else + // printf("Er "); + // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); + // printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); + printf("%f\t", x_gpu[i*n_rhs + r] ); + + } + printf("\n"); + } + printf("\n"); + } + #else -// { -// int i; -// int r; -// int errors = 0; -// for (i=0; i<n; i++) { -// for (r = 0; r < n_rhs; r++) { -// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { -// errors++; -// } -// } -// } -// if (errors == 0) -// printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l); -// else -// printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l); + { + int i; + int r; + int errors = 0; + for (i=0; i<n; i++) { + for (r = 0; r < n_rhs; r++) { + if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { + errors++; + } + } + } + if (errors == 0) + printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l); + else + printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l); -// } + } -// #endif + #endif } - - - -#else + +#else // define FULL_MEM // *** Vesion with // Copy Chol. factor and RHSs from CPU to GPU + int *d_Lp; int *d_Li; double *d_Lx; @@ -364,8 +345,10 @@ csi cs_cholsol (csi order, const cs *A, double *b) cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ + // *** Debug - check with per element output -#ifdef DEBUG + # define DEBUG + #ifdef DEBUG { int i; int r; @@ -384,9 +367,10 @@ csi cs_cholsol (csi order, const cs *A, double *b) } printf("\n"); } -#endif + #endif + +#endif // define FULL_MEM -#endif // *** END - GPU code diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu index 3f2be67..5ff38ce 100644 --- a/CSparse/Source/cs_lsolve_gpu.cu +++ b/CSparse/Source/cs_lsolve_gpu.cu @@ -79,62 +79,94 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub __global__ -void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) +void cs_lsolve_gpu_kernel_trans_multi_opt (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) { const int * __restrict__ const Lp = Lp_a[blockIdx.x]; const int * __restrict__ const Li = Li_a[blockIdx.x]; const double * __restrict__ const Lx = Lx_a[blockIdx.x]; - double *x = x_a[blockIdx.x]; - + double *x = x_a[blockIdx.x]; int tid = threadIdx.x; // + blockIdx.x * blockDim.x; int g_id = 0; + int i, p, j; + int addr; + int rhs_offset; + int Li_p; + double Lx_p; + int inc, off; + + + __shared__ int Lp_sh [4096]; + __shared__ int Li_sh [1024]; + __shared__ double Lx_sh [1024]; + + for (i = 0; i < (n+2); i = i + blockDim.x) { + if ( (i + tid) < (n+2) ) { + Lp_sh[i + tid] = Lp[i + tid]; + // printf("Lp_sh[%d] = %d \n", i+tid, Lp_sh[i + tid]); + } + } + + __syncthreads(); + + for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x) { // printf("g_id = %d \n", g_id); if ( g_id + tid < n_rhs ) { - int p, j; - - int rhs_offset = g_id + tid; + rhs_offset = g_id + tid; for (j = 0 ; j < n ; j++) { - int addr = j * n_rhs + rhs_offset; + inc = (Lp_sh [j+1] - Lp_sh [j]); + // off = Lp_sh [j]; + // if (tid == 1 ) printf("Inc: %d off: %d \n", inc, off); + __syncthreads(); + + if ( tid < inc ) { + Li_sh [tid] = Li [Lp_sh [j] + tid]; + Lx_sh [tid] = Lx [Lp_sh [j] + tid]; + // printf("Li_sh[%d] = %d \n", tid, Li_sh[tid]); + // printf("Lx_sh[%d] = %f \n", tid, Lx_sh[tid]); + } - x [addr] = x [addr] / Lx [Lp [j]] ; - - for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + __syncthreads(); + + // if (tid == 0 )printf("Lx[%d] = %f and Lx_sh[%d] = %f \n", off, Lx [off], 0, Lx_sh[0]); + + addr = j * n_rhs + rhs_offset; + x [addr] = x [addr] / Lx_sh [0] ; + + // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + for (p = 1 ; p < inc ; p++) { - - int Li_p = Li [p]; - double Lx_p = Lx [p]; + Li_p = Li_sh [p]; + Lx_p = Lx_sh [p]; x [Li_p * n_rhs + rhs_offset] = x [Li_p * n_rhs + rhs_offset] - Lx_p * x [addr] ; + + // x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ; // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); - } - - } - + } + __syncthreads(); } - } - } - + __global__ -void cs_lsolve_gpu_kernel_trans_multi_good (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) +void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) { const int * __restrict__ const Lp = Lp_a[blockIdx.x]; const int * __restrict__ const Li = Li_a[blockIdx.x]; const double * __restrict__ const Lx = Lx_a[blockIdx.x]; - double *x = x_a[blockIdx.x]; + double *x = x_a[blockIdx.x]; int tid = threadIdx.x; // + blockIdx.x * blockDim.x; -- GitLab