Skip to content
Snippets Groups Projects
Commit 39f726ca authored by Lubomir Riha's avatar Lubomir Riha
Browse files

Working version - optimal kernel doesnot give correct results

parent 0bd977f8
No related branches found
No related tags found
No related merge requests found
...@@ -11,17 +11,17 @@ all: lib cs_demo1 cs_demo2 cs_demo3 ...@@ -11,17 +11,17 @@ all: lib cs_demo1 cs_demo2 cs_demo3
# - ./cs_demo2 < ../Matrix/10_E.txt # - ./cs_demo2 < ../Matrix/10_E.txt
# - ./cs_demo2 < ../Matrix/10_Etrans # - ./cs_demo2 < ../Matrix/10_Etrans
# - ./cs_demo2 < ../Matrix/10_Ecol.txt # - ./cs_demo2 < ../Matrix/10_Ecol.txt
# - ./cs_demo2 < ../Matrix/5_E.txt # - ./cs_demo2 < ../Matrix/5_E.txt
- ./cs_demo2 < ../Matrix/10_E_orig.txt - ./cs_demo2 < ../Matrix/10_E_orig.txt
# - ./cs_demo2 < ../Matrix/15_E_orig.txt # - ./cs_demo2 < ../Matrix/15_E_orig.txt
# - ./cs_demo2 < ../Matrix/20_E.txt # - ./cs_demo2 < ../Matrix/20_E.txt
# - ./cs_demo2 < ../Matrix/t1 # - ./cs_demo2 < ../Matrix/t1
# - ./cs_demo2 < ../Matrix/FEM-2S # - ./cs_demo2 < ../Matrix/FEM-2S
# - ./cs_demo2 < ../Matrix/bcsstk26-v2 # - ./cs_demo2 < ../Matrix/bcsstk26-v2
# - ./cs_demo2 < ../Matrix/bcsstk25 # - ./cs_demo2 < ../Matrix/bcsstk25
# - ./cs_demo2 < ../Matrix/bcsstk17 # - ./cs_demo2 < ../Matrix/bcsstk17
# - ./cs_demo2 < ../Matrix/ship_001 # - ./cs_demo2 < ../Matrix/ship_001
# - ./cs_demo2 < ../Matrix/bcsstk01 # - ./cs_demo2 < ../Matrix/bcsstk01-v2
# - ./cs_demo2 < ../Matrix/crystm02 # - ./cs_demo2 < ../Matrix/crystm02
# - ./cs_demo2 < ../Matrix/bcsstk26-v2 # - ./cs_demo2 < ../Matrix/bcsstk26-v2
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
# CSparse/Lib. It does not install it for system-wide usage. # CSparse/Lib. It does not install it for system-wide usage.
LIBRARY = libcsparse LIBRARY = libcsparse
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) #-O
CC = nvcc -dc -g -O3 # -G -O0 --cudart shared #-DDEBUG CC = nvcc -dc -g -O3 # -G -O0 --cudart shared #-DDEBUG
I = -I../Include I = -I../Include
RANLIB = ranlib RANLIB = ranlib
......
...@@ -56,12 +56,6 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -56,12 +56,6 @@ csi cs_cholsol (csi order, const cs *A, double *b)
int blocks = total_num_of_LSC_perGPU; int blocks = total_num_of_LSC_perGPU;
// blocks = 10;
// n_rhs = 30;
printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs); printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
printf(" - LSC size (symm.) = %f MB \n", LSCsize); printf(" - LSC size (symm.) = %f MB \n", LSCsize);
...@@ -85,6 +79,13 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -85,6 +79,13 @@ csi cs_cholsol (csi order, const cs *A, double *b)
blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU); blocks = (int)(0.9 * max_num_of_LSC_processed_per_GPU);
// *** For debugging
blocks = 3;
n_rhs = 2;
// END
printf("Input data memory requiroments to calculate LSC on GPU\n"); printf("Input data memory requiroments to calculate LSC on GPU\n");
printf(" - Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); printf(" - Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0);
printf(" - Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); printf(" - Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0);
...@@ -135,6 +136,8 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -135,6 +136,8 @@ csi cs_cholsol (csi order, const cs *A, double *b)
int GPU_mem = 0; int GPU_mem = 0;
int num_of_arrays = blocks; int num_of_arrays = blocks;
// int num_of_arrays = 1 ; //blocks;
// create intermediate host array for storage of device row-pointers // create intermediate host array for storage of device row-pointers
int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*));
...@@ -206,61 +209,34 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -206,61 +209,34 @@ csi cs_cholsol (csi order, const cs *A, double *b)
printf("================================================================================================\n"); printf("================================================================================================\n");
blocks = 1; blocks = 1;
int j = 0;
for(j = 0 ; j < blocks ; j++){
cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
}
cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
printf("------------------------------------------------------------------------------------------------\n"); printf("------------------------------------------------------------------------------------------------\n");
for (i = 80; i < num_of_arrays; i = i * 2) { for (i = 80; i < num_of_arrays; i = i * 2) {
blocks = i; blocks = i;
int j = 0;
for(j = 0 ; j < blocks ; j++){
cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
}
cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
printf("------------------------------------------------------------------------------------------------\n"); printf("------------------------------------------------------------------------------------------------\n");
} }
blocks = num_of_arrays; blocks = num_of_arrays;
for(j = 0 ; j < blocks ; j++){
cudaMemcpy(h_array_x [j] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
}
cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
printf("================================================================================================\n"); printf("================================================================================================\n");
// printf("================================================================================================\n");
// printf("Simple kernel with single input matrix and RHS - all blocks work on identical data\n");
// int *d_Lp;
// int *d_Li;
// double *d_Lx;
// double *d_x;
// cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) );
// cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) );
// cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) );
// cudaMalloc( &d_x , n * n_rhs * sizeof(double) );
// cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
// cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
// cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
// cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ;
// blocks = num_of_arrays;
// cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// printf("------------------------------------------------------------------------------------------------\n");
// printf(" - For max num of LSC\n");
// blocks = total_num_of_LSC_perGPU;
// cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// printf("================================================================================================\n");
// int *d_Lp = &d_array_Lp[0];
// int *d_Li = &d_array_Li[0];
// double *d_Lx = &d_array_Lx[0];
// double *d_x = &d_array_x [0];
// cs_lsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
// cs_ltsolve_gpu_trans (n, d_array_Lp[0], d_array_Li[0], d_array_Lx[0], d_array_x [0], n_rhs, n_rhs, blocks);
// cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
printf("\n"); printf("\n");
...@@ -273,70 +249,75 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -273,70 +249,75 @@ csi cs_cholsol (csi order, const cs *A, double *b)
// CPU code verification - lsolve for multiple RSH // CPU code verification - lsolve for multiple RSH
t = tic () ; t = tic () ;
cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
printf ("CPU code 'cs_lsolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ; printf ("CPU code 'cs_lsolveolve_mrhs' sequential time for ONE LSC : %8.2f ms \n", 1000.0 * toc (t)) ;
t = tic () ; t = tic () ;
cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ; printf ("CPU code 'cs_ltsolve_mrhs' sequential time for ONE LSC: %8.2f ms \n", 1000.0 * toc (t)) ;
int l; int l;
printf("\n"); printf("\n");
for(l = 0 ; l < num_of_arrays ; l++) { for(l = 0 ; l < num_of_arrays ; l++) {
double *x_gpu = x_gpu_array[l]; double *x_gpu = x_gpu_array[l];
// // *** Debug - check with per element output // *** Debug - check with per element output
// #ifdef DEBUG // #define DEBUG
// { #ifdef DEBUG
// int i; {
// int r; int i;
// cs_lsolve (N->L, x) ; /* x = L\x */ int r;
// cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_lsolve (N->L, x) ; // x = L\x
// for (i=0; i<n; i++) { cs_ltsolve (N->L, x) ; // x = L'\x
// printf("cpu: %f\t gpu:\t", x[i]); for (i=0; i<n; i++) {
// for (r = 0; r < n_rhs; r++) { printf("cpu: %f\t gpu:\t", x[i]);
// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) for (r = 0; r < n_rhs; r++) {
// printf("OK\t"); if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
// else printf("OK ");
// printf("Er\t"); else
// // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); // printf("Er ");
// printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
// } // printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
// printf("\n"); printf("%f\t", x_gpu[i*n_rhs + r] );
// }
// printf("\n"); }
// } printf("\n");
// #else }
printf("\n");
}
#else
// { {
// int i; int i;
// int r; int r;
// int errors = 0; int errors = 0;
// for (i=0; i<n; i++) { for (i=0; i<n; i++) {
// for (r = 0; r < n_rhs; r++) { for (r = 0; r < n_rhs; r++) {
// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
// errors++; errors++;
// } }
// } }
// } }
// if (errors == 0) if (errors == 0)
// printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l); printf("LSC[%d] - No errors - identical result for CPU and GPU.\n", l);
// else else
// printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l); printf("LSC[%d] - Error - different result between CPU and GPU results. \n", l);
// } }
// #endif #endif
} }
#else // define FULL_MEM
#else
// *** Vesion with // *** Vesion with
// Copy Chol. factor and RHSs from CPU to GPU // Copy Chol. factor and RHSs from CPU to GPU
int *d_Lp; int *d_Lp;
int *d_Li; int *d_Li;
double *d_Lx; double *d_Lx;
...@@ -364,8 +345,10 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -364,8 +345,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
// *** Debug - check with per element output // *** Debug - check with per element output
#ifdef DEBUG # define DEBUG
#ifdef DEBUG
{ {
int i; int i;
int r; int r;
...@@ -384,9 +367,10 @@ csi cs_cholsol (csi order, const cs *A, double *b) ...@@ -384,9 +367,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
} }
printf("\n"); printf("\n");
} }
#endif #endif
#endif // define FULL_MEM
#endif
// *** END - GPU code // *** END - GPU code
......
...@@ -79,62 +79,94 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub ...@@ -79,62 +79,94 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub
__global__ __global__
void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) void cs_lsolve_gpu_kernel_trans_multi_opt (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
{ {
const int * __restrict__ const Lp = Lp_a[blockIdx.x]; const int * __restrict__ const Lp = Lp_a[blockIdx.x];
const int * __restrict__ const Li = Li_a[blockIdx.x]; const int * __restrict__ const Li = Li_a[blockIdx.x];
const double * __restrict__ const Lx = Lx_a[blockIdx.x]; const double * __restrict__ const Lx = Lx_a[blockIdx.x];
double *x = x_a[blockIdx.x]; double *x = x_a[blockIdx.x];
int tid = threadIdx.x; // + blockIdx.x * blockDim.x; int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
int g_id = 0; int g_id = 0;
int i, p, j;
int addr;
int rhs_offset;
int Li_p;
double Lx_p;
int inc, off;
__shared__ int Lp_sh [4096];
__shared__ int Li_sh [1024];
__shared__ double Lx_sh [1024];
for (i = 0; i < (n+2); i = i + blockDim.x) {
if ( (i + tid) < (n+2) ) {
Lp_sh[i + tid] = Lp[i + tid];
// printf("Lp_sh[%d] = %d \n", i+tid, Lp_sh[i + tid]);
}
}
__syncthreads();
for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x) for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x)
{ {
// printf("g_id = %d \n", g_id); // printf("g_id = %d \n", g_id);
if ( g_id + tid < n_rhs ) { if ( g_id + tid < n_rhs ) {
int p, j; rhs_offset = g_id + tid;
int rhs_offset = g_id + tid;
for (j = 0 ; j < n ; j++) for (j = 0 ; j < n ; j++)
{ {
int addr = j * n_rhs + rhs_offset; inc = (Lp_sh [j+1] - Lp_sh [j]);
// off = Lp_sh [j];
// if (tid == 1 ) printf("Inc: %d off: %d \n", inc, off);
__syncthreads();
if ( tid < inc ) {
Li_sh [tid] = Li [Lp_sh [j] + tid];
Lx_sh [tid] = Lx [Lp_sh [j] + tid];
// printf("Li_sh[%d] = %d \n", tid, Li_sh[tid]);
// printf("Lx_sh[%d] = %f \n", tid, Lx_sh[tid]);
}
x [addr] = x [addr] / Lx [Lp [j]] ; __syncthreads();
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) // if (tid == 0 )printf("Lx[%d] = %f and Lx_sh[%d] = %f \n", off, Lx [off], 0, Lx_sh[0]);
addr = j * n_rhs + rhs_offset;
x [addr] = x [addr] / Lx_sh [0] ;
// for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
for (p = 1 ; p < inc ; p++)
{ {
Li_p = Li_sh [p];
int Li_p = Li [p]; Lx_p = Lx_sh [p];
double Lx_p = Lx [p];
x [Li_p * n_rhs + rhs_offset] = x [Li_p * n_rhs + rhs_offset] - Lx_p * x [addr] ; x [Li_p * n_rhs + rhs_offset] = x [Li_p * n_rhs + rhs_offset] - Lx_p * x [addr] ;
// x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ;
// printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
} }
}
} __syncthreads();
} }
} }
} }
__global__ __global__
void cs_lsolve_gpu_kernel_trans_multi_good (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs) void cs_lsolve_gpu_kernel_trans_multi (int n, const int ** __restrict__ const Lp_a, const int ** __restrict__ const Li_a, const double ** __restrict__ const Lx_a, double **x_a, int n_rhs)
{ {
const int * __restrict__ const Lp = Lp_a[blockIdx.x]; const int * __restrict__ const Lp = Lp_a[blockIdx.x];
const int * __restrict__ const Li = Li_a[blockIdx.x]; const int * __restrict__ const Li = Li_a[blockIdx.x];
const double * __restrict__ const Lx = Lx_a[blockIdx.x]; const double * __restrict__ const Lx = Lx_a[blockIdx.x];
double *x = x_a[blockIdx.x]; double *x = x_a[blockIdx.x];
int tid = threadIdx.x; // + blockIdx.x * blockDim.x; int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment