diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index 96103f7614286dd576cb12b47aa247d921a67d54..de981fd78ceb120871f9d43c0b09e03e03335010 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -29,6 +29,7 @@ csi cs_cholsol_cpu (csi order, const cs *A, double *b) } + csi cs_cholsol (csi order, const cs *A, double *b) { @@ -36,7 +37,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) int n_rhs_ratio = 6; int n_rhs; - int blocks=600; // cca 30 GB + int blocks=13;//0; // cca 30 GB double *x ; css *S ; @@ -49,6 +50,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ ok = (S && N && x) ; + if (ok) { cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ @@ -79,6 +81,79 @@ csi cs_cholsol (csi order, const cs *A, double *b) // END - Copy RHS vector to multiple columns + int GPU_mem = 0; + + int num_of_arrays = blocks; + + // create intermediate host array for storage of device row-pointers + int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); + int** h_array_Li = (int**) malloc(num_of_arrays * sizeof(int*)); + double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*)); + double** h_array_x = (double**)malloc(num_of_arrays * sizeof(double*)); + + // create top-level device array pointer + int** d_array_Lp; + int** d_array_Li; + double** d_array_Lx; + double** d_array_x ; + + cudaMalloc((void**)&d_array_Lp, num_of_arrays * sizeof(int*)); + cudaMalloc((void**)&d_array_Li, num_of_arrays * sizeof(int*)); + cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*)); + cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*)); + + + + size_t free_byte ; + size_t total_byte ; + cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; + double free_db = (double)free_byte ; + double total_db = (double)total_byte ; + double used_db = total_db - free_db ; + + printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + + + // allocate each device row-pointer, then copy host data to it + int i; + for(i = 0 ; i < num_of_arrays ; i++){ + cudaMalloc(&h_array_Lp[i], ((N->L->n)+1) * sizeof(int)); + cudaMalloc(&h_array_Li[i], (N->L->nzmax) * sizeof(int)); + cudaMalloc(&h_array_Lx[i], (N->L->nzmax) * sizeof(double)); + cudaMalloc(&h_array_x [i], n * n_rhs * sizeof(double)); + + printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); + printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); + printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); + printf(" x size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + printf("LSCsize: %f MB\n", (double)(n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); + + cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(h_array_Lx[i] , N->L->x , (N->L->nzmax) * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(h_array_x [i] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice); + + size_t free_byte ; + size_t total_byte ; + cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; + double free_db = (double)free_byte ; + double total_db = (double)total_byte ; + double used_db = total_db - free_db ; + + printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); + + } + // fixup top level device array pointer to point to array of device row-pointers + cudaMemcpy(d_array_Lp, h_array_Lp, num_of_arrays * sizeof(int*) , cudaMemcpyHostToDevice); + cudaMemcpy(d_array_Li, h_array_Li, num_of_arrays * sizeof(int*) , cudaMemcpyHostToDevice); + cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); + cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); + + cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); + + + // Copy Chol. factor and RHSs from CPU to GPU int *d_Lp; int *d_Li; @@ -95,24 +170,138 @@ csi cs_cholsol (csi order, const cs *A, double *b) cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; - - // cudaEvent_t start, stop; - // cudaEventCreate(&start); - // cudaEventCreate(&stop); - - // Run Solve on GPU - // cudaEventRecord(start); cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); - // cudaEventRecord(stop); + // Transfer data back to CPU if needed + double *x_gpu; + x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; + // cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); - // cudaEventSynchronize(stop); - // float milliseconds = 0; - // cudaEventElapsedTime(&milliseconds, start, stop); + // CPU code verification - lsolve for multiple RSH + cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ + cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ - // printf("GPU solve time for %d RHSs is: %f n", n_rhs, milliseconds); + // int i; + // int r; + // int errors = 0; + // for (i=0; i<n; i++) { + // for (r = 0; r < n_rhs; r++) { + // if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { + // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); + // errors++; + // } + // } + // } + // printf("\n\n %d different elements between CPU and GPU. \n\n", errors); + + + // *** Debug - check with per element output +#ifdef DEBUG + // int i; + // int r; + cs_lsolve (N->L, x) ; /* x = L\x */ + cs_ltsolve (N->L, x) ; /* x = L'\x */ + for (i=0; i<n; i++) { + printf("cpu: %f\t gpu:\t", x[i]); + for (r = 0; r < n_rhs; r++) { + if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) + printf("OK\t"); + else + printf("Er\t"); + // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); + printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); + } + printf("\n"); + } + printf("\n"); +#else + cs_lsolve (N->L, x) ; /* x = L\x */ + cs_ltsolve (N->L, x) ; /* x = L'\x */ +#endif + // *** END - GPU code + + + + cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ + } + cs_free (x) ; + cs_sfree (S) ; + cs_nfree (N) ; + return (ok) ; +} + + + +csi cs_cholsol_single (csi order, const cs *A, double *b) +{ + + printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); + + int n_rhs_ratio = 6; + int n_rhs; + int blocks=600; // cca 30 GB + + double *x ; + css *S ; + csn *N ; + csi n, ok ; + if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ + n = A->n ; + n_rhs = n / n_rhs_ratio; + S = cs_schol (order, A) ; /* ordering and symbolic analysis */ + N = cs_chol (A, S) ; /* numeric Cholesky factorization */ + x = cs_malloc (n, sizeof (double)) ; /* get workspace */ + ok = (S && N && x) ; + if (ok) + { + cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ + + // *** GPU code start + // - cs_lsolve (N->L, x) ; /* x = L\x */ + + // Copy RHS vector to multiple columns + double *rhs_t; + rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; + + int ri; + int ni; + for (ni = 0; ni < n; ni++) { +#ifdef DEBUG + // printf("rhs[%d] %f - ",ni,x[ni]); +#endif + for (ri = 0; ri < n_rhs; ri++) { + rhs_t[ni*n_rhs+ri] = x[ni]; +#ifdef DEBUG + // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); +#endif + } +#ifdef DEBUG + // printf("\n"); +#endif + } + // END - Copy RHS vector to multiple columns + + + // Copy Chol. factor and RHSs from CPU to GPU + int *d_Lp; + int *d_Li; + double *d_Lx; + double *d_x; + + cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); + cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); + cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); + cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); + + cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; + + cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); + cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); // Transfer data back to CPU if needed double *x_gpu; diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu index 6de33e4ead46b9196bc61f8f201db1640946278c..966045ef35ae8c40109f15901e2839463b2509dd 100644 --- a/CSparse/Source/cs_lsolve_gpu.cu +++ b/CSparse/Source/cs_lsolve_gpu.cu @@ -78,6 +78,52 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub } +__global__ +void cs_lsolve_gpu_kernel_trans_multi (int n, const int **Lp_a, const int **Li_a, const double **Lx_a, double **x_a, int n_rhs) +{ + + const int *Lp = Lp_a[blockIdx.x]; + const int *Li = Li_a[blockIdx.x]; + const double *Lx = Lx_a[blockIdx.x]; + double *x = x_a[blockIdx.x]; + + + int tid = threadIdx.x; // + blockIdx.x * blockDim.x; + int g_id = 0; + + for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x) + { + // printf("g_id = %d \n", g_id); + if ( g_id + tid < n_rhs ) { + + int p, j; + + int rhs_offset = g_id + tid; + + for (j = 0 ; j < n ; j++) + { + + int addr = j * n_rhs + rhs_offset; + + x [addr] = x [addr] / Lx [Lp [j]] ; + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + + x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ; + + // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); + + } + + } + + } + + } + +} + __global__ void cs_ltsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs) { @@ -115,6 +161,47 @@ void cs_ltsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const dou } +__global__ +void cs_ltsolve_gpu_kernel_trans_multi (int n, const int **Lp_a, const int **Li_a, const double **Lx_a, double **x_a, int n_rhs) +{ + + const int *Lp = Lp_a[blockIdx.x]; + const int *Li = Li_a[blockIdx.x]; + const double *Lx = Lx_a[blockIdx.x]; + double *x = x_a[blockIdx.x]; + + int tid = threadIdx.x; // + blockIdx.x * blockDim.x; + int g_id = 0; + + for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x) + { + // printf("g_id = %d \n", g_id); + if ( g_id + tid < n_rhs ) { + + int p, j; + + int rhs_offset = g_id + tid; + + for (j = n-1 ; j >= 0 ; j--) + { + + int addr = j * n_rhs + rhs_offset; + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + x [addr] = x [addr] - Lx [p] * x [Li [p] * n_rhs + rhs_offset] ; + + //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); + + } + x [addr] = x [addr] / Lx [Lp [j]] ; + } + + } + } + +} + extern "C" { void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks) { @@ -142,6 +229,33 @@ extern "C" { } +extern "C" { + void cs_lsolve_gpu_trans_multi (int n, const int **Lp, const int **Li, const double **Lx, double **x, int n_rhs, int threads, int blocks) + { + + if (threads > n_rhs) threads = n_rhs; + if (threads > MAX_THREADS) threads = MAX_THREADS; + + printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + cs_lsolve_gpu_kernel_trans_multi <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs); + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + + } +} + + extern "C" { void cs_ltsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks) { @@ -169,6 +283,51 @@ extern "C" { } +extern "C" { + void cs_ltsolve_gpu_trans_multi (int n, const int **Lp, const int **Li, const double **Lx, double **x, int n_rhs, int threads, int blocks) + { + + if (threads > n_rhs) threads = n_rhs; + if (threads > MAX_THREADS) threads = MAX_THREADS; + + printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads)); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + cs_ltsolve_gpu_kernel_trans_multi <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs); + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds); + + } +} + + + + + + + + + + + + + + + + + + + + // *** OLD functions