#include "cs.h" #include "cuda_runtime.h" /* x=A\b where A is symmetric positive definite; b overwritten with solution */ csi cs_cholsol_cpu (csi order, const cs *A, double *b) { double *x ; css *S ; csn *N ; csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ ok = (S && N && x) ; if (ok) { cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ cs_lsolve (N->L, x) ; /* x = L\x */ cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } cs_free (x) ; cs_sfree (S) ; cs_nfree (N) ; return (ok) ; } csi cs_cholsol (csi order, const cs *A, double *b) { printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); double GPUmem_sizeGB = 32.0; double n_rhs_ratio = 6.0; int n_rhs; int blocks=1000;//0; // cca 30 GB double *x ; css *S ; csn *N ; csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; double cube_size = cbrt((double)n/3.0); n_rhs_ratio = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) ); printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size); n_rhs = (int)((double)n / n_rhs_ratio); // n_rhs = 1000; printf(" - LSC size = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs); double LSCsize = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double); printf(" - LSC size (symm.) = %f MB \n", LSCsize); printf(" - number of RHS for this matrix is : %d \n", n_rhs ); int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU ); blocks = total_num_of_LSC_perGPU; printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n); S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ ok = (S && N && x) ; if (ok) { cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ // *** GPU code start // - cs_lsolve (N->L, x) ; /* x = L\x */ // Copy RHS vector to multiple columns double *rhs_t; rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; int ri; int ni; for (ni = 0; ni < n; ni++) { #ifdef DEBUG // printf("rhs[%d] %f - ",ni,x[ni]); #endif for (ri = 0; ri < n_rhs; ri++) { rhs_t[ni*n_rhs+ri] = x[ni]; #ifdef DEBUG // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); #endif } #ifdef DEBUG // printf("\n"); #endif } // END - Copy RHS vector to multiple columns /* int GPU_mem = 0; int num_of_arrays = blocks; // create intermediate host array for storage of device row-pointers int** h_array_Lp = (int**) malloc(num_of_arrays * sizeof(int*)); int** h_array_Li = (int**) malloc(num_of_arrays * sizeof(int*)); double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*)); double** h_array_x = (double**)malloc(num_of_arrays * sizeof(double*)); // create top-level device array pointer int** d_array_Lp; int** d_array_Li; double** d_array_Lx; double** d_array_x ; cudaMalloc((void**)&d_array_Lp, num_of_arrays * sizeof(int*)); cudaMalloc((void**)&d_array_Li, num_of_arrays * sizeof(int*)); cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*)); cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*)); size_t free_byte ; size_t total_byte ; cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; double free_db = (double)free_byte ; double total_db = (double)total_byte ; double used_db = total_db - free_db ; printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); // allocate each device row-pointer, then copy host data to it int i; for(i = 0 ; i < num_of_arrays ; i++){ cudaMalloc(&h_array_Lp[i], ((N->L->n)+1) * sizeof(int)); cudaMalloc(&h_array_Li[i], (N->L->nzmax) * sizeof(int)); cudaMalloc(&h_array_Lx[i], (N->L->nzmax) * sizeof(double)); cudaMalloc(&h_array_x [i], n * n_rhs * sizeof(double)); printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int)) / 1024.0 / 1024.0); printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int)) / 1024.0 / 1024.0); printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double)) / 1024.0 / 1024.0); printf(" x size: %f MB\n", (double)(n * n_rhs * sizeof(double)) / 1024.0 / 1024.0); printf("LSCsize: %f MB\n", (double)(0.5*n_rhs * n_rhs * sizeof(double)) / 1024.0 / 1024.0); cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(h_array_Lx[i] , N->L->x , (N->L->nzmax) * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(h_array_x [i] , rhs_t , n * n_rhs * sizeof(double), cudaMemcpyHostToDevice); size_t free_byte ; size_t total_byte ; cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ; double free_db = (double)free_byte ; double total_db = (double)total_byte ; double used_db = total_db - free_db ; printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); } // fixup top level device array pointer to point to array of device row-pointers cudaMemcpy(d_array_Lp, h_array_Lp, num_of_arrays * sizeof(int*) , cudaMemcpyHostToDevice); cudaMemcpy(d_array_Li, h_array_Li, num_of_arrays * sizeof(int*) , cudaMemcpyHostToDevice); cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice); cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); */ // Copy Chol. factor and RHSs from CPU to GPU int *d_Lp; int *d_Li; double *d_Lx; double *d_x; cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); // Transfer data back to CPU if needed double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; // cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); // CPU code verification - lsolve for multiple RSH cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ // int i; // int r; // int errors = 0; // for (i=0; i<n; i++) { // for (r = 0; r < n_rhs; r++) { // if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); // errors++; // } // } // } // printf("\n\n %d different elements between CPU and GPU. \n\n", errors); // *** Debug - check with per element output #ifdef DEBUG // int i; // int r; cs_lsolve (N->L, x) ; /* x = L\x */ cs_ltsolve (N->L, x) ; /* x = L'\x */ for (i=0; i<n; i++) { printf("cpu: %f\t gpu:\t", x[i]); for (r = 0; r < n_rhs; r++) { if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) printf("OK\t"); else printf("Er\t"); // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); } printf("\n"); } printf("\n"); #else cs_lsolve (N->L, x) ; /* x = L\x */ cs_ltsolve (N->L, x) ; /* x = L'\x */ #endif // *** END - GPU code cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } cs_free (x) ; cs_sfree (S) ; cs_nfree (N) ; return (ok) ; } csi cs_cholsol_single (csi order, const cs *A, double *b) { printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n"); int n_rhs_ratio = 6; int n_rhs; int blocks=600; // cca 30 GB double *x ; css *S ; csn *N ; csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; n_rhs = n / n_rhs_ratio; S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ ok = (S && N && x) ; if (ok) { cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ // *** GPU code start // - cs_lsolve (N->L, x) ; /* x = L\x */ // Copy RHS vector to multiple columns double *rhs_t; rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; int ri; int ni; for (ni = 0; ni < n; ni++) { #ifdef DEBUG // printf("rhs[%d] %f - ",ni,x[ni]); #endif for (ri = 0; ri < n_rhs; ri++) { rhs_t[ni*n_rhs+ri] = x[ni]; #ifdef DEBUG // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); #endif } #ifdef DEBUG // printf("\n"); #endif } // END - Copy RHS vector to multiple columns // Copy Chol. factor and RHSs from CPU to GPU int *d_Lp; int *d_Li; double *d_Lx; double *d_x; cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks); // Transfer data back to CPU if needed double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); // CPU code verification - lsolve for multiple RSH cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */ // int i; // int r; // int errors = 0; // for (i=0; i<n; i++) { // for (r = 0; r < n_rhs; r++) { // if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) { // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); // errors++; // } // } // } // printf("\n\n %d different elements between CPU and GPU. \n\n", errors); // *** Debug - check with per element output #ifdef DEBUG // int i; // int r; cs_lsolve (N->L, x) ; /* x = L\x */ cs_ltsolve (N->L, x) ; /* x = L'\x */ for (i=0; i<n; i++) { printf("cpu: %f\t gpu:\t", x[i]); for (r = 0; r < n_rhs; r++) { if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 ) printf("OK\t"); else printf("Er\t"); // printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] ); printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] ); } printf("\n"); } printf("\n"); #else cs_lsolve (N->L, x) ; /* x = L\x */ cs_ltsolve (N->L, x) ; /* x = L'\x */ #endif // *** END - GPU code cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } cs_free (x) ; cs_sfree (S) ; cs_nfree (N) ; return (ok) ; } csi cs_cholsolx (csi order, const cs *A, double *b) { printf("\n *** Running GPU version with - kernel 1 without transposed RHS - non coalesced memory access. \n"); int n_rhs = 1000; double *x ; css *S ; csn *N ; csi n, ok ; if (!CS_CSC (A) || !b) return (0) ; /* check inputs */ n = A->n ; S = cs_schol (order, A) ; /* ordering and symbolic analysis */ N = cs_chol (A, S) ; /* numeric Cholesky factorization */ x = cs_malloc (n, sizeof (double)) ; /* get workspace */ ok = (S && N && x) ; if (ok) { cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */ double *rhs_t; rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; int ri; int ni; for (ni = 0; ni < n; ni++) { printf("rhs[%d] %f - ",ni,x[ni]); for (ri = 0; ri < n_rhs; ri++) { rhs_t[ni*n_rhs+ri] = x[ni]; printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); } printf("\n"); } int *d_Lp; int *d_Li; double *d_Lx; double *d_x; //cudaSetDevice(1); cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; int rr; for (rr = 0; rr < n_rhs; rr++) { cudaMemcpy( d_x + rr * n , x , n * sizeof(double) , cudaMemcpyHostToDevice) ; } cs_lsolve_gpu (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1); double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); cs_lsolve (N->L, x) ; /* x = L\x */ int i; int r; for (i=0; i<n; i++) { printf("cpu: %f\t gpu:\t", x[i]); for (r = 0; r < n_rhs; r++) { printf("%f\t", x_gpu[n*r + i] -x[i] ); } printf("\n"); } printf("\n"); cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } cs_free (x) ; cs_sfree (S) ; cs_nfree (N) ; return (ok) ; }