-
Lubomir Riha authoredLubomir Riha authored
cs_cholsol.c 8.15 KiB
#include "cs.h"
#include "cuda_runtime.h"
/* x=A\b where A is symmetric positive definite; b overwritten with solution */
csi cs_cholsol_cpu (csi order, const cs *A, double *b)
{
double *x ;
css *S ;
csn *N ;
csi n, ok ;
if (!CS_CSC (A) || !b) return (0) ; /* check inputs */
n = A->n ;
S = cs_schol (order, A) ; /* ordering and symbolic analysis */
N = cs_chol (A, S) ; /* numeric Cholesky factorization */
x = cs_malloc (n, sizeof (double)) ; /* get workspace */
ok = (S && N && x) ;
if (ok)
{
cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */
cs_lsolve (N->L, x) ; /* x = L\x */
cs_ltsolve (N->L, x) ; /* x = L'\x */
cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */
}
cs_free (x) ;
cs_sfree (S) ;
cs_nfree (N) ;
return (ok) ;
}
csi cs_cholsol (csi order, const cs *A, double *b)
{
printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
int n_rhs_ratio = 6;
int n_rhs;
int blocks=600; // cca 30 GB
double *x ;
css *S ;
csn *N ;
csi n, ok ;
if (!CS_CSC (A) || !b) return (0) ; /* check inputs */
n = A->n ;
n_rhs = n / n_rhs_ratio;
S = cs_schol (order, A) ; /* ordering and symbolic analysis */
N = cs_chol (A, S) ; /* numeric Cholesky factorization */
x = cs_malloc (n, sizeof (double)) ; /* get workspace */
ok = (S && N && x) ;
if (ok)
{
cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */
// *** GPU code start
// - cs_lsolve (N->L, x) ; /* x = L\x */
// Copy RHS vector to multiple columns
double *rhs_t;
rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;
int ri;
int ni;
for (ni = 0; ni < n; ni++) {
#ifdef DEBUG
// printf("rhs[%d] %f - ",ni,x[ni]);
#endif
for (ri = 0; ri < n_rhs; ri++) {
rhs_t[ni*n_rhs+ri] = x[ni];
#ifdef DEBUG
// printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]);
#endif
}
#ifdef DEBUG
// printf("\n");
#endif
}
// END - Copy RHS vector to multiple columns
// Copy Chol. factor and RHSs from CPU to GPU
int *d_Lp;
int *d_Li;
double *d_Lx;
double *d_x;
cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc( &d_x , n * n_rhs * sizeof(double) );
cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ;
// cudaEvent_t start, stop;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// Run Solve on GPU
// cudaEventRecord(start);
cs_lsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
// cudaEventRecord(stop);
// cudaEventSynchronize(stop);
// float milliseconds = 0;
// cudaEventElapsedTime(&milliseconds, start, stop);
// printf("GPU solve time for %d RHSs is: %f n", n_rhs, milliseconds);
// Transfer data back to CPU if needed
double *x_gpu;
x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;
cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
// CPU code verification - lsolve for multiple RSH
cs_lsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
cs_ltsolve_mrhs (N->L, rhs_t, n_rhs); /* X = L\X */
// int i;
// int r;
// int errors = 0;
// for (i=0; i<n; i++) {
// for (r = 0; r < n_rhs; r++) {
// if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
// printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
// errors++;
// }
// }
// }
// printf("\n\n %d different elements between CPU and GPU. \n\n", errors);
// *** Debug - check with per element output
#ifdef DEBUG
// int i;
// int r;
cs_lsolve (N->L, x) ; /* x = L\x */
cs_ltsolve (N->L, x) ; /* x = L'\x */
for (i=0; i<n; i++) {
printf("cpu: %f\t gpu:\t", x[i]);
for (r = 0; r < n_rhs; r++) {
if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
printf("OK\t");
else
printf("Er\t");
// printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
}
printf("\n");
}
printf("\n");
#else
cs_lsolve (N->L, x) ; /* x = L\x */
cs_ltsolve (N->L, x) ; /* x = L'\x */
#endif
// *** END - GPU code
cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */
}
cs_free (x) ;
cs_sfree (S) ;
cs_nfree (N) ;
return (ok) ;
}
csi cs_cholsolx (csi order, const cs *A, double *b)
{
printf("\n *** Running GPU version with - kernel 1 without transposed RHS - non coalesced memory access. \n");
int n_rhs = 1000;
double *x ;
css *S ;
csn *N ;
csi n, ok ;
if (!CS_CSC (A) || !b) return (0) ; /* check inputs */
n = A->n ;
S = cs_schol (order, A) ; /* ordering and symbolic analysis */
N = cs_chol (A, S) ; /* numeric Cholesky factorization */
x = cs_malloc (n, sizeof (double)) ; /* get workspace */
ok = (S && N && x) ;
if (ok)
{
cs_ipvec (S->pinv, b, x, n) ; /* x = P*b */
double *rhs_t;
rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;
int ri;
int ni;
for (ni = 0; ni < n; ni++) {
printf("rhs[%d] %f - ",ni,x[ni]);
for (ri = 0; ri < n_rhs; ri++) {
rhs_t[ni*n_rhs+ri] = x[ni];
printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]);
}
printf("\n");
}
int *d_Lp;
int *d_Li;
double *d_Lx;
double *d_x;
//cudaSetDevice(1);
cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc( &d_x , n * n_rhs * sizeof(double) );
cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
int rr;
for (rr = 0; rr < n_rhs; rr++) {
cudaMemcpy( d_x + rr * n , x , n * sizeof(double) , cudaMemcpyHostToDevice) ;
}
cs_lsolve_gpu (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
double *x_gpu;
x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;
cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
cs_lsolve (N->L, x) ; /* x = L\x */
int i;
int r;
for (i=0; i<n; i++) {
printf("cpu: %f\t gpu:\t", x[i]);
for (r = 0; r < n_rhs; r++) {
printf("%f\t", x_gpu[n*r + i] -x[i] );
}
printf("\n");
}
printf("\n");
cs_ltsolve (N->L, x) ; /* x = L'\x */
cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */
}
cs_free (x) ;
cs_sfree (S) ;
cs_nfree (N) ;
return (ok) ;
}