cs_cholsol.c

#include "cs.h"
#include "cuda_runtime.h"


/* x=A\b where A is symmetric positive definite; b overwritten with solution */
csi cs_cholsol_cpu (csi order, const cs *A, double *b)
{
    double *x ;
    css *S ;
    csn *N ;
    csi n, ok ;
    if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
    n = A->n ;
    S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
    N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
    x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
    ok = (S && N && x) ;
    if (ok)
    {
        cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
        cs_lsolve (N->L, x) ;           /* x = L\x */
        cs_ltsolve (N->L, x) ;          /* x = L'\x */
        cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
    }
    cs_free (x) ;
    cs_sfree (S) ;
    cs_nfree (N) ;
    return (ok) ;
}


csi cs_cholsol (csi order, const cs *A, double *b)
{
    
    printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");

    int n_rhs_ratio = 6;
    int n_rhs; 
    int blocks=600; // cca 30 GB

    double *x ;
    css *S ;
    csn *N ;
    csi n, ok ;
    if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
    n = A->n ;
    n_rhs = n / n_rhs_ratio; 
    S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
    N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
    x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
    ok = (S && N && x) ;
    if (ok)
    {
        cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
                
        // *** GPU code start 
        // - cs_lsolve (N->L, x) ;           /* x = L\x */

        // Copy RHS vector to multiple columns 
        double *rhs_t;
        rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   

        int ri;
        int ni;
        for (ni = 0; ni < n; ni++) {
#ifdef DEBUG
            // printf("rhs[%d] %f - ",ni,x[ni]); 
#endif
            for (ri = 0; ri < n_rhs; ri++) {
                rhs_t[ni*n_rhs+ri] = x[ni];
#ifdef DEBUG
                // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
#endif
            }
#ifdef DEBUG
            // printf("\n");
#endif 
        }
        // END - Copy RHS vector to multiple columns 


        // Copy Chol. factor and RHSs from CPU to GPU 
        int    *d_Lp;
        int    *d_Li;
        double *d_Lx;
        double *d_x;

        cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
        cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
        cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
        cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );

        cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;


        // cudaEvent_t start, stop;
        // cudaEventCreate(&start);
        // cudaEventCreate(&stop);

        // Run Solve on GPU 
        // cudaEventRecord(start);
        cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
        cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
        // cudaEventRecord(stop);


        // cudaEventSynchronize(stop);
        // float milliseconds = 0;
        // cudaEventElapsedTime(&milliseconds, start, stop);

        // printf("GPU solve time for %d RHSs is: %f n", n_rhs, milliseconds);


        // Transfer data back to CPU if needed 
        double *x_gpu;
        x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
        cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );

        // CPU code verification - lsolve for multiple RSH 
        cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
        cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */

        // int i; 
        // int r; 
        // int errors = 0; 
        // for (i=0; i<n; i++) {
        //     for (r = 0; r < n_rhs; r++) {
        //         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
        //             printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
        //             errors++; 
        //         }
        //     }
        // }
        // printf("\n\n %d different elements between CPU and GPU. \n\n", errors);

        
        // *** Debug - check with per element output 
#ifdef DEBUG
        // int i; 
        // int r; 
        cs_lsolve (N->L, x) ;           /* x = L\x */
        cs_ltsolve (N->L, x) ;          /* x = L'\x */
        for (i=0; i<n; i++) {
            printf("cpu: %f\t gpu:\t", x[i]); 
            for (r = 0; r < n_rhs; r++) {
                if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
                    printf("OK\t");
                else
                    printf("Er\t"); 
                //    printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
                printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
            }
            printf("\n");
        }
        printf("\n");
#else 
        cs_lsolve (N->L, x) ;           /* x = L\x */        
        cs_ltsolve (N->L, x) ;          /* x = L'\x */
#endif

        // *** END - GPU code  


        cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
    }
    cs_free (x) ;
    cs_sfree (S) ;
    cs_nfree (N) ;
    return (ok) ;
}


csi cs_cholsolx (csi order, const cs *A, double *b)
{

    printf("\n *** Running GPU version with - kernel 1 without transposed RHS - non coalesced memory access. \n");

    int n_rhs = 1000; 


    double *x ;
    css *S ;
    csn *N ;
    csi n, ok ;
    if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
    n = A->n ;
    S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
    N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
    x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
    ok = (S && N && x) ;
    if (ok)
    {
        cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
            
        double *rhs_t;
        rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   

        int ri;
        int ni;
        for (ni = 0; ni < n; ni++) {
            printf("rhs[%d] %f - ",ni,x[ni]); 
            for (ri = 0; ri < n_rhs; ri++) {
                rhs_t[ni*n_rhs+ri] = x[ni];
                printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
            }
            printf("\n");
        }


        int    *d_Lp;
        int    *d_Li;
        double *d_Lx;
        double *d_x;

        //cudaSetDevice(1);     
        
        cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
        cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
        cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
        cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );

        cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
        
        int rr;
        for (rr = 0; rr < n_rhs; rr++) {
            cudaMemcpy( d_x + rr * n , x       ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
        }
        
        cs_lsolve_gpu     (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);

        double *x_gpu;
        x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
        cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );


        cs_lsolve (N->L, x) ;           /* x = L\x */


        int i; 
        int r; 
        for (i=0; i<n; i++) {
            printf("cpu: %f\t gpu:\t", x[i]); 
            for (r = 0; r < n_rhs; r++) {
                printf("%f\t", x_gpu[n*r + i] -x[i] );
            }
            printf("\n");
        }
        printf("\n");


        cs_ltsolve (N->L, x) ;          /* x = L'\x */
        cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
    }
    cs_free (x) ;
    cs_sfree (S) ;
    cs_nfree (N) ;
    return (ok) ;
}