Skip to content
Snippets Groups Projects
cs_cholsol.c 16.9 KiB
Newer Older
  • Learn to ignore specific revisions
  • Lubomir Riha's avatar
    Lubomir Riha committed
    #include "cs.h"
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    #include "cuda_runtime.h"
    
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    /* x=A\b where A is symmetric positive definite; b overwritten with solution */
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    csi cs_cholsol_cpu (csi order, const cs *A, double *b)
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    {
        double *x ;
        css *S ;
        csn *N ;
        csi n, ok ;
        if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
        n = A->n ;
        S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
        N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
        x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
        ok = (S && N && x) ;
        if (ok)
        {
            cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
            cs_lsolve (N->L, x) ;           /* x = L\x */
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
            cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
        }
        cs_free (x) ;
        cs_sfree (S) ;
        cs_nfree (N) ;
        return (ok) ;
    }
    
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    csi cs_cholsol (csi order, const cs *A, double *b)
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    {    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        double *x ;
        css *S ;
        csn *N ;
        csi n, ok ;
        if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
        n = A->n ;
    
    Lubomir Riha's avatar
    Lubomir Riha committed
     
        printf("\n--------------------------------------------------------------------------------------------------------------------\n");
        printf("Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
     
        double GPUmem_sizeGB        = 32.0; // GB
        double cube_size            = cbrt((double)n/3.0);
        double n_rhs_ratio          = (cube_size*cube_size*cube_size) / (cube_size*cube_size*cube_size - (cube_size-2.0)*(cube_size-2.0)*(cube_size-2.0) );  
        int    n_rhs                = (int)((double)n / n_rhs_ratio); 
        double LSCsize              = (double)n_rhs*n_rhs / 1024.0 / 1024.0 / 2.0 * sizeof(double);
        int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0;
        int    blocks               = total_num_of_LSC_perGPU;
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        blocks = 2
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        n_rhs  = 2; 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
    
    
        printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
        printf(" - LSC size         = %d x %d ( 1/2 for symmetric system) \n", n_rhs, n_rhs);
        printf(" - LSC size (symm.) = %f MB \n", LSCsize); 
        printf(" - number of RHS for this matrix is : %d \n", n_rhs );
        printf(" - Total namber of LSCs to fit into %f GB RAM of GPU : %d \n", GPUmem_sizeGB, (int)total_num_of_LSC_perGPU );
        printf(" - Total problem size is : %d DOF \n", total_num_of_LSC_perGPU * n);
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        printf("\n");
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
        N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
        x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
        ok = (S && N && x) ;
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        if (ok)
        {
            cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                    
            // *** GPU code start 
            // - cs_lsolve (N->L, x) ;           /* x = L\x */
    
            // Copy RHS vector to multiple columns 
            double *rhs_t;
            rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
    
            int ri;
            int ni;
            for (ni = 0; ni < n; ni++) {
    #ifdef DEBUG
                // printf("rhs[%d] %f - ",ni,x[ni]); 
    #endif
                for (ri = 0; ri < n_rhs; ri++) {
                    rhs_t[ni*n_rhs+ri] = x[ni];
    #ifdef DEBUG
                    // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
    #endif
                }
    #ifdef DEBUG
                // printf("\n");
    #endif 
            }
            // END - Copy RHS vector to multiple columns 
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    #define FULL_MEM
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    #ifdef  FULL_MEM
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        {
    
            int GPU_mem = 0; 
    
            int num_of_arrays = blocks;
    
            // create intermediate host array for storage of device row-pointers
            int**    h_array_Lp = (int**)   malloc(num_of_arrays * sizeof(int*));
            int**    h_array_Li = (int**)   malloc(num_of_arrays * sizeof(int*));
            double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*));
            double** h_array_x  = (double**)malloc(num_of_arrays * sizeof(double*));
            
            // create top-level device array pointer
            int**    d_array_Lp;
            int**    d_array_Li;
            double** d_array_Lx;
            double** d_array_x ; 
    
            cudaMalloc((void**)&d_array_Lp, num_of_arrays * sizeof(int*));
            cudaMalloc((void**)&d_array_Li, num_of_arrays * sizeof(int*));
            cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*));
            cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*));
            
    
    
            size_t free_byte ;
            size_t total_byte ;
            cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
            double free_db = (double)free_byte ;
            double total_db = (double)total_byte ;
            double used_db = total_db - free_db ;
    
            printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
    
    
            // allocate each device row-pointer, then copy host data to it
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        
    
            int i; 
            for(i = 0 ; i < num_of_arrays ; i++){
                cudaMalloc(&h_array_Lp[i],           ((N->L->n)+1) * sizeof(int));
                cudaMalloc(&h_array_Li[i],           (N->L->nzmax) * sizeof(int));
                cudaMalloc(&h_array_Lx[i],           (N->L->nzmax) * sizeof(double));
                cudaMalloc(&h_array_x [i],               n * n_rhs * sizeof(double));
    
                printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int))     / 1024.0 / 1024.0);
                printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int))     / 1024.0 / 1024.0);
                printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double))  / 1024.0 / 1024.0);
                printf(" x size: %f MB\n",     (double)(n * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
    
                printf("LSCsize: %f MB\n",     (double)(0.5*n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
    
    
                cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int),    cudaMemcpyHostToDevice);
                cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int),    cudaMemcpyHostToDevice);
                cudaMemcpy(h_array_Lx[i] , N->L->x , (N->L->nzmax) * sizeof(double), cudaMemcpyHostToDevice);
                cudaMemcpy(h_array_x [i] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
    
                size_t free_byte ;
                size_t total_byte ;
                cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
                double free_db = (double)free_byte ;
                double total_db = (double)total_byte ;
                double used_db = total_db - free_db ;
    
                printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
    
            }
            // fixup top level device array pointer to point to array of device row-pointers
            cudaMemcpy(d_array_Lp, h_array_Lp, num_of_arrays * sizeof(int*)   , cudaMemcpyHostToDevice);
            cudaMemcpy(d_array_Li, h_array_Li, num_of_arrays * sizeof(int*)   , cudaMemcpyHostToDevice);
            cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
            cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
    
            cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
            cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
    
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            double** x_gpu_array  = (double**)malloc(num_of_arrays * sizeof(double*));
            for(i = 0 ; i < num_of_arrays ; i++) {
                x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ;
                cudaMemcpy(x_gpu_array[i], h_array_x [i], n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
            }
    
            double *x_gpu = x_gpu_array[1]; 
    
        }
    #else 
        {
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            // *** Vesion with 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            // Copy Chol. factor and RHSs from CPU to GPU 
            int    *d_Lp;
            int    *d_Li;
            double *d_Lx;
            double *d_x;
    
            cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
            cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
            cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
            cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
    
            cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
    
            cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
            cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
    
    
            // Transfer data back to CPU if needed 
            double *x_gpu;
            x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
    
    Lubomir Riha's avatar
    Lubomir Riha committed
        }
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    #endif
    
    
    
            // CPU code verification - lsolve for multiple RSH 
            cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
    
    	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
    
            // *** Debug - check with per element output 
    #ifdef DEBUG
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            {
            int i; 
            int r; 
            
            cs_lsolve  (N->L, x) ;          /* x = L\x */
    
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
    
            for (i=0; i<n; i++) {
                printf("cpu: %f\t gpu:\t", x[i]); 
                for (r = 0; r < n_rhs; r++) {
                    if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
                        printf("OK\t");
                    else
                        printf("Er\t"); 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                    //  printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
    
                    printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
                }
                printf("\n");
            }
            printf("\n");
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            }
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            cs_lsolve  (N->L, x) ;          /* x = L\x */        
    
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
    #endif
    
            // *** END - GPU code  
    
            cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
        }
        cs_free (x) ;
        cs_sfree (S) ;
        cs_nfree (N) ;
        return (ok) ;
    }
    
    
    
    csi cs_cholsol_single (csi order, const cs *A, double *b)
    {
        
        printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
    
        int n_rhs_ratio = 6;
        int n_rhs; 
        int blocks=600; // cca 30 GB
    
        double *x ;
        css *S ;
        csn *N ;
        csi n, ok ;
        if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
        n = A->n ;
        n_rhs = n / n_rhs_ratio; 
        S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
        N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
        x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
        ok = (S && N && x) ;
        if (ok)
        {
            cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
                    
            // *** GPU code start 
            // - cs_lsolve (N->L, x) ;           /* x = L\x */
    
            // Copy RHS vector to multiple columns 
            double *rhs_t;
            rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
    
            int ri;
            int ni;
            for (ni = 0; ni < n; ni++) {
    #ifdef DEBUG
                // printf("rhs[%d] %f - ",ni,x[ni]); 
    #endif
                for (ri = 0; ri < n_rhs; ri++) {
                    rhs_t[ni*n_rhs+ri] = x[ni];
    #ifdef DEBUG
                    // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
    #endif
                }
    #ifdef DEBUG
                // printf("\n");
    #endif 
            }
            // END - Copy RHS vector to multiple columns 
    
    
            // Copy Chol. factor and RHSs from CPU to GPU 
            int    *d_Lp;
            int    *d_Li;
            double *d_Lx;
            double *d_x;
    
            cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
            cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
            cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
            cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
    
            cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
    
            cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
            cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
            // Transfer data back to CPU if needed 
            double *x_gpu;
            x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
            cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
    
            // CPU code verification - lsolve for multiple RSH 
            cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
            cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
    
            // int i; 
            // int r; 
            // int errors = 0; 
            // for (i=0; i<n; i++) {
            //     for (r = 0; r < n_rhs; r++) {
            //         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
            //             printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
            //             errors++; 
            //         }
            //     }
            // }
            // printf("\n\n %d different elements between CPU and GPU. \n\n", errors);
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            // *** Debug - check with per element output 
    #ifdef DEBUG
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            int i; 
            int r; 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            cs_lsolve (N->L, x) ;           /* x = L\x */
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
            for (i=0; i<n; i++) {
                printf("cpu: %f\t gpu:\t", x[i]); 
                for (r = 0; r < n_rhs; r++) {
                    if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
                        printf("OK\t");
                    else
                        printf("Er\t"); 
                    //    printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
                    printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
                }
                printf("\n");
            }
            printf("\n");
    #else 
            cs_lsolve (N->L, x) ;           /* x = L\x */        
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
    #endif
    
            // *** END - GPU code  
    
    
    
            cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
        }
        cs_free (x) ;
        cs_sfree (S) ;
        cs_nfree (N) ;
        return (ok) ;
    }
    
    
    
    
    csi cs_cholsolx (csi order, const cs *A, double *b)
    {
    
        printf("\n *** Running GPU version with - kernel 1 without transposed RHS - non coalesced memory access. \n");
    
        int n_rhs = 1000; 
    
    
        double *x ;
        css *S ;
        csn *N ;
        csi n, ok ;
        if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
        n = A->n ;
        S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
        N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
        x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
        ok = (S && N && x) ;
        if (ok)
        {
            cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
                
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            double *rhs_t;
            rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
    
            int ri;
            int ni;
            for (ni = 0; ni < n; ni++) {
    
                printf("rhs[%d] %f - ",ni,x[ni]); 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                for (ri = 0; ri < n_rhs; ri++) {
                    rhs_t[ni*n_rhs+ri] = x[ni];
    
                    printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                }
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            }
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            int    *d_Lp;
            int    *d_Li;
            double *d_Lx;
            double *d_x;
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            //cudaSetDevice(1);     
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            
            cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
            cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
            cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
            cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
            cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
            cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            int rr;
            for (rr = 0; rr < n_rhs; rr++) {
                cudaMemcpy( d_x + rr * n , x       ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
            }
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            cs_lsolve_gpu     (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
            double *x_gpu;
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
            cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
    
            cs_lsolve (N->L, x) ;           /* x = L\x */
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            int i; 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            int r; 
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            for (i=0; i<n; i++) {
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                printf("cpu: %f\t gpu:\t", x[i]); 
                for (r = 0; r < n_rhs; r++) {
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                    printf("%f\t", x_gpu[n*r + i] -x[i] );
    
    Lubomir Riha's avatar
    Lubomir Riha committed
                }
                printf("\n");
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            }
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            printf("\n");
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    
    
    Lubomir Riha's avatar
    Lubomir Riha committed
            cs_ltsolve (N->L, x) ;          /* x = L'\x */
            cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
        }
        cs_free (x) ;
        cs_sfree (S) ;
        cs_nfree (N) ;
        return (ok) ;
    
    Lubomir Riha's avatar
    Lubomir Riha committed
    }
    
    Lubomir Riha's avatar
    Lubomir Riha committed