diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index 96103f7614286dd576cb12b47aa247d921a67d54..de981fd78ceb120871f9d43c0b09e03e03335010 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -29,6 +29,7 @@ csi cs_cholsol_cpu (csi order, const cs *A, double *b)
 }
 
 
+
 csi cs_cholsol (csi order, const cs *A, double *b)
 {
     
@@ -36,7 +37,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
     int n_rhs_ratio = 6;
     int n_rhs; 
-    int blocks=600; // cca 30 GB
+    int blocks=13;//0; // cca 30 GB
 
     double *x ;
     css *S ;
@@ -49,6 +50,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
     x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
     ok = (S && N && x) ;
+    
     if (ok)
     {
         cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
@@ -79,6 +81,79 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         // END - Copy RHS vector to multiple columns 
 
 
+        int GPU_mem = 0; 
+
+        int num_of_arrays = blocks;
+
+        // create intermediate host array for storage of device row-pointers
+        int**    h_array_Lp = (int**)   malloc(num_of_arrays * sizeof(int*));
+        int**    h_array_Li = (int**)   malloc(num_of_arrays * sizeof(int*));
+        double** h_array_Lx = (double**)malloc(num_of_arrays * sizeof(double*));
+        double** h_array_x  = (double**)malloc(num_of_arrays * sizeof(double*));
+        
+        // create top-level device array pointer
+        int**    d_array_Lp;
+        int**    d_array_Li;
+        double** d_array_Lx;
+        double** d_array_x ; 
+
+        cudaMalloc((void**)&d_array_Lp, num_of_arrays * sizeof(int*));
+        cudaMalloc((void**)&d_array_Li, num_of_arrays * sizeof(int*));
+        cudaMalloc((void**)&d_array_Lx, num_of_arrays * sizeof(double*));
+        cudaMalloc((void**)&d_array_x , num_of_arrays * sizeof(double*));
+        
+
+
+        size_t free_byte ;
+        size_t total_byte ;
+        cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+        double free_db = (double)free_byte ;
+        double total_db = (double)total_byte ;
+        double used_db = total_db - free_db ;
+
+        printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+
+        // allocate each device row-pointer, then copy host data to it
+        int i; 
+        for(i = 0 ; i < num_of_arrays ; i++){
+            cudaMalloc(&h_array_Lp[i],           ((N->L->n)+1) * sizeof(int));
+            cudaMalloc(&h_array_Li[i],           (N->L->nzmax) * sizeof(int));
+            cudaMalloc(&h_array_Lx[i],           (N->L->nzmax) * sizeof(double));
+            cudaMalloc(&h_array_x [i],               n * n_rhs * sizeof(double));
+
+            printf("Lp size: %f MB\n", (double)(((N->L->n)+1) * sizeof(int))     / 1024.0 / 1024.0);
+            printf("Li size: %f MB\n", (double)((N->L->nzmax) * sizeof(int))     / 1024.0 / 1024.0);
+            printf("Lx size: %f MB\n", (double)((N->L->nzmax) * sizeof(double))  / 1024.0 / 1024.0);
+            printf(" x size: %f MB\n",     (double)(n * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
+            printf("LSCsize: %f MB\n",     (double)(n_rhs * n_rhs * sizeof(double))  / 1024.0 / 1024.0);
+
+            cudaMemcpy(h_array_Lp[i] , N->L->p , ((N->L->n)+1) * sizeof(int),    cudaMemcpyHostToDevice);
+            cudaMemcpy(h_array_Li[i] , N->L->i , (N->L->nzmax) * sizeof(int),    cudaMemcpyHostToDevice);
+            cudaMemcpy(h_array_Lx[i] , N->L->x , (N->L->nzmax) * sizeof(double), cudaMemcpyHostToDevice);
+            cudaMemcpy(h_array_x [i] , rhs_t   ,     n * n_rhs * sizeof(double), cudaMemcpyHostToDevice);
+
+            size_t free_byte ;
+            size_t total_byte ;
+            cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+            double free_db = (double)free_byte ;
+            double total_db = (double)total_byte ;
+            double used_db = total_db - free_db ;
+
+            printf("GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+        }
+        // fixup top level device array pointer to point to array of device row-pointers
+        cudaMemcpy(d_array_Lp, h_array_Lp, num_of_arrays * sizeof(int*)   , cudaMemcpyHostToDevice);
+        cudaMemcpy(d_array_Li, h_array_Li, num_of_arrays * sizeof(int*)   , cudaMemcpyHostToDevice);
+        cudaMemcpy(d_array_Lx, h_array_Lx, num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_array_x , h_array_x , num_of_arrays * sizeof(double*), cudaMemcpyHostToDevice);
+
+        cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+        cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
+
+
+
         // Copy Chol. factor and RHSs from CPU to GPU 
         int    *d_Lp;
         int    *d_Li;
@@ -95,24 +170,138 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
         cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
 
-
-        // cudaEvent_t start, stop;
-        // cudaEventCreate(&start);
-        // cudaEventCreate(&stop);
-
-        // Run Solve on GPU 
-        // cudaEventRecord(start);
         cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
-        // cudaEventRecord(stop);
 
+        // Transfer data back to CPU if needed 
+        double *x_gpu;
+        x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
+        // cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
 
-        // cudaEventSynchronize(stop);
-        // float milliseconds = 0;
-        // cudaEventElapsedTime(&milliseconds, start, stop);
+        // CPU code verification - lsolve for multiple RSH 
+        cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
+        cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
 
-        // printf("GPU solve time for %d RHSs is: %f n", n_rhs, milliseconds);
+        // int i; 
+        // int r; 
+        // int errors = 0; 
+        // for (i=0; i<n; i++) {
+        //     for (r = 0; r < n_rhs; r++) {
+        //         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
+        //             printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+        //             errors++; 
+        //         }
+        //     }
+        // }
+        // printf("\n\n %d different elements between CPU and GPU. \n\n", errors);
+
+        
+        // *** Debug - check with per element output 
+#ifdef DEBUG
+        // int i; 
+        // int r; 
+        cs_lsolve (N->L, x) ;           /* x = L\x */
+        cs_ltsolve (N->L, x) ;          /* x = L'\x */
+        for (i=0; i<n; i++) {
+            printf("cpu: %f\t gpu:\t", x[i]); 
+            for (r = 0; r < n_rhs; r++) {
+                if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) < 1e-12 )
+                    printf("OK\t");
+                else
+                    printf("Er\t"); 
+                //    printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+                printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
+            }
+            printf("\n");
+        }
+        printf("\n");
+#else 
+        cs_lsolve (N->L, x) ;           /* x = L\x */        
+        cs_ltsolve (N->L, x) ;          /* x = L'\x */
+#endif
 
+        // *** END - GPU code  
+
+
+
+        cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
+    }
+    cs_free (x) ;
+    cs_sfree (S) ;
+    cs_nfree (N) ;
+    return (ok) ;
+}
+
+
+
+csi cs_cholsol_single (csi order, const cs *A, double *b)
+{
+    
+    printf("\n *** Running GPU version with - kernel 1 with transposed RHS - with coalesced memory access. \n");
+
+    int n_rhs_ratio = 6;
+    int n_rhs; 
+    int blocks=600; // cca 30 GB
+
+    double *x ;
+    css *S ;
+    csn *N ;
+    csi n, ok ;
+    if (!CS_CSC (A) || !b) return (0) ;     /* check inputs */
+    n = A->n ;
+    n_rhs = n / n_rhs_ratio; 
+    S = cs_schol (order, A) ;               /* ordering and symbolic analysis */
+    N = cs_chol (A, S) ;                    /* numeric Cholesky factorization */
+    x = cs_malloc (n, sizeof (double)) ;    /* get workspace */
+    ok = (S && N && x) ;
+    if (ok)
+    {
+        cs_ipvec (S->pinv, b, x, n) ;   /* x = P*b */
+                
+        // *** GPU code start 
+        // - cs_lsolve (N->L, x) ;           /* x = L\x */
+
+        // Copy RHS vector to multiple columns 
+        double *rhs_t;
+        rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
+
+        int ri;
+        int ni;
+        for (ni = 0; ni < n; ni++) {
+#ifdef DEBUG
+            // printf("rhs[%d] %f - ",ni,x[ni]); 
+#endif
+            for (ri = 0; ri < n_rhs; ri++) {
+                rhs_t[ni*n_rhs+ri] = x[ni];
+#ifdef DEBUG
+                // printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
+#endif
+            }
+#ifdef DEBUG
+            // printf("\n");
+#endif 
+        }
+        // END - Copy RHS vector to multiple columns 
+
+
+        // Copy Chol. factor and RHSs from CPU to GPU 
+        int    *d_Lp;
+        int    *d_Li;
+        double *d_Lx;
+        double *d_x;
+
+        cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
+        cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
+        cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
+        cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
+
+        cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_x , rhs_t   ,  n * n_rhs     * sizeof(double) , cudaMemcpyHostToDevice) ;
+
+        cs_lsolve_gpu_trans  (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
+        cs_ltsolve_gpu_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, blocks);
 
         // Transfer data back to CPU if needed 
         double *x_gpu;
diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu
index 6de33e4ead46b9196bc61f8f201db1640946278c..966045ef35ae8c40109f15901e2839463b2509dd 100644
--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -78,6 +78,52 @@ void cs_lsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const doub
 }
 
 
+__global__
+void cs_lsolve_gpu_kernel_trans_multi (int n, const int **Lp_a, const int **Li_a, const double **Lx_a, double **x_a, int n_rhs)
+{
+    
+    const int    *Lp = Lp_a[blockIdx.x];
+    const int    *Li = Li_a[blockIdx.x];
+    const double *Lx = Lx_a[blockIdx.x];
+          double *x  =  x_a[blockIdx.x];
+
+
+    int tid  = threadIdx.x; // + blockIdx.x * blockDim.x;
+    int g_id = 0;
+
+    for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x)
+    {
+        // printf("g_id = %d \n", g_id);
+        if ( g_id + tid < n_rhs ) {
+
+            int p, j;
+            
+            int rhs_offset = g_id + tid;
+
+            for (j = 0 ; j < n ; j++)
+            {
+                
+                int addr = j * n_rhs + rhs_offset;
+
+                x [addr]  = x [addr] / Lx [Lp [j]] ;
+                
+                for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                {
+                    
+                    x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ;
+                    
+                    // printf("GPU; gid=%d tid=%d Li[p]=%d addr=%d x=%f\n", g_id, tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
+                
+                }
+
+            }
+               
+        }
+
+    }
+
+}
+
 __global__
 void cs_ltsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs)
 {
@@ -115,6 +161,47 @@ void cs_ltsolve_gpu_kernel_trans (int n, const int *Lp, const int *Li, const dou
 }
 
 
+__global__
+void cs_ltsolve_gpu_kernel_trans_multi (int n, const int **Lp_a, const int **Li_a, const double **Lx_a, double **x_a, int n_rhs)
+{
+    
+    const int    *Lp = Lp_a[blockIdx.x];
+    const int    *Li = Li_a[blockIdx.x];
+    const double *Lx = Lx_a[blockIdx.x];
+          double *x  =  x_a[blockIdx.x];
+
+    int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
+    int g_id = 0;
+
+    for (g_id = 0; g_id < n_rhs; g_id = g_id + blockDim.x)
+    {
+        // printf("g_id = %d \n", g_id);
+        if ( g_id + tid < n_rhs ) {
+
+            int p, j;
+            
+            int rhs_offset = g_id + tid;
+
+            for (j = n-1 ; j >= 0 ; j--)
+            {
+                
+                int addr = j * n_rhs + rhs_offset;
+
+                for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+                {
+                    x [addr] =  x [addr] - Lx [p] * x [Li [p] * n_rhs + rhs_offset] ;
+
+                    //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
+                
+                }
+                x [addr] = x [addr] /  Lx [Lp [j]] ;
+            }
+               
+        }
+    }
+
+}
+
 extern "C" {
     void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
     {
@@ -142,6 +229,33 @@ extern "C" {
 }
 
 
+extern "C" {
+    void cs_lsolve_gpu_trans_multi (int n, const int **Lp, const int **Li, const double **Lx, double **x, int n_rhs, int threads, int blocks)
+    {
+      
+      if (threads > n_rhs) threads = n_rhs;
+      if (threads > MAX_THREADS) threads = MAX_THREADS; 
+
+      printf("Running kernel ** cs_lsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+      
+      cudaEvent_t start, stop;
+      cudaEventCreate(&start);
+      cudaEventCreate(&stop);
+
+      cudaEventRecord(start);
+      cs_lsolve_gpu_kernel_trans_multi <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
+      cudaEventRecord(stop);
+
+      cudaEventSynchronize(stop);
+      float milliseconds = 0;
+      cudaEventElapsedTime(&milliseconds, start, stop);
+
+      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+
+    }
+}
+
+
 extern "C" {
     void cs_ltsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
     {
@@ -169,6 +283,51 @@ extern "C" {
 }
 
 
+extern "C" {
+    void cs_ltsolve_gpu_trans_multi (int n, const int **Lp, const int **Li, const double **Lx, double **x, int n_rhs, int threads, int blocks)
+    {
+
+      if (threads > n_rhs) threads = n_rhs;
+      if (threads > MAX_THREADS) threads = MAX_THREADS; 
+      
+      printf("Running kernel ** cs_ltsolve_gpu_trans ** with %d BLOCKS and %d THREADS - %d GROUPS. \n", blocks, threads, (int)ceil((double)n_rhs/(double)threads));
+
+      cudaEvent_t start, stop;
+      cudaEventCreate(&start);
+      cudaEventCreate(&stop);
+
+      cudaEventRecord(start);
+      cs_ltsolve_gpu_kernel_trans_multi <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
+      cudaEventRecord(stop);
+
+      cudaEventSynchronize(stop);
+      float milliseconds = 0;
+      cudaEventElapsedTime(&milliseconds, start, stop);
+
+      printf("GPU solve time for %d RHSs is: %f ms \n", n_rhs, milliseconds);
+
+    }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 // *** OLD functions