diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index 59bd4ee51c02b1bc14277357bea34e8df2ed13b3..aeafe9ef1c9b7ea53f4f6395edc6da5681c375a8 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -51,7 +51,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0;
     int    blocks               = total_num_of_LSC_perGPU;
 
-    blocks = 1; 
+    blocks = 2
     n_rhs  = 2; 
 
 
@@ -97,9 +97,9 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         }
         // END - Copy RHS vector to multiple columns 
 
-// #define FULL_MEM
+#define FULL_MEM
 #ifdef  FULL_MEM
-
+    {
         int GPU_mem = 0; 
 
         int num_of_arrays = blocks;
@@ -134,6 +134,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
 
         // allocate each device row-pointer, then copy host data to it
+    
         int i; 
         for(i = 0 ; i < num_of_arrays ; i++){
             cudaMalloc(&h_array_Lp[i],           ((N->L->n)+1) * sizeof(int));
@@ -171,8 +172,18 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
 
-#else 
 
+        double** x_gpu_array  = (double**)malloc(num_of_arrays * sizeof(double*));
+        for(i = 0 ; i < num_of_arrays ; i++) {
+            x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ;
+            cudaMemcpy(x_gpu_array[i], h_array_x [i], n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
+        }
+
+        double *x_gpu = x_gpu_array[1]; 
+
+    }
+#else 
+    {
         // *** Vesion with 
         // Copy Chol. factor and RHSs from CPU to GPU 
         int    *d_Lp;
@@ -197,7 +208,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         double *x_gpu;
         x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
         cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
-
+    }
 #endif