diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index 55010dd275b56ca446f4ccfce4a97af2abb4b150..d3c84d7d1662c7cfd462d66e0b69a04cf633155d 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -8,11 +8,11 @@ CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
 all: lib cs_demo1 cs_demo2 cs_demo3
 #	- ./cs_demo2 < ../Matrix/t1
 # 	- ./cs_demo2 < ../Matrix/FEM-2S
-	- ./cs_demo2 < ../Matrix/bcsstk26-v2
-	- ./cs_demo2 < ../Matrix/bcsstk25
-	- ./cs_demo2 < ../Matrix/bcsstk17
-	- ./cs_demo2 < ../Matrix/ship_001
-# 	- ./cs_demo2 < ../Matrix/bcsstk01
+# 	- ./cs_demo2 < ../Matrix/bcsstk26-v2
+# 	- ./cs_demo2 < ../Matrix/bcsstk25
+# 	- ./cs_demo2 < ../Matrix/bcsstk17
+# 	- ./cs_demo2 < ../Matrix/ship_001
+	- ./cs_demo2 < ../Matrix/bcsstk01
 # 	- ./cs_demo2 < ../Matrix/crystm02
 #  	- ./cs_demo2 < ../Matrix/bcsstk26-v2
 
diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c
index 95d225e4f91b4d6374f7498f695b9135bb62c82e..abcd210e80d8a134570093aa178130bc63f0eabf 100644
--- a/CSparse/Demo/cs_demo.c
+++ b/CSparse/Demo/cs_demo.c
@@ -83,7 +83,7 @@ problem *get_problem (FILE *f, double tol)
     problem *Prob ;
     Prob = cs_calloc (1, sizeof (problem)) ;
     if (!Prob) return (NULL) ;
-    T = cs_load2 (f) ;                   /* load triplet matrix T from a file */
+    T = cs_load (f) ;                   /* load triplet matrix T from a file */
     Prob->A = A = cs_compress (T) ;     /* A = compressed-column form of T */
     cs_spfree (T) ;                     /* clear T */
     if (!cs_dupl (A)) return (free_problem (Prob)) ; /* sum up duplicates */
diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile
index aa4afc8a2081d19360fda4a491df34cc397a71f5..1a847e583803f632581215db86c531850bdd7b3f 100644
--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -16,7 +16,7 @@
 
 LIBRARY = libcsparse
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
-CC = nvcc -dc -g -G -O0 --cudart shared # -DDEBUG
+CC = nvcc -dc -g -G -O0 --cudart shared -DDEBUG
 I = -I../Include
 RANLIB = ranlib
 ARCHIVE = $(AR) $(ARFLAGS)
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index c77f2ef01543ef235740609a81918c88043ae2e6..59bd4ee51c02b1bc14277357bea34e8df2ed13b3 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -52,7 +52,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
     int    blocks               = total_num_of_LSC_perGPU;
 
     blocks = 1; 
-    n_rhs  = 1; 
+    n_rhs  = 2; 
 
 
     printf(" - K to LSC size ratio is : %f - cube size is %f \n", n_rhs_ratio, cube_size);
@@ -97,7 +97,9 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         }
         // END - Copy RHS vector to multiple columns 
 
-/*
+// #define FULL_MEM
+#ifdef  FULL_MEM
+
         int GPU_mem = 0; 
 
         int num_of_arrays = blocks;
@@ -168,9 +170,10 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
         cs_lsolve_gpu_trans_multi  (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
         cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks);
-*/
 
+#else 
 
+        // *** Vesion with 
         // Copy Chol. factor and RHSs from CPU to GPU 
         int    *d_Lp;
         int    *d_Li;
@@ -193,32 +196,24 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         // Transfer data back to CPU if needed 
         double *x_gpu;
         x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
-        // cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
+        cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
+
+#endif
+
 
         // CPU code verification - lsolve for multiple RSH 
         cs_lsolve_mrhs  (N->L, rhs_t, n_rhs);    /* X = L\X */
 	    cs_ltsolve_mrhs (N->L, rhs_t, n_rhs);    /* X = L\X */
-
-        // int i; 
-        // int r; 
-        // int errors = 0; 
-        // for (i=0; i<n; i++) {
-        //     for (r = 0; r < n_rhs; r++) {
-        //         if ( fabs(x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r]) > 1e-12 ) {
-        //             printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
-        //             errors++; 
-        //         }
-        //     }
-        // }
-        // printf("\n\n %d different elements between CPU and GPU. \n\n", errors);
-
-        
+ 
         // *** Debug - check with per element output 
 #ifdef DEBUG
-        // int i; 
-        // int r; 
-        cs_lsolve (N->L, x) ;           /* x = L\x */
+        {
+        int i; 
+        int r; 
+        
+        cs_lsolve  (N->L, x) ;          /* x = L\x */
         cs_ltsolve (N->L, x) ;          /* x = L'\x */
+        
         for (i=0; i<n; i++) {
             printf("cpu: %f\t gpu:\t", x[i]); 
             for (r = 0; r < n_rhs; r++) {
@@ -226,14 +221,15 @@ csi cs_cholsol (csi order, const cs *A, double *b)
                     printf("OK\t");
                 else
                     printf("Er\t"); 
-                //    printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
+                //  printf("%f\t", x_gpu[i*n_rhs + r] - rhs_t[i*n_rhs + r] );
                 printf("%f %f \t", x_gpu[i*n_rhs + r], rhs_t[i*n_rhs + r] );
             }
             printf("\n");
         }
         printf("\n");
+        }
 #else 
-        cs_lsolve (N->L, x) ;           /* x = L\x */        
+        cs_lsolve  (N->L, x) ;          /* x = L\x */        
         cs_ltsolve (N->L, x) ;          /* x = L'\x */
 #endif
 
@@ -345,8 +341,8 @@ csi cs_cholsol_single (csi order, const cs *A, double *b)
         
         // *** Debug - check with per element output 
 #ifdef DEBUG
-        // int i; 
-        // int r; 
+        int i; 
+        int r; 
         cs_lsolve (N->L, x) ;           /* x = L\x */
         cs_ltsolve (N->L, x) ;          /* x = L'\x */
         for (i=0; i<n; i++) {