diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index 2b1cf26e8ef46c7f0cc33e76826db46f3927938d..26aa709e9c17ef377242636a50ebc4903448ee0e 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -6,9 +6,10 @@ LDLIBS += -lm --cudart shared
 CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
 
 all: lib cs_demo1 cs_demo2 cs_demo3
-	- ./cs_demo2 < ../Matrix/t1
-	- ./cs_demo2 < ../Matrix/ash219
+#	- ./cs_demo2 < ../Matrix/t1
+#	- ./cs_demo2 < ../Matrix/ash219
 	- ./cs_demo2 < ../Matrix/bcsstk01
+
 lib:
 	( cd ../Lib ; $(MAKE) )
 
diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c
index 7b95a972f822b7c8e1005eebc78e91c384cf5b16..a76fefbfc3be335568de31bb222d300f0fb303ec 100644
--- a/CSparse/Demo/cs_demo.c
+++ b/CSparse/Demo/cs_demo.c
@@ -170,7 +170,7 @@ csi demo2 (problem *Prob)
     {
         if (!order && m > 1000) continue ;
         printf ("Chol \n") ;
-        //print_order (order) ;
+        print_order (order) ;
         rhs (x, b, m) ;                         /* compute right-hand side */
         t = tic () ;
         ok = cs_cholsol (order, C, x) ;         /* solve Ax=b with Cholesky */
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index f8f42e295073d144499080f6df4b1e6c54e846b4..7961d785152635a267e0c90fab19a47aceaef5ee 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -57,7 +57,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         //     csi nz ;        /* # of entries in triplet matrix, -1 for compressed-col */
         // } cs ;
 
-        int n_rhs = 7; 
+        int n_rhs = 4; 
         
         double *rhs_t;
         rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
@@ -65,11 +65,16 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         int ri;
         int ni;
         for (ni = 0; ni < n; ni++) {
+            printf("rhs[%d] %f - ",ni,x[ni]); 
             for (ri = 0; ri < n_rhs; ri++) {
                 rhs_t[ni*n_rhs+ri] = x[ni];
+                printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); 
             }
+            printf("\n");
         }
 
+
+
         int    *d_Lp;
         int    *d_Li;
         double *d_Lx;
@@ -92,12 +97,13 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         //     cudaMemcpy( d_x + rr * n , x       ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
         // }
         
-        cudaMemcpy( d_x , rhs_t ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
+        cudaMemcpy( d_x , rhs_t ,  n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ;
 
         // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x, n_rhs);
-        //    void cs_lsolve_acc (int n, const int *Lp, const int *Lissssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss, const double *Lx, double *x, int n_rhs, int threads, int blocks)
+        //    void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
 
         cs_lsolve_acc_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
+        //cs_lsolve_acc     (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
 
         double *x_gpu;
         x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
@@ -106,17 +112,32 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 
         cs_lsolve (N->L, x) ;           /* x = L\x */
 
+
+        // int i; 
+        // int r; 
+        // for (i=0; i<n; i++) {
+        //     printf("cpu: %f\t gpu:\t", x[i]); 
+        //     for (r = 0; r < n_rhs; r++) {
+        //         printf("%f\t", x_gpu[n*r + i] -x[i] );
+        //     }
+        //     printf("\n");
+        // }
+        // printf("\n");
+
+
         int i; 
         int r; 
         for (i=0; i<n; i++) {
             printf("cpu: %f\t gpu:\t", x[i]); 
             for (r = 0; r < n_rhs; r++) {
-                printf("%f\t", x_gpu[i*n_rhs + r] );
+                printf("%f\t", x_gpu[i*n_rhs + r] - x[i] );
             }
             printf("\n");
         }
         printf("\n");
 
+        exit(0);
+
         cs_ltsolve (N->L, x) ;          /* x = L'\x */
         cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
     }
diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu
index d068ba1ae3b485c3181d9a9c3428f9bf3741f1be..c6c6ab05608301d325888b3d1c7ac217c1dcfcdc 100644
--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -91,19 +91,44 @@ void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx,
         // Li = L->i ; 
         // Lx = L->x ;
         
-        int rhs_offset = tid;; 
+
+
+// csi cs_lsolve (const cs *L, double *x)
+// {
+//   csi p, j, n, *Lp, *Li ;
+//    double *Lx ;
+//    if (!CS_CSC (L) || !x) return (0) ;                     /* check inputs */
+//    n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ;
+   
+
+//    for (j = 0 ; j < n ; j++)
+//    {
+//        x [j] /= Lx [Lp [j]] ;
+//        for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+//        {
+//            x [Li [p]] -= Lx [p] * x [j] ;
+//        }
+//    }
+
+
+//    return (1) ;
+// }
+
+        int rhs_offset = tid;
 
         for (j = 0 ; j < n ; j++)
         {
             
-            x [rhs_offset + j * n_rhs]  = x [rhs_offset + j * n_rhs] / Lx [Lp [j]] ;
+            int addr = j * n_rhs + rhs_offset;
+
+            x [addr]  = x [addr] / Lx [Lp [j]] ;
             
             for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
             {
                 
-                x [rhs_offset + Li[p] * n_rhs] = x [rhs_offset + Li[p] * n_rhs] - Lx [p] * x [rhs_offset + j * n_rhs] ;
+                x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ;
                 
-                //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
+                printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
             
             }