diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile
index 5062ceccf9c7400f120d6d74175a665fbfdd6ba2..2b1cf26e8ef46c7f0cc33e76826db46f3927938d 100644
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
@@ -1,13 +1,14 @@
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
 I = -I../Include
-CC = /usr/local/cuda/bin/nvcc -g -G -O0
+CC = nvcc -g -G -O0
 
-LDLIBS += -lm
+LDLIBS += -lm --cudart shared
 CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
 
 all: lib cs_demo1 cs_demo2 cs_demo3
 	- ./cs_demo2 < ../Matrix/t1
-
+	- ./cs_demo2 < ../Matrix/ash219
+	- ./cs_demo2 < ../Matrix/bcsstk01
 lib:
 	( cd ../Lib ; $(MAKE) )
 
diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c
index 2d520001275e8390b21afec60a42aadd2606d0f1..7b95a972f822b7c8e1005eebc78e91c384cf5b16 100644
--- a/CSparse/Demo/cs_demo.c
+++ b/CSparse/Demo/cs_demo.c
@@ -142,30 +142,30 @@ csi demo2 (problem *Prob)
     printf ("blocks: %g singletons: %g structural rank: %g\n",
         (double) nb, (double) ns, (double) sprank) ;
     cs_dfree (D) ;
-    // for (order = 0 ; order <= 3 ; order += 3)   /* natural and amd(A'*A) */
-    // {
-    //     if (!order && m > 1000) continue ;
-    //     printf ("QR   ") ;
-    //     print_order (order) ;
-    //     rhs (x, b, m) ;                         /* compute right-hand side */
-    //     t = tic () ;
-    //     ok = cs_qrsol (order, C, x) ;           /* min norm(Ax-b) with QR */
-    //     printf ("time: %8.2f ", toc (t)) ;
-    //     print_resid (ok, C, x, b, resid) ;      /* print residual */
-    // }
-    // if (m != n || sprank < n) return (1) ;      /* return if rect. or singular*/
-    // for (order = 0 ; order <= 3 ; order++)      /* try all orderings */
-    // {
-    //     if (!order && m > 1000) continue ;
-    //     printf ("LU   ") ;
-    //     print_order (order) ;
-    //     rhs (x, b, m) ;                         /* compute right-hand side */
-    //     t = tic () ;
-    //     ok = cs_lusol (order, C, x, tol) ;      /* solve Ax=b with LU */
-    //     printf ("time: %8.2f ", toc (t)) ;
-    //     print_resid (ok, C, x, b, resid) ;      /* print residual */
-    // }
-    // if (!Prob->sym) return (1) ;
+    for (order = 0 ; order <= 3 ; order += 3)   /* natural and amd(A'*A) */
+    {
+        if (!order && m > 1000) continue ;
+        printf ("QR   ") ;
+        print_order (order) ;
+        rhs (x, b, m) ;                         /* compute right-hand side */
+        t = tic () ;
+        ok = cs_qrsol (order, C, x) ;           /* min norm(Ax-b) with QR */
+        printf ("time: %8.2f ", toc (t)) ;
+        print_resid (ok, C, x, b, resid) ;      /* print residual */
+    }
+    if (m != n || sprank < n) return (1) ;      /* return if rect. or singular*/
+    for (order = 0 ; order <= 3 ; order++)      /* try all orderings */
+    {
+        if (!order && m > 1000) continue ;
+        printf ("LU   ") ;
+        print_order (order) ;
+        rhs (x, b, m) ;                         /* compute right-hand side */
+        t = tic () ;
+        ok = cs_lusol (order, C, x, tol) ;      /* solve Ax=b with LU */
+        printf ("time: %8.2f ", toc (t)) ;
+        print_resid (ok, C, x, b, resid) ;      /* print residual */
+    }
+    if (!Prob->sym) return (1) ;
     for (order = 0 ; order <= 1 ; order++)      /* natural and amd(A+A') */
     {
         if (!order && m > 1000) continue ;
diff --git a/CSparse/Include/cs.h b/CSparse/Include/cs.h
index 82d70b031c139663d1d143bf5f79855bd4ec19ea..a986bf5d28d8b73ff62f3c45cf9d835e7b454373 100644
--- a/CSparse/Include/cs.h
+++ b/CSparse/Include/cs.h
@@ -19,7 +19,7 @@
 #define csi mwSignedIndex
 #endif
 #ifndef csi
-#define csi ptrdiff_t
+#define csi int //ptrdiff_t
 #endif
 
 /* --- primary CSparse routines and data structures ------------------------- */
diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile
index 930641849b3cf57593b66e4797075d4f3a428681..524ece857044aede10a8db0647a9fb35efbfe6eb 100644
--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -16,7 +16,7 @@
 
 LIBRARY = libcsparse
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
-CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0
+CC = nvcc -dc -g -G -O0 --cudart shared
 I = -I../Include
 RANLIB = ranlib
 ARCHIVE = $(AR) $(ARFLAGS)
diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c
index 70c80f900208e2ab76eebc8e9ea28e053e34dbcd..f8f42e295073d144499080f6df4b1e6c54e846b4 100644
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
@@ -56,37 +56,64 @@ csi cs_cholsol (csi order, const cs *A, double *b)
         //     double *x ;     /* numerical values, size nzmax */
         //     csi nz ;        /* # of entries in triplet matrix, -1 for compressed-col */
         // } cs ;
+
+        int n_rhs = 7; 
         
+        double *rhs_t;
+        rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;   
+
+        int ri;
+        int ni;
+        for (ni = 0; ni < n; ni++) {
+            for (ri = 0; ri < n_rhs; ri++) {
+                rhs_t[ni*n_rhs+ri] = x[ni];
+            }
+        }
+
         int    *d_Lp;
         int    *d_Li;
         double *d_Lx;
         double *d_x;
 
-        cudaMalloc(&d_Lp, ((N->L->n)+1)  * sizeof(int)    );
-        cudaMalloc(&d_Li,  (N->L->nzmax) * sizeof(int)    );
-        cudaMalloc(&d_Lx,  (N->L->nzmax) * sizeof(double) );
-        cudaMalloc(&d_x ,  n             * sizeof(double) );
+        //cudaSetDevice(1); 	
+        
+
+        cudaMalloc( &d_Lp, ((N->L->n)+1)  * sizeof(int)    );
+        cudaMalloc( &d_Li,  (N->L->nzmax) * sizeof(int)    );
+        cudaMalloc( &d_Lx,  (N->L->nzmax) * sizeof(double) );
+        cudaMalloc( &d_x ,  n * n_rhs     * sizeof(double) );
 
         cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
         cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
         cudaMemcpy( d_Lx, N->L->x ,  (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
-        cudaMemcpy( d_x , x       ,  n             * sizeof(double) , cudaMemcpyHostToDevice) ;
+        
+        // int rr;
+        // for (rr = 0; rr < n_rhs; rr++) {
+        //     cudaMemcpy( d_x + rr * n , x       ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
+        // }
+        
+        cudaMemcpy( d_x , rhs_t ,  n * sizeof(double) , cudaMemcpyHostToDevice) ;
 
+        // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x, n_rhs);
+        //    void cs_lsolve_acc (int n, const int *Lp, const int *Lissssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss, const double *Lx, double *x, int n_rhs, int threads, int blocks)
 
-        // csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
-        // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x);
-        cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x);
+        cs_lsolve_acc_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
 
         double *x_gpu;
-        x_gpu = cs_malloc (n, sizeof (double)) ;   
-        cudaMemcpy(x_gpu, d_x, n * sizeof(double), cudaMemcpyDeviceToHost );
+        x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;   
+        cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
 
 
         cs_lsolve (N->L, x) ;           /* x = L\x */
 
         int i; 
+        int r; 
         for (i=0; i<n; i++) {
-            printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] );
+            printf("cpu: %f\t gpu:\t", x[i]); 
+            for (r = 0; r < n_rhs; r++) {
+                printf("%f\t", x_gpu[i*n_rhs + r] );
+            }
+            printf("\n");
         }
         printf("\n");
 
diff --git a/CSparse/Source/cs_lsolve.c b/CSparse/Source/cs_lsolve.c
index 442ab38b6eb5bcde6ebae61be10d5a422b0765ce..4fcc689d09b887000ed0097c8a83c365c96d48e3 100644
--- a/CSparse/Source/cs_lsolve.c
+++ b/CSparse/Source/cs_lsolve.c
@@ -12,7 +12,7 @@ csi cs_lsolve (const cs *L, double *x)
         for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
         {
             x [Li [p]] -= Lx [p] * x [j] ;
-            printf("XPU: %f", x [Li [p]]);
+            //printf("XPU: %f", x [Li [p]]);
 
         }
     }
diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu
index c21ff84f7974f0efe2b4618cc46d6fa8dd6ff8ba..d068ba1ae3b485c3181d9a9c3428f9bf3741f1be 100644
--- a/CSparse/Source/cs_lsolve_gpu.cu
+++ b/CSparse/Source/cs_lsolve_gpu.cu
@@ -38,35 +38,92 @@
 
 
 __global__
-void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
+void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs)
 {
-    int p, j;
     
-    // double *Lx ;
-    // n = L->n ; 
-    // Lp = L->p ; 
-    // Li = L->i ; 
-    // Lx = L->x ;
-      
-    for (j = 0 ; j < n ; j++)
-    {
-        x [j] /= Lx [Lp [j]] ;
-        for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+    int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
+
+    if (tid < n_rhs) {
+
+        int p, j;
+        
+        // double *Lx ;
+        // n = L->n ; 
+        // Lp = L->p ; 
+        // Li = L->i ; 
+        // Lx = L->x ;
+        
+        int rhs_offset = tid * n; 
+
+        for (j = 0 ; j < n ; j++)
         {
-            x [Li [p]] -= Lx [p] * x [j] ;
-            printf("GPU: %f", x [Li [p]]);
+            
+            x [rhs_offset + j]  = x [rhs_offset + j] / Lx [Lp [j]] ;
+            
+            for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+            {
+                
+                x [rhs_offset + Li [p]] = x [rhs_offset + Li [p]] - Lx [p] * x [rhs_offset + j] ;
+                
+                printf("GPU; tid=%d Li[p]=%d addr=%d %f\n", tid, Li [p], rhs_offset + Li [p],x [rhs_offset + Li [p]]);
+            
+            }
+
         }
+           
     }
-    
-    //return (1);
+
 }
 
+__global__
+void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs)
+{
+    
+    int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
+
+    if (tid < n_rhs) {
 
+        int p, j;
+        
+        // double *Lx ;
+        // n = L->n ; 
+        // Lp = L->p ; 
+        // Li = L->i ; 
+        // Lx = L->x ;
+        
+        int rhs_offset = tid;; 
 
+        for (j = 0 ; j < n ; j++)
+        {
+            
+            x [rhs_offset + j * n_rhs]  = x [rhs_offset + j * n_rhs] / Lx [Lp [j]] ;
+            
+            for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+            {
+                
+                x [rhs_offset + Li[p] * n_rhs] = x [rhs_offset + Li[p] * n_rhs] - Lx [p] * x [rhs_offset + j * n_rhs] ;
+                
+                //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
+            
+            }
+
+        }
+           
+    }
+
+}
+
+
+extern "C" {
+    void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
+    {
+      cs_lsolve_gpu <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
+    }
+}
 
 extern "C" {
-    void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x)
+    void cs_lsolve_acc_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
     {
-      cs_lsolve_gpu <<<1, 1>>> (n, Lp, Li, Lx, x);
+      cs_lsolve_gpu_trans <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
     }
 }