update

d2be194c · Lubomir Riha · 3da58453 · d2be194c · d2be194c · d2be194c
Commit d2be194c authored 6 years ago by Lubomir Riha
--- a/CSparse/Demo/Makefile
+++ b/CSparse/Demo/Makefile
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
 I = -I../Include
+CC = /usr/local/cuda/bin/nvcc -g -G -O0

 LDLIBS += -lm
-CS = $(LDFLAGS) ../Lib/libcsparse.dylib $(LDLIBS)
+CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)

 all: lib cs_demo1 cs_demo2 cs_demo3
-	- ./cs_demo1 < ../Matrix/t1
 	- ./cs_demo2 < ../Matrix/t1
-	- ./cs_demo2 < ../Matrix/ash219
-	- ./cs_demo2 < ../Matrix/bcsstk01
-	- ./cs_demo2 < ../Matrix/fs_183_1
-	- ./cs_demo2 < ../Matrix/mbeacxc
-	- ./cs_demo2 < ../Matrix/west0067
-	- ./cs_demo2 < ../Matrix/lp_afiro
-	- ./cs_demo2 < ../Matrix/bcsstk16
-	- ./cs_demo3 < ../Matrix/bcsstk01
-	- ./cs_demo3 < ../Matrix/bcsstk16

 lib:
 	( cd ../Lib ; $(MAKE) )

--- a/CSparse/Demo/cs_demo.c
+++ b/CSparse/Demo/cs_demo.c
@@ -142,35 +142,35 @@ csi demo2 (problem *Prob)
    printf ("blocks: %g singletons: %g structural rank: %g\n",
        (double) nb, (double) ns, (double) sprank) ;
    cs_dfree (D) ;
-    for (order = 0 ; order <= 3 ; order += 3)   /* natural and amd(A'*A) */
-    {
-        if (!order && m > 1000) continue ;
-        printf ("QR   ") ;
-        print_order (order) ;
-        rhs (x, b, m) ;                         /* compute right-hand side */
-        t = tic () ;
-        ok = cs_qrsol (order, C, x) ;           /* min norm(Ax-b) with QR */
-        printf ("time: %8.2f ", toc (t)) ;
-        print_resid (ok, C, x, b, resid) ;      /* print residual */
-    }
-    if (m != n || sprank < n) return (1) ;      /* return if rect. or singular*/
-    for (order = 0 ; order <= 3 ; order++)      /* try all orderings */
-    {
-        if (!order && m > 1000) continue ;
-        printf ("LU   ") ;
-        print_order (order) ;
-        rhs (x, b, m) ;                         /* compute right-hand side */
-        t = tic () ;
-        ok = cs_lusol (order, C, x, tol) ;      /* solve Ax=b with LU */
-        printf ("time: %8.2f ", toc (t)) ;
-        print_resid (ok, C, x, b, resid) ;      /* print residual */
-    }
-    if (!Prob->sym) return (1) ;
+    // for (order = 0 ; order <= 3 ; order += 3)   /* natural and amd(A'*A) */
+    // {
+    //     if (!order && m > 1000) continue ;
+    //     printf ("QR   ") ;
+    //     print_order (order) ;
+    //     rhs (x, b, m) ;                         /* compute right-hand side */
+    //     t = tic () ;
+    //     ok = cs_qrsol (order, C, x) ;           /* min norm(Ax-b) with QR */
+    //     printf ("time: %8.2f ", toc (t)) ;
+    //     print_resid (ok, C, x, b, resid) ;      /* print residual */
+    // }
+    // if (m != n || sprank < n) return (1) ;      /* return if rect. or singular*/
+    // for (order = 0 ; order <= 3 ; order++)      /* try all orderings */
+    // {
+    //     if (!order && m > 1000) continue ;
+    //     printf ("LU   ") ;
+    //     print_order (order) ;
+    //     rhs (x, b, m) ;                         /* compute right-hand side */
+    //     t = tic () ;
+    //     ok = cs_lusol (order, C, x, tol) ;      /* solve Ax=b with LU */
+    //     printf ("time: %8.2f ", toc (t)) ;
+    //     print_resid (ok, C, x, b, resid) ;      /* print residual */
+    // }
+    // if (!Prob->sym) return (1) ;
    for (order = 0 ; order <= 1 ; order++)      /* natural and amd(A+A') */
    {
        if (!order && m > 1000) continue ;
-        printf ("Chol ") ;
-        print_order (order) ;
+        printf ("Chol \n") ;
+        //print_order (order) ;
        rhs (x, b, m) ;                         /* compute right-hand side */
        t = tic () ;
        ok = cs_cholsol (order, C, x) ;         /* solve Ax=b with Cholesky */

--- a/CSparse/Include/cs.h
+++ b/CSparse/Include/cs.h
@@ -36,6 +36,7 @@ typedef struct cs_sparse    /* matrix in compressed-column or triplet form */

 cs *cs_add (const cs *A, const cs *B, double alpha, double beta) ;
 csi cs_cholsol (csi order, const cs *A, double *b) ;
+csi cs_cholsol_gpu (csi order, const cs *A, double *b) ;
 cs *cs_compress (const cs *T) ;
 csi cs_dupl (cs *A) ;
 csi cs_entry (cs *T, csi i, csi j, double x) ;
@@ -56,6 +57,10 @@ cs *cs_spfree (cs *A) ;
 csi cs_sprealloc (cs *A, csi nzmax) ;
 void *cs_malloc (csi n, size_t size) ;

+//void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x);
+//void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x);
+
+
 /* --- secondary CSparse routines and data structures ----------------------- */
 typedef struct cs_symbolic  /* symbolic Cholesky, LU, or QR analysis */
 {

--- a/CSparse/Lib/Makefile
+++ b/CSparse/Lib/Makefile
@@ -16,7 +16,7 @@

 LIBRARY = libcsparse
 CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
-
+CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0
 I = -I../Include
 RANLIB = ranlib
 ARCHIVE = $(AR) $(ARFLAGS)
@@ -29,7 +29,7 @@ all: install
 CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \
 	cs_droptol.o cs_dropzeros.o cs_dupl.o cs_entry.o \
 	cs_etree.o cs_fkeep.o cs_gaxpy.o cs_happly.o cs_house.o cs_ipvec.o \
-	cs_lsolve.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
+	cs_lsolve.o cs_lsolve_gpu.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
 	cs_permute.o cs_pinv.o cs_post.o cs_pvec.o cs_qr.o cs_qrsol.o \
 	cs_scatter.o cs_schol.o cs_sqr.o cs_symperm.o cs_tdfs.o cs_malloc.o \
 	cs_transpose.o cs_compress.o cs_usolve.o cs_utsolve.o cs_scc.o \
@@ -38,7 +38,7 @@ CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \

 $(CS): ../Include/cs.h Makefile

-%.o: ../Source/%.c ../Include/cs.h
+%.o: ../Source/%.c* ../Include/cs.h
 	$(CC) $(CF) $(I) -c $<

 static: $(AR_TARGET)

--- a/CSparse/Lib/a.out
+++ b/CSparse/Lib/a.out
--- a/CSparse/Source/cs_cholsol.c
+++ b/CSparse/Source/cs_cholsol.c
 #include "cs.h"
+#include "cuda_runtime.h"
+
+
 /* x=A\b where A is symmetric positive definite; b overwritten with solution */
-csi cs_cholsol (csi order, const cs *A, double *b)
+csi cs_cholsol_xxx (csi order, const cs *A, double *b)
 {
    double *x ;
    css *S ;
@@ -26,7 +29,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
 }


-csi cs_cholsol_gpu (csi order, const cs *A, double *b)
+csi cs_cholsol (csi order, const cs *A, double *b)
 {
    double *x ;
    css *S ;
@@ -59,10 +62,10 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
        double *d_Lx;
        double *d_x;

-        cudaMalloc(&d_Lp, ((N->L->n)+1)  * sizeof(int)    ;
-        cudaMalloc(&d_Li,  (N->L->nzmax) * sizeof(int)    ;
-        cudaMalloc(&d_Lx,  (N->L->nzmax) * sizeof(double) ;
-        cudaMalloc(&d_x ,  n             * sizeof(double) ;
+        cudaMalloc(&d_Lp, ((N->L->n)+1)  * sizeof(int)    );
+        cudaMalloc(&d_Li,  (N->L->nzmax) * sizeof(int)    );
+        cudaMalloc(&d_Lx,  (N->L->nzmax) * sizeof(double) );
+        cudaMalloc(&d_x ,  n             * sizeof(double) );

        cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1)  * sizeof(int)    , cudaMemcpyHostToDevice) ;
        cudaMemcpy( d_Li, N->L->i ,  (N->L->nzmax) * sizeof(int)    , cudaMemcpyHostToDevice) ;
@@ -72,7 +75,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)

        // csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
        // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x);
-        cs_lsolve_gpu <<<1, 1>>> (n, d_Lp, d_Li, d_Lx, d_x);
+        cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x);

        double *x_gpu;
        x_gpu = cs_malloc (n, sizeof (double)) ;   
@@ -85,6 +88,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
        for (i=0; i<n; i++) {
            printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] );
        }
+        printf("\n");

        cs_ltsolve (N->L, x) ;          /* x = L'\x */
        cs_pvec (S->pinv, x, b, n) ;    /* b = P'*x */
@@ -93,4 +97,4 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
    cs_sfree (S) ;
    cs_nfree (N) ;
    return (ok) ;
-}
\ No newline at end of file
+}
--- a/CSparse/Source/cs_lsolve.c
+++ b/CSparse/Source/cs_lsolve.c
@@ -12,107 +12,112 @@ csi cs_lsolve (const cs *L, double *x)
        for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
        {
            x [Li [p]] -= Lx [p] * x [j] ;
+            printf("XPU: %f", x [Li [p]]);
+
        }
    }
    return (1) ;
 }


-/* solve Lx=b where x and b are dense.  x=b on input, solution on output. */
+// /* solve Lx=b where x and b are dense.  x=b on input, solution on output. */

-//csi cs_lsolve_gpu (const cs *L, double *x)
-// typedef struct cs_sparse    /* matrix in compressed-column or triplet form */
-// {
-//     csi nzmax ;     /* maximum number of entries */
-//     csi m ;         /* number of rows */
-//     csi n ;         /* number of columns */
-//     csi *p ;        /* column pointers (size n+1) or col indices (size nzmax) */
-//     csi *i ;        /* row indices, size nzmax */
-//     double *x ;     /* numerical values, size nzmax */
-//     csi nz ;        /* # of entries in triplet matrix, -1 for compressed-col */
-// } cs ;
-
-__global__
-void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
-{
-    int p, j;
+// //csi cs_lsolve_gpu (const cs *L, double *x)
+// // typedef struct cs_sparse    /* matrix in compressed-column or triplet form */
+// // {
+// //     csi nzmax ;     /* maximum number of entries */
+// //     csi m ;         /* number of rows */
+// //     csi n ;         /* number of columns */
+// //     csi *p ;        /* column pointers (size n+1) or col indices (size nzmax) */
+// //     csi *i ;        /* row indices, size nzmax */
+// //     double *x ;     /* numerical values, size nzmax */
+// //     csi nz ;        /* # of entries in triplet matrix, -1 for compressed-col */
+// // } cs ;
+
+// // __global__
+// // void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
+// // {
+// //     int p, j;
    
-    // double *Lx ;
-    // n = L->n ; 
-    // Lp = L->p ; 
-    // Li = L->i ; 
-    // Lx = L->x ;
+// //     // double *Lx ;
+// //     // n = L->n ; 
+// //     // Lp = L->p ; 
+// //     // Li = L->i ; 
+// //     // Lx = L->x ;
    
-    for (j = 0 ; j < n ; j++)
-    {
-        x [j] /= Lx [Lp [j]] ;
-        for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
-        {
-            x [Li [p]] -= Lx [p] * x [j] ;
-        }
-    }
+// //     for (j = 0 ; j < n ; j++)
+// //     {
+// //         x [j] /= Lx [Lp [j]] ;
+// //         for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
+// //         {
+// //             x [Li [p]] -= Lx [p] * x [j] ;
+// //         }
+// //     }
    
-    return (1) ;
-}
+// //     //return (1);
+// // }


    


-#include <iostream>
-#include <math.h>
+// #include <iostream>
+// #include <math.h>
 
-// CUDA kernel to add elements of two arrays
-__global__
-void add(int n, float *x, float *y)
-{
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = index; i < n; i += stride)
-    y[i] = x[i] + y[i];
-}
+// // CUDA kernel to add elements of two arrays
+// __global__
+// void add(int n, float *x, float *y)
+// {
+//   int index = blockIdx.x * blockDim.x + threadIdx.x;
+//   int stride = blockDim.x * gridDim.x;
+//   for (int i = index; i < n; i += stride)
+//     y[i] = x[i] + y[i];
+// }
 
-int main(void)
-{
-  int N = 1<<20;
-  float *x, *y;
+// int main(void)
+// {
+//   int N = 1<<20;
+//   float *x, *y;
 
-  // Allocate Unified Memory -- accessible from CPU or GPU
-  cudaMallocManaged(&x, N*sizeof(float));
-  cudaMallocManaged(&y, N*sizeof(float));
+//   // Allocate Unified Memory -- accessible from CPU or GPU
+//   cudaMallocManaged(&x, N*sizeof(float));
+//   cudaMallocManaged(&y, N*sizeof(float));
 
-  // initialize x and y arrays on the host
-  for (int i = 0; i < N; i++) {
-    x[i] = 1.0f;
-    y[i] = 2.0f;
-  }
+//   // initialize x and y arrays on the host
+//   for (int i = 0; i < N; i++) {
+//     x[i] = 1.0f;
+//     y[i] = 2.0f;
+//   }
 


-    // Prefetch the data to the GPU
-  int device = -1;
-  cudaGetDevice(&device);
-  cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
-  cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
+//     // Prefetch the data to the GPU
+//   int device = -1;
+//   cudaGetDevice(&device);
+//   cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
+//   cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);


-  // Launch kernel on 1M elements on the GPU
-  int blockSize = 256;
-  int numBlocks = (N + blockSize - 1) / blockSize;
-  add<<<numBlocks, blockSize>>>(N, x, y);
+//   // Launch kernel on 1M elements on the GPU
+//   int blockSize = 256;
+//   int numBlocks = (N + blockSize - 1) / blockSize;
+//   add<<<numBlocks, blockSize>>>(N, x, y);
 
-  // Wait for GPU to finish before accessing on host
-  cudaDeviceSynchronize();
+//   // Wait for GPU to finish before accessing on host
+//   cudaDeviceSynchronize();
 
-  // Check for errors (all values should be 3.0f)
-  float maxError = 0.0f;
-  for (int i = 0; i < N; i++)
-    maxError = fmax(maxError, fabs(y[i]-3.0f));
-  std::cout << "Max error: " << maxError << std::endl;
+//   // Check for errors (all values should be 3.0f)
+//   float maxError = 0.0f;
+//   for (int i = 0; i < N; i++)
+//     maxError = fmax(maxError, fabs(y[i]-3.0f));
+//   std::cout << "Max error: " << maxError << std::endl;
 
-  // Free memory
-  cudaFree(x);
-  cudaFree(y);
+//   // Free memory
+//   cudaFree(x);
+//   cudaFree(y);
 
-  return 0;
-}
\ No newline at end of file
+//   return 0;
+// }
+
+
+