diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index ecb9becc04abe35d458afcf05d85a80bb63007ff..5062ceccf9c7400f120d6d74175a665fbfdd6ba2 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -1,21 +1,12 @@ CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O I = -I../Include +CC = /usr/local/cuda/bin/nvcc -g -G -O0 LDLIBS += -lm -CS = $(LDFLAGS) ../Lib/libcsparse.dylib $(LDLIBS) +CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS) all: lib cs_demo1 cs_demo2 cs_demo3 - - ./cs_demo1 < ../Matrix/t1 - ./cs_demo2 < ../Matrix/t1 - - ./cs_demo2 < ../Matrix/ash219 - - ./cs_demo2 < ../Matrix/bcsstk01 - - ./cs_demo2 < ../Matrix/fs_183_1 - - ./cs_demo2 < ../Matrix/mbeacxc - - ./cs_demo2 < ../Matrix/west0067 - - ./cs_demo2 < ../Matrix/lp_afiro - - ./cs_demo2 < ../Matrix/bcsstk16 - - ./cs_demo3 < ../Matrix/bcsstk01 - - ./cs_demo3 < ../Matrix/bcsstk16 lib: ( cd ../Lib ; $(MAKE) ) diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c index f2937eb594b466c50641aaee66e9d13cfff62256..2d520001275e8390b21afec60a42aadd2606d0f1 100644 --- a/CSparse/Demo/cs_demo.c +++ b/CSparse/Demo/cs_demo.c @@ -142,35 +142,35 @@ csi demo2 (problem *Prob) printf ("blocks: %g singletons: %g structural rank: %g\n", (double) nb, (double) ns, (double) sprank) ; cs_dfree (D) ; - for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */ - { - if (!order && m > 1000) continue ; - printf ("QR ") ; - print_order (order) ; - rhs (x, b, m) ; /* compute right-hand side */ - t = tic () ; - ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */ - printf ("time: %8.2f ", toc (t)) ; - print_resid (ok, C, x, b, resid) ; /* print residual */ - } - if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/ - for (order = 0 ; order <= 3 ; order++) /* try all orderings */ - { - if (!order && m > 1000) continue ; - printf ("LU ") ; - print_order (order) ; - rhs (x, b, m) ; /* compute right-hand side */ - t = tic () ; - ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */ - printf ("time: %8.2f ", toc (t)) ; - print_resid (ok, C, x, b, resid) ; /* print residual */ - } - if (!Prob->sym) return (1) ; + // for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */ + // { + // if (!order && m > 1000) continue ; + // printf ("QR ") ; + // print_order (order) ; + // rhs (x, b, m) ; /* compute right-hand side */ + // t = tic () ; + // ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */ + // printf ("time: %8.2f ", toc (t)) ; + // print_resid (ok, C, x, b, resid) ; /* print residual */ + // } + // if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/ + // for (order = 0 ; order <= 3 ; order++) /* try all orderings */ + // { + // if (!order && m > 1000) continue ; + // printf ("LU ") ; + // print_order (order) ; + // rhs (x, b, m) ; /* compute right-hand side */ + // t = tic () ; + // ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */ + // printf ("time: %8.2f ", toc (t)) ; + // print_resid (ok, C, x, b, resid) ; /* print residual */ + // } + // if (!Prob->sym) return (1) ; for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */ { if (!order && m > 1000) continue ; - printf ("Chol ") ; - print_order (order) ; + printf ("Chol \n") ; + //print_order (order) ; rhs (x, b, m) ; /* compute right-hand side */ t = tic () ; ok = cs_cholsol (order, C, x) ; /* solve Ax=b with Cholesky */ diff --git a/CSparse/Include/cs.h b/CSparse/Include/cs.h index adbe032f019f59f182ce613fe041f43c0f51ca27..82d70b031c139663d1d143bf5f79855bd4ec19ea 100644 --- a/CSparse/Include/cs.h +++ b/CSparse/Include/cs.h @@ -36,6 +36,7 @@ typedef struct cs_sparse /* matrix in compressed-column or triplet form */ cs *cs_add (const cs *A, const cs *B, double alpha, double beta) ; csi cs_cholsol (csi order, const cs *A, double *b) ; +csi cs_cholsol_gpu (csi order, const cs *A, double *b) ; cs *cs_compress (const cs *T) ; csi cs_dupl (cs *A) ; csi cs_entry (cs *T, csi i, csi j, double x) ; @@ -56,6 +57,10 @@ cs *cs_spfree (cs *A) ; csi cs_sprealloc (cs *A, csi nzmax) ; void *cs_malloc (csi n, size_t size) ; +//void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x); +//void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x); + + /* --- secondary CSparse routines and data structures ----------------------- */ typedef struct cs_symbolic /* symbolic Cholesky, LU, or QR analysis */ { diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile index e3f32cec28e22620cb5b4d68595dc92837eb4ece..930641849b3cf57593b66e4797075d4f3a428681 100644 --- a/CSparse/Lib/Makefile +++ b/CSparse/Lib/Makefile @@ -16,7 +16,7 @@ LIBRARY = libcsparse CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O - +CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0 I = -I../Include RANLIB = ranlib ARCHIVE = $(AR) $(ARFLAGS) @@ -29,7 +29,7 @@ all: install CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \ cs_droptol.o cs_dropzeros.o cs_dupl.o cs_entry.o \ cs_etree.o cs_fkeep.o cs_gaxpy.o cs_happly.o cs_house.o cs_ipvec.o \ - cs_lsolve.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \ + cs_lsolve.o cs_lsolve_gpu.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \ cs_permute.o cs_pinv.o cs_post.o cs_pvec.o cs_qr.o cs_qrsol.o \ cs_scatter.o cs_schol.o cs_sqr.o cs_symperm.o cs_tdfs.o cs_malloc.o \ cs_transpose.o cs_compress.o cs_usolve.o cs_utsolve.o cs_scc.o \ @@ -38,7 +38,7 @@ CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \ $(CS): ../Include/cs.h Makefile -%.o: ../Source/%.c ../Include/cs.h +%.o: ../Source/%.c* ../Include/cs.h $(CC) $(CF) $(I) -c $< static: $(AR_TARGET) diff --git a/CSparse/Lib/a.out b/CSparse/Lib/a.out deleted file mode 100755 index 595e3901c555c3f1c5faad27f8a92cb29d7b4e0d..0000000000000000000000000000000000000000 Binary files a/CSparse/Lib/a.out and /dev/null differ diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index 6cff32d9398ae15626285a0111f7e2187826a3b5..70c80f900208e2ab76eebc8e9ea28e053e34dbcd 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -1,6 +1,9 @@ #include "cs.h" +#include "cuda_runtime.h" + + /* x=A\b where A is symmetric positive definite; b overwritten with solution */ -csi cs_cholsol (csi order, const cs *A, double *b) +csi cs_cholsol_xxx (csi order, const cs *A, double *b) { double *x ; css *S ; @@ -26,7 +29,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) } -csi cs_cholsol_gpu (csi order, const cs *A, double *b) +csi cs_cholsol (csi order, const cs *A, double *b) { double *x ; css *S ; @@ -59,10 +62,10 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b) double *d_Lx; double *d_x; - cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) ; - cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) ; - cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) ; - cudaMalloc(&d_x , n * sizeof(double) ; + cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) ); + cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) ); + cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) ); + cudaMalloc(&d_x , n * sizeof(double) ); cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; @@ -72,7 +75,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b) // csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x) // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x); - cs_lsolve_gpu <<<1, 1>>> (n, d_Lp, d_Li, d_Lx, d_x); + cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x); double *x_gpu; x_gpu = cs_malloc (n, sizeof (double)) ; @@ -85,6 +88,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b) for (i=0; i<n; i++) { printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] ); } + printf("\n"); cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ @@ -93,4 +97,4 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b) cs_sfree (S) ; cs_nfree (N) ; return (ok) ; -} \ No newline at end of file +} diff --git a/CSparse/Source/cs_lsolve.c b/CSparse/Source/cs_lsolve.c index 394f283f9ef6b5d5c9540b2ee93bf2fb86e4f081..442ab38b6eb5bcde6ebae61be10d5a422b0765ce 100644 --- a/CSparse/Source/cs_lsolve.c +++ b/CSparse/Source/cs_lsolve.c @@ -12,107 +12,112 @@ csi cs_lsolve (const cs *L, double *x) for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) { x [Li [p]] -= Lx [p] * x [j] ; + printf("XPU: %f", x [Li [p]]); + } } return (1) ; } -/* solve Lx=b where x and b are dense. x=b on input, solution on output. */ +// /* solve Lx=b where x and b are dense. x=b on input, solution on output. */ -//csi cs_lsolve_gpu (const cs *L, double *x) -// typedef struct cs_sparse /* matrix in compressed-column or triplet form */ -// { -// csi nzmax ; /* maximum number of entries */ -// csi m ; /* number of rows */ -// csi n ; /* number of columns */ -// csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */ -// csi *i ; /* row indices, size nzmax */ -// double *x ; /* numerical values, size nzmax */ -// csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */ -// } cs ; - -__global__ -void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x) -{ - int p, j; +// //csi cs_lsolve_gpu (const cs *L, double *x) +// // typedef struct cs_sparse /* matrix in compressed-column or triplet form */ +// // { +// // csi nzmax ; /* maximum number of entries */ +// // csi m ; /* number of rows */ +// // csi n ; /* number of columns */ +// // csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */ +// // csi *i ; /* row indices, size nzmax */ +// // double *x ; /* numerical values, size nzmax */ +// // csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */ +// // } cs ; + +// // __global__ +// // void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x) +// // { +// // int p, j; - // double *Lx ; - // n = L->n ; - // Lp = L->p ; - // Li = L->i ; - // Lx = L->x ; +// // // double *Lx ; +// // // n = L->n ; +// // // Lp = L->p ; +// // // Li = L->i ; +// // // Lx = L->x ; - for (j = 0 ; j < n ; j++) - { - x [j] /= Lx [Lp [j]] ; - for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) - { - x [Li [p]] -= Lx [p] * x [j] ; - } - } +// // for (j = 0 ; j < n ; j++) +// // { +// // x [j] /= Lx [Lp [j]] ; +// // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) +// // { +// // x [Li [p]] -= Lx [p] * x [j] ; +// // } +// // } - return (1) ; -} +// // //return (1); +// // } -#include <iostream> -#include <math.h> +// #include <iostream> +// #include <math.h> -// CUDA kernel to add elements of two arrays -__global__ -void add(int n, float *x, float *y) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < n; i += stride) - y[i] = x[i] + y[i]; -} +// // CUDA kernel to add elements of two arrays +// __global__ +// void add(int n, float *x, float *y) +// { +// int index = blockIdx.x * blockDim.x + threadIdx.x; +// int stride = blockDim.x * gridDim.x; +// for (int i = index; i < n; i += stride) +// y[i] = x[i] + y[i]; +// } -int main(void) -{ - int N = 1<<20; - float *x, *y; +// int main(void) +// { +// int N = 1<<20; +// float *x, *y; - // Allocate Unified Memory -- accessible from CPU or GPU - cudaMallocManaged(&x, N*sizeof(float)); - cudaMallocManaged(&y, N*sizeof(float)); +// // Allocate Unified Memory -- accessible from CPU or GPU +// cudaMallocManaged(&x, N*sizeof(float)); +// cudaMallocManaged(&y, N*sizeof(float)); - // initialize x and y arrays on the host - for (int i = 0; i < N; i++) { - x[i] = 1.0f; - y[i] = 2.0f; - } +// // initialize x and y arrays on the host +// for (int i = 0; i < N; i++) { +// x[i] = 1.0f; +// y[i] = 2.0f; +// } - // Prefetch the data to the GPU - int device = -1; - cudaGetDevice(&device); - cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL); - cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL); +// // Prefetch the data to the GPU +// int device = -1; +// cudaGetDevice(&device); +// cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL); +// cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL); - // Launch kernel on 1M elements on the GPU - int blockSize = 256; - int numBlocks = (N + blockSize - 1) / blockSize; - add<<<numBlocks, blockSize>>>(N, x, y); +// // Launch kernel on 1M elements on the GPU +// int blockSize = 256; +// int numBlocks = (N + blockSize - 1) / blockSize; +// add<<<numBlocks, blockSize>>>(N, x, y); - // Wait for GPU to finish before accessing on host - cudaDeviceSynchronize(); +// // Wait for GPU to finish before accessing on host +// cudaDeviceSynchronize(); - // Check for errors (all values should be 3.0f) - float maxError = 0.0f; - for (int i = 0; i < N; i++) - maxError = fmax(maxError, fabs(y[i]-3.0f)); - std::cout << "Max error: " << maxError << std::endl; +// // Check for errors (all values should be 3.0f) +// float maxError = 0.0f; +// for (int i = 0; i < N; i++) +// maxError = fmax(maxError, fabs(y[i]-3.0f)); +// std::cout << "Max error: " << maxError << std::endl; - // Free memory - cudaFree(x); - cudaFree(y); +// // Free memory +// cudaFree(x); +// cudaFree(y); - return 0; -} \ No newline at end of file +// return 0; +// } + + +