diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index 2b1cf26e8ef46c7f0cc33e76826db46f3927938d..26aa709e9c17ef377242636a50ebc4903448ee0e 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -6,9 +6,10 @@ LDLIBS += -lm --cudart shared CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS) all: lib cs_demo1 cs_demo2 cs_demo3 - - ./cs_demo2 < ../Matrix/t1 - - ./cs_demo2 < ../Matrix/ash219 +# - ./cs_demo2 < ../Matrix/t1 +# - ./cs_demo2 < ../Matrix/ash219 - ./cs_demo2 < ../Matrix/bcsstk01 + lib: ( cd ../Lib ; $(MAKE) ) diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c index 7b95a972f822b7c8e1005eebc78e91c384cf5b16..a76fefbfc3be335568de31bb222d300f0fb303ec 100644 --- a/CSparse/Demo/cs_demo.c +++ b/CSparse/Demo/cs_demo.c @@ -170,7 +170,7 @@ csi demo2 (problem *Prob) { if (!order && m > 1000) continue ; printf ("Chol \n") ; - //print_order (order) ; + print_order (order) ; rhs (x, b, m) ; /* compute right-hand side */ t = tic () ; ok = cs_cholsol (order, C, x) ; /* solve Ax=b with Cholesky */ diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index f8f42e295073d144499080f6df4b1e6c54e846b4..7961d785152635a267e0c90fab19a47aceaef5ee 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -57,7 +57,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) // csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */ // } cs ; - int n_rhs = 7; + int n_rhs = 4; double *rhs_t; rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; @@ -65,11 +65,16 @@ csi cs_cholsol (csi order, const cs *A, double *b) int ri; int ni; for (ni = 0; ni < n; ni++) { + printf("rhs[%d] %f - ",ni,x[ni]); for (ri = 0; ri < n_rhs; ri++) { rhs_t[ni*n_rhs+ri] = x[ni]; + printf(" ri %d addr: %d %f - ",ri,ni*n_rhs+ri,rhs_t[ni*n_rhs+ri]); } + printf("\n"); } + + int *d_Lp; int *d_Li; double *d_Lx; @@ -92,12 +97,13 @@ csi cs_cholsol (csi order, const cs *A, double *b) // cudaMemcpy( d_x + rr * n , x , n * sizeof(double) , cudaMemcpyHostToDevice) ; // } - cudaMemcpy( d_x , rhs_t , n * sizeof(double) , cudaMemcpyHostToDevice) ; + cudaMemcpy( d_x , rhs_t , n * n_rhs * sizeof(double) , cudaMemcpyHostToDevice) ; // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x, n_rhs); - // void cs_lsolve_acc (int n, const int *Lp, const int *Lissssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss, const double *Lx, double *x, int n_rhs, int threads, int blocks) + // void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks) cs_lsolve_acc_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1); + //cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1); double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; @@ -106,17 +112,32 @@ csi cs_cholsol (csi order, const cs *A, double *b) cs_lsolve (N->L, x) ; /* x = L\x */ + + // int i; + // int r; + // for (i=0; i<n; i++) { + // printf("cpu: %f\t gpu:\t", x[i]); + // for (r = 0; r < n_rhs; r++) { + // printf("%f\t", x_gpu[n*r + i] -x[i] ); + // } + // printf("\n"); + // } + // printf("\n"); + + int i; int r; for (i=0; i<n; i++) { printf("cpu: %f\t gpu:\t", x[i]); for (r = 0; r < n_rhs; r++) { - printf("%f\t", x_gpu[i*n_rhs + r] ); + printf("%f\t", x_gpu[i*n_rhs + r] - x[i] ); } printf("\n"); } printf("\n"); + exit(0); + cs_ltsolve (N->L, x) ; /* x = L'\x */ cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */ } diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu index d068ba1ae3b485c3181d9a9c3428f9bf3741f1be..c6c6ab05608301d325888b3d1c7ac217c1dcfcdc 100644 --- a/CSparse/Source/cs_lsolve_gpu.cu +++ b/CSparse/Source/cs_lsolve_gpu.cu @@ -91,19 +91,44 @@ void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, // Li = L->i ; // Lx = L->x ; - int rhs_offset = tid;; + + +// csi cs_lsolve (const cs *L, double *x) +// { +// csi p, j, n, *Lp, *Li ; +// double *Lx ; +// if (!CS_CSC (L) || !x) return (0) ; /* check inputs */ +// n = L->n ; Lp = L->p ; Li = L->i ; Lx = L->x ; + + +// for (j = 0 ; j < n ; j++) +// { +// x [j] /= Lx [Lp [j]] ; +// for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) +// { +// x [Li [p]] -= Lx [p] * x [j] ; +// } +// } + + +// return (1) ; +// } + + int rhs_offset = tid; for (j = 0 ; j < n ; j++) { - x [rhs_offset + j * n_rhs] = x [rhs_offset + j * n_rhs] / Lx [Lp [j]] ; + int addr = j * n_rhs + rhs_offset; + + x [addr] = x [addr] / Lx [Lp [j]] ; for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) { - x [rhs_offset + Li[p] * n_rhs] = x [rhs_offset + Li[p] * n_rhs] - Lx [p] * x [rhs_offset + j * n_rhs] ; + x [Li[p] * n_rhs + rhs_offset] = x [Li[p] * n_rhs + rhs_offset] - Lx [p] * x [addr] ; - //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); + printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); }