diff --git a/CSparse/Demo/Makefile b/CSparse/Demo/Makefile index 5062ceccf9c7400f120d6d74175a665fbfdd6ba2..2b1cf26e8ef46c7f0cc33e76826db46f3927938d 100644 --- a/CSparse/Demo/Makefile +++ b/CSparse/Demo/Makefile @@ -1,13 +1,14 @@ CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O I = -I../Include -CC = /usr/local/cuda/bin/nvcc -g -G -O0 +CC = nvcc -g -G -O0 -LDLIBS += -lm +LDLIBS += -lm --cudart shared CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS) all: lib cs_demo1 cs_demo2 cs_demo3 - ./cs_demo2 < ../Matrix/t1 - + - ./cs_demo2 < ../Matrix/ash219 + - ./cs_demo2 < ../Matrix/bcsstk01 lib: ( cd ../Lib ; $(MAKE) ) diff --git a/CSparse/Demo/cs_demo.c b/CSparse/Demo/cs_demo.c index 2d520001275e8390b21afec60a42aadd2606d0f1..7b95a972f822b7c8e1005eebc78e91c384cf5b16 100644 --- a/CSparse/Demo/cs_demo.c +++ b/CSparse/Demo/cs_demo.c @@ -142,30 +142,30 @@ csi demo2 (problem *Prob) printf ("blocks: %g singletons: %g structural rank: %g\n", (double) nb, (double) ns, (double) sprank) ; cs_dfree (D) ; - // for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */ - // { - // if (!order && m > 1000) continue ; - // printf ("QR ") ; - // print_order (order) ; - // rhs (x, b, m) ; /* compute right-hand side */ - // t = tic () ; - // ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */ - // printf ("time: %8.2f ", toc (t)) ; - // print_resid (ok, C, x, b, resid) ; /* print residual */ - // } - // if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/ - // for (order = 0 ; order <= 3 ; order++) /* try all orderings */ - // { - // if (!order && m > 1000) continue ; - // printf ("LU ") ; - // print_order (order) ; - // rhs (x, b, m) ; /* compute right-hand side */ - // t = tic () ; - // ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */ - // printf ("time: %8.2f ", toc (t)) ; - // print_resid (ok, C, x, b, resid) ; /* print residual */ - // } - // if (!Prob->sym) return (1) ; + for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */ + { + if (!order && m > 1000) continue ; + printf ("QR ") ; + print_order (order) ; + rhs (x, b, m) ; /* compute right-hand side */ + t = tic () ; + ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */ + printf ("time: %8.2f ", toc (t)) ; + print_resid (ok, C, x, b, resid) ; /* print residual */ + } + if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/ + for (order = 0 ; order <= 3 ; order++) /* try all orderings */ + { + if (!order && m > 1000) continue ; + printf ("LU ") ; + print_order (order) ; + rhs (x, b, m) ; /* compute right-hand side */ + t = tic () ; + ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */ + printf ("time: %8.2f ", toc (t)) ; + print_resid (ok, C, x, b, resid) ; /* print residual */ + } + if (!Prob->sym) return (1) ; for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */ { if (!order && m > 1000) continue ; diff --git a/CSparse/Include/cs.h b/CSparse/Include/cs.h index 82d70b031c139663d1d143bf5f79855bd4ec19ea..a986bf5d28d8b73ff62f3c45cf9d835e7b454373 100644 --- a/CSparse/Include/cs.h +++ b/CSparse/Include/cs.h @@ -19,7 +19,7 @@ #define csi mwSignedIndex #endif #ifndef csi -#define csi ptrdiff_t +#define csi int //ptrdiff_t #endif /* --- primary CSparse routines and data structures ------------------------- */ diff --git a/CSparse/Lib/Makefile b/CSparse/Lib/Makefile index 930641849b3cf57593b66e4797075d4f3a428681..524ece857044aede10a8db0647a9fb35efbfe6eb 100644 --- a/CSparse/Lib/Makefile +++ b/CSparse/Lib/Makefile @@ -16,7 +16,7 @@ LIBRARY = libcsparse CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O -CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0 +CC = nvcc -dc -g -G -O0 --cudart shared I = -I../Include RANLIB = ranlib ARCHIVE = $(AR) $(ARFLAGS) diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index 70c80f900208e2ab76eebc8e9ea28e053e34dbcd..f8f42e295073d144499080f6df4b1e6c54e846b4 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -56,37 +56,64 @@ csi cs_cholsol (csi order, const cs *A, double *b) // double *x ; /* numerical values, size nzmax */ // csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */ // } cs ; + + int n_rhs = 7; + double *rhs_t; + rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ; + + int ri; + int ni; + for (ni = 0; ni < n; ni++) { + for (ri = 0; ri < n_rhs; ri++) { + rhs_t[ni*n_rhs+ri] = x[ni]; + } + } + int *d_Lp; int *d_Li; double *d_Lx; double *d_x; - cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) ); - cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) ); - cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) ); - cudaMalloc(&d_x , n * sizeof(double) ); + //cudaSetDevice(1); + + + cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) ); + cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) ); + cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) ); + cudaMalloc( &d_x , n * n_rhs * sizeof(double) ); cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ; cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ; - cudaMemcpy( d_x , x , n * sizeof(double) , cudaMemcpyHostToDevice) ; + + // int rr; + // for (rr = 0; rr < n_rhs; rr++) { + // cudaMemcpy( d_x + rr * n , x , n * sizeof(double) , cudaMemcpyHostToDevice) ; + // } + + cudaMemcpy( d_x , rhs_t , n * sizeof(double) , cudaMemcpyHostToDevice) ; + // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x, n_rhs); + // void cs_lsolve_acc (int n, const int *Lp, const int *Lissssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss, const double *Lx, double *x, int n_rhs, int threads, int blocks) - // csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x) - // cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x); - cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x); + cs_lsolve_acc_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1); double *x_gpu; - x_gpu = cs_malloc (n, sizeof (double)) ; - cudaMemcpy(x_gpu, d_x, n * sizeof(double), cudaMemcpyDeviceToHost ); + x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; + cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); cs_lsolve (N->L, x) ; /* x = L\x */ int i; + int r; for (i=0; i<n; i++) { - printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] ); + printf("cpu: %f\t gpu:\t", x[i]); + for (r = 0; r < n_rhs; r++) { + printf("%f\t", x_gpu[i*n_rhs + r] ); + } + printf("\n"); } printf("\n"); diff --git a/CSparse/Source/cs_lsolve.c b/CSparse/Source/cs_lsolve.c index 442ab38b6eb5bcde6ebae61be10d5a422b0765ce..4fcc689d09b887000ed0097c8a83c365c96d48e3 100644 --- a/CSparse/Source/cs_lsolve.c +++ b/CSparse/Source/cs_lsolve.c @@ -12,7 +12,7 @@ csi cs_lsolve (const cs *L, double *x) for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) { x [Li [p]] -= Lx [p] * x [j] ; - printf("XPU: %f", x [Li [p]]); + //printf("XPU: %f", x [Li [p]]); } } diff --git a/CSparse/Source/cs_lsolve_gpu.cu b/CSparse/Source/cs_lsolve_gpu.cu index c21ff84f7974f0efe2b4618cc46d6fa8dd6ff8ba..d068ba1ae3b485c3181d9a9c3428f9bf3741f1be 100644 --- a/CSparse/Source/cs_lsolve_gpu.cu +++ b/CSparse/Source/cs_lsolve_gpu.cu @@ -38,35 +38,92 @@ __global__ -void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x) +void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs) { - int p, j; - // double *Lx ; - // n = L->n ; - // Lp = L->p ; - // Li = L->i ; - // Lx = L->x ; - - for (j = 0 ; j < n ; j++) - { - x [j] /= Lx [Lp [j]] ; - for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + int tid = threadIdx.x; // + blockIdx.x * blockDim.x; + + if (tid < n_rhs) { + + int p, j; + + // double *Lx ; + // n = L->n ; + // Lp = L->p ; + // Li = L->i ; + // Lx = L->x ; + + int rhs_offset = tid * n; + + for (j = 0 ; j < n ; j++) { - x [Li [p]] -= Lx [p] * x [j] ; - printf("GPU: %f", x [Li [p]]); + + x [rhs_offset + j] = x [rhs_offset + j] / Lx [Lp [j]] ; + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + + x [rhs_offset + Li [p]] = x [rhs_offset + Li [p]] - Lx [p] * x [rhs_offset + j] ; + + printf("GPU; tid=%d Li[p]=%d addr=%d %f\n", tid, Li [p], rhs_offset + Li [p],x [rhs_offset + Li [p]]); + + } + } + } - - //return (1); + } +__global__ +void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs) +{ + + int tid = threadIdx.x; // + blockIdx.x * blockDim.x; + + if (tid < n_rhs) { + int p, j; + + // double *Lx ; + // n = L->n ; + // Lp = L->p ; + // Li = L->i ; + // Lx = L->x ; + + int rhs_offset = tid;; + for (j = 0 ; j < n ; j++) + { + + x [rhs_offset + j * n_rhs] = x [rhs_offset + j * n_rhs] / Lx [Lp [j]] ; + + for (p = Lp [j]+1 ; p < Lp [j+1] ; p++) + { + + x [rhs_offset + Li[p] * n_rhs] = x [rhs_offset + Li[p] * n_rhs] - Lx [p] * x [rhs_offset + j * n_rhs] ; + + //printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]); + + } + + } + + } + +} + + +extern "C" { + void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks) + { + cs_lsolve_gpu <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs); + } +} extern "C" { - void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x) + void cs_lsolve_acc_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks) { - cs_lsolve_gpu <<<1, 1>>> (n, Lp, Li, Lx, x); + cs_lsolve_gpu_trans <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs); } }