Skip to content
Snippets Groups Projects
Commit 368dc87c authored by Lubomir Riha's avatar Lubomir Riha
Browse files

to sal

parent 5c47f32a
Branches
No related tags found
No related merge requests found
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
I = -I../Include
CC = /usr/local/cuda/bin/nvcc -g -G -O0
CC = nvcc -g -G -O0
LDLIBS += -lm
LDLIBS += -lm --cudart shared
CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
all: lib cs_demo1 cs_demo2 cs_demo3
- ./cs_demo2 < ../Matrix/t1
- ./cs_demo2 < ../Matrix/ash219
- ./cs_demo2 < ../Matrix/bcsstk01
lib:
( cd ../Lib ; $(MAKE) )
......
......@@ -142,30 +142,30 @@ csi demo2 (problem *Prob)
printf ("blocks: %g singletons: %g structural rank: %g\n",
(double) nb, (double) ns, (double) sprank) ;
cs_dfree (D) ;
// for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
// {
// if (!order && m > 1000) continue ;
// printf ("QR ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
// for (order = 0 ; order <= 3 ; order++) /* try all orderings */
// {
// if (!order && m > 1000) continue ;
// printf ("LU ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (!Prob->sym) return (1) ;
for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
{
if (!order && m > 1000) continue ;
printf ("QR ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
for (order = 0 ; order <= 3 ; order++) /* try all orderings */
{
if (!order && m > 1000) continue ;
printf ("LU ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (!Prob->sym) return (1) ;
for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */
{
if (!order && m > 1000) continue ;
......
......@@ -19,7 +19,7 @@
#define csi mwSignedIndex
#endif
#ifndef csi
#define csi ptrdiff_t
#define csi int //ptrdiff_t
#endif
/* --- primary CSparse routines and data structures ------------------------- */
......
......@@ -16,7 +16,7 @@
LIBRARY = libcsparse
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0
CC = nvcc -dc -g -G -O0 --cudart shared
I = -I../Include
RANLIB = ranlib
ARCHIVE = $(AR) $(ARFLAGS)
......
......@@ -56,37 +56,64 @@ csi cs_cholsol (csi order, const cs *A, double *b)
// double *x ; /* numerical values, size nzmax */
// csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */
// } cs ;
int n_rhs = 7;
double *rhs_t;
rhs_t = cs_malloc ( n_rhs * n, sizeof (double)) ;
int ri;
int ni;
for (ni = 0; ni < n; ni++) {
for (ri = 0; ri < n_rhs; ri++) {
rhs_t[ni*n_rhs+ri] = x[ni];
}
}
int *d_Lp;
int *d_Li;
double *d_Lx;
double *d_x;
cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc(&d_x , n * sizeof(double) );
//cudaSetDevice(1);
cudaMalloc( &d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc( &d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc( &d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc( &d_x , n * n_rhs * sizeof(double) );
cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Lx, N->L->x , (N->L->nzmax) * sizeof(double) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_x , x , n * sizeof(double) , cudaMemcpyHostToDevice) ;
// int rr;
// for (rr = 0; rr < n_rhs; rr++) {
// cudaMemcpy( d_x + rr * n , x , n * sizeof(double) , cudaMemcpyHostToDevice) ;
// }
cudaMemcpy( d_x , rhs_t , n * sizeof(double) , cudaMemcpyHostToDevice) ;
// cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x, n_rhs);
// void cs_lsolve_acc (int n, const int *Lp, const int *Lissssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss, const double *Lx, double *x, int n_rhs, int threads, int blocks)
// csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
// cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_acc_trans (n, d_Lp, d_Li, d_Lx, d_x, n_rhs, n_rhs, 1);
double *x_gpu;
x_gpu = cs_malloc (n, sizeof (double)) ;
cudaMemcpy(x_gpu, d_x, n * sizeof(double), cudaMemcpyDeviceToHost );
x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ;
cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost );
cs_lsolve (N->L, x) ; /* x = L\x */
int i;
int r;
for (i=0; i<n; i++) {
printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] );
printf("cpu: %f\t gpu:\t", x[i]);
for (r = 0; r < n_rhs; r++) {
printf("%f\t", x_gpu[i*n_rhs + r] );
}
printf("\n");
}
printf("\n");
......
......@@ -12,7 +12,7 @@ csi cs_lsolve (const cs *L, double *x)
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [Li [p]] -= Lx [p] * x [j] ;
printf("XPU: %f", x [Li [p]]);
//printf("XPU: %f", x [Li [p]]);
}
}
......
......@@ -38,35 +38,92 @@
__global__
void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs)
{
int p, j;
// double *Lx ;
// n = L->n ;
// Lp = L->p ;
// Li = L->i ;
// Lx = L->x ;
for (j = 0 ; j < n ; j++)
{
x [j] /= Lx [Lp [j]] ;
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
if (tid < n_rhs) {
int p, j;
// double *Lx ;
// n = L->n ;
// Lp = L->p ;
// Li = L->i ;
// Lx = L->x ;
int rhs_offset = tid * n;
for (j = 0 ; j < n ; j++)
{
x [Li [p]] -= Lx [p] * x [j] ;
printf("GPU: %f", x [Li [p]]);
x [rhs_offset + j] = x [rhs_offset + j] / Lx [Lp [j]] ;
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [rhs_offset + Li [p]] = x [rhs_offset + Li [p]] - Lx [p] * x [rhs_offset + j] ;
printf("GPU; tid=%d Li[p]=%d addr=%d %f\n", tid, Li [p], rhs_offset + Li [p],x [rhs_offset + Li [p]]);
}
}
}
//return (1);
}
__global__
void cs_lsolve_gpu_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs)
{
int tid = threadIdx.x; // + blockIdx.x * blockDim.x;
if (tid < n_rhs) {
int p, j;
// double *Lx ;
// n = L->n ;
// Lp = L->p ;
// Li = L->i ;
// Lx = L->x ;
int rhs_offset = tid;;
for (j = 0 ; j < n ; j++)
{
x [rhs_offset + j * n_rhs] = x [rhs_offset + j * n_rhs] / Lx [Lp [j]] ;
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [rhs_offset + Li[p] * n_rhs] = x [rhs_offset + Li[p] * n_rhs] - Lx [p] * x [rhs_offset + j * n_rhs] ;
//printf("GPU; tid=%d Li[p]=%d addr=%d x=%f\n", tid, Li [p], rhs_offset + Li [p]*n_rhs, x [rhs_offset + Li [p]*n_rhs]);
}
}
}
}
extern "C" {
void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
{
cs_lsolve_gpu <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
}
}
extern "C" {
void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x)
void cs_lsolve_acc_trans (int n, const int *Lp, const int *Li, const double *Lx, double *x, int n_rhs, int threads, int blocks)
{
cs_lsolve_gpu <<<1, 1>>> (n, Lp, Li, Lx, x);
cs_lsolve_gpu_trans <<<blocks, threads>>> (n, Lp, Li, Lx, x, n_rhs);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment