Skip to content
Snippets Groups Projects
Commit d2be194c authored by Lubomir Riha's avatar Lubomir Riha
Browse files

update

parent 3da58453
Branches
Tags
No related merge requests found
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
I = -I../Include
CC = /usr/local/cuda/bin/nvcc -g -G -O0
LDLIBS += -lm
CS = $(LDFLAGS) ../Lib/libcsparse.dylib $(LDLIBS)
CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
all: lib cs_demo1 cs_demo2 cs_demo3
- ./cs_demo1 < ../Matrix/t1
- ./cs_demo2 < ../Matrix/t1
- ./cs_demo2 < ../Matrix/ash219
- ./cs_demo2 < ../Matrix/bcsstk01
- ./cs_demo2 < ../Matrix/fs_183_1
- ./cs_demo2 < ../Matrix/mbeacxc
- ./cs_demo2 < ../Matrix/west0067
- ./cs_demo2 < ../Matrix/lp_afiro
- ./cs_demo2 < ../Matrix/bcsstk16
- ./cs_demo3 < ../Matrix/bcsstk01
- ./cs_demo3 < ../Matrix/bcsstk16
lib:
( cd ../Lib ; $(MAKE) )
......
......@@ -142,35 +142,35 @@ csi demo2 (problem *Prob)
printf ("blocks: %g singletons: %g structural rank: %g\n",
(double) nb, (double) ns, (double) sprank) ;
cs_dfree (D) ;
for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
{
if (!order && m > 1000) continue ;
printf ("QR ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
for (order = 0 ; order <= 3 ; order++) /* try all orderings */
{
if (!order && m > 1000) continue ;
printf ("LU ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (!Prob->sym) return (1) ;
// for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
// {
// if (!order && m > 1000) continue ;
// printf ("QR ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
// for (order = 0 ; order <= 3 ; order++) /* try all orderings */
// {
// if (!order && m > 1000) continue ;
// printf ("LU ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (!Prob->sym) return (1) ;
for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */
{
if (!order && m > 1000) continue ;
printf ("Chol ") ;
print_order (order) ;
printf ("Chol \n") ;
//print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_cholsol (order, C, x) ; /* solve Ax=b with Cholesky */
......
......@@ -36,6 +36,7 @@ typedef struct cs_sparse /* matrix in compressed-column or triplet form */
cs *cs_add (const cs *A, const cs *B, double alpha, double beta) ;
csi cs_cholsol (csi order, const cs *A, double *b) ;
csi cs_cholsol_gpu (csi order, const cs *A, double *b) ;
cs *cs_compress (const cs *T) ;
csi cs_dupl (cs *A) ;
csi cs_entry (cs *T, csi i, csi j, double x) ;
......@@ -56,6 +57,10 @@ cs *cs_spfree (cs *A) ;
csi cs_sprealloc (cs *A, csi nzmax) ;
void *cs_malloc (csi n, size_t size) ;
//void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x);
//void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x);
/* --- secondary CSparse routines and data structures ----------------------- */
typedef struct cs_symbolic /* symbolic Cholesky, LU, or QR analysis */
{
......
......@@ -16,7 +16,7 @@
LIBRARY = libcsparse
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0
I = -I../Include
RANLIB = ranlib
ARCHIVE = $(AR) $(ARFLAGS)
......@@ -29,7 +29,7 @@ all: install
CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \
cs_droptol.o cs_dropzeros.o cs_dupl.o cs_entry.o \
cs_etree.o cs_fkeep.o cs_gaxpy.o cs_happly.o cs_house.o cs_ipvec.o \
cs_lsolve.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
cs_lsolve.o cs_lsolve_gpu.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
cs_permute.o cs_pinv.o cs_post.o cs_pvec.o cs_qr.o cs_qrsol.o \
cs_scatter.o cs_schol.o cs_sqr.o cs_symperm.o cs_tdfs.o cs_malloc.o \
cs_transpose.o cs_compress.o cs_usolve.o cs_utsolve.o cs_scc.o \
......@@ -38,7 +38,7 @@ CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \
$(CS): ../Include/cs.h Makefile
%.o: ../Source/%.c ../Include/cs.h
%.o: ../Source/%.c* ../Include/cs.h
$(CC) $(CF) $(I) -c $<
static: $(AR_TARGET)
......
File deleted
#include "cs.h"
#include "cuda_runtime.h"
/* x=A\b where A is symmetric positive definite; b overwritten with solution */
csi cs_cholsol (csi order, const cs *A, double *b)
csi cs_cholsol_xxx (csi order, const cs *A, double *b)
{
double *x ;
css *S ;
......@@ -26,7 +29,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
}
csi cs_cholsol_gpu (csi order, const cs *A, double *b)
csi cs_cholsol (csi order, const cs *A, double *b)
{
double *x ;
css *S ;
......@@ -59,10 +62,10 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
double *d_Lx;
double *d_x;
cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) ;
cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) ;
cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) ;
cudaMalloc(&d_x , n * sizeof(double) ;
cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc(&d_x , n * sizeof(double) );
cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
......@@ -72,7 +75,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
// csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
// cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_gpu <<<1, 1>>> (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x);
double *x_gpu;
x_gpu = cs_malloc (n, sizeof (double)) ;
......@@ -85,6 +88,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
for (i=0; i<n; i++) {
printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] );
}
printf("\n");
cs_ltsolve (N->L, x) ; /* x = L'\x */
cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */
......@@ -93,4 +97,4 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
cs_sfree (S) ;
cs_nfree (N) ;
return (ok) ;
}
\ No newline at end of file
}
......@@ -12,107 +12,112 @@ csi cs_lsolve (const cs *L, double *x)
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [Li [p]] -= Lx [p] * x [j] ;
printf("XPU: %f", x [Li [p]]);
}
}
return (1) ;
}
/* solve Lx=b where x and b are dense. x=b on input, solution on output. */
// /* solve Lx=b where x and b are dense. x=b on input, solution on output. */
//csi cs_lsolve_gpu (const cs *L, double *x)
// typedef struct cs_sparse /* matrix in compressed-column or triplet form */
// {
// csi nzmax ; /* maximum number of entries */
// csi m ; /* number of rows */
// csi n ; /* number of columns */
// csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */
// csi *i ; /* row indices, size nzmax */
// double *x ; /* numerical values, size nzmax */
// csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */
// } cs ;
__global__
void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
{
int p, j;
// //csi cs_lsolve_gpu (const cs *L, double *x)
// // typedef struct cs_sparse /* matrix in compressed-column or triplet form */
// // {
// // csi nzmax ; /* maximum number of entries */
// // csi m ; /* number of rows */
// // csi n ; /* number of columns */
// // csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */
// // csi *i ; /* row indices, size nzmax */
// // double *x ; /* numerical values, size nzmax */
// // csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */
// // } cs ;
// // __global__
// // void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
// // {
// // int p, j;
// double *Lx ;
// n = L->n ;
// Lp = L->p ;
// Li = L->i ;
// Lx = L->x ;
// // // double *Lx ;
// // // n = L->n ;
// // // Lp = L->p ;
// // // Li = L->i ;
// // // Lx = L->x ;
for (j = 0 ; j < n ; j++)
{
x [j] /= Lx [Lp [j]] ;
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [Li [p]] -= Lx [p] * x [j] ;
}
}
// // for (j = 0 ; j < n ; j++)
// // {
// // x [j] /= Lx [Lp [j]] ;
// // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
// // {
// // x [Li [p]] -= Lx [p] * x [j] ;
// // }
// // }
return (1) ;
}
// // //return (1);
// // }
#include <iostream>
#include <math.h>
// #include <iostream>
// #include <math.h>
// CUDA kernel to add elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
// // CUDA kernel to add elements of two arrays
// __global__
// void add(int n, float *x, float *y)
// {
// int index = blockIdx.x * blockDim.x + threadIdx.x;
// int stride = blockDim.x * gridDim.x;
// for (int i = index; i < n; i += stride)
// y[i] = x[i] + y[i];
// }
int main(void)
{
int N = 1<<20;
float *x, *y;
// int main(void)
// {
// int N = 1<<20;
// float *x, *y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// // Allocate Unified Memory -- accessible from CPU or GPU
// cudaMallocManaged(&x, N*sizeof(float));
// cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// // initialize x and y arrays on the host
// for (int i = 0; i < N; i++) {
// x[i] = 1.0f;
// y[i] = 2.0f;
// }
// Prefetch the data to the GPU
int device = -1;
cudaGetDevice(&device);
cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
// // Prefetch the data to the GPU
// int device = -1;
// cudaGetDevice(&device);
// cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
// cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// // Launch kernel on 1M elements on the GPU
// int blockSize = 256;
// int numBlocks = (N + blockSize - 1) / blockSize;
// add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// // Wait for GPU to finish before accessing on host
// cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// // Check for errors (all values should be 3.0f)
// float maxError = 0.0f;
// for (int i = 0; i < N; i++)
// maxError = fmax(maxError, fabs(y[i]-3.0f));
// std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
// // Free memory
// cudaFree(x);
// cudaFree(y);
return 0;
}
\ No newline at end of file
// return 0;
// }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment