Skip to content
Snippets Groups Projects
Commit d2be194c authored by Lubomir Riha's avatar Lubomir Riha
Browse files

update

parent 3da58453
No related branches found
No related tags found
No related merge requests found
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
I = -I../Include
CC = /usr/local/cuda/bin/nvcc -g -G -O0
LDLIBS += -lm
CS = $(LDFLAGS) ../Lib/libcsparse.dylib $(LDLIBS)
CS = $(LDFLAGS) ../Lib/libcsparse.a $(LDLIBS)
all: lib cs_demo1 cs_demo2 cs_demo3
- ./cs_demo1 < ../Matrix/t1
- ./cs_demo2 < ../Matrix/t1
- ./cs_demo2 < ../Matrix/ash219
- ./cs_demo2 < ../Matrix/bcsstk01
- ./cs_demo2 < ../Matrix/fs_183_1
- ./cs_demo2 < ../Matrix/mbeacxc
- ./cs_demo2 < ../Matrix/west0067
- ./cs_demo2 < ../Matrix/lp_afiro
- ./cs_demo2 < ../Matrix/bcsstk16
- ./cs_demo3 < ../Matrix/bcsstk01
- ./cs_demo3 < ../Matrix/bcsstk16
lib:
( cd ../Lib ; $(MAKE) )
......
......@@ -142,35 +142,35 @@ csi demo2 (problem *Prob)
printf ("blocks: %g singletons: %g structural rank: %g\n",
(double) nb, (double) ns, (double) sprank) ;
cs_dfree (D) ;
for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
{
if (!order && m > 1000) continue ;
printf ("QR ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
for (order = 0 ; order <= 3 ; order++) /* try all orderings */
{
if (!order && m > 1000) continue ;
printf ("LU ") ;
print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
printf ("time: %8.2f ", toc (t)) ;
print_resid (ok, C, x, b, resid) ; /* print residual */
}
if (!Prob->sym) return (1) ;
// for (order = 0 ; order <= 3 ; order += 3) /* natural and amd(A'*A) */
// {
// if (!order && m > 1000) continue ;
// printf ("QR ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_qrsol (order, C, x) ; /* min norm(Ax-b) with QR */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (m != n || sprank < n) return (1) ; /* return if rect. or singular*/
// for (order = 0 ; order <= 3 ; order++) /* try all orderings */
// {
// if (!order && m > 1000) continue ;
// printf ("LU ") ;
// print_order (order) ;
// rhs (x, b, m) ; /* compute right-hand side */
// t = tic () ;
// ok = cs_lusol (order, C, x, tol) ; /* solve Ax=b with LU */
// printf ("time: %8.2f ", toc (t)) ;
// print_resid (ok, C, x, b, resid) ; /* print residual */
// }
// if (!Prob->sym) return (1) ;
for (order = 0 ; order <= 1 ; order++) /* natural and amd(A+A') */
{
if (!order && m > 1000) continue ;
printf ("Chol ") ;
print_order (order) ;
printf ("Chol \n") ;
//print_order (order) ;
rhs (x, b, m) ; /* compute right-hand side */
t = tic () ;
ok = cs_cholsol (order, C, x) ; /* solve Ax=b with Cholesky */
......
......@@ -36,6 +36,7 @@ typedef struct cs_sparse /* matrix in compressed-column or triplet form */
cs *cs_add (const cs *A, const cs *B, double alpha, double beta) ;
csi cs_cholsol (csi order, const cs *A, double *b) ;
csi cs_cholsol_gpu (csi order, const cs *A, double *b) ;
cs *cs_compress (const cs *T) ;
csi cs_dupl (cs *A) ;
csi cs_entry (cs *T, csi i, csi j, double x) ;
......@@ -56,6 +57,10 @@ cs *cs_spfree (cs *A) ;
csi cs_sprealloc (cs *A, csi nzmax) ;
void *cs_malloc (csi n, size_t size) ;
//void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x);
//void cs_lsolve_acc (int n, const int *Lp, const int *Li, const double *Lx, double *x);
/* --- secondary CSparse routines and data structures ----------------------- */
typedef struct cs_symbolic /* symbolic Cholesky, LU, or QR analysis */
{
......
......@@ -16,7 +16,7 @@
LIBRARY = libcsparse
CF = $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -O
CC = /usr/local/cuda/bin/nvcc -dc -g -G -O0
I = -I../Include
RANLIB = ranlib
ARCHIVE = $(AR) $(ARFLAGS)
......@@ -29,7 +29,7 @@ all: install
CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \
cs_droptol.o cs_dropzeros.o cs_dupl.o cs_entry.o \
cs_etree.o cs_fkeep.o cs_gaxpy.o cs_happly.o cs_house.o cs_ipvec.o \
cs_lsolve.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
cs_lsolve.o cs_lsolve_gpu.o cs_ltsolve.o cs_lu.o cs_lusol.o cs_util.o cs_multiply.o \
cs_permute.o cs_pinv.o cs_post.o cs_pvec.o cs_qr.o cs_qrsol.o \
cs_scatter.o cs_schol.o cs_sqr.o cs_symperm.o cs_tdfs.o cs_malloc.o \
cs_transpose.o cs_compress.o cs_usolve.o cs_utsolve.o cs_scc.o \
......@@ -38,7 +38,7 @@ CS = cs_add.o cs_amd.o cs_chol.o cs_cholsol.o cs_counts.o cs_cumsum.o \
$(CS): ../Include/cs.h Makefile
%.o: ../Source/%.c ../Include/cs.h
%.o: ../Source/%.c* ../Include/cs.h
$(CC) $(CF) $(I) -c $<
static: $(AR_TARGET)
......
File deleted
#include "cs.h"
#include "cuda_runtime.h"
/* x=A\b where A is symmetric positive definite; b overwritten with solution */
csi cs_cholsol (csi order, const cs *A, double *b)
csi cs_cholsol_xxx (csi order, const cs *A, double *b)
{
double *x ;
css *S ;
......@@ -26,7 +29,7 @@ csi cs_cholsol (csi order, const cs *A, double *b)
}
csi cs_cholsol_gpu (csi order, const cs *A, double *b)
csi cs_cholsol (csi order, const cs *A, double *b)
{
double *x ;
css *S ;
......@@ -59,10 +62,10 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
double *d_Lx;
double *d_x;
cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) ;
cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) ;
cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) ;
cudaMalloc(&d_x , n * sizeof(double) ;
cudaMalloc(&d_Lp, ((N->L->n)+1) * sizeof(int) );
cudaMalloc(&d_Li, (N->L->nzmax) * sizeof(int) );
cudaMalloc(&d_Lx, (N->L->nzmax) * sizeof(double) );
cudaMalloc(&d_x , n * sizeof(double) );
cudaMemcpy( d_Lp, N->L->p , ((N->L->n)+1) * sizeof(int) , cudaMemcpyHostToDevice) ;
cudaMemcpy( d_Li, N->L->i , (N->L->nzmax) * sizeof(int) , cudaMemcpyHostToDevice) ;
......@@ -72,7 +75,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
// csi cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
// cs_lsolve_gpu <<<gridSize, blockSize>>> (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_gpu <<<1, 1>>> (n, d_Lp, d_Li, d_Lx, d_x);
cs_lsolve_acc (n, d_Lp, d_Li, d_Lx, d_x);
double *x_gpu;
x_gpu = cs_malloc (n, sizeof (double)) ;
......@@ -85,6 +88,7 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
for (i=0; i<n; i++) {
printf("cpu: %f - gpu: %f\n",x[i], x_gpu[i] );
}
printf("\n");
cs_ltsolve (N->L, x) ; /* x = L'\x */
cs_pvec (S->pinv, x, b, n) ; /* b = P'*x */
......@@ -93,4 +97,4 @@ csi cs_cholsol_gpu (csi order, const cs *A, double *b)
cs_sfree (S) ;
cs_nfree (N) ;
return (ok) ;
}
\ No newline at end of file
}
......@@ -12,107 +12,112 @@ csi cs_lsolve (const cs *L, double *x)
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [Li [p]] -= Lx [p] * x [j] ;
printf("XPU: %f", x [Li [p]]);
}
}
return (1) ;
}
/* solve Lx=b where x and b are dense. x=b on input, solution on output. */
// /* solve Lx=b where x and b are dense. x=b on input, solution on output. */
//csi cs_lsolve_gpu (const cs *L, double *x)
// typedef struct cs_sparse /* matrix in compressed-column or triplet form */
// {
// csi nzmax ; /* maximum number of entries */
// csi m ; /* number of rows */
// csi n ; /* number of columns */
// csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */
// csi *i ; /* row indices, size nzmax */
// double *x ; /* numerical values, size nzmax */
// csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */
// } cs ;
__global__
void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
{
int p, j;
// //csi cs_lsolve_gpu (const cs *L, double *x)
// // typedef struct cs_sparse /* matrix in compressed-column or triplet form */
// // {
// // csi nzmax ; /* maximum number of entries */
// // csi m ; /* number of rows */
// // csi n ; /* number of columns */
// // csi *p ; /* column pointers (size n+1) or col indices (size nzmax) */
// // csi *i ; /* row indices, size nzmax */
// // double *x ; /* numerical values, size nzmax */
// // csi nz ; /* # of entries in triplet matrix, -1 for compressed-col */
// // } cs ;
// // __global__
// // void cs_lsolve_gpu (int n, const int *Lp, const int *Li, const double *Lx, double *x)
// // {
// // int p, j;
// double *Lx ;
// n = L->n ;
// Lp = L->p ;
// Li = L->i ;
// Lx = L->x ;
// // // double *Lx ;
// // // n = L->n ;
// // // Lp = L->p ;
// // // Li = L->i ;
// // // Lx = L->x ;
for (j = 0 ; j < n ; j++)
{
x [j] /= Lx [Lp [j]] ;
for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
{
x [Li [p]] -= Lx [p] * x [j] ;
}
}
// // for (j = 0 ; j < n ; j++)
// // {
// // x [j] /= Lx [Lp [j]] ;
// // for (p = Lp [j]+1 ; p < Lp [j+1] ; p++)
// // {
// // x [Li [p]] -= Lx [p] * x [j] ;
// // }
// // }
return (1) ;
}
// // //return (1);
// // }
#include <iostream>
#include <math.h>
// #include <iostream>
// #include <math.h>
// CUDA kernel to add elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
// // CUDA kernel to add elements of two arrays
// __global__
// void add(int n, float *x, float *y)
// {
// int index = blockIdx.x * blockDim.x + threadIdx.x;
// int stride = blockDim.x * gridDim.x;
// for (int i = index; i < n; i += stride)
// y[i] = x[i] + y[i];
// }
int main(void)
{
int N = 1<<20;
float *x, *y;
// int main(void)
// {
// int N = 1<<20;
// float *x, *y;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// // Allocate Unified Memory -- accessible from CPU or GPU
// cudaMallocManaged(&x, N*sizeof(float));
// cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// // initialize x and y arrays on the host
// for (int i = 0; i < N; i++) {
// x[i] = 1.0f;
// y[i] = 2.0f;
// }
// Prefetch the data to the GPU
int device = -1;
cudaGetDevice(&device);
cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
// // Prefetch the data to the GPU
// int device = -1;
// cudaGetDevice(&device);
// cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
// cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);
// Launch kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// // Launch kernel on 1M elements on the GPU
// int blockSize = 256;
// int numBlocks = (N + blockSize - 1) / blockSize;
// add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// // Wait for GPU to finish before accessing on host
// cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// // Check for errors (all values should be 3.0f)
// float maxError = 0.0f;
// for (int i = 0; i < N; i++)
// maxError = fmax(maxError, fabs(y[i]-3.0f));
// std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
// // Free memory
// cudaFree(x);
// cudaFree(y);
return 0;
}
\ No newline at end of file
// return 0;
// }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment