diff --git a/CSparse/Source/cs_cholsol.c b/CSparse/Source/cs_cholsol.c index 59bd4ee51c02b1bc14277357bea34e8df2ed13b3..aeafe9ef1c9b7ea53f4f6395edc6da5681c375a8 100644 --- a/CSparse/Source/cs_cholsol.c +++ b/CSparse/Source/cs_cholsol.c @@ -51,7 +51,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) int total_num_of_LSC_perGPU = GPUmem_sizeGB / LSCsize * 1024.0; int blocks = total_num_of_LSC_perGPU; - blocks = 1; + blocks = 2 n_rhs = 2; @@ -97,9 +97,9 @@ csi cs_cholsol (csi order, const cs *A, double *b) } // END - Copy RHS vector to multiple columns -// #define FULL_MEM +#define FULL_MEM #ifdef FULL_MEM - + { int GPU_mem = 0; int num_of_arrays = blocks; @@ -134,6 +134,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) // allocate each device row-pointer, then copy host data to it + int i; for(i = 0 ; i < num_of_arrays ; i++){ cudaMalloc(&h_array_Lp[i], ((N->L->n)+1) * sizeof(int)); @@ -171,8 +172,18 @@ csi cs_cholsol (csi order, const cs *A, double *b) cs_lsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); cs_ltsolve_gpu_trans_multi (n, d_array_Lp, d_array_Li, d_array_Lx, d_array_x, n_rhs, n_rhs, blocks); -#else + double** x_gpu_array = (double**)malloc(num_of_arrays * sizeof(double*)); + for(i = 0 ; i < num_of_arrays ; i++) { + x_gpu_array[i] = cs_malloc ( n_rhs * n, sizeof (double)) ; + cudaMemcpy(x_gpu_array[i], h_array_x [i], n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); + } + + double *x_gpu = x_gpu_array[1]; + + } +#else + { // *** Vesion with // Copy Chol. factor and RHSs from CPU to GPU int *d_Lp; @@ -197,7 +208,7 @@ csi cs_cholsol (csi order, const cs *A, double *b) double *x_gpu; x_gpu = cs_malloc ( n_rhs * n, sizeof (double)) ; cudaMemcpy(x_gpu, d_x, n * n_rhs * sizeof(double), cudaMemcpyDeviceToHost ); - + } #endif