diff --git a/preconditioner.h b/preconditioner.h index 39dc9c882e7a5bdee7b41b11adbd79308ce8ac6a..ab3f5b783aa08f268953a456ee4cebf959c8ae42 100644 --- a/preconditioner.h +++ b/preconditioner.h @@ -57,37 +57,7 @@ template<class FT> __global__ void thomas_kernel(int const m, int block_size, int num_blocks, FT const alpha, FT const beta, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= m*num_blocks) { - return; - } - - int start = (tid / m) * block_size * m + (tid % m); - - FT work_buffer_reg = b[start] * beta / (FT(2) + alpha); - - x[start] = work_buffer_reg; - for (int i = 1; i < block_size; ++i) { - int j = start + i*m; - work_buffer_reg = (b[j] * beta + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - x[j] = work_buffer_reg; - } - - //FT x_reg = work_buffer_reg; - - //FT x_reg = x[start + (block_size-1)*m]; - //x[start + (block_size-1)*m] = x_reg; - for (int i = block_size-2; i >= 0; --i) { - int j = start + i*m; - work_buffer_reg = x[j] - dev_c_prime[i] * work_buffer_reg; - x[j] = work_buffer_reg; - } -} - -template<class FT> -__global__ void thomas_kernel_x(int const m, int block_size, int num_blocks, FT const alpha, FT const beta, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid >= m*num_blocks) { return; } @@ -103,11 +73,6 @@ __global__ void thomas_kernel_x(int const m, int block_size, int num_blocks, FT x[j] = work_buffer_reg; } - //FT x_reg = work_buffer_reg; - - //FT x_reg = x[start + (block_size-1)*m]; - //x[start + (block_size-1)*m] = x_reg; - for (int i = block_size-2; i >= 0; --i) { int j = start + i*m; work_buffer_reg = x[j] - dev_c_prime[i] * work_buffer_reg; @@ -126,11 +91,7 @@ __global__ void thomas_kernel_trans(int const m, int block_size, int num_blocks, // b - input // x - output - //__shared__ FT sh_b[THREADS+1][THREADS]; - - //TODO: POZOR - //__shared__ FT sh_x[THREADS+1][THREADS]; - __shared__ FT sh_x[1][1]; + __shared__ FT sh_x[THREADS+1][THREADS]; int tid_l = threadIdx.x; int bid = blockIdx.x; @@ -207,184 +168,6 @@ __global__ void thomas_kernel_trans(int const m, int block_size, int num_blocks, } - - - - - - - - -// // dev_c_prime - precalculated factors -// // b - input -// // x - output -// -// __shared__ FT sh_b[THREADS+1][THREADS]; -// __shared__ FT sh_x[THREADS+1][THREADS]; -// -// int tid_l = threadIdx.x; -// int bid = blockIdx.x; -// int tid = blockIdx.x * blockDim.x + threadIdx.x; -// FT work_buffer_reg; -// -// //if (bid == 1) return; -// -// #pragma unroll -// for (int i = 0; i < THREADS; i++) { -// int adr = tid_l + m*(i+bid*THREADS) + 0*THREADS; -// FT val = b[ adr ]; -// sh_b[tid_l][i] = val; -// //if (tid_l == 0) printf("Global addr = %d - %f \n", adr, sh_b[i][tid_l]); -// } -// -// //FT work_buffer_reg = sh_b[0][tid_l] * beta / (FT(2) + alpha); -// //sh_x[0][tid_l] = work_buffer_reg; -// //if (tid_l == 0) printf(" x = %f \n",work_buffer_reg); -// -// for (int l = 0; l < M/THREADS; l++) { -// #pragma unroll -// for (int i = 0; i < THREADS; ++i) { -// // input -// FT b_l = sh_b[i][tid_l]; -// FT c_prime = dev_c_prime[l*THREADS+i-1]; -// // calc - ... -// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime); -// // output -// sh_x[i][tid_l] = work_buffer_reg; -// //if (tid_l == 0) printf(" x = %f \n",work_buffer_reg); -// } -// -// // save temp res and get new input data -// if ( l < (M/THREADS - 1) ) { -// #pragma unroll -// for (int i = 0; i < THREADS; i++) { -// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i]; -// sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ]; -//// if (tid_l == 0) printf("Global addr = %d", tid_l + m*(i+bid*THREADS) + l *THREADS); -//// if (tid_l == 0) printf(" - %d \n", tid_l + m*(i+bid*THREADS) + (l+1)*THREADS); -// } -// } -// } -// -//// #pragma unroll -//// for (int i = 0; i < THREADS; ++i) { -//// // input -//// FT b_l = sh_b[i][tid_l]; -//// FT c_prime = dev_c_prime[1*THREADS+i-1]; -//// // calc - ... -//// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime); -//// // output -//// sh_x[i][tid_l] = work_buffer_reg; -//// if (tid_l == 0) printf(" x = %f \n",work_buffer_reg); -//// } -// -// // backward -// int level = M/THREADS - 1; -// #pragma unroll -// for (int i = THREADS-2; i >= 0; --i) { -// // input -// FT x_l = sh_x[i][tid_l]; -// // calc - ... -// work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg; -// // output -// sh_x[i][tid_l] = work_buffer_reg; -//// if (tid_l == 0) printf(" x = %f \n",work_buffer_reg); -// } -// -// for (int l = level; l >= 1 ; l--) { -// // save temp res and get new input data -// #pragma unroll -// for (int i = 0; i < THREADS; i++) { -// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i]; -// sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ]; -//// if (tid_l == 0) printf("Global addr = %d", tid_l + m*(i+bid*THREADS) + (l-1)*THREADS); -//// if (tid_l == 0) printf(" - %d \n", tid_l + m*(i+bid*THREADS) + l *THREADS); -// } -// -// #pragma unroll -// for (int i = THREADS-1; i >= 0; --i) { -// // input -// FT x_l = sh_x[i][tid_l]; -// // calc - ... -// work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg; -// // output -// sh_x[i][tid_l] = work_buffer_reg; -//// if (tid_l == 0) printf(" x = %f \n",work_buffer_reg); -// -// } -// -// } -// -// #pragma unroll -// for (int i = 0; i < THREADS; i++) { -// x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i]; -// } - - - -// // dev_c_prime - precalculated factors -// // b - input -// // x - output -// -// __shared__ FT sh_b[M * M]; -// __shared__ FT sh_x[M * M]; -// -// int tid_l = threadIdx.x; -// int tid = blockIdx.x * blockDim.x + threadIdx.x; -// if (tid >= m*num_blocks) { -// return; -// } -// -// for (int i = 0; i < m; i++) { -// sh_b[ tid_l + m*i] = b[ tid_l + m*i]; -// } -// -// //int start = (tid / m) * block_size * m + (tid % m); // start incerements contigously by tid .. 0,1,2,3, ... m -// int start = tid*m; -// -// //FT work_buffer_reg = b[start] * beta / (FT(2) + alpha); -// FT work_buffer_reg = sh_b[start] * beta / (FT(2) + alpha); -// // x[start] = work_buffer_reg; -// sh_x[start] = work_buffer_reg; -// -// for (int i = 1; i < block_size; ++i) { -// // address -// //int j = start + i*m; // index for input data - start is contigous 0,1,2,3, .. + (0,1,2,3 ... ) * m -// int j = start + i; -// // input -// //FT b_l = b[j]; -// FT b_l = sh_b[j]; -// // calc - ... -// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); -// // output -// sh_x[j] = work_buffer_reg; -// } -// -// //FT x_reg = x[start + (block_size-1)*m]; -// FT x_reg = sh_x[start + block_size-1]; -// //x[start + (block_size-1)*m] = x_reg; -// sh_x[start + block_size-1] = x_reg; -// -// for (int i = block_size-2; i >= 0; --i) { -// // adress -// //int j = start + i*m; -// int j = start + i; -// // input -// FT x_l = sh_x[j]; -// // calc - ... -// x_reg = x_l - dev_c_prime[i] * x_reg; -// // output -// // x[j] = x_reg; -// sh_x[j] = x_reg; -// } -// -// __syncthreads(); -// -// for (int i = 0; i < m; i++) { -// x[ tid_l + m*i] = sh_x[ tid_l + m*i]; -// } - - } @@ -550,9 +333,9 @@ public: FT *y = b_trans; cublas_transpose(cublas_handle, m, y_trans, y); - thomas_kernel_x<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x); + thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x); - std::cout << "Tk 1 - cublas trans " << std::endl; + //std::cout << "Tk 1 - cublas trans " << std::endl; } else { @@ -563,102 +346,21 @@ public: FT const * const b_trans_const = b_trans; thomas_kernel <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans_const, x); //thomas_kernel <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, x); - std::cout << "Tk 1 - implicit trans " << std::endl; + //std::cout << "Tk 1 - implicit trans " << std::endl; } } virtual void run_t(FT const * const b, FT * const x) { - - - //int block_count = M/THREADS; - //int threads_per_block = THREADS; - - //thomas_kernel_trans<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b, b_trans); - thomas_kernel_trans<FT><<<M/THREADS, THREADS >>> (m, m, 1, alpha, beta, c_prime, b, b_trans); - - int block_count = divide_and_round_up(m, thomas_kernel_block_size); - int threads_per_block = thomas_kernel_block_size; -// thomas_kernel <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, x); - - - cublas_transpose(cublas_handle, m, b, b_trans); - FT * const y_trans = x; - thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans); - FT * const y = b_trans; - cublas_transpose(cublas_handle, m, y_trans, y); - thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x); - - -// std::cout << "Blocks: "<< block_count << "; threads: " << threads_per_block << std::endl; -// -// std::cout.setf(std::ios_base::scientific, std::ios_base::floatfield); -// std::cout.precision(6); -// -// cublas_transpose(cublas_handle, m, b, x); -// thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, x, b_trans); -// cublas_transpose(cublas_handle, m, b_trans, x); -// -// FT *h_x; -// h_x = (FT * ) malloc ( (m*m)*sizeof(FT) ); -// -// cudaMemcpy( h_x, x, (m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); -// for (int i = 0; i < m*m; i++) { -// if (i % m == 0) std::cout << std::endl; -// std::cout << h_x[i] << "\t"; -// } -// std::cout << std::endl; -// -// device_memset<FT>(x, FT(0), m*m); -// -// thomas_kernel_trans<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b, x); -// -// cudaMemcpy( h_x, x, (m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); -// for (int i = 0; i < m*m; i++) { -// if (i % m == 0) std::cout << std::endl; -// std::cout << h_x[i] << "\t"; -// } -// std::cout << std::endl; -// -// std::cout.unsetf ( std::ios::floatfield ); // floatfield not set - - - - std::cout << "Tk 1_t" << std::endl; - } virtual void run2(FT const * const b, FT * const x) { - FT block_count = divide_and_round_up(m, thomas_kernel_block_size); - FT threads_per_block = thomas_kernel_block_size; - - cublas_transpose(cublas_handle, m, b, b_trans); - FT *y_trans = x; - thomas_kernel2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT) >>>(m, m, 1, alpha, beta, c_prime, b_trans, y_trans); - FT *y = b_trans; - cublas_transpose(cublas_handle, m, y_trans, y); - - thomas_kernel2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT) >>>(m, m, 1, alpha, beta, c_prime, y, x); - - std::cout << "Tk 2" << std::endl; - } + } virtual void run3(FT const * const b, FT * const x) { - FT block_count = divide_and_round_up(m, thomas_kernel_block_size); - FT threads_per_block = thomas_kernel_block_size; - - cublas_transpose(cublas_handle, m, b, b_trans); - FT *y_trans = x; - thomas_kernel3<FT><<<block_count, threads_per_block, 2 * threads_per_block * (m+1) * sizeof(FT) >>>(m, m, 1, alpha, beta, c_prime, b_trans, y_trans); - FT *y = b_trans; - cublas_transpose(cublas_handle, m, y_trans, y); - - thomas_kernel3<FT><<<block_count, threads_per_block, 2 * threads_per_block * (m+1) * sizeof(FT) >>>(m, m, 1, alpha, beta, c_prime, y, x); - - std::cout << "Tk 3" << std::endl; } @@ -670,678 +372,678 @@ public: }; -/* - * Kernel that performs the thomas algorithm for the 3D problem. Used for ThomasPreconditioner3 - * - * @param FT Field Type - Either float or double - * @param m grid size i.e. M is of size mxm - * @param n number of vectors that we invert simultanously. Usually has value m*m - * @param alpha > 0. - * @param alpha_23 = alpha^(2/3) - * @param c_prime coefficients for the thomas algorithm that where precualculated - * @param b != NULL input vector of size m*n - * @param x != NULL output vector of size m*n - * - * @return M\X - * - */ - -template<class FT> -__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= n) { - return; - } - - int start = tid; - - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - x[start] = work_buffer_reg; - - for (int i = 1; i < m; ++i) { - int j = start + i*n; - work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - x[j] = work_buffer_reg; - } - - FT x_reg = x[start + (m-1)*n]; - x[start + (m-1)*n] = x_reg; - - for (int i = m-2; i >= 0; --i) { - int j = start + i*n; - x_reg = x[j] - dev_c_prime[i] * x_reg; - x[j] = x_reg; - } -} - - -template<class FT> -__global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - int n = m*m; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= n) { - return; - } - - int start = tid; - - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - x[start] = work_buffer_reg; - - for (int i = 1; i < m; ++i) { - int j = start + i*n; - work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - x[j] = work_buffer_reg; - } - - FT x_reg = x[start + (m-1)*n]; - x[start + (m-1)*n] = x_reg; - - for (int i = m-2; i >= 0; --i) { - int j = start + i*n; - x_reg = x[j] - dev_c_prime[i] * x_reg; - x[j] = x_reg; - } -} - - -template<class FT> -__global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - int n = m*m; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (tid >= n) { - return; - } - - int start = (n) * (tid/m) + (tid % m); // + (i * m) - - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - x[start] = work_buffer_reg; - - for (int i = 1; i < m; ++i) { - int j = start + i*m; - work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - x[j] = work_buffer_reg; - } - - FT x_reg = x[start + (m-1)*m]; - x[start + (m-1)*m] = x_reg; - - for (int i = m-2; i >= 0; --i) { - int j = start + i*m; - x_reg = x[j] - dev_c_prime[i] * x_reg; - x[j] = x_reg; - } -} - - - -template<class FT> -__global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - //#define TILE_SIZE 2 - //TODO: Should be #define - //int TILE_SIZE = blockDim.x; - - __shared__ FT sh_b[TILE_SIZE][TILE_SIZE+1]; - __shared__ FT sh_x[TILE_SIZE][TILE_SIZE+1]; - - int TILES = m / TILE_SIZE; - - int tid_l = threadIdx.x; - int bid = blockIdx.x; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - - // Basis of an adress used to read dat from global memory to tiles in shared memory - // - this is rused multiple times - //int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m; - int base_addr = tid_l + - (tid / m) * m + - (m*m) * TILE_SIZE * ( bid % (m / TILE_SIZE)); - // + tile * TILE_SIZE - // + i*m*m - - // ************************************************************************************************************** - // *** Forward substitution ************************************************************************************ - - // Read input data to fill the first tile in shared memoru - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int a = base_addr + m*m*i; - sh_b[tid_l][i] = b[a]; - //printf("tid = %d - SM a = [%d,%d] - g a = %d ; val = %f \n", tid, tid_l, i, a, b[ a ] ); - } - - // Calculate first element of the forward substitution - FT work_buffer_reg = sh_b[0][tid_l] * alpha_23 / (FT(2) + alpha); - sh_x[0][tid_l] = work_buffer_reg; - //printf("A tid = %d - work_buffer_reg = %f ; in_val = %f \n", tid, work_buffer_reg, sh_b[0][tid_l]); - - // Calculate the rest of the forward substitution for the first tile - #pragma unroll - for (int i = 1; i < TILE_SIZE; ++i) { - int ca = i - 1; - work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]); - sh_x[i][tid_l] = work_buffer_reg; - //printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); - } - - // Save results of for the first tile to the global memory - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int a = base_addr + m*m*i; - x[a] = sh_x[tid_l][i]; - //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - // Processing of the remaining tiles - for (int tile = 1; tile < TILES; tile++) { - - // Read data from global memory to tile in shared memory - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int a = base_addr + m*m*i + (tile * TILE_SIZE); - sh_b[tid_l][i] = b[a]; - //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - // Calculate forward substitution for the current tile - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int ca = (tile * TILE_SIZE) + i - 1; - work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]); - sh_x[i][tid_l] = work_buffer_reg; - //printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); - } - - // Save the results of the forward substitution of the current tile to a global memory - this does not have to be done fot the last tile - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int a = base_addr + m*m*i + (tile * TILE_SIZE); - x[a] = sh_x[tid_l][i]; - //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - - } - // *** END - Forward substitution ************************************************************************************ - // ************************************************************************************************************** - - __syncthreads(); - - // ************************************************************************************************************** - // *** Backward substitution ************************************************************************************ - - // Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory - #pragma unroll - for (int i = TILE_SIZE-2; i >= 0; --i) { - int ca = (TILES-1) * TILE_SIZE + i; - work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[ ca ] * work_buffer_reg; - sh_x[i][tid_l] = work_buffer_reg; - //printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i); - - } - - // Backward substitution - last TILE - store results from tile in shared memory to global memory - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - int a = base_addr + m*m*i + (TILES-1) * TILE_SIZE; //m - TILE_SIZE; - x[ a ] = sh_x[tid_l][i]; - //printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - // Backward substitution - remainig tiles - for (int tile = TILES - 2; tile >= 0; tile--) { - - // Load new tile from global memory to tile in shared memory - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - //sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ]; - int a = base_addr + m*m*i + tile * TILE_SIZE; - sh_b[tid_l][i] = x[ a ]; - //printf("Lod1 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - // compute backward substitution - use date stored in tile in shared memory - #pragma unroll - for (int i = TILE_SIZE-1; i >= 0; --i) { - int ca = tile * TILE_SIZE + i; - work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[ca] * work_buffer_reg; - sh_x[i][tid_l] = work_buffer_reg; - //printf("B1 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); - } - - // Store current tile from shared memory to global memory - #pragma unroll - for (int i = 0; i < TILE_SIZE; i++) { - //sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ]; - int a = base_addr + m*m*i + tile * TILE_SIZE; - x[a] = sh_x[tid_l][i]; - //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); - } - - } - // *** Backward substitution - END ****************************************************************************** - // ************************************************************************************************************** - - return; - - - -} - - - - -template<class FT> -__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) { - -#define T W -//#define THREADS 4 -//#define BLOCKS 4 - - __shared__ FT SM [THREADS/T][T][T]; - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int tmt = threadIdx.x % T; - int tdt = threadIdx.x / T; - - FT x_reg; - - if (tid >= n) { - return; - } - - int start = tid; - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - - tmp_g[start] = work_buffer_reg; - for (int i = 1; i < m; ++i) { - int j = start + i*n; - double input = b[j]; - double c_prime = dev_c_prime[i-1]; - - work_buffer_reg = (input * alpha_23 + work_buffer_reg) / (double(2) + alpha + c_prime); - tmp_g[j] = work_buffer_reg; - } - - x_reg = tmp_g[start + (m-1)*n]; - SM[tdt][tmt][0] = x_reg; - - for (int i = 1; i < T; i++) { - int j = start + (m-1-i)*n; - x_reg = tmp_g[j] - dev_c_prime[ (m-1-i) ] * x_reg; - SM[tdt][tmt][i] = x_reg; - } - - - int addr1 = T * tdt; - int addr2 = blockIdx.x; - int addr3 = m - 1 - tmt; - int addr4; - - int addr = (addr2 * m * m) + addr3; - - for (int i = 0; i < T; i++) { - x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt]; - } - - for (int T_loop = 1; T_loop < m/T; T_loop++) { - - int T_offset = T * T_loop; - addr4 = m - 1 - T_offset; - addr3 = addr4 - tmt; - addr = (addr2 * m * m) + addr3; - - for (int i = 0; i < T; i++) { - int g_addr = addr4 - i; - int j = start + n * g_addr; - - x_reg = tmp_g[j] - dev_c_prime[ g_addr ] * x_reg; - SM[tdt][tmt][i] = x_reg; - } - - for (int i = 0; i < T; i++) { - x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt]; - } - - - } - - - - // int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced] - // return s*m + r*m*m + c; - // } - - -} - - - -// Thomas algorithm with storing intermediate results in shared memory - saves one global memory read and global memory write into x - compare with previous Thomas kernel -template<class FT> -__global__ void thomas_kernel3D2(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - extern __shared__ FT xs[]; - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int tid_l = threadIdx.x; - int block_size = m; - - if (tid >= n) { - return; - } - - int start = tid; - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - - //x[start] = work_buffer_reg; - xs[tid_l * block_size + 0 + tid_l] = work_buffer_reg; - for (int i = 1; i < m; ++i) { - int j = start + i*n; - work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - //x[j] = work_buffer_reg; - xs[ tid_l * block_size + i + tid_l ] = work_buffer_reg; - } - - FT x_reg = work_buffer_reg; //x[start + (m-1)*n]; - x[start + (m-1)*n] = x_reg; - for (int i = m-2; i >= 0; --i) { - int j = start + i*n; - //x_reg = x[j] - dev_c_prime[i] * x_reg; - x_reg = xs[tid_l * block_size + i + tid_l] - dev_c_prime[i] * x_reg; - x[j] = x_reg; - } - -} - - - -template<class FT> -__global__ void thomas_kernel3D3(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - - -} - - -/* - * Preconditioner for the 3D problem. Uses the thomas algorithm to invert M. - * - * @param FT Field Type - Either float or double - * - */ -template<class FT> -class ThomasPreconditioner3D : public Preconditioner<FT> { -private: - FT *c_prime; - int m; - cublasHandle_t cublas_handle; - FT *b_trans; - FT alpha, alpha_23; - - int thomas_kernel_block_size; -public: - virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { - this->m = m; - this->alpha = alpha; - this-> cublas_handle = cublas_handle; - - FT *host_c_prime = new FT[m]; - - alpha_23 = pow(alpha, FT(2)/FT(3)); - - host_c_prime[0] = FT(-1) / (alpha + FT(2)); - for (int i = 1; i < m; ++i) { - host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); - } - - cudaMalloc((void **) &c_prime, m*sizeof(FT)); - cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); - - delete[] host_c_prime; - - cudaMalloc((void **) &b_trans, m*m*m*sizeof(FT)); - - int minGridSize; - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel3D<FT>, 0, m*m); - - - - - } - - virtual void run(FT const * const b, FT * const x) { - FT block_count = m; //divide_and_round_up(m*m, thomas_kernel_block_size); - FT threads_per_block = m; //thomas_kernel_block_size; - - std::cout << "Blocks = " << block_count << std::endl; - std::cout << "threads_per_block = " << threads_per_block << std::endl; - - //int n = m*m*m; - - // delete -------------- - - FT *h_x; - h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) ); - FT *h_x2; - h_x2 = (FT * ) malloc ( (m*m*m)*sizeof(FT) ); - - cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - if (m <= 8) std::cout << h_x[i] << "\t"; - } - if (m <= 8) std::cout << std::endl; - - FT *xx = x; - - FT *bb; - cudaMalloc((void **) &bb, m*m*m*sizeof(FT)); - FT *bbb; - cudaMalloc((void **) &bbb, m*m*m*sizeof(FT)); - FT *bbbb; - cudaMalloc((void **) &bbbb, m*m*m*sizeof(FT)); - - - // Ker 1 ************* - - device_memset<FT>(xx, FT(0), m*m*m); - thomas_kernel3D_X1<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, b, xx); //bb); - - FT sum = 0.0; - cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - //std::cout << h_x[i] << "\t"; - if (m <= 8) printf("%4.1f\t",h_x[i]); - - sum+=h_x[i]; - } - if (m <= 8) std::cout << std::endl; - std::cout << sum << std::endl; - - // Ker 2 ***************** - - cublas_transpose2(cublas_handle, m, m*m, b, bb); - device_memset<FT>(xx, FT(0), m*m*m); - cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - if (m <= 8) std::cout << h_x[i] << "\t"; - } - if (m <= 8) std::cout << std::endl; - - thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb); - cublas_transpose2(cublas_handle, m*m, m, xx, bbb); - - - cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - sum = 0.0; - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - //std::cout << h_x[i] << "\t"; - if (m <= 8) printf("%4.1f\t",h_x[i]); - sum+=h_x[i]; - h_x2[i] = h_x[i]; - } - if (m <= 8) std::cout << std::endl; - std::cout << sum <<std::endl; - - - // Ker 3 ***************** - - cublas_transpose2(cublas_handle, m, m*m, b, bb); - cublas_transpose2(cublas_handle, m, m*m, bb, bbb); - - device_memset<FT>(xx, FT(0), m*m*m); - cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - if (m <= 8) std::cout << h_x[i] << "\t"; - } - if (m <= 8) std::cout << std::endl; - - int blocks = m*m / TILE_SIZE; - int threads = TILE_SIZE; - //blocks = 1; - //threads = 2; - std::cout << "\nThomas 3D Tiled kernel - Blocks: " << blocks << " Threads = " << threads << "\n"; - - thomas_kernel3D_XT<FT><<<blocks, threads>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb); - - cublas_transpose2(cublas_handle, m*m, m, xx, bbb); - cublas_transpose2(cublas_handle, m*m, m, bbb, xx); - - cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); - sum = 0.0; - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - //std::cout << h_x[i] << "\t"; - if (m <= 8) - printf("%4.1f\t",h_x[i]); - sum+=h_x[i]; - } - if (m <= 8) std::cout << std::endl; - std::cout << sum <<std::endl; - - - sum = 0.0; - for (int i = 0; i < m*m*m; i++) { - if (m <= 8) - if (i % (m*m) == 0) - std::cout << std::endl; - //std::cout << h_x[i] << "\t"; - if (m <= 8) - printf("%4.1f\t",h_x2[i] - h_x[i]); - - sum+=h_x[i]; - } - if (m <= 8) std::cout << std::endl; - std::cout << sum <<std::endl; - - - cudaFree(bb); - cudaFree(bbb); - cudaFree(bbbb); - - - //device_memset<FT>(x, FT(0), m*m); - - // delete -------------- - - return; - - FT *y = x; - thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, y); - - FT *y_trans = b_trans; - cublas_transpose2(cublas_handle, m*m, m, y, y_trans); - - FT *z_trans = y; - thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans); - - FT *z_trans2 = y_trans; - cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); - FT *x_trans2 = z_trans; - - thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2); - FT *x_trans = z_trans2; - cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); - - cublas_transpose2(cublas_handle, m, m*m, x_trans, x); - } - - - virtual void run2(FT const * const b, FT * const x) { - - std::cout << "thomas_kernel_block_size = " << thomas_kernel_block_size << std::endl; - - int inverses_per_block = 32; - - FT block_count = divide_and_round_up(m*m, inverses_per_block); // thomas_kernel_block_size); - FT threads_per_block = inverses_per_block; //thomas_kernel_block_size; - - //int n = m*m*m; - - - FT *y = x; - thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, b, y); - - FT *y_trans = b_trans; - cublas_transpose2(cublas_handle, m*m, m, y, y_trans); - - FT *z_trans = y; - thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans); - - FT *z_trans2 = y_trans; - cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); - FT *x_trans2 = z_trans; - - thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2); - FT *x_trans = z_trans2; - cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); - - cublas_transpose2(cublas_handle, m, m*m, x_trans, x); - } - - virtual void run3(FT const * const b, FT * const x) { - } - - virtual void run_t(FT const * const b, FT * const x) { - } - - ~ThomasPreconditioner3D() { - cudaFree(c_prime); - cudaFree(b_trans); - } - -}; +///* +// * Kernel that performs the thomas algorithm for the 3D problem. Used for ThomasPreconditioner3 +// * +// * @param FT Field Type - Either float or double +// * @param m grid size i.e. M is of size mxm +// * @param n number of vectors that we invert simultanously. Usually has value m*m +// * @param alpha > 0. +// * @param alpha_23 = alpha^(2/3) +// * @param c_prime coefficients for the thomas algorithm that where precualculated +// * @param b != NULL input vector of size m*n +// * @param x != NULL output vector of size m*n +// * +// * @return M\X +// * +// */ +// +//template<class FT> +//__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// +// if (tid >= n) { +// return; +// } +// +// int start = tid; +// +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// x[start] = work_buffer_reg; +// +// for (int i = 1; i < m; ++i) { +// int j = start + i*n; +// work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); +// x[j] = work_buffer_reg; +// } +// +// FT x_reg = x[start + (m-1)*n]; +// x[start + (m-1)*n] = x_reg; +// +// for (int i = m-2; i >= 0; --i) { +// int j = start + i*n; +// x_reg = x[j] - dev_c_prime[i] * x_reg; +// x[j] = x_reg; +// } +//} +// +// +//template<class FT> +//__global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// int n = m*m; +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// +// if (tid >= n) { +// return; +// } +// +// int start = tid; +// +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// x[start] = work_buffer_reg; +// +// for (int i = 1; i < m; ++i) { +// int j = start + i*n; +// work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); +// x[j] = work_buffer_reg; +// } +// +// FT x_reg = x[start + (m-1)*n]; +// x[start + (m-1)*n] = x_reg; +// +// for (int i = m-2; i >= 0; --i) { +// int j = start + i*n; +// x_reg = x[j] - dev_c_prime[i] * x_reg; +// x[j] = x_reg; +// } +//} +// +// +//template<class FT> +//__global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// int n = m*m; +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// +// if (tid >= n) { +// return; +// } +// +// int start = (n) * (tid/m) + (tid % m); // + (i * m) +// +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// x[start] = work_buffer_reg; +// +// for (int i = 1; i < m; ++i) { +// int j = start + i*m; +// work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); +// x[j] = work_buffer_reg; +// } +// +// FT x_reg = x[start + (m-1)*m]; +// x[start + (m-1)*m] = x_reg; +// +// for (int i = m-2; i >= 0; --i) { +// int j = start + i*m; +// x_reg = x[j] - dev_c_prime[i] * x_reg; +// x[j] = x_reg; +// } +//} +// +// +// +//template<class FT> +//__global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// //#define TILE_SIZE 2 +// //TODO: Should be #define +// //int TILE_SIZE = blockDim.x; +// +// __shared__ FT sh_b[TILE_SIZE][TILE_SIZE+1]; +// __shared__ FT sh_x[TILE_SIZE][TILE_SIZE+1]; +// +// int TILES = m / TILE_SIZE; +// +// int tid_l = threadIdx.x; +// int bid = blockIdx.x; +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// +// // Basis of an adress used to read dat from global memory to tiles in shared memory +// // - this is rused multiple times +// //int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m; +// int base_addr = tid_l + +// (tid / m) * m + +// (m*m) * TILE_SIZE * ( bid % (m / TILE_SIZE)); +// // + tile * TILE_SIZE +// // + i*m*m +// +// // ************************************************************************************************************** +// // *** Forward substitution ************************************************************************************ +// +// // Read input data to fill the first tile in shared memoru +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int a = base_addr + m*m*i; +// sh_b[tid_l][i] = b[a]; +// //printf("tid = %d - SM a = [%d,%d] - g a = %d ; val = %f \n", tid, tid_l, i, a, b[ a ] ); +// } +// +// // Calculate first element of the forward substitution +// FT work_buffer_reg = sh_b[0][tid_l] * alpha_23 / (FT(2) + alpha); +// sh_x[0][tid_l] = work_buffer_reg; +// //printf("A tid = %d - work_buffer_reg = %f ; in_val = %f \n", tid, work_buffer_reg, sh_b[0][tid_l]); +// +// // Calculate the rest of the forward substitution for the first tile +// #pragma unroll +// for (int i = 1; i < TILE_SIZE; ++i) { +// int ca = i - 1; +// work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]); +// sh_x[i][tid_l] = work_buffer_reg; +// //printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); +// } +// +// // Save results of for the first tile to the global memory +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int a = base_addr + m*m*i; +// x[a] = sh_x[tid_l][i]; +// //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// // Processing of the remaining tiles +// for (int tile = 1; tile < TILES; tile++) { +// +// // Read data from global memory to tile in shared memory +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int a = base_addr + m*m*i + (tile * TILE_SIZE); +// sh_b[tid_l][i] = b[a]; +// //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// // Calculate forward substitution for the current tile +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int ca = (tile * TILE_SIZE) + i - 1; +// work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]); +// sh_x[i][tid_l] = work_buffer_reg; +// //printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); +// } +// +// // Save the results of the forward substitution of the current tile to a global memory - this does not have to be done fot the last tile +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int a = base_addr + m*m*i + (tile * TILE_SIZE); +// x[a] = sh_x[tid_l][i]; +// //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// +// } +// // *** END - Forward substitution ************************************************************************************ +// // ************************************************************************************************************** +// +// __syncthreads(); +// +// // ************************************************************************************************************** +// // *** Backward substitution ************************************************************************************ +// +// // Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory +// #pragma unroll +// for (int i = TILE_SIZE-2; i >= 0; --i) { +// int ca = (TILES-1) * TILE_SIZE + i; +// work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[ ca ] * work_buffer_reg; +// sh_x[i][tid_l] = work_buffer_reg; +// //printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i); +// +// } +// +// // Backward substitution - last TILE - store results from tile in shared memory to global memory +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// int a = base_addr + m*m*i + (TILES-1) * TILE_SIZE; //m - TILE_SIZE; +// x[ a ] = sh_x[tid_l][i]; +// //printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// // Backward substitution - remainig tiles +// for (int tile = TILES - 2; tile >= 0; tile--) { +// +// // Load new tile from global memory to tile in shared memory +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// //sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ]; +// int a = base_addr + m*m*i + tile * TILE_SIZE; +// sh_b[tid_l][i] = x[ a ]; +// //printf("Lod1 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// // compute backward substitution - use date stored in tile in shared memory +// #pragma unroll +// for (int i = TILE_SIZE-1; i >= 0; --i) { +// int ca = tile * TILE_SIZE + i; +// work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[ca] * work_buffer_reg; +// sh_x[i][tid_l] = work_buffer_reg; +// //printf("B1 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca); +// } +// +// // Store current tile from shared memory to global memory +// #pragma unroll +// for (int i = 0; i < TILE_SIZE; i++) { +// //sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ]; +// int a = base_addr + m*m*i + tile * TILE_SIZE; +// x[a] = sh_x[tid_l][i]; +// //printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a ); +// } +// +// } +// // *** Backward substitution - END ****************************************************************************** +// // ************************************************************************************************************** +// +// return; +// +// +// +//} +// +// +// +// +//template<class FT> +//__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) { +// +//#define T W +////#define THREADS 4 +////#define BLOCKS 4 +// +// __shared__ FT SM [THREADS/T][T][T]; +// +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// int tmt = threadIdx.x % T; +// int tdt = threadIdx.x / T; +// +// FT x_reg; +// +// if (tid >= n) { +// return; +// } +// +// int start = tid; +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// +// tmp_g[start] = work_buffer_reg; +// for (int i = 1; i < m; ++i) { +// int j = start + i*n; +// double input = b[j]; +// double c_prime = dev_c_prime[i-1]; +// +// work_buffer_reg = (input * alpha_23 + work_buffer_reg) / (double(2) + alpha + c_prime); +// tmp_g[j] = work_buffer_reg; +// } +// +// x_reg = tmp_g[start + (m-1)*n]; +// SM[tdt][tmt][0] = x_reg; +// +// for (int i = 1; i < T; i++) { +// int j = start + (m-1-i)*n; +// x_reg = tmp_g[j] - dev_c_prime[ (m-1-i) ] * x_reg; +// SM[tdt][tmt][i] = x_reg; +// } +// +// +// int addr1 = T * tdt; +// int addr2 = blockIdx.x; +// int addr3 = m - 1 - tmt; +// int addr4; +// +// int addr = (addr2 * m * m) + addr3; +// +// for (int i = 0; i < T; i++) { +// x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt]; +// } +// +// for (int T_loop = 1; T_loop < m/T; T_loop++) { +// +// int T_offset = T * T_loop; +// addr4 = m - 1 - T_offset; +// addr3 = addr4 - tmt; +// addr = (addr2 * m * m) + addr3; +// +// for (int i = 0; i < T; i++) { +// int g_addr = addr4 - i; +// int j = start + n * g_addr; +// +// x_reg = tmp_g[j] - dev_c_prime[ g_addr ] * x_reg; +// SM[tdt][tmt][i] = x_reg; +// } +// +// for (int i = 0; i < T; i++) { +// x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt]; +// } +// +// +// } +// +// +// +// // int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced] +// // return s*m + r*m*m + c; +// // } +// +// +//} +// +// +// +//// Thomas algorithm with storing intermediate results in shared memory - saves one global memory read and global memory write into x - compare with previous Thomas kernel +//template<class FT> +//__global__ void thomas_kernel3D2(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// extern __shared__ FT xs[]; +// +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// int tid_l = threadIdx.x; +// int block_size = m; +// +// if (tid >= n) { +// return; +// } +// +// int start = tid; +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// +// //x[start] = work_buffer_reg; +// xs[tid_l * block_size + 0 + tid_l] = work_buffer_reg; +// for (int i = 1; i < m; ++i) { +// int j = start + i*n; +// work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); +// //x[j] = work_buffer_reg; +// xs[ tid_l * block_size + i + tid_l ] = work_buffer_reg; +// } +// +// FT x_reg = work_buffer_reg; //x[start + (m-1)*n]; +// x[start + (m-1)*n] = x_reg; +// for (int i = m-2; i >= 0; --i) { +// int j = start + i*n; +// //x_reg = x[j] - dev_c_prime[i] * x_reg; +// x_reg = xs[tid_l * block_size + i + tid_l] - dev_c_prime[i] * x_reg; +// x[j] = x_reg; +// } +// +//} +// +// +// +//template<class FT> +//__global__ void thomas_kernel3D3(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// +// +//} +// +// +///* +// * Preconditioner for the 3D problem. Uses the thomas algorithm to invert M. +// * +// * @param FT Field Type - Either float or double +// * +// */ +//template<class FT> +//class ThomasPreconditioner3D : public Preconditioner<FT> { +//private: +// FT *c_prime; +// int m; +// cublasHandle_t cublas_handle; +// FT *b_trans; +// FT alpha, alpha_23; +// +// int thomas_kernel_block_size; +//public: +// virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { +// this->m = m; +// this->alpha = alpha; +// this-> cublas_handle = cublas_handle; +// +// FT *host_c_prime = new FT[m]; +// +// alpha_23 = pow(alpha, FT(2)/FT(3)); +// +// host_c_prime[0] = FT(-1) / (alpha + FT(2)); +// for (int i = 1; i < m; ++i) { +// host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); +// } +// +// cudaMalloc((void **) &c_prime, m*sizeof(FT)); +// cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); +// +// delete[] host_c_prime; +// +// cudaMalloc((void **) &b_trans, m*m*m*sizeof(FT)); +// +// int minGridSize; +// cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel3D<FT>, 0, m*m); +// +// +// +// +// } +// +// virtual void run(FT const * const b, FT * const x) { +// FT block_count = m; //divide_and_round_up(m*m, thomas_kernel_block_size); +// FT threads_per_block = m; //thomas_kernel_block_size; +// +// std::cout << "Blocks = " << block_count << std::endl; +// std::cout << "threads_per_block = " << threads_per_block << std::endl; +// +// //int n = m*m*m; +// +// // delete -------------- +// +// FT *h_x; +// h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) ); +// FT *h_x2; +// h_x2 = (FT * ) malloc ( (m*m*m)*sizeof(FT) ); +// +// cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// if (m <= 8) std::cout << h_x[i] << "\t"; +// } +// if (m <= 8) std::cout << std::endl; +// +// FT *xx = x; +// +// FT *bb; +// cudaMalloc((void **) &bb, m*m*m*sizeof(FT)); +// FT *bbb; +// cudaMalloc((void **) &bbb, m*m*m*sizeof(FT)); +// FT *bbbb; +// cudaMalloc((void **) &bbbb, m*m*m*sizeof(FT)); +// +// +// // Ker 1 ************* +// +// device_memset<FT>(xx, FT(0), m*m*m); +// thomas_kernel3D_X1<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, b, xx); //bb); +// +// FT sum = 0.0; +// cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// //std::cout << h_x[i] << "\t"; +// if (m <= 8) printf("%4.1f\t",h_x[i]); +// +// sum+=h_x[i]; +// } +// if (m <= 8) std::cout << std::endl; +// std::cout << sum << std::endl; +// +// // Ker 2 ***************** +// +// cublas_transpose2(cublas_handle, m, m*m, b, bb); +// device_memset<FT>(xx, FT(0), m*m*m); +// cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// if (m <= 8) std::cout << h_x[i] << "\t"; +// } +// if (m <= 8) std::cout << std::endl; +// +// thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb); +// cublas_transpose2(cublas_handle, m*m, m, xx, bbb); +// +// +// cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// sum = 0.0; +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// //std::cout << h_x[i] << "\t"; +// if (m <= 8) printf("%4.1f\t",h_x[i]); +// sum+=h_x[i]; +// h_x2[i] = h_x[i]; +// } +// if (m <= 8) std::cout << std::endl; +// std::cout << sum <<std::endl; +// +// +// // Ker 3 ***************** +// +// cublas_transpose2(cublas_handle, m, m*m, b, bb); +// cublas_transpose2(cublas_handle, m, m*m, bb, bbb); +// +// device_memset<FT>(xx, FT(0), m*m*m); +// cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// if (m <= 8) std::cout << h_x[i] << "\t"; +// } +// if (m <= 8) std::cout << std::endl; +// +// int blocks = m*m / TILE_SIZE; +// int threads = TILE_SIZE; +// //blocks = 1; +// //threads = 2; +// std::cout << "\nThomas 3D Tiled kernel - Blocks: " << blocks << " Threads = " << threads << "\n"; +// +// thomas_kernel3D_XT<FT><<<blocks, threads>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb); +// +// cublas_transpose2(cublas_handle, m*m, m, xx, bbb); +// cublas_transpose2(cublas_handle, m*m, m, bbb, xx); +// +// cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost ); +// sum = 0.0; +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// //std::cout << h_x[i] << "\t"; +// if (m <= 8) +// printf("%4.1f\t",h_x[i]); +// sum+=h_x[i]; +// } +// if (m <= 8) std::cout << std::endl; +// std::cout << sum <<std::endl; +// +// +// sum = 0.0; +// for (int i = 0; i < m*m*m; i++) { +// if (m <= 8) +// if (i % (m*m) == 0) +// std::cout << std::endl; +// //std::cout << h_x[i] << "\t"; +// if (m <= 8) +// printf("%4.1f\t",h_x2[i] - h_x[i]); +// +// sum+=h_x[i]; +// } +// if (m <= 8) std::cout << std::endl; +// std::cout << sum <<std::endl; +// +// +// cudaFree(bb); +// cudaFree(bbb); +// cudaFree(bbbb); +// +// +// //device_memset<FT>(x, FT(0), m*m); +// +// // delete -------------- +// +// return; +// +// FT *y = x; +// thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, y); +// +// FT *y_trans = b_trans; +// cublas_transpose2(cublas_handle, m*m, m, y, y_trans); +// +// FT *z_trans = y; +// thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans); +// +// FT *z_trans2 = y_trans; +// cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); +// FT *x_trans2 = z_trans; +// +// thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2); +// FT *x_trans = z_trans2; +// cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); +// +// cublas_transpose2(cublas_handle, m, m*m, x_trans, x); +// } +// +// +// virtual void run2(FT const * const b, FT * const x) { +// +// std::cout << "thomas_kernel_block_size = " << thomas_kernel_block_size << std::endl; +// +// int inverses_per_block = 32; +// +// FT block_count = divide_and_round_up(m*m, inverses_per_block); // thomas_kernel_block_size); +// FT threads_per_block = inverses_per_block; //thomas_kernel_block_size; +// +// //int n = m*m*m; +// +// +// FT *y = x; +// thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, b, y); +// +// FT *y_trans = b_trans; +// cublas_transpose2(cublas_handle, m*m, m, y, y_trans); +// +// FT *z_trans = y; +// thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans); +// +// FT *z_trans2 = y_trans; +// cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); +// FT *x_trans2 = z_trans; +// +// thomas_kernel3D2<FT><<<block_count, threads_per_block, threads_per_block * (m+1) * sizeof(FT)>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2); +// FT *x_trans = z_trans2; +// cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); +// +// cublas_transpose2(cublas_handle, m, m*m, x_trans, x); +// } +// +// virtual void run3(FT const * const b, FT * const x) { +// } +// +// virtual void run_t(FT const * const b, FT * const x) { +// } +// +// ~ThomasPreconditioner3D() { +// cudaFree(c_prime); +// cudaFree(b_trans); +// } +// +//}; @@ -1733,522 +1435,880 @@ private: threads_per_block = ST_block_borders_kernel_block_size; ST_block_borders_kernel<FT><<<block_count, threads_per_block>>>(m, block_size, nLU, y, b2); - block_count = divide_and_round_up(m, ST_lu_forwards_kernel_block_size); - threads_per_block = ST_lu_forwards_kernel_block_size; - ST_lu_forwards_kernel<FT><<<block_count, threads_per_block>>>(m, nLU, l1, l2, b2, Ux2); - - block_count = divide_and_round_up(m, ST_lu_backwards_kernel_block_size); - threads_per_block = ST_lu_backwards_kernel_block_size; - ST_lu_backwards_kernel<FT><<<block_count, threads_per_block>>>(m, nLU, u0, u1, u2, Ux2, x2); - - block_count = divide_and_round_up(m*m, ST_backsubstitution_kernel_block_size); - threads_per_block = ST_backsubstitution_kernel_block_size; - - if (this->iter == 0) { - std::cout << "m :" << m << std::endl; - std::cout << "block_size :" << block_size << std::endl; - std::cout << "nLU :" << nLU << std::endl; - std::cout << "left_spike :" << left_spike << std::endl; - std::cout << "right_spike :" << right_spike << std::endl; - std::cout << "y :" << y << std::endl; - std::cout << "x2 :" << x2 << std::endl; - std::cout << "x :" << x << std::endl; - } - - ST_backsubstitution_kernel<FT><<<block_count, threads_per_block>>>(m, block_size, nLU, left_spike, right_spike, y, x2, x); - this->iter++; - - } - -public: - ~SpikeThomasPreconditioner() { - cudaFree(c_prime); - cudaFree(l1); - cudaFree(l2); - cudaFree(u0); - cudaFree(u1); - cudaFree(u2); - // TODO more - } -}; - - -/* - * Preconditioner for the 2D problem. This uses the PCR algorithm from cusparse. - * - * @param FT Field Type - Either float or double - * - */ -template<class FT> -class CusparsePCRPreconditioner : public Preconditioner<FT> { -private: - int m; - cusparseHandle_t cusparse_handle; - cublasHandle_t cublas_handle; - FT *dl, *d, *du; - FT *d_x_trans; -public: - virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { - - this->m = m; - this->cusparse_handle = cusparse_handle; - this->cublas_handle = cublas_handle; - - cudaMalloc((void **) &dl, m*sizeof(FT)); - cudaMalloc((void **) &d, m*sizeof(FT)); - cudaMalloc((void **) &du, m*sizeof(FT)); - - FT beta = sqrt(alpha); - - device_memset<FT>(d, (alpha + FT(2)) / beta, m, 0); - device_memset<FT>(dl, FT(0), 1, 0); - device_memset<FT>(dl, FT(-1)/beta, m-1, 1); - device_memset<FT>(du, FT(0), 1, m-1); - device_memset<FT>(du, FT(-1)/beta, m-1, 0); - - cudaMalloc((void **) &d_x_trans, m*m * sizeof(FT)); - } - - virtual void run(FT const * const d_b, FT * const d_x) { - cublas_copy(cublas_handle, m*m, d_b, d_x); - - cusparse_gtsv(cusparse_handle, m, m, dl, d, du, d_x); - - cublas_transpose(cublas_handle, m, d_x, d_x_trans); - - cusparse_gtsv(cusparse_handle, m, m, dl, d, du, d_x_trans); - - cublas_transpose(cublas_handle, m, d_x_trans, d_x); - } - - ~CusparsePCRPreconditioner() { - cudaFree(dl); - cudaFree(d); - cudaFree(du); - - cudaFree(d_x_trans); - } -}; - - - - - - -/* - * Kernel that performs multiplacion of vector with Inverse matrix of N. - * - Used for InvPreconditioner - * - * @param FT Field Type - Either float or double - * @param m grid size i.e. M is of size mxm - * @param m >= block_size >= 1 the size of a block that is inverted. For the ThomasPreconditioner this is of size m - * @param num_blocks the number of blocks that we invert - * @param alpha > 0. - * @param beta = sqrt(alpha) - * @param c_prime coefficients for the thomas algorithm that where precualculated - * @param b != NULL input vector of size m*num_blocks*block_size - * @param x != NULL output vector of size m*num_blocks*block_size - * - * @return M\X - * - */ - -//#define W 4 // -//#define THREADS 1024 // -//#define M 1024 // -//#define BANDS 10 // - defined at compile time during the compile command - - -template<class FT> -__global__ void invM_kernel_n( - int const m, // size of the problem - in 2D m*m in 3D m*m*m - int block_size, // not used - int num_blocks, // not used - FT const alpha, // not used - FT const beta, // not used - FT const * const __restrict__ dev_c_prime, // not used - FT const * const __restrict__ data_in, // input - righ hand side - FT * const __restrict__ data_out) { // output - solution - - int tid_l = threadIdx.x; // thread number within the block - int bid = blockIdx.x; - //int BLOCKS = M / W ; - int G = M / THREADS; - - int SM_a; - int G_a; - - //extern __shared__ FT bs[]; - __shared__ FT bs[BANDS + THREADS + BANDS + 1]; - - FT result; - FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal - // Just for tunning - later must be loaded particular values of inv matrix - // Kernel should be compiled with these coeficients - - // Band matrix coeeficient innitialization - should be compiled into the code to avoid unnecessary memory accesses - #pragma unroll - for (int i = 0; i < BANDS+1; i++) { - b_coefs[i] = BANDS+1 - i; - } - - // Example - how coefficients can be loaded from global to shared - it is too slow - coefs must be compiled into code - // if (tid_l < BANDS+1) - // b_coefs[tid_l] = dev_c_prime[tid_l]; - - //#pragma unroll - for (int g = 0; g < G; g++ ) { - - __syncthreads(); - - // Prev buff - if ( tid_l < BANDS ) { - SM_a = tid_l; - G_a = bid * G * THREADS + g * THREADS - BANDS + tid_l; - if ( g == 0 ) - bs[SM_a] = 0.0; - else - bs[SM_a] = data_in[G_a]; - } - - // After buff - if ( tid_l < BANDS ) { - SM_a = tid_l + THREADS + BANDS; - G_a = bid * G * THREADS + g * THREADS + THREADS + tid_l; - if ( g < G - 1 ) { - bs[SM_a] = data_in[G_a]; - } else { - bs[SM_a] = 0; - } - - } - - // Main buff - SM_a = tid_l + BANDS; - G_a = bid * G * THREADS + g * THREADS + tid_l; - bs[SM_a] = data_in[G_a]; - - __syncthreads(); - - // Processing - SM_a = tid_l + BANDS; - - result = b_coefs[0] * bs[ SM_a ]; - - #pragma unroll - for (int i = 0; i < BANDS; i++) { - result += b_coefs[BANDS - i] * bs[SM_a - BANDS + i]; -// } -// for (int i = 0; i < BANDS; i++) { - result += b_coefs[i+1] * bs[SM_a + i + 1]; - } - - // Save results - //G_a = bid * G * THREADS + g * THREADS + tid_l; - data_out[G_a] = result; - - - } // end G loop - -} - - -template<class FT> -__global__ void invM_kernel2_n( - int const m, - int block_size, - int num_blocks, - FT const alpha, - FT const beta, - FT const * const __restrict__ dev_c_prime, - FT const * const __restrict__ data_in, - FT * const __restrict__ data_out) -{ - - // Shared memory for W (at least 8) - __shared__ FT bs[W][BANDS + THREADS/W + BANDS + 1]; // +1 for avoiding memory banks conflict - - int tid_l = threadIdx.x; // thread number within the block - int bid = blockIdx.x; - //int BLOCKS = M / W ; - int G = M / (THREADS / W ); - - int SM_row_a; - int SM_col_a; - int G_row_a; - int G_col_a; - int G_a; - - FT result; // stores W results per thread - FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal - // Just for tunning - later must be loaded particular values of inv matrix - // Kernel should be compiled with these coeficients - - // Band matrix coeeficient innitialization - should be compiled into the code to avoid unnecessary memory accesses - #pragma unroll - for (int i = 0; i < BANDS+1; i++) { - b_coefs[i] = BANDS+1 - i; - } - - for (int g = 0; g < G; g++ ) { - - __syncthreads(); - - // START - loading data to shared memory - SM_row_a = tid_l % W; - SM_col_a = tid_l / W + 2 * BANDS; - G_row_a = tid_l / W + THREADS / W * g + BANDS; - G_col_a = tid_l % W + bid * W; - G_a = M * G_row_a + G_col_a; - - bs[SM_row_a][SM_col_a] = data_in[G_a]; - - if ( g == 0 && tid_l < W * BANDS) { - SM_row_a = tid_l % W; - SM_col_a = tid_l / W + 1 * BANDS; - G_row_a = tid_l / W + THREADS/W * g + 0 * BANDS; - G_col_a = tid_l % W + bid * W; - G_a = M*G_row_a + G_col_a; - - bs[SM_row_a][SM_col_a] = data_in[G_a]; - bs[tid_l % W][tid_l / W] = 0.0; - } - - __syncthreads(); - - if ( g == G-1 && tid_l < W * BANDS ) { - SM_row_a = tid_l % W; - SM_col_a = tid_l / W + THREADS/W + 1 * BANDS; - - bs[SM_row_a][SM_col_a] = 0.0; - } - - SM_row_a = tid_l % W; - SM_col_a = tid_l / W + 1 * BANDS; - - __syncthreads(); - // END - loading data to shared memory - - // START - Processing - result = b_coefs[0] * bs[ SM_row_a ][ SM_col_a ]; - - #pragma unroll - for (int i = 0; i < BANDS; i++) { - result += b_coefs[BANDS - i] * bs[SM_row_a][SM_col_a - BANDS + i]; - } - - #pragma unroll - for (int i = 0; i < BANDS; i++) { - result += b_coefs[i+1] * bs[SM_row_a][SM_col_a + i + 1]; - } - // END - Processing - - __syncthreads(); + block_count = divide_and_round_up(m, ST_lu_forwards_kernel_block_size); + threads_per_block = ST_lu_forwards_kernel_block_size; + ST_lu_forwards_kernel<FT><<<block_count, threads_per_block>>>(m, nLU, l1, l2, b2, Ux2); - // START - Shifting of overlapped regions in shared memory - if ( tid_l < 2 * BANDS * W ) { - SM_row_a = tid_l % W; - SM_col_a = tid_l / W; - bs[SM_row_a][SM_col_a] = bs[SM_row_a][SM_col_a + THREADS / W]; - } - // END - Shifting of overlapped regions in shared memory + block_count = divide_and_round_up(m, ST_lu_backwards_kernel_block_size); + threads_per_block = ST_lu_backwards_kernel_block_size; + ST_lu_backwards_kernel<FT><<<block_count, threads_per_block>>>(m, nLU, u0, u1, u2, Ux2, x2); - // Save results to global memory -// SM_row_a = tid_l % W; -// SM_col_a = tid_l / W + 2 * BANDS; - G_row_a = tid_l / W + THREADS/W * g + 0*BANDS; - G_col_a = tid_l % W + bid * W; - G_a = M * G_row_a + G_col_a; + block_count = divide_and_round_up(m*m, ST_backsubstitution_kernel_block_size); + threads_per_block = ST_backsubstitution_kernel_block_size; - data_out[G_a] = result; + if (this->iter == 0) { + std::cout << "m :" << m << std::endl; + std::cout << "block_size :" << block_size << std::endl; + std::cout << "nLU :" << nLU << std::endl; + std::cout << "left_spike :" << left_spike << std::endl; + std::cout << "right_spike :" << right_spike << std::endl; + std::cout << "y :" << y << std::endl; + std::cout << "x2 :" << x2 << std::endl; + std::cout << "x :" << x << std::endl; + } - } // end G loop + ST_backsubstitution_kernel<FT><<<block_count, threads_per_block>>>(m, block_size, nLU, left_spike, right_spike, y, x2, x); + this->iter++; + } -} +public: + ~SpikeThomasPreconditioner() { + cudaFree(c_prime); + cudaFree(l1); + cudaFree(l2); + cudaFree(u0); + cudaFree(u1); + cudaFree(u2); + // TODO more + } +}; /* - * Preconditioner for the 2D problem. This use an elements of inverse matrix of M + * Preconditioner for the 2D problem. This uses the PCR algorithm from cusparse. * * @param FT Field Type - Either float or double * */ - template<class FT> -class InvPreconditioner : public Preconditioner<FT> { +class CusparsePCRPreconditioner : public Preconditioner<FT> { private: - FT *c_prime; int m; - FT alpha, beta; + cusparseHandle_t cusparse_handle; cublasHandle_t cublas_handle; - FT *b_trans; + FT *dl, *d, *du; + FT *d_x_trans; +public: + virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { - int kernel_block_size; - bool USE_IMPLICIT_TRANSPOSE; + this->m = m; + this->cusparse_handle = cusparse_handle; + this->cublas_handle = cublas_handle; -public: - InvPreconditioner(bool useImplicitTranspose) { - this->USE_IMPLICIT_TRANSPOSE = useImplicitTranspose; - } + cudaMalloc((void **) &dl, m*sizeof(FT)); + cudaMalloc((void **) &d, m*sizeof(FT)); + cudaMalloc((void **) &du, m*sizeof(FT)); - virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { - this->m = m; - this->alpha = alpha; - beta = sqrt(alpha); - this-> cublas_handle = cublas_handle; + FT beta = sqrt(alpha); - // needed for Thomas help ... - FT *host_c_prime = new FT[m]; - host_c_prime[0] = FT(-1) / (alpha + FT(2)); - for (int i = 1; i < m; ++i) { - host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); - } + device_memset<FT>(d, (alpha + FT(2)) / beta, m, 0); + device_memset<FT>(dl, FT(0), 1, 0); + device_memset<FT>(dl, FT(-1)/beta, m-1, 1); + device_memset<FT>(du, FT(0), 1, m-1); + device_memset<FT>(du, FT(-1)/beta, m-1, 0); - cudaMalloc((void **) &c_prime, m * sizeof(FT)); - cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); - delete[] host_c_prime; + cudaMalloc((void **) &d_x_trans, m*m * sizeof(FT)); + } - cudaMalloc((void **) &b_trans, m * m * sizeof(FT)); - // END - needed for Thomas help ... + virtual void run(FT const * const d_b, FT * const d_x) { + cublas_copy(cublas_handle, m*m, d_b, d_x); + + cusparse_gtsv(cusparse_handle, m, m, dl, d, du, d_x); + cublas_transpose(cublas_handle, m, d_x, d_x_trans); + cusparse_gtsv(cusparse_handle, m, m, dl, d, du, d_x_trans); - int minGridSize; - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &kernel_block_size, invM_kernel_n<FT>, 0, m); - //kernel_block_size /= 8; + cublas_transpose(cublas_handle, m, d_x_trans, d_x); + } - kernel_block_size = THREADS; + ~CusparsePCRPreconditioner() { + cudaFree(dl); + cudaFree(d); + cudaFree(du); + cudaFree(d_x_trans); } +}; + - virtual void run(FT const * const b_XXX, FT * const x_XXX) { -// double *h_x, *h_b; -// double *b, *x; + + + +///* +// * Kernel that performs multiplacion of vector with Inverse matrix of N. +// * - Used for InvPreconditioner +// * +// * @param FT Field Type - Either float or double +// * @param m grid size i.e. M is of size mxm +// * @param m >= block_size >= 1 the size of a block that is inverted. For the ThomasPreconditioner this is of size m +// * @param num_blocks the number of blocks that we invert +// * @param alpha > 0. +// * @param beta = sqrt(alpha) +// * @param c_prime coefficients for the thomas algorithm that where precualculated +// * @param b != NULL input vector of size m*num_blocks*block_size +// * @param x != NULL output vector of size m*num_blocks*block_size +// * +// * @return M\X +// * +// */ +// +////#define W 4 // +////#define THREADS 1024 // +////#define M 1024 // +////#define BANDS 10 // - defined at compile time during the compile command +// +// +//template<class FT> +//__global__ void invM_kernel_n( +// int const m, // size of the problem - in 2D m*m in 3D m*m*m +// int block_size, // not used +// int num_blocks, // not used +// FT const alpha, // not used +// FT const beta, // not used +// FT const * const __restrict__ dev_c_prime, // not used +// FT const * const __restrict__ data_in, // input - righ hand side +// FT * const __restrict__ data_out) { // output - solution +// +// int tid_l = threadIdx.x; // thread number within the block +// int bid = blockIdx.x; +// //int BLOCKS = M / W ; +// int G = M / THREADS; +// +// int SM_a; +// int G_a; +// +// //extern __shared__ FT bs[]; +// __shared__ FT bs[BANDS + THREADS + BANDS + 1]; +// +// FT result; +// FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal +// // Just for tunning - later must be loaded particular values of inv matrix +// // Kernel should be compiled with these coeficients +// +// // Band matrix coeeficient innitialization - should be compiled into the code to avoid unnecessary memory accesses +// #pragma unroll +// for (int i = 0; i < BANDS+1; i++) { +// b_coefs[i] = BANDS+1 - i; +// } +// +// // Example - how coefficients can be loaded from global to shared - it is too slow - coefs must be compiled into code +// // if (tid_l < BANDS+1) +// // b_coefs[tid_l] = dev_c_prime[tid_l]; +// +// //#pragma unroll +// for (int g = 0; g < G; g++ ) { +// +// __syncthreads(); +// +// // Prev buff +// if ( tid_l < BANDS ) { +// SM_a = tid_l; +// G_a = bid * G * THREADS + g * THREADS - BANDS + tid_l; +// if ( g == 0 ) +// bs[SM_a] = 0.0; +// else +// bs[SM_a] = data_in[G_a]; +// } +// +// // After buff +// if ( tid_l < BANDS ) { +// SM_a = tid_l + THREADS + BANDS; +// G_a = bid * G * THREADS + g * THREADS + THREADS + tid_l; +// if ( g < G - 1 ) { +// bs[SM_a] = data_in[G_a]; +// } else { +// bs[SM_a] = 0; +// } +// +// } +// +// // Main buff +// SM_a = tid_l + BANDS; +// G_a = bid * G * THREADS + g * THREADS + tid_l; +// bs[SM_a] = data_in[G_a]; +// +// __syncthreads(); +// +// // Processing +// SM_a = tid_l + BANDS; +// +// result = b_coefs[0] * bs[ SM_a ]; +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[BANDS - i] * bs[SM_a - BANDS + i]; +//// } +//// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[i+1] * bs[SM_a + i + 1]; +// } +// +// // Save results +// //G_a = bid * G * THREADS + g * THREADS + tid_l; +// data_out[G_a] = result; +// +// +// } // end G loop +// +//} +// +// +//template<class FT> +//__global__ void invM_kernel2_n( +// int const m, +// int block_size, +// int num_blocks, +// FT const alpha, +// FT const beta, +// FT const * const __restrict__ dev_c_prime, +// FT const * const __restrict__ data_in, +// FT * const __restrict__ data_out) +//{ +// +// // Shared memory for W (at least 8) +// __shared__ FT bs[W][BANDS + THREADS/W + BANDS + 1]; // +1 for avoiding memory banks conflict +// +// int tid_l = threadIdx.x; // thread number within the block +// int bid = blockIdx.x; +// //int BLOCKS = M / W ; +// int G = M / (THREADS / W ); +// +// int SM_row_a; +// int SM_col_a; +// int G_row_a; +// int G_col_a; +// int G_a; +// +// FT result; // stores W results per thread +// FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal +// // Just for tunning - later must be loaded particular values of inv matrix +// // Kernel should be compiled with these coeficients +// +// // Band matrix coeeficient innitialization - should be compiled into the code to avoid unnecessary memory accesses +// #pragma unroll +// for (int i = 0; i < BANDS+1; i++) { +// b_coefs[i] = BANDS+1 - i; +// } +// +// for (int g = 0; g < G; g++ ) { +// +// __syncthreads(); +// +// // START - loading data to shared memory +// SM_row_a = tid_l % W; +// SM_col_a = tid_l / W + 2 * BANDS; +// G_row_a = tid_l / W + THREADS / W * g + BANDS; +// G_col_a = tid_l % W + bid * W; +// G_a = M * G_row_a + G_col_a; +// +// bs[SM_row_a][SM_col_a] = data_in[G_a]; +// +// if ( g == 0 && tid_l < W * BANDS) { +// SM_row_a = tid_l % W; +// SM_col_a = tid_l / W + 1 * BANDS; +// G_row_a = tid_l / W + THREADS/W * g + 0 * BANDS; +// G_col_a = tid_l % W + bid * W; +// G_a = M*G_row_a + G_col_a; +// +// bs[SM_row_a][SM_col_a] = data_in[G_a]; +// bs[tid_l % W][tid_l / W] = 0.0; +// } +// +// __syncthreads(); +// +// if ( g == G-1 && tid_l < W * BANDS ) { +// SM_row_a = tid_l % W; +// SM_col_a = tid_l / W + THREADS/W + 1 * BANDS; +// +// bs[SM_row_a][SM_col_a] = 0.0; +// } +// +// SM_row_a = tid_l % W; +// SM_col_a = tid_l / W + 1 * BANDS; +// +// __syncthreads(); +// // END - loading data to shared memory +// +// // START - Processing +// result = b_coefs[0] * bs[ SM_row_a ][ SM_col_a ]; +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[BANDS - i] * bs[SM_row_a][SM_col_a - BANDS + i]; +// } +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[i+1] * bs[SM_row_a][SM_col_a + i + 1]; +// } +// // END - Processing +// +// __syncthreads(); +// +// // START - Shifting of overlapped regions in shared memory +// if ( tid_l < 2 * BANDS * W ) { +// SM_row_a = tid_l % W; +// SM_col_a = tid_l / W; +// bs[SM_row_a][SM_col_a] = bs[SM_row_a][SM_col_a + THREADS / W]; +// } +// // END - Shifting of overlapped regions in shared memory +// +// // Save results to global memory +//// SM_row_a = tid_l % W; +//// SM_col_a = tid_l / W + 2 * BANDS; +// G_row_a = tid_l / W + THREADS/W * g + 0*BANDS; +// G_col_a = tid_l % W + bid * W; +// G_a = M * G_row_a + G_col_a; +// +// data_out[G_a] = result; +// +// } // end G loop +// +// +//} +// +// +///* +// * Preconditioner for the 2D problem. This use an elements of inverse matrix of M +// * +// * @param FT Field Type - Either float or double +// * +// */ +// +//template<class FT> +//class InvPreconditioner : public Preconditioner<FT> { +//private: +// FT *c_prime; +// int m; +// FT alpha, beta; +// cublasHandle_t cublas_handle; +// FT *b_trans; +// +// int kernel_block_size; +// bool USE_IMPLICIT_TRANSPOSE; +// +//public: +// InvPreconditioner(bool useImplicitTranspose) { +// this->USE_IMPLICIT_TRANSPOSE = useImplicitTranspose; +// } +// +// virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { +// this->m = m; +// this->alpha = alpha; +// beta = sqrt(alpha); +// this-> cublas_handle = cublas_handle; +// +// // needed for Thomas help ... +// FT *host_c_prime = new FT[m]; +// host_c_prime[0] = FT(-1) / (alpha + FT(2)); +// for (int i = 1; i < m; ++i) { +// host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); +// } +// +// cudaMalloc((void **) &c_prime, m * sizeof(FT)); +// cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); +// delete[] host_c_prime; +// +// cudaMalloc((void **) &b_trans, m * m * sizeof(FT)); +// // END - needed for Thomas help ... +// +// +// +// int minGridSize; +// cudaOccupancyMaxPotentialBlockSize(&minGridSize, &kernel_block_size, invM_kernel_n<FT>, 0, m); +// //kernel_block_size /= 8; +// +// kernel_block_size = THREADS; +// +// } +// +// virtual void run(FT const * const b_XXX, FT * const x_XXX) { +// +//// double *h_x, *h_b; +//// double *b, *x; +//// +//// h_x = (double * ) malloc ( (m*m)*sizeof(double) ); +//// h_b = (double * ) malloc ( (m*m)*sizeof(double) ); +//// +//// for (int i = 0; i < M; i++) { +//// for (int j = 0; j < M; j++) +//// { +//// h_b[i*M + j] = 100*j + i; +//// h_x[i*M + j] = 0.0; +//// } +//// } +//// +//// cudaMalloc((void **) &b, (m*m)*sizeof(double)); +//// cudaMalloc((void **) &x, (m*m)*sizeof(double)); +//// +//// device_memset_inc<double>(x, 0.0, m*m); +//// cudaMemcpy( b, h_b, (m*m)*sizeof(double) , cudaMemcpyHostToDevice ); +//// +//// for (int i = 0; i < M; i++) { +//// for (int j = 0; j < M; j++) +//// { +//// //std::cout << std::setw(5) << h_b[i*M + j]; +//// +//// } +//// //std::cout << std::endl; +//// } +//// std::cout << std::endl; +//// std::cout << std::endl; +//// +////// for (int i = 0; i < m*m; i++) +////// std::cout << h_b[i] << " "; +////// std::cout << std::endl; +//// +//// +// +// +// kernel_block_size = THREADS; +// FT block_count = divide_and_round_up(m*m, kernel_block_size); +// FT threads_per_block = kernel_block_size; +// std::cout << "Inv M - Tk 2 = API computed block size : " << kernel_block_size << " Grid size: " << block_count << std::endl; +// +// +//// cublas_transpose(cublas_handle, m, b, b_trans); +//// +//// invM_kernel <FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b, x); +//// std::cout << "Inv M - Tk 2 = 1 " << std::endl; +// +// //invM_kernel_n<FT><<<m, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b, x); +// invM_kernel_n<FT><<<m, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b_XXX, x_XXX); +// std::cout << "Inv M - Tk 2 = 2 - Blocks: " << m << " Threads: " << threads_per_block << std::endl; +// +//// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); +//// device_memset_inc<double>(x, 0.0, m*m); +//// long long sum = 0; +//// for (int i = 0; i < M; i++) { +//// for (int j = 0; j < M; j++) +//// { +//// sum += h_x[i*M + j]; +//// //std::cout << std::setw(5) << h_x[i*M + j]; +//// } +//// //std::cout << std::endl; +//// } +//// std::cout << "Sum 1 = " << sum << std::endl; +//// std::cout << std::endl; +//// +//// //cublas_transpose(cublas_handle, m, b, b_trans); +//// +//// // invM_kernel2 <FT><<<block_count/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b, x); +//// // std::cout << "Inv M - Tk 2 = 3 " << std::endl; +// +// //invM_kernel2_n<FT><<<M/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b, x); +// invM_kernel2_n<FT><<<M/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b_XXX, x_XXX); +// std::cout << "Inv M - Tk 2 = 4 - Blocks: " << M/W << " Threads: " << THREADS << std::endl; +// +//// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); +//// device_memset_inc<double>(x, 0.0, m*m); +//// sum = 0; +//// for (int i = 0; i < M; i++) { +//// for (int j = 0; j < M; j++) +//// { +//// sum += h_x[i*M + j]; +//// //std::cout << std::setw(5) << h_x[i*M + j]; +//// } +//// //std::cout << std::endl; +//// } +//// std::cout << "Sum 2 = " << sum << std::endl; +//// std::cout << std::endl; +//// +//// cublas_transpose(cublas_handle, m, b, x); +//// +//// cudaFree(b); +//// cudaFree(x); +// +// +// // To let the iterative solver finish - recalculate with Thomas +// int minGridSize; +// int thomas_kernel_block_size; +// cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel<FT>, 0, m); +// thomas_kernel_block_size /= 8; +// +// block_count = divide_and_round_up(m, thomas_kernel_block_size); +// threads_per_block = thomas_kernel_block_size; +// +// +// cublas_transpose(cublas_handle, m, b_XXX, b_trans); +// FT *y_trans = x_XXX; +// thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans); +// FT *y = b_trans; +// cublas_transpose(cublas_handle, m, y_trans, y); +// thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x_XXX); +// +// +//// thomas_kernel_trans<FT><<<M/THREADS, THREADS >>> (m, m, 1, alpha, beta, c_prime, b_XXX, b_trans); +//// thomas_kernel <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, x_XXX); +// +// std::cout << "Tk Inverse(TH-hp) - implicit trans " << std::endl; +// +// +// } +// +// +// virtual void run2(FT const * const b, FT * const x) { +// +//// std::cout << "Inv M - Tk 1 = API computed block size : " << kernel_block_size << std::endl; +//// +//// //kernel_block_size = 1024; +//// FT block_count = divide_and_round_up(m*m, kernel_block_size); +//// FT threads_per_block = kernel_block_size; +//// +//// cublas_transpose(cublas_handle, m, b, b_trans); +//// FT *y_trans = x; +//// +//// invM_kernel<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans); +//// +//// FT *y = b_trans; +//// cublas_transpose(cublas_handle, m, y_trans, y); +//// +//// invM_kernel<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, y, x); +//// +//// std::cout << "Inv M - Tk 1" << std::endl; +// +// } +// +// +// virtual void run3(FT const * const b, FT * const x) { +// +// } +// +// virtual void run_t(FT const * const b, FT * const x) { +// +// } +// +// +// +// +//}; +// +// +// +// +// +// +///* +// * Kernel that performs Inverse of matrix for 3D case +// * +// * @param FT Field Type - Either float or double +// * @param m grid size i.e. M is of size mxm +// * @param n number of vectors that we invert simultanously. Usually has value m*m +// * @param alpha > 0. +// * @param alpha_23 = alpha^(2/3) +// * @param c_prime coefficients for the thomas algorithm that where precualculated +// * @param b != NULL input vector of size m*n +// * @param x != NULL output vector of size m*n +// * +// * @return M\X +// * +// */ +// +//template<class FT> +//__global__ void invM_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// if (tid >= n) { +// return; +// } +// +// int start = tid; +// +// +// FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); +// +// x[start] = work_buffer_reg; +// for (int i = 1; i < m; ++i) { +// int j = start + i*n; +// work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); +// x[j] = work_buffer_reg; +// } +// +// FT x_reg = x[start + (m-1)*n]; +// x[start + (m-1)*n] = x_reg; +// for (int i = m-2; i >= 0; --i) { +// int j = start + i*n; +// x_reg = x[j] - dev_c_prime[i] * x_reg; +// x[j] = x_reg; +// } +//} +// +// +// +// +// +//template<class FT> +//__global__ void invM_kernel3D( int const m, // size of the problem - in 2D m*m in 3D m*m*m +// int block_size, // not used +// int num_blocks, // not used +// FT const alpha, // not used +// FT const beta, // not used +// FT const * const __restrict__ dev_c_prime, // not used +// FT const * const __restrict__ b, // input - righ hand side +// FT * const __restrict__ x) { // output - solution +// +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// int tid_l = threadIdx.x; +// +// //int start = tid; +// +// // There are different coeficients for cornes of the matrix - will be done by separate kernel most probably +// //if (tid < BANDS || tid >= m * m * blockDim.x - BANDS) { +// if (tid_l < BANDS || tid_l >= blockDim.x - BANDS) { +// return; +// } +// +// extern __shared__ FT bs[]; +// //__shared__ FT b_coefs[BANDS+1]; +// +// FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal +// // Just for tunning - later must be loaded particular values of inv matrix +// // Kernel should be compiled with these coeficients +// for (int i = 0; i < BANDS+1; i++) { +// b_coefs[i] = 0.001*i; +// } +// +// //if (tid_l < BANDS+1) +// // b_coefs[tid_l] = dev_c_prime[tid_l]; +// +// +// bs[tid_l+BANDS] = b[tid]; +// +// if (tid_l < BANDS) { +// bs[tid_l ] = b[tid-BANDS]; +// bs[blockDim.x+tid_l] = b[tid+BANDS]; +// } +// +// FT result = b_coefs[0] * bs[BANDS + tid_l]; +// +// __syncthreads(); // -// h_x = (double * ) malloc ( (m*m)*sizeof(double) ); -// h_b = (double * ) malloc ( (m*m)*sizeof(double) ); +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[BANDS - i] * bs[tid_l + i]; +// } // -// for (int i = 0; i < M; i++) { -// for (int j = 0; j < M; j++) -// { -// h_b[i*M + j] = 100*j + i; -// h_x[i*M + j] = 0.0; -// } -// } +// //result += b_coefs[0] * bs[BANDS + tid_l]; // -// cudaMalloc((void **) &b, (m*m)*sizeof(double)); -// cudaMalloc((void **) &x, (m*m)*sizeof(double)); +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[i] * bs[BANDS + tid_l + i]; +// } // -// device_memset_inc<double>(x, 0.0, m*m); -// cudaMemcpy( b, h_b, (m*m)*sizeof(double) , cudaMemcpyHostToDevice ); +// x[tid] = result; // -// for (int i = 0; i < M; i++) { -// for (int j = 0; j < M; j++) -// { -// //std::cout << std::setw(5) << h_b[i*M + j]; // -// } -// //std::cout << std::endl; -// } -// std::cout << std::endl; -// std::cout << std::endl; +//} // -//// for (int i = 0; i < m*m; i++) -//// std::cout << h_b[i] << " "; -//// std::cout << std::endl; // +//template<class FT> +//__global__ void invM_kernel3D_2(int const m, // size of the problem - in 2D m*m in 3D m*m*m +// int block_size, // not used +// int num_blocks, // not used +// FT const alpha, // not used +// FT const beta, // not used +// FT const * const __restrict__ dev_c_prime, // not used +// FT const * const __restrict__ b, // input - righ hand side +// FT * const __restrict__ x) { // output - solution // - - - kernel_block_size = THREADS; - FT block_count = divide_and_round_up(m*m, kernel_block_size); - FT threads_per_block = kernel_block_size; - std::cout << "Inv M - Tk 2 = API computed block size : " << kernel_block_size << " Grid size: " << block_count << std::endl; - - -// cublas_transpose(cublas_handle, m, b, b_trans); +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// int tid_l = threadIdx.x; +// int n = m*m; +// // int start = tid - indexing in 1st dimension +// int start = (n) * (tid/m) + (tid % m); // indexing in 2nd dimension +// // // -// invM_kernel <FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b, x); -// std::cout << "Inv M - Tk 2 = 1 " << std::endl; - - //invM_kernel_n<FT><<<m, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b, x); - invM_kernel_n<FT><<<m, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b_XXX, x_XXX); - std::cout << "Inv M - Tk 2 = 2 - Blocks: " << m << " Threads: " << threads_per_block << std::endl; - -// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// device_memset_inc<double>(x, 0.0, m*m); -// long long sum = 0; -// for (int i = 0; i < M; i++) { -// for (int j = 0; j < M; j++) -// { -// sum += h_x[i*M + j]; -// //std::cout << std::setw(5) << h_x[i*M + j]; -// } -// //std::cout << std::endl; -// } -// std::cout << "Sum 1 = " << sum << std::endl; -// std::cout << std::endl; -// -// //cublas_transpose(cublas_handle, m, b, b_trans); -// -// // invM_kernel2 <FT><<<block_count/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b, x); -// // std::cout << "Inv M - Tk 2 = 3 " << std::endl; - - //invM_kernel2_n<FT><<<M/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b, x); - invM_kernel2_n<FT><<<M/W, THREADS >>> (m, m, 1, alpha, beta, c_prime, b_XXX, x_XXX); - std::cout << "Inv M - Tk 2 = 4 - Blocks: " << M/W << " Threads: " << THREADS << std::endl; - -// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// device_memset_inc<double>(x, 0.0, m*m); -// sum = 0; -// for (int i = 0; i < M; i++) { -// for (int j = 0; j < M; j++) -// { -// sum += h_x[i*M + j]; -// //std::cout << std::setw(5) << h_x[i*M + j]; -// } -// //std::cout << std::endl; -// } -// std::cout << "Sum 2 = " << sum << std::endl; -// std::cout << std::endl; +// // There are different coeficients for cornes of the matrix - will be done by separate kernel most probably +// //if (tid < BANDS || tid >= m * m * blockDim.x - BANDS) { +// if (tid_l < BANDS || tid_l >= blockDim.x - BANDS) { +// return; +// } // -// cublas_transpose(cublas_handle, m, b, x); +// extern __shared__ FT bs[]; +// //__shared__ FT b_coefs[BANDS+1]; // -// cudaFree(b); -// cudaFree(x); - - - // To let the iterative solver finish - recalculate with Thomas - int minGridSize; - int thomas_kernel_block_size; - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel<FT>, 0, m); - thomas_kernel_block_size /= 8; - - block_count = divide_and_round_up(m, thomas_kernel_block_size); - threads_per_block = thomas_kernel_block_size; - - - cublas_transpose(cublas_handle, m, b_XXX, b_trans); - FT *y_trans = x_XXX; - thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans); - FT *y = b_trans; - cublas_transpose(cublas_handle, m, y_trans, y); - thomas_kernel_x<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x_XXX); - - -// thomas_kernel_trans<FT><<<M/THREADS, THREADS >>> (m, m, 1, alpha, beta, c_prime, b_XXX, b_trans); -// thomas_kernel <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, x_XXX); - - std::cout << "Tk Inverse(TH-hp) - implicit trans " << std::endl; - - - } - - - virtual void run2(FT const * const b, FT * const x) { - +// FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal +// // Just for tunning - later must be loaded particular values of inv matrix +// // Kernel should be compiled with these coeficients +// for (int i = 0; i < BANDS+1; i++) { +// b_coefs[i] = 0.001*i; +// } +// +// //if (tid_l < BANDS+1) +// // b_coefs[tid_l] = dev_c_prime[tid_l]; +// +// +// bs[tid_l+BANDS] = b[start]; +// +// if (tid_l < BANDS) { +// bs[tid_l ] = b[start-BANDS]; +// bs[blockDim.x+tid_l] = b[start+BANDS]; +// } +// +// FT result = b_coefs[0] * bs[BANDS + tid_l]; +// +// __syncthreads(); +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[BANDS - i] * bs[tid_l + i]; +// } +// +// //result += b_coefs[0] * bs[BANDS + tid_l]; +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[i] * bs[BANDS + tid_l + i]; +// } +// +// x[tid] = result; +// +// +//} +// +// +//template<class FT> +//__global__ void invM_kernel3D_T(int const m, // size of the problem - in 2D m*m in 3D m*m*m +// int block_size, // not used +// int num_blocks, // not used +// FT const alpha, // not used +// FT const beta, // not used +// FT const * const __restrict__ dev_c_prime, // not used +// FT const * const __restrict__ b, // input - righ hand side +// FT * const __restrict__ x) { // output - solution +// +// int tid = blockIdx.x * blockDim.x + threadIdx.x; +// int tid_l = threadIdx.x; +// int n = m*m; +// // int start = tid - indexing in 1st dimension +// int start = (n) * (tid/m) + (tid % m); // indexing in 2nd dimension +// //int start = +// +// // There are different coeficients for cornes of the matrix - will be done by separate kernel most probably +// //if (tid < BANDS || tid >= m * m * blockDim.x - BANDS) { +// if (tid_l < BANDS || tid_l >= blockDim.x - BANDS) { +// return; +// } +// +// extern __shared__ FT bs[]; +// //__shared__ FT b_coefs[BANDS+1]; +// +// FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal +// // Just for tunning - later must be loaded particular values of inv matrix +// // Kernel should be compiled with these coeficients +// for (int i = 0; i < BANDS+1; i++) { +// b_coefs[i] = 0.001*i; +// } +// +// //if (tid_l < BANDS+1) +// // b_coefs[tid_l] = dev_c_prime[tid_l]; +// +// +// bs[tid_l+BANDS] = b[start]; +// +// if (tid_l < BANDS) { +// bs[tid_l ] = b[start-BANDS]; +// bs[blockDim.x+tid_l] = b[start+BANDS]; +// } +// +// FT result = b_coefs[0] * bs[BANDS + tid_l]; +// +// __syncthreads(); +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[BANDS - i] * bs[tid_l + i]; +// } +// +// //result += b_coefs[0] * bs[BANDS + tid_l]; +// +// #pragma unroll +// for (int i = 0; i < BANDS; i++) { +// result += b_coefs[i] * bs[BANDS + tid_l + i]; +// } +// +// x[tid] = result; +// +// +//} +// +// +// +//template<class FT> +//class InvPreconditioner3D : public Preconditioner<FT> { +//private: +// FT *c_prime; +// int m; +// cublasHandle_t cublas_handle; +// FT *b_trans; +// FT alpha, alpha_23; +// +// int kernel_block_size; +//public: +// virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { +// this->m = m; +// this->alpha = alpha; +// this-> cublas_handle = cublas_handle; +// +//// FT *host_c_prime = new FT[m]; +// +// alpha_23 = pow(alpha, FT(2)/FT(3)); +// +//// host_c_prime[0] = FT(-1) / (alpha + FT(2)); +//// for (int i = 1; i < m; ++i) { +//// host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); +//// } +// +// cudaMalloc((void **) &c_prime, m*sizeof(FT)); +//// cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); +// +//// delete[] host_c_prime; +// +// cudaMalloc((void **) &b_trans, m*m*m*sizeof(FT)); +// +// int minGridSize; +// cudaOccupancyMaxPotentialBlockSize(&minGridSize, &kernel_block_size, invM_kernel_n<FT>, 0, m*m); +// if ( m <= 1024 ) +// kernel_block_size = m; +// else +// kernel_block_size = 1024; +// } +// +// virtual void run(FT const * const b, FT * const x) { +// +// //kernel_block_size = 1024; +// int n = m*m*m; +// FT block_count = divide_and_round_up(n, kernel_block_size); +// FT threads_per_block = kernel_block_size; +// +// std::cout << "Inv M 3D - Tk 1 = API computed block size : " << kernel_block_size << std::endl; +// std::cout << "Inv M 3D - Tk 1 = API computed grid size : " << block_count << std::endl; +// +// FT *y = x; +// invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, b, y); +// +// FT *y_trans = b_trans; +// cublas_transpose2(cublas_handle, m*m, m, y, y_trans); +// +// FT *z_trans = y; +// invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, y_trans, z_trans); +// +// FT *z_trans2 = y_trans; +// cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); +// FT *x_trans2 = z_trans; +// +// invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, z_trans2, x_trans2); +// FT *x_trans = z_trans2; +// cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); +// +// cublas_transpose2(cublas_handle, m, m*m, x_trans, x); +// } +// +// +// virtual void run2(FT const * const b, FT * const x) { +// +///* // std::cout << "Inv M - Tk 1 = API computed block size : " << kernel_block_size << std::endl; // // //kernel_block_size = 1024; @@ -2266,244 +2326,15 @@ public: // invM_kernel<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, y, x); // // std::cout << "Inv M - Tk 1" << std::endl; - - } - - - virtual void run3(FT const * const b, FT * const x) { - - } - - virtual void run_t(FT const * const b, FT * const x) { - - } - - - - -}; - - - - - - -/* - * Kernel that performs Inverse of matrix for 3D case - * - * @param FT Field Type - Either float or double - * @param m grid size i.e. M is of size mxm - * @param n number of vectors that we invert simultanously. Usually has value m*m - * @param alpha > 0. - * @param alpha_23 = alpha^(2/3) - * @param c_prime coefficients for the thomas algorithm that where precualculated - * @param b != NULL input vector of size m*n - * @param x != NULL output vector of size m*n - * - * @return M\X - * - */ - -template<class FT> -__global__ void invM_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid >= n) { - return; - } - - int start = tid; - - - FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha); - - x[start] = work_buffer_reg; - for (int i = 1; i < m; ++i) { - int j = start + i*n; - work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]); - x[j] = work_buffer_reg; - } - - FT x_reg = x[start + (m-1)*n]; - x[start + (m-1)*n] = x_reg; - for (int i = m-2; i >= 0; --i) { - int j = start + i*n; - x_reg = x[j] - dev_c_prime[i] * x_reg; - x[j] = x_reg; - } -} - - - - - -template<class FT> -__global__ void invM_kernel3D( int const m, // size of the problem - in 2D m*m in 3D m*m*m - int block_size, // not used - int num_blocks, // not used - FT const alpha, // not used - FT const beta, // not used - FT const * const __restrict__ dev_c_prime, // not used - FT const * const __restrict__ b, // input - righ hand side - FT * const __restrict__ x) { // output - solution - - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int tid_l = threadIdx.x; - - - // There are different coeficients for cornes of the matrix - will be done by separate kernel most probably - //if (tid < BANDS || tid >= m * m * blockDim.x - BANDS) { - if (tid_l < BANDS || tid_l >= blockDim.x - BANDS) { - return; - } - - extern __shared__ FT bs[]; - //__shared__ FT b_coefs[BANDS+1]; - - FT b_coefs[BANDS + 1]; // b_coef[0] is the one on the diagonal b_coef[BANDS-1] the most remote diagonal - // Just for tunning - later must be loaded particular values of inv matrix - // Kernel should be compiled with these coeficients - for (int i = 0; i < BANDS+1; i++) { - b_coefs[i] = 0.001*i; - } - - //if (tid_l < BANDS+1) - // b_coefs[tid_l] = dev_c_prime[tid_l]; - - - bs[tid_l+BANDS] = b[tid]; - - if (tid_l < BANDS) { - bs[tid_l ] = b[tid-BANDS]; - bs[blockDim.x+tid_l] = b[tid+BANDS]; - } - - FT result = b_coefs[0] * bs[BANDS + tid_l]; - - __syncthreads(); - - #pragma unroll - for (int i = 0; i < BANDS; i++) { - result += b_coefs[BANDS - i] * bs[tid_l + i]; - } - - //result += b_coefs[0] * bs[BANDS + tid_l]; - - #pragma unroll - for (int i = 0; i < BANDS; i++) { - result += b_coefs[i] * bs[BANDS + tid_l + i]; - } - - x[tid] = result; - - -} - - - - - - - -template<class FT> -class InvPreconditioner3D : public Preconditioner<FT> { -private: - FT *c_prime; - int m; - cublasHandle_t cublas_handle; - FT *b_trans; - FT alpha, alpha_23; - - int kernel_block_size; -public: - virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) { - this->m = m; - this->alpha = alpha; - this-> cublas_handle = cublas_handle; - -// FT *host_c_prime = new FT[m]; - - alpha_23 = pow(alpha, FT(2)/FT(3)); - -// host_c_prime[0] = FT(-1) / (alpha + FT(2)); -// for (int i = 1; i < m; ++i) { -// host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha); -// } - - cudaMalloc((void **) &c_prime, m*sizeof(FT)); -// cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice); - -// delete[] host_c_prime; - - cudaMalloc((void **) &b_trans, m*m*m*sizeof(FT)); - - int minGridSize; - cudaOccupancyMaxPotentialBlockSize(&minGridSize, &kernel_block_size, invM_kernel_n<FT>, 0, m*m); - if ( m <= 1024 ) - kernel_block_size = m; - else - kernel_block_size = 1024; - } - - virtual void run(FT const * const b, FT * const x) { - - //kernel_block_size = 1024; - int n = m*m*m; - FT block_count = divide_and_round_up(n, kernel_block_size); - FT threads_per_block = kernel_block_size; - - std::cout << "Inv M 3D - Tk 1 = API computed block size : " << kernel_block_size << std::endl; - std::cout << "Inv M 3D - Tk 1 = API computed grid size : " << block_count << std::endl; - - FT *y = x; - invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, b, y); - - FT *y_trans = b_trans; - cublas_transpose2(cublas_handle, m*m, m, y, y_trans); - - FT *z_trans = y; - invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, y_trans, z_trans); - - FT *z_trans2 = y_trans; - cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2); - FT *x_trans2 = z_trans; - - invM_kernel3D<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>>(m, n, 1, alpha, alpha_23, c_prime, z_trans2, x_trans2); - FT *x_trans = z_trans2; - cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans); - - cublas_transpose2(cublas_handle, m, m*m, x_trans, x); - } - - - virtual void run2(FT const * const b, FT * const x) { - -/* - std::cout << "Inv M - Tk 1 = API computed block size : " << kernel_block_size << std::endl; - - //kernel_block_size = 1024; - FT block_count = divide_and_round_up(m*m, kernel_block_size); - FT threads_per_block = kernel_block_size; - - cublas_transpose(cublas_handle, m, b, b_trans); - FT *y_trans = x; - - invM_kernel<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans); - - FT *y = b_trans; - cublas_transpose(cublas_handle, m, y_trans, y); - - invM_kernel<FT><<<block_count, threads_per_block, (2*BANDS + threads_per_block)*sizeof(FT) >>> (m, m, 1, alpha, beta, c_prime, y, x); - - std::cout << "Inv M - Tk 1" << std::endl; - -*/ - - } - - virtual void run3(FT const * const b, FT * const x) { - - } - - - -}; +// +//*/ +// +// } +// +// virtual void run3(FT const * const b, FT * const x) { +// +// } +// +// +// +//}; diff --git a/solver.h b/solver.h index eab8eb3d2bc74d5d26628bcec0b30611117fea84..a7c143309e10174a38aeb098b5da0228c0dad891 100644 --- a/solver.h +++ b/solver.h @@ -386,12 +386,13 @@ int solve_with_conjugate_gradient3D(cublasHandle_t const cublas_handle, preconditioner->init(m, alpha, cublas_handle, cusparse_handle); } - //delete - for (int r = 0; r < 10; r++) { - preconditioner->run(b, x); - } - return 0; - // delete - end +// LR: TODO: delete +// //delete +// for (int r = 0; r < 10; r++) { +// preconditioner->run(b, x); +// } +// return 0; +// // delete - end device_memset<FT>(x, FT(0), n); // x = 0 FT *p, *r, *h, *Ax, *q; diff --git a/test b/test index 396a79ca903455a855f8bd28cdaba0d23e7a1809..881688afc90d49e3921fddef7fbde91fd302db9a 100755 Binary files a/test and b/test differ diff --git a/test.cu b/test.cu index 75a7d77f5294a8f2ddf2892e531a44051ef4fa2f..5be58710a684b9e7637044a0a238f97ca99938fe 100644 --- a/test.cu +++ b/test.cu @@ -41,184 +41,85 @@ int main (int argc, char *argv[]) cublasCreate(&cublas_handle); cusparseCreate(&cusparse_handle); - const double alpha = 1; //0.01f; - int max_iter = 2; //10000; + const double alpha = 0.01f; //0.01f; + int max_iter = 10000; //10000; double tolerance = 0.00001; -// const int m = M; //1 * 1024; //32; //8 * 1024; // 2048; -// -// int experiment; -// experiment = atoi(argv[1]); -// printf("Running experiment: %d \n",experiment); -// -// int num_iter; -// double *b, *x; -// -// double *h_x, *h_b; -// -// h_x = (double * ) malloc ( (m*m)*sizeof(double) ); -// h_b = (double * ) malloc ( (m*m)*sizeof(double) ); -// -// for (int i = 0; i < M; i++) { -// for (int j = 0; j < M; j++) -// { -// if ( j == 0) -// h_b[i*M + j] = i+1.0; -// else -// h_b[i*M + j] = 0.0; -// -// h_x[i*M + j] = 0.0; -// } -// } -// -// -// -// cudaMalloc((void **) &b, (m*m)*sizeof(double)); -// cudaMalloc((void **) &x, (m*m)*sizeof(double)); -// -// device_memset_inc<double>(x, 0.0, m*m); -// cudaMemcpy( b, h_b, (m*m)*sizeof(double) , cudaMemcpyHostToDevice ); -// -// -// -///* -// num_iter = solve_with_conjugate_gradient<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, NULL ); -// cout << num_iter << " CG only - iterations" << endl; -// -// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// -// //for (int i = 0; i < m*m; i++) -// // cout << h_x[i] << " "; -// -// cout << endl; -//*/ -// -// -// if (experiment == 1 || experiment == 0) { -// ThomasPreconditioner<double> preconditioner_th(false); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_th); -// cout << " Chebyschev - Thomas Prec w. cublas transpose. - iterations: " << num_iter << endl; -// -// // cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// // for (int i = 0; i < m*m; i++) { -// // if (i % m == 0) cout << endl; -// // cout << h_x[i] << "\t"; -// // } -// // cout << endl; -// } -// -// if (experiment == 2 || experiment == 0) { -// ThomasPreconditioner<double> preconditioner_th_it(true); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_th_it); -// cout << " Chebyschev - Thomas Prec. w implicit transpose - iterations: " << num_iter << endl; -// } -// -// if (experiment == 10 || experiment == 0) { -// SpikeThomasPreconditioner<double> preconditioner_sp4(4); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp4); -// cout << " Chebyschev - Thomas-SPIKE(4) Prec. - iterations: " << num_iter << endl; -// } -// -// if (experiment == 11 || experiment == 0) { -// SpikeThomasPreconditioner<double> preconditioner_sp8(8); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp8); -// cout << " Chebyschev - Thomas-SPIKE(8) Prec. - iterations: " << num_iter << endl; -// } -// -// if (experiment == 12 || experiment == 0) { -// SpikeThomasPreconditioner<double> preconditioner_sp16(16); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp16); -// cout << " Chebyschev - Thomas-SPIKE(16) Prec. - iterations: " << num_iter << endl; -// } -// -// if (experiment == 20 || experiment == 0) { -// InvPreconditioner<double> preconditioner_in(false); -// num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_in); -// cout << " Chebyschev - inverse Matrix prec. w. cublas trans - iterations: " << num_iter << endl; -// } -// -// -// -// -// -// -// -// -///* -// SpikeThomasPreconditioner<double> preconditioner(8); -// num_iter = solve_with_conjugate_gradient<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner ); -// cout << num_iter << " CG Spike-Thomas prec - iterations" << endl; -// -// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// //for (int i = 0; i < m*m; i++) -// // cout << h_x[i] << " "; -// cout << endl; -//*/ -// -// -///* -// ThomasPreconditioner<double> preconditioner2; -// num_iter = solve_with_conjugate_gradient<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner2 ); -// cout << num_iter << " CG Thomas prec - iterations" << endl; -// -// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -// //for (int i = 0; i < m*m; i++) -// // cout << h_x[i] << " "; -// -// cout << endl; -//*/ -// -// -// -//// InvPreconditioner<double> preconditionerInv; -//// num_iter = solve_with_conjugate_gradient<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditionerInv ); -//// cout << num_iter << " CG InvM prec - iterations" << endl; -// -//// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); -//// for (int i = 0; i < m*m; i++) -//// cout << h_x[i] << " "; -// -// // cout << endl; -// - - - - int num_iter; - -#define PREC double - - PREC *b_3d, *x_3d; - int m_3d = M; - - PREC * h_x_3d; - h_x_3d = (PREC * ) malloc ( (m_3d*m_3d*m_3d) * sizeof(PREC) ); - - - cudaMalloc((void **) &b_3d, (m_3d*m_3d*m_3d)*sizeof(PREC)); - cudaMalloc((void **) &x_3d, (m_3d*m_3d*m_3d)*sizeof(PREC)); - - for (int i = 0; i < m_3d*m_3d*m_3d; i++) - h_x_3d[i] = i; - - cudaMemcpy( b_3d, h_x_3d, (m_3d*m_3d*m_3d)*sizeof(PREC) , cudaMemcpyHostToDevice ); - - //device_memset<double>(b_3d, 1.0, m_3d*m_3d*m_3d); - - //Inv - - ThomasPreconditioner3D<PREC> preconditioner_inv3d; - num_iter = solve_with_conjugate_gradient3D<PREC>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_inv3d); // NULL ); - cout << num_iter << " CG 3D only - iterations" << endl; - - cudaMemcpy( h_x_3d, b_3d, (m_3d*m_3d*m_3d)*sizeof(double) , cudaMemcpyDeviceToHost ); - //for (int i = 0; i < m_3d*m_3d*m_3d; i++) - // cout << h_x_3d[i] << endl; - cout << endl; - - - -// ThomasPreconditioner3D<double> preconditioner_3d; -// num_iter = solve_with_conjugate_gradient3D<double>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_3d); -// cout << num_iter << " CG 3D Thomas Prec - iterations" << endl; + const int m = M; //1 * 1024; //32; //8 * 1024; // 2048; + + int experiment; + experiment = atoi(argv[1]); + printf("Running experiment: %d \n",experiment); + + int num_iter; + double *b, *x; + + double *h_x, *h_b; + + h_x = (double * ) malloc ( (m*m)*sizeof(double) ); + h_b = (double * ) malloc ( (m*m)*sizeof(double) ); + + for (int i = 0; i < M; i++) { + for (int j = 0; j < M; j++) + { + if ( j == 0) + h_b[i*M + j] = i+1.0; + else + h_b[i*M + j] = 0.0; + + h_x[i*M + j] = 0.0; + } + } + + + cudaMalloc((void **) &b, (m*m)*sizeof(double)); + cudaMalloc((void **) &x, (m*m)*sizeof(double)); + + + device_memset_inc<double>(x, 0.0, m*m); + cudaMemcpy( b, h_b, (m*m)*sizeof(double) , cudaMemcpyHostToDevice ); + + + if (experiment == 1 || experiment == 0) { + ThomasPreconditioner<double> preconditioner_th(false); + num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_th); + cout << " Chebyschev - Thomas Prec w. cublas transpose. - iterations: " << num_iter << endl; + } + + if (experiment == 2 || experiment == 0) { + ThomasPreconditioner<double> preconditioner_th_it(true); + num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_th_it); + cout << " Chebyschev - Thomas Prec. w implicit transpose - iterations: " << num_iter << endl; + } + + if (experiment == 10 || experiment == 0) { + SpikeThomasPreconditioner<double> preconditioner_sp4(4); + num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp4); + cout << " Chebyschev - Thomas-SPIKE(4) Prec. - iterations: " << num_iter << endl; + } + + if (experiment == 11 || experiment == 0) { + SpikeThomasPreconditioner<double> preconditioner_sp8(8); + num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp8); + cout << " Chebyschev - Thomas-SPIKE(8) Prec. - iterations: " << num_iter << endl; + } + + if (experiment == 12 || experiment == 0) { + SpikeThomasPreconditioner<double> preconditioner_sp16(16); + num_iter = solve_with_chebyshev_iteration<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner_sp16); + cout << " Chebyschev - Thomas-SPIKE(16) Prec. - iterations: " << num_iter << endl; + } + + if (experiment == 30 || experiment == 0) { + SpikeThomasPreconditioner<double> preconditioner(8); + num_iter = solve_with_conjugate_gradient<double>(cublas_handle, cusparse_handle, m, alpha, b, x, max_iter, tolerance, &preconditioner ); + cout << " CG - Thomas-SPIKE(8) Prec. - iterations: " << num_iter << endl; + } + + // Print results +// cudaMemcpy( h_x, x, (m*m)*sizeof(double) , cudaMemcpyDeviceToHost ); +// for (int i = 0; i < m*m; i++) +// cout << h_x[i] << "\n"; +// cout << endl; }