preconditioner.h

#include "misc.h"
#include "cublas_wrapper.h"
#include <iostream>
#include <iomanip>
#include <stdio.h>


/*

 *      preconditioner.h
 *
 *      This file contains different preconditioner implementations. They are all implementations of the abstract
 *      type Preconditioner. It is the users responsibility to check wether a preconditioner solves the 2D or
 *      the 3D problem.
 *
 *      @author Simon Schoelly
 *
 */

/*
 *      Abstract preconditioner class. Provides two methods that have to be implemented:
 *      init: initialzes the preconditioner before the first run
 *      run: solves M\b
 *
 *      @param FT Field Type - Either float or double
 *
 */
template<class FT>
class Preconditioner {
public:
        virtual void init (int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) = 0;
        virtual void run  (FT const * const b, FT * const x) = 0;
        virtual void run2 (FT const * const b, FT * const x) = 0;
        virtual void run3 (FT const * const b, FT * const x) = 0;
        virtual void run_t(FT const * const b, FT * const x) = 0;

};


/*
 *      Kernel that performs the thomas algorithm. Used for ThomasPreconditioner and SpikeThomasPreconditioner.
 *
 *      @param FT Field Type - Either float or double
 *      @param m grid size i.e. M is of size mxm
 *      @param m >= block_size >= 1 the size of a block that is inverted. For the ThomasPreconditioner this is of size m
 *      @param num_blocks the number of blocks that we invert
 *      @param alpha > 0.
 *      @param beta = sqrt(alpha)
 *      @param c_prime coefficients for the thomas algorithm that where precualculated
 *      @param b != NULL input vector of size m*num_blocks*block_size
 *      @param x != NULL output vector of size m*num_blocks*block_size
 *
 *      @return M\X
 *
 */
template<class FT>
__global__ void thomas_kernel(int const m, int block_size, int num_blocks, FT const alpha, FT const beta, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	if (tid >= m*num_blocks) {
	  return;
	}

        int start = (tid / m) * block_size * m + (tid % m);

        FT work_buffer_reg = b[start] * beta / (FT(2) + alpha);

        x[start] = work_buffer_reg;
        for (int i = 1; i < block_size; ++i) {
        	int j = start + i*m;
            work_buffer_reg  = (b[j] * beta + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
			x[j] = work_buffer_reg;
        }

        for (int i = block_size-2; i >= 0; --i) {
        	int j = start + i*m;
        	work_buffer_reg = x[j] - dev_c_prime[i] * work_buffer_reg;
            x[j] = work_buffer_reg;
        }
}


template<class FT>
__global__ void thomas_kernel_trans(int const m, int block_size, int num_blocks, FT const alpha, FT const beta,
		FT const * const __restrict__ dev_c_prime,
		FT const * const __restrict__ b,
		FT       * const __restrict__ x) {

	// dev_c_prime - precalculated factors
	// b - input
	// x - output

	__shared__ FT sh_x[THREADS+1][THREADS];

	int tid_l = threadIdx.x;
	int bid   =  blockIdx.x;
	//int tid   =  blockIdx.x * blockDim.x + threadIdx.x;
	FT  work_buffer_reg = 0.0;

	//if (bid != 0) return;

	// read first patch of input data
	#pragma unroll
	for (int i = 0; i < THREADS; i++) {
		//sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
		sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
	}


	for (int l = 0; l < M/THREADS; l++) {
		#pragma unroll
		for (int i = 0; i < THREADS; ++i) {
			//FT b_l     = sh_b[i][tid_l];
			FT b_l     = sh_x[i][tid_l];
			FT c_prime = dev_c_prime[l*THREADS+i-1];
			work_buffer_reg  = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime);
			sh_x[i][tid_l] = work_buffer_reg;
//			if (tid_l == 0) printf(" x = %.10f - %.10f - %.10f - %.10f \n",sh_b	[i][tid_l],sh_b[i][tid_l+1],sh_x[i][tid_l],sh_x[i][tid_l+1]);

		}

		// save temp res and get new input data
		if ( l < (M/THREADS - 1) ) {
			#pragma unroll
			for (int i = 0; i < THREADS; i++) {
								 x[ tid_l + m*(i+bid*THREADS) +  l   *THREADS ] = sh_x[tid_l][i];
				sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
				//sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
			}
		}
	}

    // backward
//	if (tid_l == 0) printf("\n");

	int level = M/THREADS - 1;
	#pragma unroll
	for (int i = THREADS-2; i >= 0; --i) {
		FT x_l = sh_x[i][tid_l];
		work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg;
		sh_x[i][tid_l] = work_buffer_reg;
//		if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);

	}

	for (int l = level; l >= 1 ; l--) {
		// save temp res and get new input data
		#pragma unroll
		for (int i = 0; i < THREADS; i++) {
							 x[ tid_l + m*(i+bid*THREADS) +  l   *THREADS ] = sh_x[tid_l][i];
			sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ];
		}

		#pragma unroll
		for (int i = THREADS-1; i >= 0; --i) {
			FT x_l = sh_x[i][tid_l];
			work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg;
			sh_x[i][tid_l] = work_buffer_reg;
//			if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
		}
	}

	// write last patch of output data
	#pragma unroll
	for (int i = 0; i < THREADS; i++) {
		x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i];
	}


}


// Thomas algorithm with storing intermediate results in shared memory - saves one global memory read and global memory write into x - compare with previous Thomas kernel

template<class FT>
__global__ void thomas_kernel2(int const m, int block_size, int num_blocks, FT const alpha, FT const beta, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {

	extern __shared__ FT xs[];

        int tid   = blockIdx.x * blockDim.x + threadIdx.x;
	int tid_l = threadIdx.x;

        if (tid >= m*num_blocks) {
          return;
        }

        int start = (tid / m) * block_size * m + (tid % m);

        FT work_buffer_reg = b[start] * beta / (FT(2) + alpha);

        xs[tid_l * block_size + 0 + tid_l] = work_buffer_reg;
        for (int i = 1; i < block_size; ++i) {
                int j = start + i*m;
                work_buffer_reg  = (b[j] * beta + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
                xs[ tid_l * block_size + i + tid_l ] = work_buffer_reg;
        }

        FT x_reg = work_buffer_reg;
        x[start + (block_size-1)*m] = x_reg;
        for (int i = block_size-2; i >= 0; --i) {
                int j = start + i*m;
                x_reg = xs[tid_l * block_size + i + tid_l] - dev_c_prime[i] * x_reg;
                x[j] = x_reg;
        }
}


// Thomas algorithm that calculates cprime on the fly - eliminates reading of the cprime from global memory - SHARED MEMORY will become the bottleneck - reduces amount of blocks that can sit on one Streaming Multiprocessor

template<class FT>
__global__ void thomas_kernel3(int const m, int block_size, int num_blocks, FT const alpha, FT const beta, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {

	int tid   = blockIdx.x * blockDim.x + threadIdx.x;
        int tid_l = threadIdx.x;

	if (tid >= m*num_blocks) {
          return;
        }

	//__shared__ FT d [32][33];
	//__shared__ FT xs[32][34];

	extern __shared__ FT shared_pool[];

	FT *shared_pool_p = shared_pool;

	FT *d = &shared_pool_p[ 0 ];
	FT *xs= &shared_pool_p[ blockDim.x * (m+1) ]; // shared memory size: 2 * threads_per_block * (m+1) * sizeof(FT)

	// tridiag(gamma,delta,gamma) x = b
	FT delta = beta + (FT(2) / beta);  // value on diag
        FT gama  = FT(-1) / beta;
	FT gama2 = gama*gama;

	FT d_reg;
	FT x_reg;

        d_reg     = FT(1) / delta;
        d[tid_l * block_size + 0 + tid_l] = d_reg;  //d[tid][0] = d_reg;

        int start = (tid / m) * block_size * m + (tid % m);
	x_reg = b[start];
	xs[tid_l * block_size + 0 + tid_l] = x_reg; //xs[tid][0] = x_reg;

	for (int i = 1; i < block_size; ++i) {
		int j = start + i*m;

		x_reg     = b[j] - gama * x_reg * d_reg;

		d_reg     = FT(1) / ( delta - gama2 * d_reg ); // d_reg = d[tid][i-1]
	        d[tid_l * block_size + i + tid_l] = d_reg; //d[tid][i] = d_reg;


		//x_reg     = b[j] - gama * x_reg * d_reg;
		xs[tid_l * block_size + i + tid_l] = x_reg; //xs[tid][i] = x_reg;

	}

	x_reg = x_reg * d_reg;
	x[start + (block_size - 1) * m] = x_reg;

	for (int i = block_size-2; i >= 0; --i) {
		int j = start + i*m;

		//x_reg = (xs[tid][i] - gama * x_reg) * d[tid][i];
		x_reg = (xs[tid_l * block_size + i + tid_l] - gama * x_reg) * d[tid_l * block_size + i + tid_l];
		x[j]  = x_reg;

	}
}


/*
 *      Preconditioner for the 2D problem. Uses the thomas algorithm to invert M.
 *
 *      @param FT Field Type - Either float or double
 *
 */
template<class FT>
class ThomasPreconditioner : public Preconditioner<FT> {
private:
        FT *c_prime;
        int m;
        FT alpha, beta;
        cublasHandle_t cublas_handle;
        FT *b_trans;

        bool USE_IMPLICIT_TRANSPOSE;
        int thomas_kernel_block_size;
public:
        ThomasPreconditioner(bool useImplicitTranspose) {
        	this->USE_IMPLICIT_TRANSPOSE = useImplicitTranspose;
        }

        virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) {
                this->m = m;
                this->alpha = alpha;
                beta = sqrt(alpha);
                this-> cublas_handle = cublas_handle;

                FT *host_c_prime = new FT[m];

                host_c_prime[0] = FT(-1) / (alpha + FT(2));
                for (int i = 1; i < m; ++i) {
                        host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha);
                }

                cudaMalloc((void **) &c_prime, m*sizeof(FT));
                cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice);

                delete[] host_c_prime;

                cudaMalloc((void **) &b_trans, m*m*sizeof(FT));

                int minGridSize;
                cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel<FT>, 0, m);
                thomas_kernel_block_size /= 8;
        }


        virtual void run(FT const * const b, FT * const x) {

        	if (!USE_IMPLICIT_TRANSPOSE) {

        		FT block_count = divide_and_round_up(m, thomas_kernel_block_size);
                FT threads_per_block = thomas_kernel_block_size;

                cublas_transpose(cublas_handle, m, b, b_trans);
                FT *y_trans = x;
                thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, y_trans);
                FT *y = b_trans;
                cublas_transpose(cublas_handle, m, y_trans, y);

                thomas_kernel<FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, y, x);

                //std::cout << "Tk 1 - cublas trans " << std::endl;

        	} else {

        		FT block_count = divide_and_round_up(m, thomas_kernel_block_size);
                FT threads_per_block = thomas_kernel_block_size;

                thomas_kernel_trans<FT><<<M/THREADS, THREADS >>>             (m, m, 1, alpha, beta, c_prime, b, b_trans);
                FT const * const b_trans_const = b_trans;
                thomas_kernel      <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans_const, x);
              //thomas_kernel      <FT><<<block_count, threads_per_block >>> (m, m, 1, alpha, beta, c_prime, b_trans, x);
                //std::cout << "Tk 1 - implicit trans " << std::endl;

        	}
        }

        virtual void run_t(FT const * const b, FT * const x) {
        }


        virtual void run2(FT const * const b, FT * const x) {
        }


        virtual void run3(FT const * const b, FT * const x) {
        }


        ~ThomasPreconditioner() {
                cudaFree(b_trans);
                cudaFree(c_prime);
        }

};

///*
// *      Kernel that performs the thomas algorithm for the 3D problem. Used for ThomasPreconditioner3
// *
// *      @param FT Field Type - Either float or double
// *      @param m grid size i.e. M is of size mxm
// *      @param n number of vectors that we invert simultanously. Usually has value m*m
// *      @param alpha > 0.
// *      @param alpha_23 = alpha^(2/3)
// *      @param c_prime coefficients for the thomas algorithm that where precualculated
// *      @param b != NULL input vector of size m*n
// *      @param x != NULL output vector of size m*n
// *
// *      @return M\X
// *
// */
//
//template<class FT>
//__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//	int tid = blockIdx.x * blockDim.x + threadIdx.x;
//
//	if (tid >= n) {
//	  return;
//	}
//
//        int start = tid;
//
//
//        FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
//        x[start] = work_buffer_reg;
//
//        for (int i = 1; i < m; ++i) {
//                int j = start + i*n;
//                work_buffer_reg  = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
//                x[j] = work_buffer_reg;
//        }
//
//        FT x_reg = x[start + (m-1)*n];
//        x[start + (m-1)*n] = x_reg;
//
//        for (int i = m-2; i >= 0; --i) {
//                int j = start + i*n;
//                x_reg = x[j] - dev_c_prime[i] * x_reg;
//                x[j] = x_reg;
//        }
//}
//
//
//template<class FT>
//__global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//	int n = m*m;
//	int tid = blockIdx.x * blockDim.x + threadIdx.x;
//
//	if (tid >= n) {
//	  return;
//	}
//
//        int start = tid;
//
//
//        FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
//        x[start] = work_buffer_reg;
//
//        for (int i = 1; i < m; ++i) {
//                int j = start + i*n;
//                work_buffer_reg  = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
//                x[j] = work_buffer_reg;
//        }
//
//        FT x_reg = x[start + (m-1)*n];
//        x[start + (m-1)*n] = x_reg;
//
//        for (int i = m-2; i >= 0; --i) {
//                int j = start + i*n;
//                x_reg = x[j] - dev_c_prime[i] * x_reg;
//                x[j] = x_reg;
//        }
//}
//
//
//template<class FT>
//__global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//	int n = m*m;
//	int tid = blockIdx.x * blockDim.x + threadIdx.x;
//
//	if (tid >= n) {
//	  return;
//	}
//
//        int start = (n) * (tid/m) + (tid % m); // + (i * m)
//
//
//        FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
//        x[start] = work_buffer_reg;
//
//        for (int i = 1; i < m; ++i) {
//                int j = start + i*m;
//                work_buffer_reg  = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
//                x[j] = work_buffer_reg;
//        }
//
//        FT x_reg = x[start + (m-1)*m];
//        x[start + (m-1)*m] = x_reg;
//
//        for (int i = m-2; i >= 0; --i) {
//                int j = start + i*m;
//                x_reg = x[j] - dev_c_prime[i] * x_reg;
//                x[j] = x_reg;
//        }
//}
//
//
//
//template<class FT>
//__global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//	//#define TILE_SIZE 2
//	//TODO: Should be #define
//	//int TILE_SIZE = blockDim.x;
//
//	__shared__ FT sh_b[TILE_SIZE][TILE_SIZE+1];
//	__shared__ FT sh_x[TILE_SIZE][TILE_SIZE+1];
//
//	int TILES = m / TILE_SIZE;
//
//	int tid_l =  threadIdx.x;
//	int bid   =  blockIdx.x;
//	int tid   =  blockIdx.x * blockDim.x + threadIdx.x;
//
//	// Basis of an adress used to read dat from global memory to tiles in shared memory
//	// - this is rused multiple times
//	//int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m;
//	int base_addr = tid_l +
//					(tid / m) * m +
//				    (m*m) * TILE_SIZE *  ( bid % (m / TILE_SIZE));
//			//    + tile * TILE_SIZE
//			//    + i*m*m
//
//    // **************************************************************************************************************
//    // *** Forward substitution ************************************************************************************
//
//	// Read input data to fill the first tile in shared memoru
//	#pragma unroll
//	for (int i = 0; i < TILE_SIZE; i++) {
//		int a = base_addr +  m*m*i;
//		sh_b[tid_l][i] = b[a];
//		//printf("tid = %d - SM a = [%d,%d] - g a = %d ; val = %f \n", tid, tid_l, i, a, b[ a ] );
//	}
//
//	// Calculate first element of the forward substitution
//    FT work_buffer_reg = sh_b[0][tid_l] * alpha_23 / (FT(2) + alpha);
//    sh_x[0][tid_l] = work_buffer_reg;
//    //printf("A tid = %d - work_buffer_reg = %f ; in_val = %f \n", tid, work_buffer_reg, sh_b[0][tid_l]);
//
//    // Calculate the rest of the forward substitution for the first tile
//    #pragma unroll
//    for (int i = 1; i < TILE_SIZE; ++i) {
//        int ca = i - 1;
//    	work_buffer_reg  = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]);
//        sh_x[i][tid_l] = work_buffer_reg;
//        //printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
//    }
//
//    // Save results of for the first tile to the global memory
//	#pragma unroll
//	for (int i = 0; i < TILE_SIZE; i++) {
//		int a = base_addr +  m*m*i;
//		x[a] = sh_x[tid_l][i];
//		//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//	}
//
//	// Processing of the remaining tiles
//    for (int tile = 1; tile < TILES; tile++) {
//
//    	// Read data from global memory to tile in shared memory
//    	#pragma unroll
//		for (int i = 0; i < TILE_SIZE; i++) {
//			int a = base_addr + m*m*i + (tile * TILE_SIZE);
//			sh_b[tid_l][i] = b[a];
//			//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//		}
//
//		// Calculate forward substitution for the current tile
//    	#pragma unroll
//		for (int i = 0; i < TILE_SIZE; i++) {
//			int ca = (tile * TILE_SIZE) + i - 1;
//            work_buffer_reg  = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]);
//            sh_x[i][tid_l] = work_buffer_reg;
//            //printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
//		}
//
//		// Save the results of the forward substitution of the current tile to a global memory - this does not have to be done fot the last tile
//		#pragma unroll
//		for (int i = 0; i < TILE_SIZE; i++) {
//			int a = base_addr + m*m*i + (tile * TILE_SIZE);
//			x[a] = sh_x[tid_l][i];
//			//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//		}
//
//
//    }
//    // *** END - Forward substitution ************************************************************************************
//    // **************************************************************************************************************
//
//    __syncthreads();
//
//    // **************************************************************************************************************
//    // *** Backward substitution ************************************************************************************
//
//    // Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory
//	#pragma unroll
//    for (int i = TILE_SIZE-2; i >= 0; --i) {
//    	int ca = (TILES-1) * TILE_SIZE + i;
//    	work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[ ca ] * work_buffer_reg;
//        sh_x[i][tid_l] = work_buffer_reg;
//        //printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i);
//
//    }
//
//    // Backward substitution - last TILE - store results from tile in shared memory to global memory
//	#pragma unroll
//	for (int i = 0; i < TILE_SIZE; i++) {
//		int a = base_addr +  m*m*i + (TILES-1) * TILE_SIZE;   //m - TILE_SIZE;
//		x[ a ] = sh_x[tid_l][i];
//		//printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//	}
//
//    // Backward substitution - remainig tiles
//    for (int tile = TILES - 2; tile >= 0; tile--) {
//
//    	// Load new tile from global memory to tile in shared memory
//    	#pragma unroll
//		for (int i = 0; i < TILE_SIZE; i++) {
//			//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
//			int a = base_addr + m*m*i + tile * TILE_SIZE;
//			sh_b[tid_l][i] = x[ a ];
//			//printf("Lod1 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//		}
//
//		// compute backward substitution - use date stored in tile in shared memory
//		#pragma unroll
//		for (int i = TILE_SIZE-1; i >= 0; --i) {
//			int ca = tile * TILE_SIZE + i;
//			work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[ca] * work_buffer_reg;
//			sh_x[i][tid_l] = work_buffer_reg;
//            //printf("B1 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
//		}
//
//		// Store current tile from shared memory to global memory
//		#pragma unroll
//		for (int i = 0; i < TILE_SIZE; i++) {
//			//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
//			int a = base_addr + m*m*i + tile * TILE_SIZE;
//			x[a] = sh_x[tid_l][i];
//			//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
//		}
//
//    }
//    // *** Backward substitution - END ******************************************************************************
//    // **************************************************************************************************************
//
//	return;
//
//
//
//}
//
//
//
//
//template<class FT>
//__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) {
//
//#define T 		W
////#define THREADS 	4
////#define BLOCKS		4
//
//	__shared__ FT SM [THREADS/T][T][T];
//
//	int tid	 	= blockIdx.x * blockDim.x + threadIdx.x;
//	int tmt	    = threadIdx.x % T;
//	int tdt		= threadIdx.x / T;
//
//	FT x_reg;
//
//	if (tid >= n) {
//		return;
//	}
//
//	int start = tid;
//
//	FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
//
//	tmp_g[start] = work_buffer_reg;
//	for (int i = 1; i < m; ++i) {
//		int        j   = start + i*n;
//		double input   = b[j];
//		double c_prime = dev_c_prime[i-1];
//
//		work_buffer_reg  = (input * alpha_23 + work_buffer_reg) / (double(2) + alpha + c_prime);
//		tmp_g[j] = work_buffer_reg;
//	}
//
//	x_reg = tmp_g[start + (m-1)*n];
//	SM[tdt][tmt][0] = x_reg;
//
//	for (int i = 1; i < T; i++) {
//		int j = start + (m-1-i)*n;
//		x_reg = tmp_g[j] - dev_c_prime[ (m-1-i) ] * x_reg;
//		SM[tdt][tmt][i] = x_reg;
//	}
//
//
//	int addr1	 = T * tdt;
//	int addr2	 = blockIdx.x;
//	int addr3 	 = m - 1 - tmt;
//	int addr4;
//
//	int addr	 = (addr2 * m * m) + addr3;
//
//	for (int i = 0; i < T; i++) {
//		x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
//	}
//
//	for (int T_loop = 1; T_loop < m/T; T_loop++) {
//
//		int T_offset 	= T * T_loop;
//		addr4 	= m - 1 - T_offset;
//		addr3   = addr4 - tmt;
//		addr    = (addr2 * m * m) + addr3;
//
//		for (int i = 0; i < T; i++) {
//			int g_addr = addr4 - i;
//			int j 	   = start + n * g_addr;
//
//			x_reg           = tmp_g[j] - dev_c_prime[ g_addr ] * x_reg;
//			SM[tdt][tmt][i] = x_reg;
//		}
//
//		for (int i = 0; i < T; i++) {
//			x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
//		}
//
//
//	}
//
//
//
//	//	int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
//	//	  return s*m + r*m*m + c;
//	//	}
//
//
//}
//
//
//
//// Thomas algorithm with storing intermediate results in shared memory - saves one global memory read and global memory write into x - compare with previous Thomas kernel
//template<class FT>
//__global__ void thomas_kernel3D2(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//	extern __shared__ FT xs[];
//
//	int tid   = blockIdx.x * blockDim.x + threadIdx.x;
//	int tid_l = threadIdx.x;
//	int block_size = m;
//
//        if (tid >= n) {
//          return;
//        }
//
//        int start = tid;
//
//        FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
//
//        //x[start] = work_buffer_reg;
//        xs[tid_l * block_size + 0 + tid_l] = work_buffer_reg;
//	for (int i = 1; i < m; ++i) {
//                int j = start + i*n;
//                work_buffer_reg  = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
//                //x[j] = work_buffer_reg;
//		xs[ tid_l * block_size + i + tid_l ] = work_buffer_reg;
//        }
//
//        FT x_reg = work_buffer_reg; //x[start + (m-1)*n];
//        x[start + (m-1)*n] = x_reg;
//        for (int i = m-2; i >= 0; --i) {
//                int j = start + i*n;
//                //x_reg = x[j] - dev_c_prime[i] * x_reg;
//                x_reg = xs[tid_l * block_size + i + tid_l] - dev_c_prime[i] * x_reg;
//		x[j] = x_reg;
//        }
//
//}
//
//
//
//template<class FT>
//__global__ void thomas_kernel3D3(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime,  FT const * const __restrict__ b, FT * const __restrict__ x) {
//
//
//}
//
//
///*
// *      Preconditioner for the 3D problem. Uses the thomas algorithm to invert M.
// *
// *      @param FT Field Type - Either float or double
// *
// */
//template<class FT>
//class ThomasPreconditioner3D : public Preconditioner<FT> {
//private:
//        FT *c_prime;
//        int m;
//        cublasHandle_t cublas_handle;
//        FT *b_trans;
//        FT alpha, alpha_23;
//
//        int thomas_kernel_block_size;
//public:
//        virtual void init(int const m, FT const alpha, cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle) {
//                this->m = m;
//                this->alpha = alpha;
//                this-> cublas_handle = cublas_handle;
//
//                FT *host_c_prime = new FT[m];
//
//                alpha_23 = pow(alpha, FT(2)/FT(3));
//
//                host_c_prime[0] = FT(-1) / (alpha + FT(2));
//                for (int i = 1; i < m; ++i) {
//                        host_c_prime[i] = FT(-1) / (host_c_prime[i-1] + FT(2) + alpha);
//                }
//
//                cudaMalloc((void **) &c_prime, m*sizeof(FT));
//                cudaMemcpy(c_prime, host_c_prime, m*sizeof(FT), cudaMemcpyHostToDevice);
//
//                delete[] host_c_prime;
//
//                cudaMalloc((void **) &b_trans, m*m*m*sizeof(FT));
//
//                int minGridSize;
//                cudaOccupancyMaxPotentialBlockSize(&minGridSize, &thomas_kernel_block_size, thomas_kernel3D<FT>, 0, m*m);
//
//
//
//
//        }
//
//        virtual void run(FT const * const b, FT * const x) {
//        	FT block_count       = m; //divide_and_round_up(m*m, thomas_kernel_block_size);
//        	FT threads_per_block = m; //thomas_kernel_block_size;
//
//        	std::cout << "Blocks            = " << block_count << std::endl;
//        	std::cout << "threads_per_block = " << threads_per_block << std::endl;
//
//        	//int n = m*m*m;
//
//        	// delete --------------
//
//        	FT *h_x;
//        	h_x  = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
//        	FT *h_x2;
//        	h_x2 = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
//
//        	cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		if (m <= 8) std::cout << h_x[i] << "\t";
//        	}
//        	if (m <= 8) std::cout << std::endl;
//
//        	FT *xx = x;
//
//        	FT *bb;
//        	cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
//        	FT *bbb;
//        	cudaMalloc((void **) &bbb, m*m*m*sizeof(FT));
//        	FT *bbbb;
//        	cudaMalloc((void **) &bbbb, m*m*m*sizeof(FT));
//
//
//        	// Ker 1 *************
//
//        	device_memset<FT>(xx, FT(0), m*m*m);
//        	thomas_kernel3D_X1<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, b, xx); //bb);
//
//        	FT sum = 0.0;
//        	cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		//std::cout << h_x[i] << "\t";
//        		if (m <= 8) printf("%4.1f\t",h_x[i]);
//
//        		sum+=h_x[i];
//        	}
//        	if (m <= 8) std::cout << std::endl;
//        	std::cout << sum << std::endl;
//
//        	// Ker 2 *****************
//
//        	cublas_transpose2(cublas_handle, m, m*m, b, bb);
//        	device_memset<FT>(xx, FT(0), m*m*m);
//        	cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		if (m <= 8) std::cout << h_x[i] << "\t";
//        	}
//        	if (m <= 8) std::cout << std::endl;
//
//        	thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb);
//        	cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
//
//
//        	cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//        	sum = 0.0;
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		//std::cout << h_x[i] << "\t";
//        		if (m <= 8) printf("%4.1f\t",h_x[i]);
//        		sum+=h_x[i];
//        		h_x2[i] = h_x[i];
//        	}
//        	if (m <= 8) std::cout << std::endl;
//        	std::cout << sum <<std::endl;
//
//
//        	// Ker 3 *****************
//
//        	cublas_transpose2(cublas_handle, m, m*m, b, bb);
//        	cublas_transpose2(cublas_handle, m, m*m, bb, bbb);
//
//        	device_memset<FT>(xx, FT(0), m*m*m);
//        	cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		if (m <= 8) std::cout << h_x[i] << "\t";
//        	}
//        	if (m <= 8) std::cout << std::endl;
//
//        	int blocks  = m*m / TILE_SIZE;
//        	int threads = TILE_SIZE;
//        	//blocks = 1;
//        	//threads = 2;
//        	std::cout << "\nThomas 3D Tiled kernel - Blocks: " << blocks << " Threads = " << threads << "\n";
//
//        	thomas_kernel3D_XT<FT><<<blocks, threads>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb);
//
//        	cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
//        	cublas_transpose2(cublas_handle, m*m, m, bbb, xx);
//
//        	cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//        	sum = 0.0;
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		//std::cout << h_x[i] << "\t";
//        		if (m <= 8)
//        			printf("%4.1f\t",h_x[i]);
//        		sum+=h_x[i];
//        	}
//        	if (m <= 8) std::cout << std::endl;
//        	std::cout << sum <<std::endl;
//
//
//        	sum = 0.0;
//        	for (int i = 0; i < m*m*m; i++) {
//        		if (m <= 8)
//        			if (i % (m*m) == 0)
//        				std::cout << std::endl;
//        		//std::cout << h_x[i] << "\t";
//        		if (m <= 8)
//        			printf("%4.1f\t",h_x2[i] - h_x[i]);
//
//        		sum+=h_x[i];
//        	}
//        	if (m <= 8) std::cout << std::endl;
//        	std::cout << sum <<std::endl;
//
//
//        	cudaFree(bb);
//        	cudaFree(bbb);
//        	cudaFree(bbbb);
//
//
//        	//device_memset<FT>(x, FT(0), m*m);
//
//        	// delete --------------
//
//        	return;
//
//        	FT *y = x;
//        	thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m,  m*m, alpha, alpha_23, c_prime, b, y);
//
//        	FT *y_trans = b_trans;
//        	cublas_transpose2(cublas_handle, m*m, m, y, y_trans);
//
//        	FT *z_trans = y;
//        	thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m,  m*m, alpha, alpha_23, c_prime, y_trans, z_trans);
//
//        	FT *z_trans2 = y_trans;
//        	cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2);
//        	FT *x_trans2 = z_trans;
//
//        	thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m,  m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2);
//        	FT *x_trans = z_trans2;
//        	cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans);
//
//        	cublas_transpose2(cublas_handle, m, m*m, x_trans, x);
//        }