Commit 31bcdda3 authored by Lubomir Riha's avatar Lubomir Riha
Browse files

ENH: 3D Thomas kernel works

parent e1c5b3ae
......@@ -740,14 +740,14 @@ __global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_2
x[j] = work_buffer_reg;
}
// FT x_reg = x[start + (m-1)*n];
// x[start + (m-1)*n] = x_reg;
//
// for (int i = m-2; i >= 0; --i) {
// int j = start + i*n;
// x_reg = x[j] - dev_c_prime[i] * x_reg;
// x[j] = x_reg;
// }
FT x_reg = x[start + (m-1)*n];
x[start + (m-1)*n] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
......@@ -773,14 +773,14 @@ __global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_2
x[j] = work_buffer_reg;
}
// FT x_reg = x[start + (m-1)*m];
// x[start + (m-1)*m] = x_reg;
//
// for (int i = m-2; i >= 0; --i) {
// int j = start + i*m;
// x_reg = x[j] - dev_c_prime[i] * x_reg;
// x[j] = x_reg;
// }
FT x_reg = x[start + (m-1)*m];
x[start + (m-1)*m] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*m;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
......@@ -789,26 +789,24 @@ template<class FT>
__global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
#define TILE_SIZE 2
//printf("Tile size = %d \n", TILE_SIZE);
__shared__ FT sh_b[TILE_SIZE][TILE_SIZE];
__shared__ FT sh_x[TILE_SIZE][TILE_SIZE];
__shared__ FT sh_b[TILE_SIZE][TILE_SIZE+1];
__shared__ FT sh_x[TILE_SIZE][TILE_SIZE+1];
int TILES = m / TILE_SIZE;
//int n = m*m;
int tid_l = threadIdx.x;
int bid = blockIdx.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int start = tid;
int bid = blockIdx.x;
//int tid = blockIdx.x * blockDim.x + threadIdx.x;
int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m; // + m*m*i
// Basis of an adress used to read dat from global memory to tiles in shared memory
// - this is rused multiple times
int base_addr = tid_l + m*m*TILE_SIZE*(bid%TILE_SIZE) + (bid/TILE_SIZE)*m;
//FT work_buffer_reg = 0.0;
// **************************************************************************************************************
// *** Forward substitution ************************************************************************************
//if (bid != 0) return;
// read first patch of input data
// Read input data to fill the first tile in shared memoru
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
int a = base_addr + m*m*i;
......@@ -816,10 +814,12 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
//printf("tid = %d - SM a = [%d,%d] - g a = %d ; val = %f \n", tid, tid_l, i, a, b[ a ] );
}
// Calculate first element of the forward substitution
FT work_buffer_reg = sh_b[0][tid_l] * alpha_23 / (FT(2) + alpha);
sh_x[0][tid_l] = work_buffer_reg;
//printf("A tid = %d - work_buffer_reg = %f ; in_val = %f \n", tid, work_buffer_reg, sh_b[0][tid_l]);
// Calculate the rest of the forward substitution for the first tile
#pragma unroll
for (int i = 1; i < TILE_SIZE; ++i) {
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
......@@ -827,180 +827,104 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
//printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, i-1);
}
// Save results of for the first tile to the global memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
int a = base_addr + m*m*i;
x[ a ] = sh_x[tid_l][i];
x[a] = sh_x[tid_l][i];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// Processing of the remaining tiles
for (int tile = 1; tile < TILES; tile++) {
// Read data from global memory to tile in shared memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
//int a = tid_l + m*m*TILE_SIZE*bid + m*m*i + tile * TILE_SIZE;
int a = base_addr + m*m*i + tile * TILE_SIZE;
int a = base_addr + m*m*i + (tile * TILE_SIZE);
sh_b[tid_l][i] = b[a];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
//printf("Y tid = %d - work_buffer_reg = %f \n", tid, work_buffer_reg);
#pragma unroll
// Calculate forward substitution for the current tile
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[tile * TILE_SIZE + i - 1]);
int ca = (tile * TILE_SIZE) + i - 1;
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[ca]);
sh_x[i][tid_l] = work_buffer_reg;
//printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, tile * TILE_SIZE + i - 1);
//printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
}
// Save the results of the forward substitution of the current tile to a global memory - this does not have to be done fot the last tile
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
int a = base_addr + m*m*i + tile * TILE_SIZE;
x[ a ] = sh_x[tid_l][i];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i , a );
int a = base_addr + m*m*i + (tile * TILE_SIZE);
x[a] = sh_x[tid_l][i];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
}
// *** END - Forward substitution ************************************************************************************
// **************************************************************************************************************
FT x_reg = sh_x[TILE_SIZE-1][tid_l];
sh_x[TILE_SIZE-1][tid_l] = x_reg;
// **************************************************************************************************************
// *** Backward substitution ************************************************************************************
// Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory
#pragma unroll
for (int i = TILE_SIZE-2; i >= 0; --i) {
x_reg = sh_x[i][tid_l] - dev_c_prime[m - TILE_SIZE + i] * x_reg;
sh_x[i][tid_l] = x_reg;
}
work_buffer_reg = sh_x[i][tid_l] - dev_c_prime[m - TILE_SIZE + i] * work_buffer_reg;
sh_x[i][tid_l] = work_buffer_reg;
//printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i);
}
// for (int tile = TILES - 1; tile > 0; tile--) {
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; i++) {
// sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
// }
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; ++i) {
// work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[(1+tile) * TILE_SIZE - i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
// }
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; i++) {
// x[ start + m * m * ( tile * TILE_SIZE + i ) ] = sh_x[tid_l][i];
// }
//
// }
// Backward substitution - last TILE - store results from tile in shared memory to global memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
int a = base_addr + m*m*i + m - TILE_SIZE;
x[ a ] = sh_x[tid_l][i];
//printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// Backward substitution - remainig tiles
for (int tile = TILES - 2; tile >= 0; tile--) {
// Load new tile from global memory to tile in shared memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
int a = base_addr + m*m*i + tile * TILE_SIZE;
sh_b[tid_l][i] = x[ a ];
//printf("Lod1 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// compute backward substitution - use date stored in tile in shared memory
#pragma unroll
for (int i = TILE_SIZE-1; i >= 0; --i) {
int ca = tile * TILE_SIZE + i;
work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[ca] * work_buffer_reg;
sh_x[i][tid_l] = work_buffer_reg;
//printf("B1 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
}
//
// for (int i = m-2; i >= 0; --i) {
// int j = start + i*n;
// x_reg = sh_x[i][tid_l] - dev_c_prime[i] * x_reg;
// sh_x[i][tid_l] = x_reg;
// }
//
//
//
// // write patch of input data
// #pragma unroll
// for (int i = 0; i < 4; i++) {
// x[ start + m*m*i ] = sh_x[tid_l][i];
// }
// Store current tile from shared memory to global memory
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
int a = base_addr + m*m*i + tile * TILE_SIZE;
x[a] = sh_x[tid_l][i];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
}
// *** Backward substitution - END ******************************************************************************
// **************************************************************************************************************
return;
// FT beta = alpha_23;
// dev_c_prime - precalculated factors
// b - input
// x - output
//__shared__ FT sh_b[THREADS+1][THREADS];
//TODO: POZOR
//__shared__ FT sh_x[THREADS+1][THREADS];
// __shared__ FT sh_x[4][4];
//
// int tid_l = threadIdx.x;
// int bid = blockIdx.x;
// //int tid = blockIdx.x * blockDim.x + threadIdx.x;
// FT work_buffer_reg = 0.0;
//
// //if (bid != 0) return;
//
// // read first patch of input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// }
//
//
// for (int l = 0; l < M/THREADS; l++) {
// #pragma unroll
// for (int i = 0; i < THREADS; ++i) {
// //FT b_l = sh_b[i][tid_l];
// FT b_l = sh_x[i][tid_l];
// FT c_prime = dev_c_prime[l*THREADS+i-1];
// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime);
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f - %.10f - %.10f \n",sh_b [i][tid_l],sh_b[i][tid_l+1],sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// // save temp res and get new input data
// if ( l < (M/THREADS - 1) ) {
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// }
// }
// }
//
// // backward
//// if (tid_l == 0) printf("\n");
//
// int level = M/THREADS - 1;
// #pragma unroll
// for (int i = THREADS-2; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// for (int l = level; l >= 1 ; l--) {
// // save temp res and get new input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ];
// }
//
// #pragma unroll
// for (int i = THREADS-1; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
// }
// }
//
// // write last patch of output data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i];
// }
}
......
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment