Commit b85269af authored by Lubomir Riha's avatar Lubomir Riha
Browse files

3D Thomas kernel in 3D w Tiling works - basic version with one SM

parent ad78fcff
......@@ -784,6 +784,153 @@ __global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_2
}
template<class FT>
__global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
__shared__ FT sh_b[4][4];
__shared__ FT sh_x[4][4];
int n = m*m;
int tid_l = threadIdx.x;
int bid = blockIdx.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int start = tid;
//FT work_buffer_reg = 0.0;
//if (bid != 0) return;
// read first patch of input data
#pragma unroll
for (int i = 0; i < 4; i++) {
sh_b[tid_l][i] = b[ start + m*m*i ];
}
FT work_buffer_reg = sh_b[0][tid_l] * alpha_23 / (FT(2) + alpha);
sh_x[0][tid_l] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
//int j = start + i*n;
work_buffer_reg = (sh_b[i][tid_l] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
sh_x[i][tid_l] = work_buffer_reg;
}
FT x_reg = sh_x[m-1][tid_l];
sh_x[m-1][tid_l] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = sh_x[i][tid_l] - dev_c_prime[i] * x_reg;
sh_x[i][tid_l] = x_reg;
}
// write patch of input data
#pragma unroll
for (int i = 0; i < 4; i++) {
x[ start + m*m*i ] = sh_x[tid_l][i];
}
return;
FT beta = alpha_23;
// dev_c_prime - precalculated factors
// b - input
// x - output
//__shared__ FT sh_b[THREADS+1][THREADS];
//TODO: POZOR
//__shared__ FT sh_x[THREADS+1][THREADS];
// __shared__ FT sh_x[4][4];
//
// int tid_l = threadIdx.x;
// int bid = blockIdx.x;
// //int tid = blockIdx.x * blockDim.x + threadIdx.x;
// FT work_buffer_reg = 0.0;
//
// //if (bid != 0) return;
//
// // read first patch of input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// }
//
//
// for (int l = 0; l < M/THREADS; l++) {
// #pragma unroll
// for (int i = 0; i < THREADS; ++i) {
// //FT b_l = sh_b[i][tid_l];
// FT b_l = sh_x[i][tid_l];
// FT c_prime = dev_c_prime[l*THREADS+i-1];
// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime);
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f - %.10f - %.10f \n",sh_b [i][tid_l],sh_b[i][tid_l+1],sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// // save temp res and get new input data
// if ( l < (M/THREADS - 1) ) {
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// }
// }
// }
//
// // backward
//// if (tid_l == 0) printf("\n");
//
// int level = M/THREADS - 1;
// #pragma unroll
// for (int i = THREADS-2; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// for (int l = level; l >= 1 ; l--) {
// // save temp res and get new input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ];
// }
//
// #pragma unroll
// for (int i = THREADS-1; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
// }
// }
//
// // write last patch of output data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i];
// }
}
template<class FT>
__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) {
......@@ -794,7 +941,7 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
__shared__ FT SM [THREADS/T][T][T];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tmt = threadIdx.x % T;
int tmt = threadIdx.x % T;
int tdt = threadIdx.x / T;
FT x_reg;
......@@ -965,8 +1112,8 @@ public:
}
virtual void run(FT const * const b, FT * const x) {
FT block_count = divide_and_round_up(m*m, thomas_kernel_block_size);
FT threads_per_block = thomas_kernel_block_size;
FT block_count = m; //divide_and_round_up(m*m, thomas_kernel_block_size);
FT threads_per_block = m; //thomas_kernel_block_size;
std::cout << "Blocks = " << block_count << std::endl;
std::cout << "threads_per_block = " << threads_per_block << std::endl;
......@@ -987,6 +1134,7 @@ public:
std::cout << std::endl;
FT *xx = x;
FT *bb;
cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
FT *bbb;
......@@ -995,14 +1143,10 @@ public:
cudaMalloc((void **) &bbbb, m*m*m*sizeof(FT));
// Ker 1 *************
device_memset<FT>(xx, FT(0), m*m*m);
thomas_kernel3D_X1<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, b, xx); //bb);
//cublas_transpose2(cublas_handle, m*m, m, bb, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
FT sum = 0.0;
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
......@@ -1014,12 +1158,9 @@ public:
}
std::cout << std::endl << std::endl << sum << std::endl;
// Ker 2 *****************
cublas_transpose2(cublas_handle, m, m*m, b, bb);
// cublas_transpose2(cublas_handle, m*m, m, bb, bbb);
// cublas_transpose2(cublas_handle, m*m, m, bbb, bbbb);
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
......@@ -1029,17 +1170,40 @@ public:
}
std::cout << std::endl;
thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb);
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
FT *tmp_g;
//cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
//thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb);
cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << std::endl << std::endl << sum <<std::endl;
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
// Ker 3 *****************
cublas_transpose2(cublas_handle, m, m*m, b, bb);
cublas_transpose2(cublas_handle, m, m*m, bb, bbb);
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
thomas_kernel3D_XT<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bbb, xx); //bb);
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
cublas_transpose2(cublas_handle, m*m, m, bbb, xx);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
......@@ -1048,6 +1212,13 @@ public:
}
std::cout << std::endl << std::endl << sum <<std::endl;
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
......
No preview for this file type
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment