Commit ad78fcff authored by Lubomir Riha's avatar Lubomir Riha
Browse files

ENH: Second kernel for 3D Thomas works - tileed is missing

parent 4b935cca
......@@ -688,7 +688,9 @@ public:
template<class FT>
__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= n) {
return;
}
......@@ -716,6 +718,72 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
}
template<class FT>
__global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
int n = m*m;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= n) {
return;
}
int start = tid;
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
x[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
x[j] = work_buffer_reg;
}
FT x_reg = x[start + (m-1)*n];
x[start + (m-1)*n] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
template<class FT>
__global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
int n = m*m;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= n) {
return;
}
int start = (n) * (tid/m) + (tid % m); // + (i * m)
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
x[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*m;
work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
x[j] = work_buffer_reg;
}
FT x_reg = x[start + (m-1)*m];
x[start + (m-1)*m] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*m;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
template<class FT>
__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) {
......@@ -725,23 +793,23 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
__shared__ FT SM [THREADS/T][T][T];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tmt = threadIdx.x % T;
int tdt = threadIdx.x / T;
FT x_reg;
if (tid >= n) {
return;
}
if (tid >= n) {
return;
}
int start = tid;
int start = tid;
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
tmp_g[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
tmp_g[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
double input = b[j];
double c_prime = dev_c_prime[i-1];
......@@ -763,15 +831,15 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
int addr2 = blockIdx.x;
int addr3 = m - 1 - tmt;
int addr4;
int addr = (addr2 * m * m) + addr3;
for (int i = 0; i < T; i++) {
x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
}
for (int T_loop = 1; T_loop < m/T; T_loop++) {
int T_offset = T * T_loop;
addr4 = m - 1 - T_offset;
addr3 = addr4 - tmt;
......@@ -788,15 +856,15 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
for (int i = 0; i < T; i++) {
x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
}
}
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
}
......@@ -900,6 +968,9 @@ public:
FT block_count = divide_and_round_up(m*m, thomas_kernel_block_size);
FT threads_per_block = thomas_kernel_block_size;
std::cout << "Blocks = " << block_count << std::endl;
std::cout << "threads_per_block = " << threads_per_block << std::endl;
//int n = m*m*m;
// delete --------------
......@@ -918,9 +989,17 @@ public:
FT *xx = x;
FT *bb;
cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
FT *bbb;
cudaMalloc((void **) &bbb, m*m*m*sizeof(FT));
FT *bbbb;
cudaMalloc((void **) &bbbb, m*m*m*sizeof(FT));
device_memset<FT>(xx, FT(0), m*m*m);
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, bb);
cublas_transpose2(cublas_handle, m*m, m, bb, xx);
thomas_kernel3D_X1<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, b, xx); //bb);
//cublas_transpose2(cublas_handle, m*m, m, bb, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
......@@ -937,8 +1016,12 @@ public:
cublas_transpose2(cublas_handle, m, m*m, b, bb);
// cublas_transpose2(cublas_handle, m*m, m, bb, bbb);
// cublas_transpose2(cublas_handle, m*m, m, bbb, bbbb);
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
cudaMemcpy( h_x, bb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
......@@ -949,10 +1032,14 @@ public:
FT *tmp_g;
cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
//cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
//thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
thomas_kernel3D_X2<FT><<<block_count, threads_per_block>>>(m, alpha, alpha_23, c_prime, bb, xx); //bb);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
cublas_transpose2(cublas_handle, m*m, m, xx, bbb);
cudaMemcpy( h_x, bbb, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
......
No preview for this file type
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment