Commit 93044da2 authored by Lubomir Riha's avatar Lubomir Riha
Browse files

Working versin on Anselm

parent ac0c5b0f
......@@ -715,6 +715,36 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
}
template<class FT>
__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= n) {
return;
}
int start = tid;
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
x[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
x[j] = work_buffer_reg;
}
FT x_reg = x[start + (m-1)*n];
x[start + (m-1)*n] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
// Thomas algorithm with storing intermediate results in shared memory - saves one global memory read and global memory write into x - compare with previous Thomas kernel
template<class FT>
__global__ void thomas_kernel3D2(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
......@@ -829,8 +859,10 @@ public:
FT *xx = x;
FT *bb;
cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, bb);
cublas_transpose2(cublas_handle, m*m, m, bb, xx);
cublas_transpose2(cublas_handle, m*m, m, b, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
......@@ -844,6 +876,14 @@ public:
}
std::cout << std::endl;
thomas_kernel3DT<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, xx);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
//device_memset<FT>(x, FT(0), m*m);
......
No preview for this file type
......@@ -184,7 +184,7 @@ int main (int argc, char *argv[])
int num_iter;
double *b_3d, *x_3d;
int m_3d = 5;
int m_3d = 6;
double * h_x_3d;
h_x_3d = (double * ) malloc ( (m_3d*m_3d*m_3d) * sizeof(double) );
......@@ -200,20 +200,21 @@ int main (int argc, char *argv[])
//device_memset<double>(b_3d, 1.0, m_3d*m_3d*m_3d);
// InvPreconditioner3D<double> preconditioner_inv3d;
// num_iter = solve_with_conjugate_gradient3D<double>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_inv3d); // NULL );
// cout << num_iter << " CG 3D only - iterations" << endl;
//Inv
ThomasPreconditioner3D<double> preconditioner_inv3d;
num_iter = solve_with_conjugate_gradient3D<double>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_inv3d); // NULL );
cout << num_iter << " CG 3D only - iterations" << endl;
cudaMemcpy( h_x_3d, b_3d, (m_3d*m_3d*m_3d)*sizeof(double) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m_3d*m_3d*m_3d; i++)
cout << h_x_3d[i] << endl;
cudaMemcpy( h_x_3d, b_3d, (m_3d*m_3d*m_3d)*sizeof(double) , cudaMemcpyDeviceToHost );
//for (int i = 0; i < m_3d*m_3d*m_3d; i++)
// cout << h_x_3d[i] << endl;
cout << endl;
ThomasPreconditioner3D<double> preconditioner_3d;
num_iter = solve_with_conjugate_gradient3D<double>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_3d);
cout << num_iter << " CG 3D Thomas Prec - iterations" << endl;
// ThomasPreconditioner3D<double> preconditioner_3d;
// num_iter = solve_with_conjugate_gradient3D<double>(cublas_handle, cusparse_handle, m_3d, alpha, b_3d, x_3d, max_iter, tolerance, &preconditioner_3d);
// cout << num_iter << " CG 3D Thomas Prec - iterations" << endl;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment