Commit 4b935cca authored by Lubomir Riha's avatar Lubomir Riha
Browse files

preparaion for 3D

parent 1baa3ecd
......@@ -697,8 +697,8 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
x[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
......@@ -707,6 +707,7 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
FT x_reg = x[start + (m-1)*n];
x[start + (m-1)*n] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = x[j] - dev_c_prime[i] * x_reg;
......@@ -896,92 +897,94 @@ public:
}
virtual void run(FT const * const b, FT * const x) {
FT block_count = divide_and_round_up(m*m, thomas_kernel_block_size);
FT threads_per_block = thomas_kernel_block_size;
FT block_count = divide_and_round_up(m*m, thomas_kernel_block_size);
FT threads_per_block = thomas_kernel_block_size;
//int n = m*m*m;
//int n = m*m*m;
// delete --------------
// delete --------------
FT *h_x;
h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
FT *h_x;
h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0)
std::cout << std::endl;
std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
FT *xx = x;
FT *bb;
cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, bb);
cublas_transpose2(cublas_handle, m*m, m, bb, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
FT sum = 0.0;
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << sum << std::endl;
FT *xx = x;
FT *bb;
cudaMalloc((void **) &bb, m*m*m*sizeof(FT));
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, bb);
cublas_transpose2(cublas_handle, m*m, m, bb, xx);
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
FT sum = 0.0;
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0)
std::cout << std::endl;
std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << std::endl << std::endl << sum << std::endl;
FT *tmp_g;
cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << sum <<std::endl;
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
return;
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
FT *y = x;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, y);
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
FT *y_trans = b_trans;
cublas_transpose2(cublas_handle, m*m, m, y, y_trans);
FT *z_trans = y;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans);
FT *z_trans2 = y_trans;
cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2);
FT *x_trans2 = z_trans;
FT *tmp_g;
cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2);
FT *x_trans = z_trans2;
cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << std::endl << std::endl << sum <<std::endl;
cublas_transpose2(cublas_handle, m, m*m, x_trans, x);
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
return;
FT *y = x;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, y);
FT *y_trans = b_trans;
cublas_transpose2(cublas_handle, m*m, m, y, y_trans);
FT *z_trans = y;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, y_trans, z_trans);
FT *z_trans2 = y_trans;
cublas_transpose2(cublas_handle, m*m, m, z_trans, z_trans2);
FT *x_trans2 = z_trans;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, z_trans2, x_trans2);
FT *x_trans = z_trans2;
cublas_transpose2(cublas_handle, m, m*m, x_trans2, x_trans);
cublas_transpose2(cublas_handle, m, m*m, x_trans, x);
}
......
......@@ -2,7 +2,7 @@
#echo "M BANDS THREADS W" | tr " " "\t"
for M in 256 # 512 1024 2048 4096 8192
for M in 4 # 512 1024 2048 4096 8192
do
#echo " --- "
for BANDS in 1 # 4 8 16 32
......
No preview for this file type
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment