Commit 1baa3ecd authored by Lubomir Riha's avatar Lubomir Riha
Browse files

Anselm - ne beginnig

parent 93044da2
......@@ -716,31 +716,88 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
template<class FT>
__global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const alpha_23, FT const * const __restrict__ dev_c_prime, FT const * const __restrict__ b, FT * const __restrict__ x, FT * const __restrict__ tmp_g) {
#define T W
//#define THREADS 4
//#define BLOCKS 4
__shared__ FT SM [THREADS/T][T][T];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int tmt = threadIdx.x % T;
int tdt = threadIdx.x / T;
FT x_reg;
if (tid >= n) {
return;
}
int start = tid;
FT work_buffer_reg = b[start] * alpha_23 / (FT(2) + alpha);
x[start] = work_buffer_reg;
tmp_g[start] = work_buffer_reg;
for (int i = 1; i < m; ++i) {
int j = start + i*n;
work_buffer_reg = (b[j] * alpha_23 + work_buffer_reg) / (FT(2) + alpha + dev_c_prime[i-1]);
x[j] = work_buffer_reg;
}
int j = start + i*n;
double input = b[j];
double c_prime = dev_c_prime[i-1];
work_buffer_reg = (input * alpha_23 + work_buffer_reg) / (double(2) + alpha + c_prime);
tmp_g[j] = work_buffer_reg;
}
x_reg = tmp_g[start + (m-1)*n];
SM[tdt][tmt][0] = x_reg;
for (int i = 1; i < T; i++) {
int j = start + (m-1-i)*n;
x_reg = tmp_g[j] - dev_c_prime[ (m-1-i) ] * x_reg;
SM[tdt][tmt][i] = x_reg;
}
int addr1 = T * tdt;
int addr2 = blockIdx.x;
int addr3 = m - 1 - tmt;
int addr4;
int addr = (addr2 * m * m) + addr3;
for (int i = 0; i < T; i++) {
x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
}
for (int T_loop = 1; T_loop < m/T; T_loop++) {
int T_offset = T * T_loop;
addr4 = m - 1 - T_offset;
addr3 = addr4 - tmt;
addr = (addr2 * m * m) + addr3;
for (int i = 0; i < T; i++) {
int g_addr = addr4 - i;
int j = start + n * g_addr;
x_reg = tmp_g[j] - dev_c_prime[ g_addr ] * x_reg;
SM[tdt][tmt][i] = x_reg;
}
for (int i = 0; i < T; i++) {
x[ addr + (addr1 + i) * m ] = SM[tdt][i][tmt];
}
}
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
FT x_reg = x[start + (m-1)*n];
x[start + (m-1)*n] = x_reg;
for (int i = m-2; i >= 0; --i) {
int j = start + i*n;
x_reg = x[j] - dev_c_prime[i] * x_reg;
x[j] = x_reg;
}
}
......@@ -851,8 +908,8 @@ public:
cudaMemcpy( h_x, b, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
......@@ -866,29 +923,46 @@ public:
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
//FT *h_x;
//h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
FT sum = 0.0;
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << sum << std::endl;
device_memset<FT>(xx, FT(0), m*m*m);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
thomas_kernel3DT<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, xx);
FT *tmp_g;
cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
cudaMemcpy( h_x, xx, (m*m*m)*sizeof(FT) , cudaMemcpyDeviceToHost );
for (int i = 0; i < m*m*m; i++) {
if (i % (m*m) == 0) std::cout << std::endl;
std::cout << h_x[i] << "\t";
}
std::cout << std::endl;
sum = 0.0;
for (int i = 0; i < m*m*m; i++) {
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum+=h_x[i];
}
std::cout << sum <<std::endl;
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
return;
FT *y = x;
thomas_kernel3D<FT><<<block_count, threads_per_block>>>(m, m*m, alpha, alpha_23, c_prime, b, y);
......
No preview for this file type
......@@ -184,7 +184,7 @@ int main (int argc, char *argv[])
int num_iter;
double *b_3d, *x_3d;
int m_3d = 6;
int m_3d = M;
double * h_x_3d;
h_x_3d = (double * ) malloc ( (m_3d*m_3d*m_3d) * sizeof(double) );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment