Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lubomir Riha
ETH-ADI
Commits
31bcdda3
Commit
31bcdda3
authored
Jul 20, 2016
by
Lubomir Riha
Browse files
ENH: 3D Thomas kernel works
parent
e1c5b3ae
Changes
2
Hide whitespace changes
Inline
Side-by-side
preconditioner.h
View file @
31bcdda3
...
...
@@ -740,14 +740,14 @@ __global__ void thomas_kernel3D_X1(int const m, FT const alpha, FT const alpha_2
x
[
j
]
=
work_buffer_reg
;
}
//
FT x_reg = x[start + (m-1)*n];
//
x[start + (m-1)*n] = x_reg;
//
//
for (int i = m-2; i >= 0; --i) {
//
int j = start + i*n;
//
x_reg = x[j] - dev_c_prime[i] * x_reg;
//
x[j] = x_reg;
//
}
FT
x_reg
=
x
[
start
+
(
m
-
1
)
*
n
];
x
[
start
+
(
m
-
1
)
*
n
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
n
;
x_reg
=
x
[
j
]
-
dev_c_prime
[
i
]
*
x_reg
;
x
[
j
]
=
x_reg
;
}
}
...
...
@@ -773,14 +773,14 @@ __global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_2
x
[
j
]
=
work_buffer_reg
;
}
//
FT x_reg = x[start + (m-1)*m];
//
x[start + (m-1)*m] = x_reg;
//
//
for (int i = m-2; i >= 0; --i) {
//
int j = start + i*m;
//
x_reg = x[j] - dev_c_prime[i] * x_reg;
//
x[j] = x_reg;
//
}
FT
x_reg
=
x
[
start
+
(
m
-
1
)
*
m
];
x
[
start
+
(
m
-
1
)
*
m
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
m
;
x_reg
=
x
[
j
]
-
dev_c_prime
[
i
]
*
x_reg
;
x
[
j
]
=
x_reg
;
}
}
...
...
@@ -789,26 +789,24 @@ template<class FT>
__global__
void
thomas_kernel3D_XT
(
int
const
m
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
#define TILE_SIZE 2
//printf("Tile size = %d \n", TILE_SIZE);
__shared__
FT
sh_b
[
TILE_SIZE
][
TILE_SIZE
];
__shared__
FT
sh_x
[
TILE_SIZE
][
TILE_SIZE
];
__shared__
FT
sh_b
[
TILE_SIZE
][
TILE_SIZE
+
1
];
__shared__
FT
sh_x
[
TILE_SIZE
][
TILE_SIZE
+
1
];
int
TILES
=
m
/
TILE_SIZE
;
//int n = m*m;
int
tid_l
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
tid
;
int
bid
=
blockIdx
.
x
;
//int tid = blockIdx.x * blockDim.x + threadIdx.x;
int
base_addr
=
tid_l
+
m
*
m
*
TILE_SIZE
*
(
bid
%
TILE_SIZE
)
+
(
bid
/
TILE_SIZE
)
*
m
;
// + m*m*i
// Basis of an adress used to read dat from global memory to tiles in shared memory
// - this is rused multiple times
int
base_addr
=
tid_l
+
m
*
m
*
TILE_SIZE
*
(
bid
%
TILE_SIZE
)
+
(
bid
/
TILE_SIZE
)
*
m
;
//FT work_buffer_reg = 0.0;
// **************************************************************************************************************
// *** Forward substitution ************************************************************************************
//if (bid != 0) return;
// read first patch of input data
// Read input data to fill the first tile in shared memoru
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
int
a
=
base_addr
+
m
*
m
*
i
;
...
...
@@ -816,10 +814,12 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
//printf("tid = %d - SM a = [%d,%d] - g a = %d ; val = %f \n", tid, tid_l, i, a, b[ a ] );
}
// Calculate first element of the forward substitution
FT
work_buffer_reg
=
sh_b
[
0
][
tid_l
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
sh_x
[
0
][
tid_l
]
=
work_buffer_reg
;
//printf("A tid = %d - work_buffer_reg = %f ; in_val = %f \n", tid, work_buffer_reg, sh_b[0][tid_l]);
// Calculate the rest of the forward substitution for the first tile
#pragma unroll
for
(
int
i
=
1
;
i
<
TILE_SIZE
;
++
i
)
{
work_buffer_reg
=
(
sh_b
[
i
][
tid_l
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
i
-
1
]);
...
...
@@ -827,180 +827,104 @@ __global__ void thomas_kernel3D_XT(int const m, FT const alpha, FT const alpha_2
//printf("X tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, i-1);
}
// Save results of for the first tile to the global memory
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
int
a
=
base_addr
+
m
*
m
*
i
;
x
[
a
]
=
sh_x
[
tid_l
][
i
];
x
[
a
]
=
sh_x
[
tid_l
][
i
];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// Processing of the remaining tiles
for
(
int
tile
=
1
;
tile
<
TILES
;
tile
++
)
{
// Read data from global memory to tile in shared memory
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
//int a = tid_l + m*m*TILE_SIZE*bid + m*m*i + tile * TILE_SIZE;
int
a
=
base_addr
+
m
*
m
*
i
+
tile
*
TILE_SIZE
;
int
a
=
base_addr
+
m
*
m
*
i
+
(
tile
*
TILE_SIZE
);
sh_b
[
tid_l
][
i
]
=
b
[
a
];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
//printf("Y tid = %d - work_buffer_reg = %f \n", tid, work_buffer_reg);
#pragma unroll
// Calculate forward substitution for the current tile
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
work_buffer_reg
=
(
sh_b
[
i
][
tid_l
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
tile
*
TILE_SIZE
+
i
-
1
]);
int
ca
=
(
tile
*
TILE_SIZE
)
+
i
-
1
;
work_buffer_reg
=
(
sh_b
[
i
][
tid_l
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
ca
]);
sh_x
[
i
][
tid_l
]
=
work_buffer_reg
;
//printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg,
tile * TILE_SIZE + i - 1
);
//printf("Z tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg,
ca
);
}
// Save the results of the forward substitution of the current tile to a global memory - this does not have to be done fot the last tile
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
int
a
=
base_addr
+
m
*
m
*
i
+
tile
*
TILE_SIZE
;
x
[
a
]
=
sh_x
[
tid_l
][
i
];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i
, a );
int
a
=
base_addr
+
m
*
m
*
i
+
(
tile
*
TILE_SIZE
)
;
x
[
a
]
=
sh_x
[
tid_l
][
i
];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
}
// *** END - Forward substitution ************************************************************************************
// **************************************************************************************************************
FT
x_reg
=
sh_x
[
TILE_SIZE
-
1
][
tid_l
];
sh_x
[
TILE_SIZE
-
1
][
tid_l
]
=
x_reg
;
// **************************************************************************************************************
// *** Backward substitution ************************************************************************************
// Backward substitution - last TILE - compute backward substitution using data already stored in tile in shared memory
#pragma unroll
for
(
int
i
=
TILE_SIZE
-
2
;
i
>=
0
;
--
i
)
{
x
_reg
=
sh_x
[
i
][
tid_l
]
-
dev_c_prime
[
m
-
TILE_SIZE
+
i
]
*
x
_reg
;
sh_x
[
i
][
tid_l
]
=
x
_reg
;
}
work_buffer
_reg
=
sh_x
[
i
][
tid_l
]
-
dev_c_prime
[
m
-
TILE_SIZE
+
i
]
*
work_buffer
_reg
;
sh_x
[
i
][
tid_l
]
=
work_buffer
_reg
;
//printf("B0 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, m - TILE_SIZE + i);
}
// for (int tile = TILES - 1; tile > 0; tile--) {
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; i++) {
// sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
// }
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; ++i) {
// work_buffer_reg = sh_b[i][tid_l] - dev_c_prime[(1+tile) * TILE_SIZE - i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
// }
//
// #pragma unroll
// for (int i = 0; i < TILE_SIZE; i++) {
// x[ start + m * m * ( tile * TILE_SIZE + i ) ] = sh_x[tid_l][i];
// }
//
// }
// Backward substitution - last TILE - store results from tile in shared memory to global memory
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
int
a
=
base_addr
+
m
*
m
*
i
+
m
-
TILE_SIZE
;
x
[
a
]
=
sh_x
[
tid_l
][
i
];
//printf("Sav0 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// Backward substitution - remainig tiles
for
(
int
tile
=
TILES
-
2
;
tile
>=
0
;
tile
--
)
{
// Load new tile from global memory to tile in shared memory
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
int
a
=
base_addr
+
m
*
m
*
i
+
tile
*
TILE_SIZE
;
sh_b
[
tid_l
][
i
]
=
x
[
a
];
//printf("Lod1 - tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
// compute backward substitution - use date stored in tile in shared memory
#pragma unroll
for
(
int
i
=
TILE_SIZE
-
1
;
i
>=
0
;
--
i
)
{
int
ca
=
tile
*
TILE_SIZE
+
i
;
work_buffer_reg
=
sh_b
[
i
][
tid_l
]
-
dev_c_prime
[
ca
]
*
work_buffer_reg
;
sh_x
[
i
][
tid_l
]
=
work_buffer_reg
;
//printf("B1 - tid = %d - work_buffer_reg = %f - prim a = %d \n", tid, work_buffer_reg, ca);
}
//
// for (int i = m-2; i >= 0; --i) {
// int j = start + i*n;
// x_reg = sh_x[i][tid_l] - dev_c_prime[i] * x_reg;
// sh_x[i][tid_l] = x_reg;
// }
//
//
//
// // write patch of input data
// #pragma unroll
// for (int i = 0; i < 4; i++) {
// x[ start + m*m*i ] = sh_x[tid_l][i];
// }
// Store current tile from shared memory to global memory
#pragma unroll
for
(
int
i
=
0
;
i
<
TILE_SIZE
;
i
++
)
{
//sh_b[tid_l][i] = b[ start + m * m * ( tile * TILE_SIZE + i ) ];
int
a
=
base_addr
+
m
*
m
*
i
+
tile
*
TILE_SIZE
;
x
[
a
]
=
sh_x
[
tid_l
][
i
];
//printf("tid = %d - SM a = [%d,%d] - g a = %d \n", tid, tid_l, i, a );
}
}
// *** Backward substitution - END ******************************************************************************
// **************************************************************************************************************
return
;
// FT beta = alpha_23;
// dev_c_prime - precalculated factors
// b - input
// x - output
//__shared__ FT sh_b[THREADS+1][THREADS];
//TODO: POZOR
//__shared__ FT sh_x[THREADS+1][THREADS];
// __shared__ FT sh_x[4][4];
//
// int tid_l = threadIdx.x;
// int bid = blockIdx.x;
// //int tid = blockIdx.x * blockDim.x + threadIdx.x;
// FT work_buffer_reg = 0.0;
//
// //if (bid != 0) return;
//
// // read first patch of input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// }
//
//
// for (int l = 0; l < M/THREADS; l++) {
// #pragma unroll
// for (int i = 0; i < THREADS; ++i) {
// //FT b_l = sh_b[i][tid_l];
// FT b_l = sh_x[i][tid_l];
// FT c_prime = dev_c_prime[l*THREADS+i-1];
// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime);
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f - %.10f - %.10f \n",sh_b [i][tid_l],sh_b[i][tid_l+1],sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// // save temp res and get new input data
// if ( l < (M/THREADS - 1) ) {
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// }
// }
// }
//
// // backward
//// if (tid_l == 0) printf("\n");
//
// int level = M/THREADS - 1;
// #pragma unroll
// for (int i = THREADS-2; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// for (int l = level; l >= 1 ; l--) {
// // save temp res and get new input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ];
// }
//
// #pragma unroll
// for (int i = THREADS-1; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
// }
// }
//
// // write last patch of output data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i];
// }
}
...
...
test
View file @
31bcdda3
No preview for this file type
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment