Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lubomir Riha
ETH-ADI
Commits
b85269af
Commit
b85269af
authored
Jul 19, 2016
by
Lubomir Riha
Browse files
3D Thomas kernel in 3D w Tiling works - basic version with one SM
parent
ad78fcff
Changes
2
Hide whitespace changes
Inline
Side-by-side
preconditioner.h
View file @
b85269af
...
...
@@ -784,6 +784,153 @@ __global__ void thomas_kernel3D_X2(int const m, FT const alpha, FT const alpha_2
}
template
<
class
FT
>
__global__
void
thomas_kernel3D_XT
(
int
const
m
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
__shared__
FT
sh_b
[
4
][
4
];
__shared__
FT
sh_x
[
4
][
4
];
int
n
=
m
*
m
;
int
tid_l
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
start
=
tid
;
//FT work_buffer_reg = 0.0;
//if (bid != 0) return;
// read first patch of input data
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
sh_b
[
tid_l
][
i
]
=
b
[
start
+
m
*
m
*
i
];
}
FT
work_buffer_reg
=
sh_b
[
0
][
tid_l
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
sh_x
[
0
][
tid_l
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
//int j = start + i*n;
work_buffer_reg
=
(
sh_b
[
i
][
tid_l
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
i
-
1
]);
sh_x
[
i
][
tid_l
]
=
work_buffer_reg
;
}
FT
x_reg
=
sh_x
[
m
-
1
][
tid_l
];
sh_x
[
m
-
1
][
tid_l
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
n
;
x_reg
=
sh_x
[
i
][
tid_l
]
-
dev_c_prime
[
i
]
*
x_reg
;
sh_x
[
i
][
tid_l
]
=
x_reg
;
}
// write patch of input data
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
x
[
start
+
m
*
m
*
i
]
=
sh_x
[
tid_l
][
i
];
}
return
;
FT
beta
=
alpha_23
;
// dev_c_prime - precalculated factors
// b - input
// x - output
//__shared__ FT sh_b[THREADS+1][THREADS];
//TODO: POZOR
//__shared__ FT sh_x[THREADS+1][THREADS];
// __shared__ FT sh_x[4][4];
//
// int tid_l = threadIdx.x;
// int bid = blockIdx.x;
// //int tid = blockIdx.x * blockDim.x + threadIdx.x;
// FT work_buffer_reg = 0.0;
//
// //if (bid != 0) return;
//
// // read first patch of input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + 0*THREADS ];
// }
//
//
// for (int l = 0; l < M/THREADS; l++) {
// #pragma unroll
// for (int i = 0; i < THREADS; ++i) {
// //FT b_l = sh_b[i][tid_l];
// FT b_l = sh_x[i][tid_l];
// FT c_prime = dev_c_prime[l*THREADS+i-1];
// work_buffer_reg = (b_l * beta + work_buffer_reg) / (FT(2) + alpha + c_prime);
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f - %.10f - %.10f \n",sh_b [i][tid_l],sh_b[i][tid_l+1],sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// // save temp res and get new input data
// if ( l < (M/THREADS - 1) ) {
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// //sh_b[tid_l][i] = b[ tid_l + m*(i+bid*THREADS) + (l+1)*THREADS ];
// }
// }
// }
//
// // backward
//// if (tid_l == 0) printf("\n");
//
// int level = M/THREADS - 1;
// #pragma unroll
// for (int i = THREADS-2; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[level*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
//
// }
//
// for (int l = level; l >= 1 ; l--) {
// // save temp res and get new input data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + l *THREADS ] = sh_x[tid_l][i];
// sh_x[tid_l][i] = x[ tid_l + m*(i+bid*THREADS) + (l-1)*THREADS ];
// }
//
// #pragma unroll
// for (int i = THREADS-1; i >= 0; --i) {
// FT x_l = sh_x[i][tid_l];
// work_buffer_reg = x_l - dev_c_prime[(l-1)*THREADS+i] * work_buffer_reg;
// sh_x[i][tid_l] = work_buffer_reg;
//// if (tid_l == 0) printf(" x = %.10f - %.10f\n",sh_x[i][tid_l],sh_x[i][tid_l+1]);
// }
// }
//
// // write last patch of output data
// #pragma unroll
// for (int i = 0; i < THREADS; i++) {
// x[ tid_l + m*(i+bid*THREADS) + 0*THREADS ] = sh_x[tid_l][i];
// }
}
template
<
class
FT
>
__global__
void
thomas_kernel3DT
(
int
const
m
,
int
n
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
,
FT
*
const
__restrict__
tmp_g
)
{
...
...
@@ -794,7 +941,7 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
__shared__
FT
SM
[
THREADS
/
T
][
T
][
T
];
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tmt
=
threadIdx
.
x
%
T
;
int
tmt
=
threadIdx
.
x
%
T
;
int
tdt
=
threadIdx
.
x
/
T
;
FT
x_reg
;
...
...
@@ -965,8 +1112,8 @@ public:
}
virtual
void
run
(
FT
const
*
const
b
,
FT
*
const
x
)
{
FT
block_count
=
divide_and_round_up
(
m
*
m
,
thomas_kernel_block_size
);
FT
threads_per_block
=
thomas_kernel_block_size
;
FT
block_count
=
m
;
//
divide_and_round_up(m*m, thomas_kernel_block_size);
FT
threads_per_block
=
m
;
//
thomas_kernel_block_size;
std
::
cout
<<
"Blocks = "
<<
block_count
<<
std
::
endl
;
std
::
cout
<<
"threads_per_block = "
<<
threads_per_block
<<
std
::
endl
;
...
...
@@ -987,6 +1134,7 @@ public:
std
::
cout
<<
std
::
endl
;
FT
*
xx
=
x
;
FT
*
bb
;
cudaMalloc
((
void
**
)
&
bb
,
m
*
m
*
m
*
sizeof
(
FT
));
FT
*
bbb
;
...
...
@@ -995,14 +1143,10 @@ public:
cudaMalloc
((
void
**
)
&
bbbb
,
m
*
m
*
m
*
sizeof
(
FT
));
// Ker 1 *************
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
thomas_kernel3D_X1
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
xx
);
//bb);
//cublas_transpose2(cublas_handle, m*m, m, bb, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
FT
sum
=
0.0
;
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
...
...
@@ -1014,12 +1158,9 @@ public:
}
std
::
cout
<<
std
::
endl
<<
std
::
endl
<<
sum
<<
std
::
endl
;
// Ker 2 *****************
cublas_transpose2
(
cublas_handle
,
m
,
m
*
m
,
b
,
bb
);
// cublas_transpose2(cublas_handle, m*m, m, bb, bbb);
// cublas_transpose2(cublas_handle, m*m, m, bbb, bbbb);
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
cudaMemcpy
(
h_x
,
bb
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
...
...
@@ -1029,17 +1170,40 @@ public:
}
std
::
cout
<<
std
::
endl
;
thomas_kernel3D_X2
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
bb
,
xx
);
//bb);
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
xx
,
bbb
);
FT
*
tmp_g
;
//cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
//thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
thomas_kernel3D_X2
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
bb
,
xx
);
//bb);
cudaMemcpy
(
h_x
,
bbb
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
h_x
[
i
]
<<
"
\t
"
;
sum
+=
h_x
[
i
];
}
std
::
cout
<<
std
::
endl
<<
std
::
endl
<<
sum
<<
std
::
endl
;
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
xx
,
bbb
);
// Ker 3 *****************
cublas_transpose2
(
cublas_handle
,
m
,
m
*
m
,
b
,
bb
);
cublas_transpose2
(
cublas_handle
,
m
,
m
*
m
,
bb
,
bbb
);
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
cudaMemcpy
(
h_x
,
bbb
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
h_x
[
i
]
<<
"
\t
"
;
}
std
::
cout
<<
std
::
endl
;
thomas_kernel3D_XT
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
bbb
,
xx
);
//bb);
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
xx
,
bbb
);
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
bbb
,
xx
);
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
...
...
@@ -1048,6 +1212,13 @@ public:
}
std
::
cout
<<
std
::
endl
<<
std
::
endl
<<
sum
<<
std
::
endl
;
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
...
...
test
View file @
b85269af
No preview for this file type
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment