Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lubomir Riha
ETH-ADI
Commits
ad78fcff
Commit
ad78fcff
authored
Jul 19, 2016
by
Lubomir Riha
Browse files
ENH: Second kernel for 3D Thomas works - tileed is missing
parent
4b935cca
Changes
2
Hide whitespace changes
Inline
Side-by-side
preconditioner.h
View file @
ad78fcff
...
...
@@ -688,7 +688,9 @@ public:
template
<
class
FT
>
__global__
void
thomas_kernel3D
(
int
const
m
,
int
n
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
>=
n
)
{
return
;
}
...
...
@@ -716,6 +718,72 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
}
template
<
class
FT
>
__global__
void
thomas_kernel3D_X1
(
int
const
m
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
int
n
=
m
*
m
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
>=
n
)
{
return
;
}
int
start
=
tid
;
FT
work_buffer_reg
=
b
[
start
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
x
[
start
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
int
j
=
start
+
i
*
n
;
work_buffer_reg
=
(
b
[
j
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
i
-
1
]);
x
[
j
]
=
work_buffer_reg
;
}
FT
x_reg
=
x
[
start
+
(
m
-
1
)
*
n
];
x
[
start
+
(
m
-
1
)
*
n
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
n
;
x_reg
=
x
[
j
]
-
dev_c_prime
[
i
]
*
x_reg
;
x
[
j
]
=
x_reg
;
}
}
template
<
class
FT
>
__global__
void
thomas_kernel3D_X2
(
int
const
m
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
int
n
=
m
*
m
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
tid
>=
n
)
{
return
;
}
int
start
=
(
n
)
*
(
tid
/
m
)
+
(
tid
%
m
);
// + (i * m)
FT
work_buffer_reg
=
b
[
start
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
x
[
start
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
int
j
=
start
+
i
*
m
;
work_buffer_reg
=
(
b
[
j
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
i
-
1
]);
x
[
j
]
=
work_buffer_reg
;
}
FT
x_reg
=
x
[
start
+
(
m
-
1
)
*
m
];
x
[
start
+
(
m
-
1
)
*
m
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
m
;
x_reg
=
x
[
j
]
-
dev_c_prime
[
i
]
*
x_reg
;
x
[
j
]
=
x_reg
;
}
}
template
<
class
FT
>
__global__
void
thomas_kernel3DT
(
int
const
m
,
int
n
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
,
FT
*
const
__restrict__
tmp_g
)
{
...
...
@@ -725,23 +793,23 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
__shared__
FT
SM
[
THREADS
/
T
][
T
][
T
];
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tmt
=
threadIdx
.
x
%
T
;
int
tdt
=
threadIdx
.
x
/
T
;
FT
x_reg
;
if
(
tid
>=
n
)
{
return
;
}
if
(
tid
>=
n
)
{
return
;
}
int
start
=
tid
;
int
start
=
tid
;
FT
work_buffer_reg
=
b
[
start
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
FT
work_buffer_reg
=
b
[
start
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
tmp_g
[
start
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
int
j
=
start
+
i
*
n
;
tmp_g
[
start
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
int
j
=
start
+
i
*
n
;
double
input
=
b
[
j
];
double
c_prime
=
dev_c_prime
[
i
-
1
];
...
...
@@ -763,15 +831,15 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
int
addr2
=
blockIdx
.
x
;
int
addr3
=
m
-
1
-
tmt
;
int
addr4
;
int
addr
=
(
addr2
*
m
*
m
)
+
addr3
;
for
(
int
i
=
0
;
i
<
T
;
i
++
)
{
x
[
addr
+
(
addr1
+
i
)
*
m
]
=
SM
[
tdt
][
i
][
tmt
];
}
for
(
int
T_loop
=
1
;
T_loop
<
m
/
T
;
T_loop
++
)
{
int
T_offset
=
T
*
T_loop
;
addr4
=
m
-
1
-
T_offset
;
addr3
=
addr4
-
tmt
;
...
...
@@ -788,15 +856,15 @@ __global__ void thomas_kernel3DT(int const m, int n, FT const alpha, FT const al
for
(
int
i
=
0
;
i
<
T
;
i
++
)
{
x
[
addr
+
(
addr1
+
i
)
*
m
]
=
SM
[
tdt
][
i
][
tmt
];
}
}
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
}
...
...
@@ -900,6 +968,9 @@ public:
FT
block_count
=
divide_and_round_up
(
m
*
m
,
thomas_kernel_block_size
);
FT
threads_per_block
=
thomas_kernel_block_size
;
std
::
cout
<<
"Blocks = "
<<
block_count
<<
std
::
endl
;
std
::
cout
<<
"threads_per_block = "
<<
threads_per_block
<<
std
::
endl
;
//int n = m*m*m;
// delete --------------
...
...
@@ -918,9 +989,17 @@ public:
FT
*
xx
=
x
;
FT
*
bb
;
cudaMalloc
((
void
**
)
&
bb
,
m
*
m
*
m
*
sizeof
(
FT
));
FT
*
bbb
;
cudaMalloc
((
void
**
)
&
bbb
,
m
*
m
*
m
*
sizeof
(
FT
));
FT
*
bbbb
;
cudaMalloc
((
void
**
)
&
bbbb
,
m
*
m
*
m
*
sizeof
(
FT
));
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
thomas_kernel3D
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
m
*
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
bb
);
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
bb
,
xx
);
thomas_kernel3D
_X1
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
xx
);
//
bb);
//
cublas_transpose2(cublas_handle, m*m, m, bb, xx);
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
...
...
@@ -937,8 +1016,12 @@ public:
cublas_transpose2
(
cublas_handle
,
m
,
m
*
m
,
b
,
bb
);
// cublas_transpose2(cublas_handle, m*m, m, bb, bbb);
// cublas_transpose2(cublas_handle, m*m, m, bbb, bbbb);
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
h_x
,
bb
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
...
...
@@ -949,10 +1032,14 @@ public:
FT
*
tmp_g
;
cudaMalloc
((
void
**
)
&
tmp_g
,
m
*
m
*
m
*
sizeof
(
FT
));
thomas_kernel3DT
<
FT
><<<
m
,
m
>>>
(
m
,
m
*
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
xx
,
tmp_g
);
//cudaMalloc((void **) &tmp_g, m*m*m*sizeof(FT));
//thomas_kernel3DT<FT><<<m, m>>>(m, m*m, alpha, alpha_23, c_prime, b, xx, tmp_g);
thomas_kernel3D_X2
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
alpha
,
alpha_23
,
c_prime
,
bb
,
xx
);
//bb);
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
cublas_transpose2
(
cublas_handle
,
m
*
m
,
m
,
xx
,
bbb
);
cudaMemcpy
(
h_x
,
bbb
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
...
...
test
View file @
ad78fcff
No preview for this file type
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment