Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Lubomir Riha
ETH-ADI
Commits
1baa3ecd
Commit
1baa3ecd
authored
Jul 18, 2016
by
Lubomir Riha
Browse files
Anselm - ne beginnig
parent
93044da2
Changes
3
Hide whitespace changes
Inline
Side-by-side
preconditioner.h
View file @
1baa3ecd
...
...
@@ -716,31 +716,88 @@ __global__ void thomas_kernel3D(int const m, int n, FT const alpha, FT const alp
template
<
class
FT
>
__global__
void
thomas_kernel3D
(
int
const
m
,
int
n
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
__global__
void
thomas_kernel3DT
(
int
const
m
,
int
n
,
FT
const
alpha
,
FT
const
alpha_23
,
FT
const
*
const
__restrict__
dev_c_prime
,
FT
const
*
const
__restrict__
b
,
FT
*
const
__restrict__
x
,
FT
*
const
__restrict__
tmp_g
)
{
#define T W
//#define THREADS 4
//#define BLOCKS 4
__shared__
FT
SM
[
THREADS
/
T
][
T
][
T
];
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
tmt
=
threadIdx
.
x
%
T
;
int
tdt
=
threadIdx
.
x
/
T
;
FT
x_reg
;
if
(
tid
>=
n
)
{
return
;
}
int
start
=
tid
;
FT
work_buffer_reg
=
b
[
start
]
*
alpha_23
/
(
FT
(
2
)
+
alpha
);
x
[
start
]
=
work_buffer_reg
;
tmp_g
[
start
]
=
work_buffer_reg
;
for
(
int
i
=
1
;
i
<
m
;
++
i
)
{
int
j
=
start
+
i
*
n
;
work_buffer_reg
=
(
b
[
j
]
*
alpha_23
+
work_buffer_reg
)
/
(
FT
(
2
)
+
alpha
+
dev_c_prime
[
i
-
1
]);
x
[
j
]
=
work_buffer_reg
;
}
int
j
=
start
+
i
*
n
;
double
input
=
b
[
j
];
double
c_prime
=
dev_c_prime
[
i
-
1
];
work_buffer_reg
=
(
input
*
alpha_23
+
work_buffer_reg
)
/
(
double
(
2
)
+
alpha
+
c_prime
);
tmp_g
[
j
]
=
work_buffer_reg
;
}
x_reg
=
tmp_g
[
start
+
(
m
-
1
)
*
n
];
SM
[
tdt
][
tmt
][
0
]
=
x_reg
;
for
(
int
i
=
1
;
i
<
T
;
i
++
)
{
int
j
=
start
+
(
m
-
1
-
i
)
*
n
;
x_reg
=
tmp_g
[
j
]
-
dev_c_prime
[
(
m
-
1
-
i
)
]
*
x_reg
;
SM
[
tdt
][
tmt
][
i
]
=
x_reg
;
}
int
addr1
=
T
*
tdt
;
int
addr2
=
blockIdx
.
x
;
int
addr3
=
m
-
1
-
tmt
;
int
addr4
;
int
addr
=
(
addr2
*
m
*
m
)
+
addr3
;
for
(
int
i
=
0
;
i
<
T
;
i
++
)
{
x
[
addr
+
(
addr1
+
i
)
*
m
]
=
SM
[
tdt
][
i
][
tmt
];
}
for
(
int
T_loop
=
1
;
T_loop
<
m
/
T
;
T_loop
++
)
{
int
T_offset
=
T
*
T_loop
;
addr4
=
m
-
1
-
T_offset
;
addr3
=
addr4
-
tmt
;
addr
=
(
addr2
*
m
*
m
)
+
addr3
;
for
(
int
i
=
0
;
i
<
T
;
i
++
)
{
int
g_addr
=
addr4
-
i
;
int
j
=
start
+
n
*
g_addr
;
x_reg
=
tmp_g
[
j
]
-
dev_c_prime
[
g_addr
]
*
x_reg
;
SM
[
tdt
][
tmt
][
i
]
=
x_reg
;
}
for
(
int
i
=
0
;
i
<
T
;
i
++
)
{
x
[
addr
+
(
addr1
+
i
)
*
m
]
=
SM
[
tdt
][
i
][
tmt
];
}
}
// int addr3D(int s, int r, int c, int m){ //[system, row, col_in_system = coallesced]
// return s*m + r*m*m + c;
// }
FT
x_reg
=
x
[
start
+
(
m
-
1
)
*
n
];
x
[
start
+
(
m
-
1
)
*
n
]
=
x_reg
;
for
(
int
i
=
m
-
2
;
i
>=
0
;
--
i
)
{
int
j
=
start
+
i
*
n
;
x_reg
=
x
[
j
]
-
dev_c_prime
[
i
]
*
x_reg
;
x
[
j
]
=
x_reg
;
}
}
...
...
@@ -851,8 +908,8 @@ public:
cudaMemcpy
(
h_x
,
b
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
h_x
[
i
]
<<
"
\t
"
;
//
if (i % (m*m) == 0) std::cout << std::endl;
//
std::cout << h_x[i] << "\t";
}
std
::
cout
<<
std
::
endl
;
...
...
@@ -866,29 +923,46 @@ public:
//cublas_transpose2(cublas_handle, m*m, m, xx, bb);
//xx = bb;
//FT *h_x;
//h_x = (FT * ) malloc ( (m*m*m)*sizeof(FT) );
FT
sum
=
0.0
;
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum
+=
h_x
[
i
];
}
std
::
cout
<<
sum
<<
std
::
endl
;
device_memset
<
FT
>
(
xx
,
FT
(
0
),
m
*
m
*
m
);
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
h_x
[
i
]
<<
"
\t
"
;
//
if (i % (m*m) == 0) std::cout << std::endl;
//
std::cout << h_x[i] << "\t";
}
std
::
cout
<<
std
::
endl
;
thomas_kernel3DT
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
m
*
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
xx
);
FT
*
tmp_g
;
cudaMalloc
((
void
**
)
&
tmp_g
,
m
*
m
*
m
*
sizeof
(
FT
));
thomas_kernel3DT
<
FT
><<<
m
,
m
>>>
(
m
,
m
*
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
xx
,
tmp_g
);
cudaMemcpy
(
h_x
,
xx
,
(
m
*
m
*
m
)
*
sizeof
(
FT
)
,
cudaMemcpyDeviceToHost
);
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
if
(
i
%
(
m
*
m
)
==
0
)
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
h_x
[
i
]
<<
"
\t
"
;
}
std
::
cout
<<
std
::
endl
;
sum
=
0.0
;
for
(
int
i
=
0
;
i
<
m
*
m
*
m
;
i
++
)
{
//if (i % (m*m) == 0) std::cout << std::endl;
//std::cout << h_x[i] << "\t";
sum
+=
h_x
[
i
];
}
std
::
cout
<<
sum
<<
std
::
endl
;
//device_memset<FT>(x, FT(0), m*m);
// delete --------------
return
;
FT
*
y
=
x
;
thomas_kernel3D
<
FT
><<<
block_count
,
threads_per_block
>>>
(
m
,
m
*
m
,
alpha
,
alpha_23
,
c_prime
,
b
,
y
);
...
...
test
View file @
1baa3ecd
No preview for this file type
test.cu
View file @
1baa3ecd
...
...
@@ -184,7 +184,7 @@ int main (int argc, char *argv[])
int
num_iter
;
double
*
b_3d
,
*
x_3d
;
int
m_3d
=
6
;
int
m_3d
=
M
;
double
*
h_x_3d
;
h_x_3d
=
(
double
*
)
malloc
(
(
m_3d
*
m_3d
*
m_3d
)
*
sizeof
(
double
)
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment