Commit 3fab3cd7 authored by Pavel Gajdušek's avatar Pavel Gajdušek
Browse files

mdlint

parent 6fc0e23d
......@@ -193,79 +193,79 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo
```console
$ cat ./vect-add
#include <stdio.h>
#include <stdio.h>
typedef int T;
typedef int T;
#define SIZE 1000
#define SIZE 1000
#pragma offload_attribute(push, target(mic))
T in1[SIZE];
T in2[SIZE];
T res[SIZE];
#pragma offload_attribute(pop)
#pragma offload_attribute(push, target(mic))
T in1[SIZE];
T in2[SIZE];
T res[SIZE];
#pragma offload_attribute(pop)
// MIC function to add two vectors
__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
int i = 0;
#pragma omp parallel for
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// MIC function to add two vectors
__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
int i = 0;
#pragma omp parallel for
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// CPU function to add two vectors
void add_cpu (T *a, T *b, T *c, int size) {
int i;
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// CPU function to add two vectors
void add_cpu (T *a, T *b, T *c, int size) {
int i;
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// CPU function to generate a vector of random numbers
void random_T (T *a, int size) {
int i;
for (i = 0; i < size; i++)
a[i] = rand() % 10000; // random number between 0 and 9999
}
// CPU function to generate a vector of random numbers
void random_T (T *a, int size) {
int i;
for (i = 0; i < size; i++)
a[i] = rand() % 10000; // random number between 0 and 9999
}
// CPU function to compare two vectors
int compare(T *a, T *b, T size ){
int pass = 0;
int i;
for (i = 0; i < size; i++){
if (a[i] != b[i]) {
printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
pass = 1;
}
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
// CPU function to compare two vectors
int compare(T *a, T *b, T size ){
int pass = 0;
int i;
for (i = 0; i < size; i++){
if (a[i] != b[i]) {
printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
pass = 1;
}
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
}
int main()
{
int i;
random_T(in1, SIZE);
random_T(in2, SIZE);
int main()
{
int i;
random_T(in1, SIZE);
random_T(in2, SIZE);
#pragma offload target(mic) in(in1,in2) inout(res)
{
#pragma offload target(mic) in(in1,in2) inout(res)
{
// Parallel loop from main function
#pragma omp parallel for
for (i=0; i<SIZE; i++)
res[i] = in1[i] + in2[i];
// Parallel loop from main function
#pragma omp parallel for
for (i=0; i<SIZE; i++)
res[i] = in1[i] + in2[i];
// or parallel loop is called inside the function
add_mic(in1, in2, res, SIZE);
// or parallel loop is called inside the function
add_mic(in1, in2, res, SIZE);
}
}
//Check the results with CPU implementation
T res_cpu[SIZE];
add_cpu(in1, in2, res_cpu, SIZE);
compare(res, res_cpu, SIZE);
//Check the results with CPU implementation
T res_cpu[SIZE];
add_cpu(in1, in2, res_cpu, SIZE);
compare(res, res_cpu, SIZE);
}
}
```
During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run
......@@ -566,29 +566,29 @@ $ g++ capsbasic.cpp -lOpenCL -o capsbasic -I/apps/intel/opencl/include/
After executing the complied binary file, following output should be displayed.
```console
./capsbasic
./capsbasic
Number of available platforms: 1
Platform names:
[0] Intel(R) OpenCL [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 1
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 1
Number of available platforms: 1
Platform names:
[0] Intel(R) OpenCL [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 1
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 1
** Detailed information for each device ***
** Detailed information for each device ***
CL_DEVICE_TYPE_CPU[0]
CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
CL_DEVICE_AVAILABLE: 1
CL_DEVICE_TYPE_CPU[0]
CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
CL_DEVICE_AVAILABLE: 1
...
...
CL_DEVICE_TYPE_ACCELERATOR[0]
CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
CL_DEVICE_AVAILABLE: 1
CL_DEVICE_TYPE_ACCELERATOR[0]
CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
CL_DEVICE_AVAILABLE: 1
...
...
```
!!! note
......@@ -612,23 +612,23 @@ $ g++ cmdoptions.cpp gemm.cpp ../common/basic.cpp ../common/cmdparser.cpp ../com
To see the performance of Intel Xeon Phi performing the DGEMM run the example as follows:
```console
./gemm -d 1
Platforms (1):
[0] Intel(R) OpenCL [Selected]
Devices (2):
[0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
[1] Intel(R) Many Integrated Core Acceleration Card [Selected]
Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8"
Running gemm_nn kernel with matrix size: 3968x3968
Memory row stride to ensure necessary alignment: 15872 bytes
Size of memory region for one matrix: 62980096 bytes
Using alpha = 0.57599 and beta = 0.872412
...
Host time: 0.292953 sec.
Host perf: 426.635 GFLOPS
Host time: 0.293334 sec.
Host perf: 426.081 GFLOPS
...
./gemm -d 1
Platforms (1):
[0] Intel(R) OpenCL [Selected]
Devices (2):
[0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
[1] Intel(R) Many Integrated Core Acceleration Card [Selected]
Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8"
Running gemm_nn kernel with matrix size: 3968x3968
Memory row stride to ensure necessary alignment: 15872 bytes
Size of memory region for one matrix: 62980096 bytes
Using alpha = 0.57599 and beta = 0.872412
...
Host time: 0.292953 sec.
Host perf: 426.635 GFLOPS
Host time: 0.293334 sec.
Host perf: 426.081 GFLOPS
...
```
!!! hint
......@@ -685,28 +685,28 @@ $ mpiifort -mmic -o mpi-test-mic mpi-test.f90
An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file)
```cpp
#include <stdio.h>
#include <mpi.h>
#include <stdio.h>
#include <mpi.h>
int main (argc, argv)
int argc;
char *argv[];
{
int rank, size;
int main (argc, argv)
int argc;
char *argv[];
{
int rank, size;
int len;
char node[MPI_MAX_PROCESSOR_NAME];
int len;
char node[MPI_MAX_PROCESSOR_NAME];
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
MPI_Get_processor_name(node,&len);
MPI_Get_processor_name(node,&len);
printf( "Hello world from process %d of %d on host %s n", rank, size, node );
MPI_Finalize();
return 0;
}
printf( "Hello world from process %d of %d on host %s n", rank, size, node );
MPI_Finalize();
return 0;
}
```
### MPI Programming Models
......
......@@ -56,7 +56,7 @@ Because a direct connection is not allowed to compute nodes on Salomon, you must
$ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz
```
replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step).
replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step).
If you use PuTTY on Windows, load Salomon connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment