Skip to content
Snippets Groups Projects
Commit 3fab3cd7 authored by Pavel Gajdušek's avatar Pavel Gajdušek
Browse files

mdlint

parent 6fc0e23d
No related branches found
No related tags found
6 merge requests!368Update prace.md to document the change from qprace to qprod as the default...,!367Update prace.md to document the change from qprace to qprod as the default...,!366Update prace.md to document the change from qprace to qprod as the default...,!323extended-acls-storage-section,!196Master,!161Gajdusek cleaning
...@@ -193,79 +193,79 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo ...@@ -193,79 +193,79 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo
```console ```console
$ cat ./vect-add $ cat ./vect-add
#include <stdio.h> #include <stdio.h>
typedef int T; typedef int T;
#define SIZE 1000 #define SIZE 1000
#pragma offload_attribute(push, target(mic)) #pragma offload_attribute(push, target(mic))
T in1[SIZE]; T in1[SIZE];
T in2[SIZE]; T in2[SIZE];
T res[SIZE]; T res[SIZE];
#pragma offload_attribute(pop) #pragma offload_attribute(pop)
// MIC function to add two vectors // MIC function to add two vectors
__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
int i = 0; int i = 0;
#pragma omp parallel for #pragma omp parallel for
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
} }
// CPU function to add two vectors // CPU function to add two vectors
void add_cpu (T *a, T *b, T *c, int size) { void add_cpu (T *a, T *b, T *c, int size) {
int i; int i;
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
c[i] = a[i] + b[i]; c[i] = a[i] + b[i];
} }
// CPU function to generate a vector of random numbers // CPU function to generate a vector of random numbers
void random_T (T *a, int size) { void random_T (T *a, int size) {
int i; int i;
for (i = 0; i < size; i++) for (i = 0; i < size; i++)
a[i] = rand() % 10000; // random number between 0 and 9999 a[i] = rand() % 10000; // random number between 0 and 9999
} }
// CPU function to compare two vectors // CPU function to compare two vectors
int compare(T *a, T *b, T size ){ int compare(T *a, T *b, T size ){
int pass = 0; int pass = 0;
int i; int i;
for (i = 0; i < size; i++){ for (i = 0; i < size; i++){
if (a[i] != b[i]) { if (a[i] != b[i]) {
printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
pass = 1; pass = 1;
}
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
} }
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
}
int main() int main()
{ {
int i; int i;
random_T(in1, SIZE); random_T(in1, SIZE);
random_T(in2, SIZE); random_T(in2, SIZE);
#pragma offload target(mic) in(in1,in2) inout(res) #pragma offload target(mic) in(in1,in2) inout(res)
{ {
// Parallel loop from main function // Parallel loop from main function
#pragma omp parallel for #pragma omp parallel for
for (i=0; i<SIZE; i++) for (i=0; i<SIZE; i++)
res[i] = in1[i] + in2[i]; res[i] = in1[i] + in2[i];
// or parallel loop is called inside the function // or parallel loop is called inside the function
add_mic(in1, in2, res, SIZE); add_mic(in1, in2, res, SIZE);
} }
//Check the results with CPU implementation //Check the results with CPU implementation
T res_cpu[SIZE]; T res_cpu[SIZE];
add_cpu(in1, in2, res_cpu, SIZE); add_cpu(in1, in2, res_cpu, SIZE);
compare(res, res_cpu, SIZE); compare(res, res_cpu, SIZE);
} }
``` ```
During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run
...@@ -566,29 +566,29 @@ $ g++ capsbasic.cpp -lOpenCL -o capsbasic -I/apps/intel/opencl/include/ ...@@ -566,29 +566,29 @@ $ g++ capsbasic.cpp -lOpenCL -o capsbasic -I/apps/intel/opencl/include/
After executing the complied binary file, following output should be displayed. After executing the complied binary file, following output should be displayed.
```console ```console
./capsbasic ./capsbasic
Number of available platforms: 1 Number of available platforms: 1
Platform names: Platform names:
[0] Intel(R) OpenCL [Selected] [0] Intel(R) OpenCL [Selected]
Number of devices available for each type: Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 1 CL_DEVICE_TYPE_CPU: 1
CL_DEVICE_TYPE_GPU: 0 CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 1 CL_DEVICE_TYPE_ACCELERATOR: 1
** Detailed information for each device *** ** Detailed information for each device ***
CL_DEVICE_TYPE_CPU[0] CL_DEVICE_TYPE_CPU[0]
CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
CL_DEVICE_AVAILABLE: 1 CL_DEVICE_AVAILABLE: 1
... ...
CL_DEVICE_TYPE_ACCELERATOR[0] CL_DEVICE_TYPE_ACCELERATOR[0]
CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
CL_DEVICE_AVAILABLE: 1 CL_DEVICE_AVAILABLE: 1
... ...
``` ```
!!! note !!! note
...@@ -612,23 +612,23 @@ $ g++ cmdoptions.cpp gemm.cpp ../common/basic.cpp ../common/cmdparser.cpp ../com ...@@ -612,23 +612,23 @@ $ g++ cmdoptions.cpp gemm.cpp ../common/basic.cpp ../common/cmdparser.cpp ../com
To see the performance of Intel Xeon Phi performing the DGEMM run the example as follows: To see the performance of Intel Xeon Phi performing the DGEMM run the example as follows:
```console ```console
./gemm -d 1 ./gemm -d 1
Platforms (1): Platforms (1):
[0] Intel(R) OpenCL [Selected] [0] Intel(R) OpenCL [Selected]
Devices (2): Devices (2):
[0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz [0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
[1] Intel(R) Many Integrated Core Acceleration Card [Selected] [1] Intel(R) Many Integrated Core Acceleration Card [Selected]
Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8" Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8"
Running gemm_nn kernel with matrix size: 3968x3968 Running gemm_nn kernel with matrix size: 3968x3968
Memory row stride to ensure necessary alignment: 15872 bytes Memory row stride to ensure necessary alignment: 15872 bytes
Size of memory region for one matrix: 62980096 bytes Size of memory region for one matrix: 62980096 bytes
Using alpha = 0.57599 and beta = 0.872412 Using alpha = 0.57599 and beta = 0.872412
... ...
Host time: 0.292953 sec. Host time: 0.292953 sec.
Host perf: 426.635 GFLOPS Host perf: 426.635 GFLOPS
Host time: 0.293334 sec. Host time: 0.293334 sec.
Host perf: 426.081 GFLOPS Host perf: 426.081 GFLOPS
... ...
``` ```
!!! hint !!! hint
...@@ -685,28 +685,28 @@ $ mpiifort -mmic -o mpi-test-mic mpi-test.f90 ...@@ -685,28 +685,28 @@ $ mpiifort -mmic -o mpi-test-mic mpi-test.f90
An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file) An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file)
```cpp ```cpp
#include <stdio.h> #include <stdio.h>
#include <mpi.h> #include <mpi.h>
int main (argc, argv) int main (argc, argv)
int argc; int argc;
char *argv[]; char *argv[];
{ {
int rank, size; int rank, size;
int len; int len;
char node[MPI_MAX_PROCESSOR_NAME]; char node[MPI_MAX_PROCESSOR_NAME];
MPI_Init (&argc, &argv); /* starts MPI */ MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
MPI_Get_processor_name(node,&len); MPI_Get_processor_name(node,&len);
printf( "Hello world from process %d of %d on host %s n", rank, size, node ); printf( "Hello world from process %d of %d on host %s n", rank, size, node );
MPI_Finalize(); MPI_Finalize();
return 0; return 0;
} }
``` ```
### MPI Programming Models ### MPI Programming Models
......
...@@ -56,7 +56,7 @@ Because a direct connection is not allowed to compute nodes on Salomon, you must ...@@ -56,7 +56,7 @@ Because a direct connection is not allowed to compute nodes on Salomon, you must
$ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz $ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz
``` ```
replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step). replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step).
If you use PuTTY on Windows, load Salomon connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding. If you use PuTTY on Windows, load Salomon connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment