mdlint

3fab3cd7 · Pavel Gajdušek · 6fc0e23d · 3fab3cd7 · 3fab3cd7
Commit 3fab3cd7 authored 7 years ago by Pavel Gajdušek
--- a/docs.it4i/salomon/software/intel-xeon-phi.md
+++ b/docs.it4i/salomon/software/intel-xeon-phi.md
@@ -193,79 +193,79 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo
 ```console
 $ cat ./vect-add
-    #include <stdio.h>
+#include <stdio.h>
-    typedef int T;
+typedef int T;
-    #define SIZE 1000
+#define SIZE 1000
-    #pragma offload_attribute(push, target(mic))
+#pragma offload_attribute(push, target(mic))
-    T in1[SIZE];
+T in1[SIZE];
-    T in2[SIZE];
+T in2[SIZE];
-    T res[SIZE];
+T res[SIZE];
-    #pragma offload_attribute(pop)
+#pragma offload_attribute(pop)
-    // MIC function to add two vectors
+// MIC function to add two vectors
-    __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
+__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
-      int i = 0;
+  int i = 0;
-      #pragma omp parallel for
+  #pragma omp parallel for
-        for (i = 0; i < size; i++)
+    for (i = 0; i < size; i++)
-          c[i] = a[i] + b[i];
+      c[i] = a[i] + b[i];
-    }
+}
-    // CPU function to add two vectors
+// CPU function to add two vectors
-    void add_cpu (T *a, T *b, T *c, int size) {
+void add_cpu (T *a, T *b, T *c, int size) {
-      int i;
+  int i;
-      for (i = 0; i < size; i++)
+  for (i = 0; i < size; i++)
-        c[i] = a[i] + b[i];
+    c[i] = a[i] + b[i];
-    }
+}
-    // CPU function to generate a vector of random numbers
+// CPU function to generate a vector of random numbers
-    void random_T (T *a, int size) {
+void random_T (T *a, int size) {
-      int i;
+  int i;
-      for (i = 0; i < size; i++)
+  for (i = 0; i < size; i++)
-        a[i] = rand() % 10000; // random number between 0 and 9999
+    a[i] = rand() % 10000; // random number between 0 and 9999
-    }
+}
-    // CPU function to compare two vectors
+// CPU function to compare two vectors
-    int compare(T *a, T *b, T size ){
+int compare(T *a, T *b, T size ){
-      int pass = 0;
+  int pass = 0;
-      int i;
+  int i;
-      for (i = 0; i < size; i++){
+  for (i = 0; i < size; i++){
-        if (a[i] != b[i]) {
+    if (a[i] != b[i]) {
-          printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
+      printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
-          pass = 1;
+      pass = 1;
-        }
-      }
-      if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
-      return pass;
    }
+  }
+  if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
+  return pass;
+}
-    int main()
+int main()
-    {
+{
-      int i;
+  int i;
-      random_T(in1, SIZE);
+  random_T(in1, SIZE);
-      random_T(in2, SIZE);
+  random_T(in2, SIZE);
-      #pragma offload target(mic) in(in1,in2)  inout(res)
+  #pragma offload target(mic) in(in1,in2)  inout(res)
-      {
+  {
-        // Parallel loop from main function
+    // Parallel loop from main function
-        #pragma omp parallel for
+    #pragma omp parallel for
-        for (i=0; i<SIZE; i++)
+    for (i=0; i<SIZE; i++)
-          res[i] = in1[i] + in2[i];
+      res[i] = in1[i] + in2[i];
-        // or parallel loop is called inside the function
+    // or parallel loop is called inside the function
-        add_mic(in1, in2, res, SIZE);
+    add_mic(in1, in2, res, SIZE);
-      }
+  }
-      //Check the results with CPU implementation
+  //Check the results with CPU implementation
-      T res_cpu[SIZE];
+  T res_cpu[SIZE];
-      add_cpu(in1, in2, res_cpu, SIZE);
+  add_cpu(in1, in2, res_cpu, SIZE);
-      compare(res, res_cpu, SIZE);
+  compare(res, res_cpu, SIZE);
-    }
+}
 ```
 During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run
@@ -566,29 +566,29 @@ $ g++ capsbasic.cpp -lOpenCL -o capsbasic -I/apps/intel/opencl/include/
 After executing the complied binary file, following output should be displayed.
 ```console
-    ./capsbasic
+./capsbasic
-    Number of available platforms: 1
+Number of available platforms: 1
-    Platform names:
+Platform names:
-        [0] Intel(R) OpenCL [Selected]
+    [0] Intel(R) OpenCL [Selected]
-    Number of devices available for each type:
+Number of devices available for each type:
-        CL_DEVICE_TYPE_CPU: 1
+    CL_DEVICE_TYPE_CPU: 1
-        CL_DEVICE_TYPE_GPU: 0
+    CL_DEVICE_TYPE_GPU: 0
-        CL_DEVICE_TYPE_ACCELERATOR: 1
+    CL_DEVICE_TYPE_ACCELERATOR: 1
-    ** Detailed information for each device ***
+** Detailed information for each device ***
-    CL_DEVICE_TYPE_CPU[0]
+CL_DEVICE_TYPE_CPU[0]
-        CL_DEVICE_NAME:        Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
+    CL_DEVICE_NAME:        Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
-        CL_DEVICE_AVAILABLE: 1
+    CL_DEVICE_AVAILABLE: 1
-    ...
+...
-    CL_DEVICE_TYPE_ACCELERATOR[0]
+CL_DEVICE_TYPE_ACCELERATOR[0]
-        CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
+    CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
-        CL_DEVICE_AVAILABLE: 1
+    CL_DEVICE_AVAILABLE: 1
-    ...
+...
 ```
 !!! note
@@ -612,23 +612,23 @@ $ g++ cmdoptions.cpp gemm.cpp ../common/basic.cpp ../common/cmdparser.cpp ../com
 To see the performance of Intel Xeon Phi performing the DGEMM run the example as follows:
 ```console
-    ./gemm -d 1
+./gemm -d 1
-    Platforms (1):
+Platforms (1):
-     [0] Intel(R) OpenCL [Selected]
+ [0] Intel(R) OpenCL [Selected]
-    Devices (2):
+Devices (2):
-     [0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
+ [0] Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
-     [1] Intel(R) Many Integrated Core Acceleration Card [Selected]
+ [1] Intel(R) Many Integrated Core Acceleration Card [Selected]
-    Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8"
+Build program options: "-DT=float -DTILE_SIZE_M=1 -DTILE_GROUP_M=16 -DTILE_SIZE_N=128 -DTILE_GROUP_N=1 -DTILE_SIZE_K=8"
-    Running gemm_nn kernel with matrix size: 3968x3968
+Running gemm_nn kernel with matrix size: 3968x3968
-    Memory row stride to ensure necessary alignment: 15872 bytes
+Memory row stride to ensure necessary alignment: 15872 bytes
-    Size of memory region for one matrix: 62980096 bytes
+Size of memory region for one matrix: 62980096 bytes
-    Using alpha = 0.57599 and beta = 0.872412
+Using alpha = 0.57599 and beta = 0.872412
-    ...
+...
-    Host time: 0.292953 sec.
+Host time: 0.292953 sec.
-    Host perf: 426.635 GFLOPS
+Host perf: 426.635 GFLOPS
-    Host time: 0.293334 sec.
+Host time: 0.293334 sec.
-    Host perf: 426.081 GFLOPS
+Host perf: 426.081 GFLOPS
-    ...
+...
 ```
 !!! hint
@@ -685,28 +685,28 @@ $ mpiifort -mmic -o mpi-test-mic mpi-test.f90
 An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file)
 ```cpp
-    #include <stdio.h>
+#include <stdio.h>
-    #include <mpi.h>
+#include <mpi.h>
-    int main (argc, argv)
+int main (argc, argv)
-         int argc;
+     int argc;
-         char *argv[];
+     char *argv[];
-    {
+{
-      int rank, size;
+  int rank, size;
-      int len;
+  int len;
-      char node[MPI_MAX_PROCESSOR_NAME];
+  char node[MPI_MAX_PROCESSOR_NAME];
-      MPI_Init (&argc, &argv);      /* starts MPI */
+  MPI_Init (&argc, &argv);      /* starts MPI */
-      MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
+  MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
-      MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
+  MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
-      MPI_Get_processor_name(node,&len);
+  MPI_Get_processor_name(node,&len);
-      printf( "Hello world from process %d of %d on host %s n", rank, size, node );
+  printf( "Hello world from process %d of %d on host %s n", rank, size, node );
-      MPI_Finalize();
+  MPI_Finalize();
-      return 0;
+  return 0;
-    }
+}
 ```
 ### MPI Programming Models

--- a/docs.it4i/salomon/software/paraview.md
+++ b/docs.it4i/salomon/software/paraview.md
@@ -56,7 +56,7 @@ Because a direct connection is not allowed to compute nodes on Salomon, you must
 $ ssh -TN -L 12345:r37u29n1006:11111 username@salomon.it4i.cz
 ```
-replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step). 
+replace username with your login and r37u29n1006 with the name of compute node your ParaView server is running on (see previous step).
 If you use PuTTY on Windows, load Salomon connection configuration, then go to *Connection* -> *SSH* -> *Tunnels* to set up the port forwarding.