From 5c20d591a4316b5befad14ff3a2a2bde6bf480c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Krup=C4=8D=C3=ADk?= <lukas.krupcik@vsb.cz>
Date: Fri, 20 Jan 2017 09:38:59 +0100
Subject: [PATCH] test

---
 .../software/intel-xeon-phi.md                | 330 +++++++++---------
 1 file changed, 165 insertions(+), 165 deletions(-)

diff --git a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md
index 8ea1ba38a..80e76efec 100644
--- a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md
+++ b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md
@@ -19,7 +19,7 @@ To set up the environment module "Intel" has to be loaded
     $ module load intel/13.5.192
 ```
 
-Information about the hardware can be obtained by running the micinfo program on the host.
+Information about the hardware can be obtained by running the micinfo program on the host.
 
 ```bash
     $ /usr/bin/micinfo
@@ -32,61 +32,61 @@ The output of the "micinfo" utility executed on one of the Anselm node is as fol
 
     Created Mon Jul 22 00:23:50 2013
 
-            System Info
-                    HOST OS                 : Linux
-                    OS Version              : 2.6.32-279.5.2.bl6.Bull.33.x86_64
-                    Driver Version          : 6720-15
-                    MPSS Version            : 2.1.6720-15
-                    Host Physical Memory    : 98843 MB
+            System Info
+                    HOST OS                 : Linux
+                    OS Version              : 2.6.32-279.5.2.bl6.Bull.33.x86_64
+                    Driver Version          : 6720-15
+                    MPSS Version            : 2.1.6720-15
+                    Host Physical Memory    : 98843 MB
 
     Device No: 0, Device Name: mic0
 
-            Version
-                    Flash Version            : 2.1.03.0386
-                    SMC Firmware Version     : 1.15.4830
-                    SMC Boot Loader Version  : 1.8.4326
-                    uOS Version              : 2.6.38.8-g2593b11
-                    Device Serial Number     : ADKC30102482
-
-            Board
-                    Vendor ID                : 0x8086
-                    Device ID                : 0x2250
-                    Subsystem ID             : 0x2500
-                    Coprocessor Stepping ID  : 3
-                    PCIe Width               : x16
-                    PCIe Speed               : 5 GT/s
-                    PCIe Max payload size    : 256 bytes
-                    PCIe Max read req size   : 512 bytes
-                    Coprocessor Model        : 0x01
-                    Coprocessor Model Ext    : 0x00
-                    Coprocessor Type         : 0x00
-                    Coprocessor Family       : 0x0b
-                    Coprocessor Family Ext   : 0x00
-                    Coprocessor Stepping     : B1
-                    Board SKU                : B1PRQ-5110P/5120D
-                    ECC Mode                 : Enabled
-                    SMC HW Revision          : Product 225W Passive CS
-
-            Cores
-                    Total No of Active Cores : 60
-                    Voltage                  : 1032000 uV
-                    Frequency                : 1052631 kHz
-
-            Thermal
-                    Fan Speed Control        : N/A
-                    Fan RPM                  : N/A
-                    Fan PWM                  : N/A
-                    Die Temp                 : 49 C
-
-            GDDR
-                    GDDR Vendor              : Elpida
-                    GDDR Version             : 0x1
-                    GDDR Density             : 2048 Mb
-                    GDDR Size                : 7936 MB
-                    GDDR Technology          : GDDR5
-                    GDDR Speed               : 5.000000 GT/s
-                    GDDR Frequency           : 2500000 kHz
-                    GDDR Voltage             : 1501000 uV
+            Version
+                    Flash Version            : 2.1.03.0386
+                    SMC Firmware Version     : 1.15.4830
+                    SMC Boot Loader Version  : 1.8.4326
+                    uOS Version              : 2.6.38.8-g2593b11
+                    Device Serial Number     : ADKC30102482
+
+            Board
+                    Vendor ID                : 0x8086
+                    Device ID                : 0x2250
+                    Subsystem ID             : 0x2500
+                    Coprocessor Stepping ID  : 3
+                    PCIe Width               : x16
+                    PCIe Speed               : 5 GT/s
+                    PCIe Max payload size    : 256 bytes
+                    PCIe Max read req size   : 512 bytes
+                    Coprocessor Model        : 0x01
+                    Coprocessor Model Ext    : 0x00
+                    Coprocessor Type         : 0x00
+                    Coprocessor Family       : 0x0b
+                    Coprocessor Family Ext   : 0x00
+                    Coprocessor Stepping     : B1
+                    Board SKU                : B1PRQ-5110P/5120D
+                    ECC Mode                 : Enabled
+                    SMC HW Revision          : Product 225W Passive CS
+
+            Cores
+                    Total No of Active Cores : 60
+                    Voltage                  : 1032000 uV
+                    Frequency                : 1052631 kHz
+
+            Thermal
+                    Fan Speed Control        : N/A
+                    Fan RPM                  : N/A
+                    Fan PWM                  : N/A
+                    Die Temp                 : 49 C
+
+            GDDR
+                    GDDR Vendor              : Elpida
+                    GDDR Version             : 0x1
+                    GDDR Density             : 2048 Mb
+                    GDDR Size                : 7936 MB
+                    GDDR Technology          : GDDR5
+                    GDDR Speed               : 5.000000 GT/s
+                    GDDR Frequency           : 2500000 kHz
+                    GDDR Voltage             : 1501000 uV
 ```
 
 Offload Mode
@@ -113,16 +113,16 @@ A very basic example of code that employs offload programming technique is shown
 
     int main(int argc, char* argv[])
     {
-        const int niter = 100000;
-        double result = 0;
-
-     #pragma offload target(mic)
-        for (int i = 0; i < niter; ++i) {
-            const double t = (i + 0.5) / niter;
-            result += 4.0 / (t * t + 1.0);
-        }
-        result /= niter;
-        std::cout << "Pi ~ " << result << 'n';
+        const int niter = 100000;
+        double result = 0;
+
+     #pragma offload target(mic)
+        for (int i = 0; i < niter; ++i) {
+            const double t = (i + 0.5) / niter;
+            result += 4.0 / (t * t + 1.0);
+        }
+        result /= niter;
+        std::cout << "Pi ~ " << result << 'n';
     }
 ```
 
@@ -159,63 +159,63 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo
 
     // MIC function to add two vectors
     __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
-      int i = 0;
-      #pragma omp parallel for
-        for (i = 0; i < size; i++)
-          c[i] = a[i] + b[i];
+      int i = 0;
+      #pragma omp parallel for
+        for (i = 0; i < size; i++)
+          c[i] = a[i] + b[i];
     }
 
     // CPU function to add two vectors
     void add_cpu (T *a, T *b, T *c, int size) {
-      int i;
-      for (i = 0; i < size; i++)
-        c[i] = a[i] + b[i];
+      int i;
+      for (i = 0; i < size; i++)
+        c[i] = a[i] + b[i];
     }
 
     // CPU function to generate a vector of random numbers
     void random_T (T *a, int size) {
-      int i;
-      for (i = 0; i < size; i++)
-        a[i] = rand() % 10000; // random number between 0 and 9999
+      int i;
+      for (i = 0; i < size; i++)
+        a[i] = rand() % 10000; // random number between 0 and 9999
     }
 
     // CPU function to compare two vectors
     int compare(T *a, T *b, T size ){
-      int pass = 0;
-      int i;
-      for (i = 0; i < size; i++){
-        if (a[i] != b[i]) {
-          printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
-          pass = 1;
-        }
-      }
-      if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
-      return pass;
+      int pass = 0;
+      int i;
+      for (i = 0; i < size; i++){
+        if (a[i] != b[i]) {
+          printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
+          pass = 1;
+        }
+      }
+      if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
+      return pass;
     }
 
     int main()
     {
-      int i;
-      random_T(in1, SIZE);
-      random_T(in2, SIZE);
+      int i;
+      random_T(in1, SIZE);
+      random_T(in2, SIZE);
 
-      #pragma offload target(mic) in(in1,in2)  inout(res)
-      {
+      #pragma offload target(mic) in(in1,in2)  inout(res)
+      {
 
-        // Parallel loop from main function
-        #pragma omp parallel for
-        for (i=0; i<SIZE; i++)
-          res[i] = in1[i] + in2[i];
+        // Parallel loop from main function
+        #pragma omp parallel for
+        for (i=0; i<SIZE; i++)
+          res[i] = in1[i] + in2[i];
 
-        // or parallel loop is called inside the function
-        add_mic(in1, in2, res, SIZE);
+        // or parallel loop is called inside the function
+        add_mic(in1, in2, res, SIZE);
 
-      }
+      }
 
-      //Check the results with CPU implementation
-      T res_cpu[SIZE];
-      add_cpu(in1, in2, res_cpu, SIZE);
-      compare(res, res_cpu, SIZE);
+      //Check the results with CPU implementation
+      T res_cpu[SIZE];
+      add_cpu(in1, in2, res_cpu, SIZE);
+      compare(res, res_cpu, SIZE);
 
     }
 ```
@@ -282,48 +282,48 @@ Following example show how to automatically offload an SGEMM (single precision -
 
     int main(int argc, char **argv)
     {
-            float *A, *B, *C; /* Matrices */
+            float *A, *B, *C; /* Matrices */
 
-            MKL_INT N = 2560; /* Matrix dimensions */
-            MKL_INT LD = N; /* Leading dimension */
-            int matrix_bytes; /* Matrix size in bytes */
-            int matrix_elements; /* Matrix size in elements */
+            MKL_INT N = 2560; /* Matrix dimensions */
+            MKL_INT LD = N; /* Leading dimension */
+            int matrix_bytes; /* Matrix size in bytes */
+            int matrix_elements; /* Matrix size in elements */
 
-            float alpha = 1.0, beta = 1.0; /* Scaling factors */
-            char transa = 'N', transb = 'N'; /* Transposition options */
+            float alpha = 1.0, beta = 1.0; /* Scaling factors */
+            char transa = 'N', transb = 'N'; /* Transposition options */
 
-            int i, j; /* Counters */
+            int i, j; /* Counters */
 
-            matrix_elements = N * N;
-            matrix_bytes = sizeof(float) * matrix_elements;
+            matrix_elements = N * N;
+            matrix_bytes = sizeof(float) * matrix_elements;
 
-            /* Allocate the matrices */
-            A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
+            /* Allocate the matrices */
+            A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
 
-            /* Initialize the matrices */
-            for (i = 0; i < matrix_elements; i++) {
-                    A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
-            }
+            /* Initialize the matrices */
+            for (i = 0; i < matrix_elements; i++) {
+                    A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
+            }
 
-            printf("Computing SGEMM on the hostn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+            printf("Computing SGEMM on the hostn");
+            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-            printf("Enabling Automatic Offloadn");
-            /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
-            mkl_mic_enable();
+            printf("Enabling Automatic Offloadn");
+            /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
+            mkl_mic_enable();
 
-            int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
-            printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
+            int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
+            printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
 
-            printf("Computing SGEMM with automatic workdivisionn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+            printf("Computing SGEMM with automatic workdivisionn");
+            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-            /* Free the matrix memory */
-            free(A); free(B); free(C);
+            /* Free the matrix memory */
+            free(A); free(B); free(C);
 
-            printf("Donen");
+            printf("Donen");
 
-        return 0;
+        return 0;
     }
 ```
 
@@ -349,10 +349,10 @@ The output of a code should look similar to following listing, where lines start
     Enabling Automatic Offload
     Automatic Offload enabled: 1 MIC devices present
     Computing SGEMM with automatic workdivision
-    [MKL] [MIC --] [AO Function]    SGEMM
-    [MKL] [MIC --] [AO SGEMM Workdivision]  0.00 1.00
-    [MKL] [MIC 00] [AO SGEMM CPU Time]      0.463351 seconds
-    [MKL] [MIC 00] [AO SGEMM MIC Time]      0.179608 seconds
+    [MKL] [MIC --] [AO Function]    SGEMM
+    [MKL] [MIC --] [AO SGEMM Workdivision]  0.00 1.00
+    [MKL] [MIC 00] [AO SGEMM CPU Time]      0.463351 seconds
+    [MKL] [MIC 00] [AO SGEMM MIC Time]      0.179608 seconds
     [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 52428800 bytes
     [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 26214400 bytes
     Done
@@ -479,23 +479,23 @@ After executing the complied binary file, following output should be displayed.
 
     Number of available platforms: 1
     Platform names:
-        [0] Intel(R) OpenCL [Selected]
+        [0] Intel(R) OpenCL [Selected]
     Number of devices available for each type:
-        CL_DEVICE_TYPE_CPU: 1
-        CL_DEVICE_TYPE_GPU: 0
-        CL_DEVICE_TYPE_ACCELERATOR: 1
+        CL_DEVICE_TYPE_CPU: 1
+        CL_DEVICE_TYPE_GPU: 0
+        CL_DEVICE_TYPE_ACCELERATOR: 1
 
     ** Detailed information for each device ***
 
     CL_DEVICE_TYPE_CPU[0]
-        CL_DEVICE_NAME:        Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
-        CL_DEVICE_AVAILABLE: 1
+        CL_DEVICE_NAME:        Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
+        CL_DEVICE_AVAILABLE: 1
 
     ...
 
     CL_DEVICE_TYPE_ACCELERATOR[0]
-        CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
-        CL_DEVICE_AVAILABLE: 1
+        CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
+        CL_DEVICE_AVAILABLE: 1
 
     ...
 ```
@@ -579,23 +579,23 @@ An example of basic MPI version of "hello-world" example in C language, that can
     #include <mpi.h>
 
     int main (argc, argv)
-         int argc;
-         char *argv[];
+         int argc;
+         char *argv[];
     {
-      int rank, size;
+      int rank, size;
 
-      int len;
-      char node[MPI_MAX_PROCESSOR_NAME];
+      int len;
+      char node[MPI_MAX_PROCESSOR_NAME];
 
-      MPI_Init (&argc, &argv);      /* starts MPI */
-      MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
-      MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
+      MPI_Init (&argc, &argv);      /* starts MPI */
+      MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
+      MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
 
-      MPI_Get_processor_name(node,&len);
+      MPI_Get_processor_name(node,&len);
 
-      printf( "Hello world from process %d of %d on host %s n", rank, size, node );
-      MPI_Finalize();
-      return 0;
+      printf( "Hello world from process %d of %d on host %s n", rank, size, node );
+      MPI_Finalize();
+      return 0;
     }
 ```
 
@@ -722,8 +722,8 @@ The output should be again similar to:
 	Please note that the **"mpiexec.hydra"** requires a file the MIC filesystem. If the file is missing please contact the system administrators. A simple test to see if the file is present is to execute:
 
 ```bash
-      $ ssh mic0 ls /bin/pmi_proxy
-      /bin/pmi_proxy
+      $ ssh mic0 ls /bin/pmi_proxy
+      /bin/pmi_proxy
 ```
 
 **Execution on host - MPI processes distributed over multiple accelerators on multiple nodes**
@@ -769,8 +769,8 @@ The launch the MPI program use:
 ```bash
     $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
      -genv I_MPI_FABRICS_LIST tcp
-     -genv I_MPI_FABRICS shm:tcp
-     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
+     -genv I_MPI_FABRICS shm:tcp
+     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
      -host cn204-mic0 -n 4 ~/mpi-test-mic
     : -host cn205-mic0 -n 6 ~/mpi-test-mic
 ```
@@ -779,8 +779,8 @@ or using mpirun:
 ```bash
     $ mpirun -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
      -genv I_MPI_FABRICS_LIST tcp
-     -genv I_MPI_FABRICS shm:tcp
-     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
+     -genv I_MPI_FABRICS shm:tcp
+     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
      -host cn204-mic0 -n 4 ~/mpi-test-mic
     : -host cn205-mic0 -n 6 ~/mpi-test-mic
 ```
@@ -805,8 +805,8 @@ The same way MPI program can be executed on multiple hosts:
 ```bash
     $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
      -genv I_MPI_FABRICS_LIST tcp
-     -genv I_MPI_FABRICS shm:tcp
-     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
+     -genv I_MPI_FABRICS shm:tcp
+     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
      -host cn204 -n 4 ~/mpi-test
     : -host cn205 -n 6 ~/mpi-test
 ```
@@ -822,7 +822,7 @@ In the previous section we have compiled two binary files, one for hosts "**mpi-
     $ mpiexec.hydra
      -genv I_MPI_FABRICS_LIST tcp
      -genv I_MPI_FABRICS shm:tcp
-     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
+     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
      -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
      -host cn205 -n 2 ~/mpi-test
     : -host cn205-mic0 -n 2 ~/mpi-test-mic
@@ -851,7 +851,7 @@ An example of a machine file that uses 2 >hosts (**cn205** and **cn206**) and 2
     cn206-mic0:2
 ```
 
-In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"**  and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic").
+In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"**  and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic").
 
 ```bash
     $ export I_MPI_MIC_POSTFIX=-mic
@@ -864,8 +864,8 @@ To run the MPI code using mpirun and the machine file "hosts_file_mix" use:
      -genv I_MPI_FABRICS shm:tcp
      -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
      -genv I_MPI_FABRICS_LIST tcp
-     -genv I_MPI_FABRICS shm:tcp
-     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
+     -genv I_MPI_FABRICS shm:tcp
+     -genv I_MPI_TCP_NETMASK=10.1.0.0/16
      -machinefile hosts_file_mix
      ~/mpi-test
 ```
@@ -901,4 +901,4 @@ Please note each host or accelerator is listed only per files. User has to speci
 
 Optimization
 ------------
-For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization")
+For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization")
-- 
GitLab