From 47570116f2e3014f933e3d35953ba75821f0f07c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20Gajdu=C5=A1ek?= <gajdusek.pavel@gmail.com>
Date: Thu, 21 Sep 2017 09:06:14 +0200
Subject: [PATCH] the whole structure changed. IntelXeonPhi still duplicated...

---
 .../salomon/software/operating-system.md      |   5 -
 docs.it4i/software/debuggers/papi.md          |   2 +-
 .../intel-xeon-phi.anselm.md}                 | 268 +++++++++---------
 .../{salomon => }/software/intel-xeon-phi.md  |  62 ++--
 .../software/numerical-languages/octave.md    |   2 +-
 docs.it4i/software/numerical-languages/r.md   |   2 +-
 mkdocs.yml                                    |   6 +-
 7 files changed, 171 insertions(+), 176 deletions(-)
 delete mode 100644 docs.it4i/salomon/software/operating-system.md
 rename docs.it4i/{anselm/software/intel-xeon-phi.md => software/intel-xeon-phi.anselm.md} (85%)
 rename docs.it4i/{salomon => }/software/intel-xeon-phi.md (95%)

diff --git a/docs.it4i/salomon/software/operating-system.md b/docs.it4i/salomon/software/operating-system.md
deleted file mode 100644
index f68a9a97a..000000000
--- a/docs.it4i/salomon/software/operating-system.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Operating System
-
-The operating system on Salomon is Linux - [**CentOS 6.x**](https://en.wikipedia.org/wiki/CentOS)
-
-The CentOS Linux distribution is a stable, predictable, manageable and reproducible platform derived from the sources of Red Hat Enterprise Linux (RHEL).
diff --git a/docs.it4i/software/debuggers/papi.md b/docs.it4i/software/debuggers/papi.md
index c1c519ecd..15b03837e 100644
--- a/docs.it4i/software/debuggers/papi.md
+++ b/docs.it4i/software/debuggers/papi.md
@@ -193,7 +193,7 @@ $ ./matrix
 !!! note
     PAPI currently supports only a subset of counters on the Intel Xeon Phi processor compared to Intel Xeon, for example the floating point operations counter is missing.
 
-To use PAPI in [Intel Xeon Phi](../../anselm/software/intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" :
+To use PAPI in [Intel Xeon Phi](../intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" :
 
 ```console
 $ ml papi/5.3.2-mic
diff --git a/docs.it4i/anselm/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.anselm.md
similarity index 85%
rename from docs.it4i/anselm/software/intel-xeon-phi.md
rename to docs.it4i/software/intel-xeon-phi.anselm.md
index dd83a2298..b1e86256d 100644
--- a/docs.it4i/anselm/software/intel-xeon-phi.md
+++ b/docs.it4i/software/intel-xeon-phi.anselm.md
@@ -108,24 +108,24 @@ A very basic example of code that employs offload programming technique is shown
 !!! note
     This code is sequential and utilizes only single core of the accelerator.
 
-```console
+```cpp
 $ vim source-offload.cpp
 
-    #include <iostream>
+#include <iostream>
 
-    int main(int argc, char* argv[])
-    {
-        const int niter = 100000;
-        double result = 0;
+int main(int argc, char* argv[])
+{
+    const int niter = 100000;
+    double result = 0;
 
-     #pragma offload target(mic)
-        for (int i = 0; i < niter; ++i) {
-            const double t = (i + 0.5) / niter;
-            result += 4.0 / (t * t + 1.0);
-        }
-        result /= niter;
-        std::cout << "Pi ~ " << result << 'n';
+ #pragma offload target(mic)
+    for (int i = 0; i < niter; ++i) {
+        const double t = (i + 0.5) / niter;
+        result += 4.0 / (t * t + 1.0);
     }
+    result /= niter;
+    std::cout << "Pi ~ " << result << 'n';
+}
 ```
 
 To compile a code using Intel compiler run
@@ -144,82 +144,82 @@ $ ./bin-offload
 
 One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition.
 
-```console
+```cpp
 $ vim ./vect-add
 
-    #include <stdio.h>
-
-    typedef int T;
-
-    #define SIZE 1000
-
-    #pragma offload_attribute(push, target(mic))
-    T in1[SIZE];
-    T in2[SIZE];
-    T res[SIZE];
-    #pragma offload_attribute(pop)
-
-    // MIC function to add two vectors
-    __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
-      int i = 0;
-      #pragma omp parallel for
-        for (i = 0; i < size; i++)
-          c[i] = a[i] + b[i];
+#include <stdio.h>
+
+typedef int T;
+
+#define SIZE 1000
+
+#pragma offload_attribute(push, target(mic))
+T in1[SIZE];
+T in2[SIZE];
+T res[SIZE];
+#pragma offload_attribute(pop)
+
+// MIC function to add two vectors
+__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
+  int i = 0;
+  #pragma omp parallel for
+    for (i = 0; i < size; i++)
+      c[i] = a[i] + b[i];
+}
+
+// CPU function to add two vectors
+void add_cpu (T *a, T *b, T *c, int size) {
+  int i;
+  for (i = 0; i < size; i++)
+    c[i] = a[i] + b[i];
+}
+
+// CPU function to generate a vector of random numbers
+void random_T (T *a, int size) {
+  int i;
+  for (i = 0; i < size; i++)
+    a[i] = rand() % 10000; // random number between 0 and 9999
+}
+
+// CPU function to compare two vectors
+int compare(T *a, T *b, T size ){
+  int pass = 0;
+  int i;
+  for (i = 0; i < size; i++){
+    if (a[i] != b[i]) {
+      printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
+      pass = 1;
     }
+  }
+  if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
+  return pass;
+}
 
-    // CPU function to add two vectors
-    void add_cpu (T *a, T *b, T *c, int size) {
-      int i;
-      for (i = 0; i < size; i++)
-        c[i] = a[i] + b[i];
-    }
+int main()
+{
+  int i;
+  random_T(in1, SIZE);
+  random_T(in2, SIZE);
 
-    // CPU function to generate a vector of random numbers
-    void random_T (T *a, int size) {
-      int i;
-      for (i = 0; i < size; i++)
-        a[i] = rand() % 10000; // random number between 0 and 9999
-    }
+  #pragma offload target(mic) in(in1,in2)  inout(res)
+  {
 
-    // CPU function to compare two vectors
-    int compare(T *a, T *b, T size ){
-      int pass = 0;
-      int i;
-      for (i = 0; i < size; i++){
-        if (a[i] != b[i]) {
-          printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
-          pass = 1;
-        }
-      }
-      if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
-      return pass;
-    }
+    // Parallel loop from main function
+    #pragma omp parallel for
+    for (i=0; i<SIZE; i++)
+      res[i] = in1[i] + in2[i];
 
-    int main()
-    {
-      int i;
-      random_T(in1, SIZE);
-      random_T(in2, SIZE);
+    // or parallel loop is called inside the function
+    add_mic(in1, in2, res, SIZE);
 
-      #pragma offload target(mic) in(in1,in2)  inout(res)
-      {
+  }
 
-        // Parallel loop from main function
-        #pragma omp parallel for
-        for (i=0; i<SIZE; i++)
-          res[i] = in1[i] + in2[i];
+  //Check the results with CPU implementation
+  T res_cpu[SIZE];
+  add_cpu(in1, in2, res_cpu, SIZE);
+  compare(res, res_cpu, SIZE);
 
-        // or parallel loop is called inside the function
-        add_mic(in1, in2, res, SIZE);
-
-      }
-
-      //Check the results with CPU implementation
-      T res_cpu[SIZE];
-      add_cpu(in1, in2, res_cpu, SIZE);
-      compare(res, res_cpu, SIZE);
-
-    }
+}
 ```
 
 During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run
@@ -271,61 +271,61 @@ $ module load intel
 
 Following example show how to automatically offload an SGEMM (single precision - general matrix multiply) function to MIC coprocessor. The code can be copied to a file and compiled without any necessary modification.
 
-```console
+```cpp
 $ vim sgemm-ao-short.c
 
-    #include <stdio.h>
-    #include <stdlib.h>
-    #include <malloc.h>
-    #include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdint.h>
 
-    #include "mkl.h"
+#include "mkl.h"
 
-    int main(int argc, char **argv)
-    {
-            float *A, *B, *C; /* Matrices */
+int main(int argc, char **argv)
+{
+    float *A, *B, *C; /* Matrices */
 
-            MKL_INT N = 2560; /* Matrix dimensions */
-            MKL_INT LD = N; /* Leading dimension */
-            int matrix_bytes; /* Matrix size in bytes */
-            int matrix_elements; /* Matrix size in elements */
+    MKL_INT N = 2560; /* Matrix dimensions */
+    MKL_INT LD = N; /* Leading dimension */
+    int matrix_bytes; /* Matrix size in bytes */
+    int matrix_elements; /* Matrix size in elements */
 
-            float alpha = 1.0, beta = 1.0; /* Scaling factors */
-            char transa = 'N', transb = 'N'; /* Transposition options */
+    float alpha = 1.0, beta = 1.0; /* Scaling factors */
+    char transa = 'N', transb = 'N'; /* Transposition options */
 
-            int i, j; /* Counters */
+    int i, j; /* Counters */
 
-            matrix_elements = N * N;
-            matrix_bytes = sizeof(float) * matrix_elements;
+    matrix_elements = N * N;
+    matrix_bytes = sizeof(float) * matrix_elements;
 
-            /* Allocate the matrices */
-            A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
+    /* Allocate the matrices */
+    A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
 
-            /* Initialize the matrices */
-            for (i = 0; i < matrix_elements; i++) {
-                    A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
-            }
+    /* Initialize the matrices */
+    for (i = 0; i < matrix_elements; i++) {
+            A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
+    }
 
-            printf("Computing SGEMM on the hostn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    printf("Computing SGEMM on the hostn");
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-            printf("Enabling Automatic Offloadn");
-            /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
-            mkl_mic_enable();
+    printf("Enabling Automatic Offloadn");
+    /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
+    mkl_mic_enable();
 
-            int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
-            printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
+    int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
+    printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
 
-            printf("Computing SGEMM with automatic workdivisionn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    printf("Computing SGEMM with automatic workdivisionn");
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-            /* Free the matrix memory */
-            free(A); free(B); free(C);
+    /* Free the matrix memory */
+    free(A); free(B); free(C);
 
-            printf("Donen");
+    printf("Donen");
 
-        return 0;
-    }
+    return 0;
+}
 ```
 
 !!! note
@@ -573,28 +573,28 @@ $ mpiicc -mmic -o mpi-test-mic mpi-test.c
 An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file)
 
 ```cpp
-    #include <stdio.h>
-    #include <mpi.h>
+#include <stdio.h>
+#include <mpi.h>
 
-    int main (argc, argv)
-         int argc;
-         char *argv[];
-    {
-      int rank, size;
+int main (argc, argv)
+     int argc;
+     char *argv[];
+{
+  int rank, size;
 
-      int len;
-      char node[MPI_MAX_PROCESSOR_NAME];
+  int len;
+  char node[MPI_MAX_PROCESSOR_NAME];
 
-      MPI_Init (&argc, &argv);      /* starts MPI */
-      MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
-      MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
+  MPI_Init (&argc, &argv);      /* starts MPI */
+  MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
+  MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
 
-      MPI_Get_processor_name(node,&len);
+  MPI_Get_processor_name(node,&len);
 
-      printf( "Hello world from process %d of %d on host %s n", rank, size, node );
-      MPI_Finalize();
-      return 0;
-    }
+  printf( "Hello world from process %d of %d on host %s n", rank, size, node );
+  MPI_Finalize();
+  return 0;
+}
 ```
 
 ### MPI Programming Models
diff --git a/docs.it4i/salomon/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.md
similarity index 95%
rename from docs.it4i/salomon/software/intel-xeon-phi.md
rename to docs.it4i/software/intel-xeon-phi.md
index b2b6da255..f09e343ce 100644
--- a/docs.it4i/salomon/software/intel-xeon-phi.md
+++ b/docs.it4i/software/intel-xeon-phi.md
@@ -154,7 +154,7 @@ export OFFLOAD_REPORT=3
 
 A very basic example of code that employs offload programming technique is shown in the next listing. Please note that this code is sequential and utilizes only single core of the accelerator.
 
-```console
+```cpp
 $ cat source-offload.cpp
 
 #include <iostream>
@@ -190,7 +190,7 @@ $ ./bin-offload
 
 One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition.
 
-```console
+```cpp
 $ cat ./vect-add
 
 #include <stdio.h>
@@ -317,7 +317,7 @@ $ ml intel
 
 The code can be copied to a file and compiled without any necessary modification.
 
-```console
+```cpp
 $ vim sgemm-ao-short.c
 
 #include <stdio.h>
@@ -329,46 +329,46 @@ $ vim sgemm-ao-short.c
 
 int main(int argc, char **argv)
 {
-        float *A, *B, *C; /* Matrices */
+    float *A, *B, *C; /* Matrices */
 
-        MKL_INT N = 2560; /* Matrix dimensions */
-        MKL_INT LD = N; /* Leading dimension */
-        int matrix_bytes; /* Matrix size in bytes */
-        int matrix_elements; /* Matrix size in elements */
+    MKL_INT N = 2560; /* Matrix dimensions */
+    MKL_INT LD = N; /* Leading dimension */
+    int matrix_bytes; /* Matrix size in bytes */
+    int matrix_elements; /* Matrix size in elements */
 
-        float alpha = 1.0, beta = 1.0; /* Scaling factors */
-        char transa = 'N', transb = 'N'; /* Transposition options */
+    float alpha = 1.0, beta = 1.0; /* Scaling factors */
+    char transa = 'N', transb = 'N'; /* Transposition options */
 
-        int i, j; /* Counters */
+    int i, j; /* Counters */
 
-        matrix_elements = N * N;
-        matrix_bytes = sizeof(float) * matrix_elements;
+    matrix_elements = N * N;
+    matrix_bytes = sizeof(float) * matrix_elements;
 
-        /* Allocate the matrices */
-        A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
+    /* Allocate the matrices */
+    A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
 
-        /* Initialize the matrices */
-        for (i = 0; i < matrix_elements; i++) {
-                A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
-        }
+    /* Initialize the matrices */
+    for (i = 0; i < matrix_elements; i++) {
+            A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
+    }
 
-        printf("Computing SGEMM on the host\n");
-        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    printf("Computing SGEMM on the host\n");
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-        printf("Enabling Automatic Offload\n");
-        /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
-        mkl_mic_enable();
+    printf("Enabling Automatic Offload\n");
+    /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
+    mkl_mic_enable();
 
-        int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
-        printf("Automatic Offload enabled: %d MIC devices present\n",   ndevices);
+    int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
+    printf("Automatic Offload enabled: %d MIC devices present\n",   ndevices);
 
-        printf("Computing SGEMM with automatic workdivision\n");
-        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    printf("Computing SGEMM with automatic workdivision\n");
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
 
-        /* Free the matrix memory */
-        free(A); free(B); free(C);
+    /* Free the matrix memory */
+    free(A); free(B); free(C);
 
-        printf("Done\n");
+    printf("Done\n");
 
     return 0;
 }
diff --git a/docs.it4i/software/numerical-languages/octave.md b/docs.it4i/software/numerical-languages/octave.md
index 4d96754cc..ca785e75d 100644
--- a/docs.it4i/software/numerical-languages/octave.md
+++ b/docs.it4i/software/numerical-languages/octave.md
@@ -60,7 +60,7 @@ Octave may use MPI for interprocess communication This functionality is currentl
 
 ## Xeon Phi Support
 
-Octave may take advantage of the Xeon Phi accelerators. This will only work on the  [Intel Xeon Phi](../../salomon/software/intel-xeon-phi/)  [accelerated nodes](../../salomon/compute-nodes/).
+Octave may take advantage of the Xeon Phi accelerators. This will only work on the  [Intel Xeon Phi](../intel-xeon-phi/)  [accelerated nodes](../../salomon/compute-nodes/).
 
 ### Automatic Offload Support
 
diff --git a/docs.it4i/software/numerical-languages/r.md b/docs.it4i/software/numerical-languages/r.md
index 771c3efd5..3322a89ac 100644
--- a/docs.it4i/software/numerical-languages/r.md
+++ b/docs.it4i/software/numerical-languages/r.md
@@ -402,4 +402,4 @@ By leveraging MKL, R can accelerate certain computations, most notably linear al
 $ export MKL_MIC_ENABLE=1
 ```
 
-[Read more about automatic offload](../../anselm/software/intel-xeon-phi/)
+[Read more about automatic offload](../intel-xeon-phi/)
diff --git a/mkdocs.yml b/mkdocs.yml
index 2db0d019d..1188f3408 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -113,6 +113,9 @@ pages:
       - Intel MKL: software/intel-suite/intel-mkl.md
       - Intel TBB: software/intel-suite/intel-tbb.md
       - Intel Trace Analyzer and Collector: software/intel-suite/intel-trace-analyzer-and-collector.md
+    - 'Intel Xeon Phi':
+      - Intel Xeon Phi Salomon: software/intel-xeon-phi.md
+      - Intel Xeon Phi Anselm: software/intel-xeon-phi.anselm.md
     - ISV Licenses: software/isv_licenses.md
     - Java: software/java.md
     - 'Machine larning':
@@ -145,10 +148,7 @@ pages:
     - OpenFOAM: software/openfoam.md
     - Operating System: software/operating-system.md
     - ParaView: software/paraview.md
-    - Salomon Software:
-      - Intel Xeon Phi: salomon/software/intel-xeon-phi.md
     - Anselm Software:
-      - Intel Xeon Phi: anselm/software/intel-xeon-phi.md
       - NVIDIA CUDA: anselm/software/nvidia-cuda.md
       - Virtualization: anselm/software/virtualization.md
   - PBS Pro Documentation: pbspro.md
-- 
GitLab