the whole structure changed. IntelXeonPhi still duplicated...

47570116 · Pavel Gajdušek · ecc099ef · ecc099ef · 47570116 · 47570116
Commit 47570116 authored 7 years ago by Pavel Gajdušek
--- a/docs.it4i/salomon/software/operating-system.md
+++ b/docs.it4i/salomon/software/operating-system.md
-# Operating System
-The operating system on Salomon is Linux - [**CentOS 6.x**](https://en.wikipedia.org/wiki/CentOS)
-The CentOS Linux distribution is a stable, predictable, manageable and reproducible platform derived from the sources of Red Hat Enterprise Linux (RHEL).
--- a/docs.it4i/software/debuggers/papi.md
+++ b/docs.it4i/software/debuggers/papi.md
@@ -193,7 +193,7 @@ $ ./matrix
 !!! note
    PAPI currently supports only a subset of counters on the Intel Xeon Phi processor compared to Intel Xeon, for example the floating point operations counter is missing.
-To use PAPI in [Intel Xeon Phi](../../anselm/software/intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" :
+To use PAPI in [Intel Xeon Phi](../intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" :
 ```console
 $ ml papi/5.3.2-mic

--- a/docs.it4i/anselm/software/intel-xeon-phi.md
+++ b/docs.it4i/anselm/software/intel-xeon-phi.md
@@ -108,24 +108,24 @@ A very basic example of code that employs offload programming technique is shown
 !!! note
    This code is sequential and utilizes only single core of the accelerator.
-```console
+```cpp
 $ vim source-offload.cpp
-    #include <iostream>
+#include <iostream>
-    int main(int argc, char* argv[])
+int main(int argc, char* argv[])
-    {
+{
-        const int niter = 100000;
+    const int niter = 100000;
-        double result = 0;
+    double result = 0;
-     #pragma offload target(mic)
+ #pragma offload target(mic)
-        for (int i = 0; i < niter; ++i) {
+    for (int i = 0; i < niter; ++i) {
-            const double t = (i + 0.5) / niter;
+        const double t = (i + 0.5) / niter;
-            result += 4.0 / (t * t + 1.0);
+        result += 4.0 / (t * t + 1.0);
-        }
-        result /= niter;
-        std::cout << "Pi ~ " << result << 'n';
    }
+    result /= niter;
+    std::cout << "Pi ~ " << result << 'n';
+}
 ```
 To compile a code using Intel compiler run
@@ -144,82 +144,82 @@ $ ./bin-offload
 One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition.
-```console
+```cpp
 $ vim ./vect-add
-    #include <stdio.h>
+#include <stdio.h>
-    typedef int T;
+typedef int T;
-    #define SIZE 1000
+#define SIZE 1000
-    #pragma offload_attribute(push, target(mic))
+#pragma offload_attribute(push, target(mic))
-    T in1[SIZE];
+T in1[SIZE];
-    T in2[SIZE];
+T in2[SIZE];
-    T res[SIZE];
+T res[SIZE];
-    #pragma offload_attribute(pop)
+#pragma offload_attribute(pop)
-    // MIC function to add two vectors
+// MIC function to add two vectors
-    __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
+__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
-      int i = 0;
+  int i = 0;
-      #pragma omp parallel for
+  #pragma omp parallel for
-        for (i = 0; i < size; i++)
+    for (i = 0; i < size; i++)
-          c[i] = a[i] + b[i];
+      c[i] = a[i] + b[i];
+}
+// CPU function to add two vectors
+void add_cpu (T *a, T *b, T *c, int size) {
+  int i;
+  for (i = 0; i < size; i++)
+    c[i] = a[i] + b[i];
+}
+// CPU function to generate a vector of random numbers
+void random_T (T *a, int size) {
+  int i;
+  for (i = 0; i < size; i++)
+    a[i] = rand() % 10000; // random number between 0 and 9999
+}
+// CPU function to compare two vectors
+int compare(T *a, T *b, T size ){
+  int pass = 0;
+  int i;
+  for (i = 0; i < size; i++){
+    if (a[i] != b[i]) {
+      printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
+      pass = 1;
    }
+  }
+  if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
+  return pass;
+}
-    // CPU function to add two vectors
+int main()
-    void add_cpu (T *a, T *b, T *c, int size) {
+{
-      int i;
+  int i;
-      for (i = 0; i < size; i++)
+  random_T(in1, SIZE);
-        c[i] = a[i] + b[i];
+  random_T(in2, SIZE);
-    }
-    // CPU function to generate a vector of random numbers
+  #pragma offload target(mic) in(in1,in2)  inout(res)
-    void random_T (T *a, int size) {
+  {
-      int i;
-      for (i = 0; i < size; i++)
-        a[i] = rand() % 10000; // random number between 0 and 9999
-    }
-    // CPU function to compare two vectors
+    // Parallel loop from main function
-    int compare(T *a, T *b, T size ){
+    #pragma omp parallel for
-      int pass = 0;
+    for (i=0; i<SIZE; i++)
-      int i;
+      res[i] = in1[i] + in2[i];
-      for (i = 0; i < size; i++){
-        if (a[i] != b[i]) {
-          printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
-          pass = 1;
-        }
-      }
-      if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
-      return pass;
-    }
-    int main()
+    // or parallel loop is called inside the function
-    {
+    add_mic(in1, in2, res, SIZE);
-      int i;
-      random_T(in1, SIZE);
-      random_T(in2, SIZE);
-      #pragma offload target(mic) in(in1,in2)  inout(res)
+  }
-      {
-        // Parallel loop from main function
+  //Check the results with CPU implementation
-        #pragma omp parallel for
+  T res_cpu[SIZE];
-        for (i=0; i<SIZE; i++)
+  add_cpu(in1, in2, res_cpu, SIZE);
-          res[i] = in1[i] + in2[i];
+  compare(res, res_cpu, SIZE);
-        // or parallel loop is called inside the function
+}
-        add_mic(in1, in2, res, SIZE);
-      }
-      //Check the results with CPU implementation
-      T res_cpu[SIZE];
-      add_cpu(in1, in2, res_cpu, SIZE);
-      compare(res, res_cpu, SIZE);
-    }
 ```
 During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run
@@ -271,61 +271,61 @@ $ module load intel
 Following example show how to automatically offload an SGEMM (single precision - general matrix multiply) function to MIC coprocessor. The code can be copied to a file and compiled without any necessary modification.
-```console
+```cpp
 $ vim sgemm-ao-short.c
-    #include <stdio.h>
+#include <stdio.h>
-    #include <stdlib.h>
+#include <stdlib.h>
-    #include <malloc.h>
+#include <malloc.h>
-    #include <stdint.h>
+#include <stdint.h>
-    #include "mkl.h"
+#include "mkl.h"
-    int main(int argc, char **argv)
+int main(int argc, char **argv)
-    {
+{
-            float *A, *B, *C; /* Matrices */
+    float *A, *B, *C; /* Matrices */
-            MKL_INT N = 2560; /* Matrix dimensions */
+    MKL_INT N = 2560; /* Matrix dimensions */
-            MKL_INT LD = N; /* Leading dimension */
+    MKL_INT LD = N; /* Leading dimension */
-            int matrix_bytes; /* Matrix size in bytes */
+    int matrix_bytes; /* Matrix size in bytes */
-            int matrix_elements; /* Matrix size in elements */
+    int matrix_elements; /* Matrix size in elements */
-            float alpha = 1.0, beta = 1.0; /* Scaling factors */
+    float alpha = 1.0, beta = 1.0; /* Scaling factors */
-            char transa = 'N', transb = 'N'; /* Transposition options */
+    char transa = 'N', transb = 'N'; /* Transposition options */
-            int i, j; /* Counters */
+    int i, j; /* Counters */
-            matrix_elements = N * N;
+    matrix_elements = N * N;
-            matrix_bytes = sizeof(float) * matrix_elements;
+    matrix_bytes = sizeof(float) * matrix_elements;
-            /* Allocate the matrices */
+    /* Allocate the matrices */
-            A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
+    A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
-            /* Initialize the matrices */
+    /* Initialize the matrices */
-            for (i = 0; i < matrix_elements; i++) {
+    for (i = 0; i < matrix_elements; i++) {
-                    A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
+            A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
-            }
+    }
-            printf("Computing SGEMM on the hostn");
+    printf("Computing SGEMM on the hostn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
-            printf("Enabling Automatic Offloadn");
+    printf("Enabling Automatic Offloadn");
-            /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
+    /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
-            mkl_mic_enable();
+    mkl_mic_enable();
-            int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
+    int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
-            printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
+    printf("Automatic Offload enabled: %d MIC devices presentn",   ndevices);
-            printf("Computing SGEMM with automatic workdivisionn");
+    printf("Computing SGEMM with automatic workdivisionn");
-            sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
-            /* Free the matrix memory */
+    /* Free the matrix memory */
-            free(A); free(B); free(C);
+    free(A); free(B); free(C);
-            printf("Donen");
+    printf("Donen");
-        return 0;
+    return 0;
-    }
+}
 ```
 !!! note
@@ -573,28 +573,28 @@ $ mpiicc -mmic -o mpi-test-mic mpi-test.c
 An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file)
 ```cpp
-    #include <stdio.h>
+#include <stdio.h>
-    #include <mpi.h>
+#include <mpi.h>
-    int main (argc, argv)
+int main (argc, argv)
-         int argc;
+     int argc;
-         char *argv[];
+     char *argv[];
-    {
+{
-      int rank, size;
+  int rank, size;
-      int len;
+  int len;
-      char node[MPI_MAX_PROCESSOR_NAME];
+  char node[MPI_MAX_PROCESSOR_NAME];
-      MPI_Init (&argc, &argv);      /* starts MPI */
+  MPI_Init (&argc, &argv);      /* starts MPI */
-      MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
+  MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
-      MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
+  MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */
-      MPI_Get_processor_name(node,&len);
+  MPI_Get_processor_name(node,&len);
-      printf( "Hello world from process %d of %d on host %s n", rank, size, node );
+  printf( "Hello world from process %d of %d on host %s n", rank, size, node );
-      MPI_Finalize();
+  MPI_Finalize();
-      return 0;
+  return 0;
-    }
+}
 ```
 ### MPI Programming Models

--- a/docs.it4i/salomon/software/intel-xeon-phi.md
+++ b/docs.it4i/salomon/software/intel-xeon-phi.md
@@ -154,7 +154,7 @@ export OFFLOAD_REPORT=3
 A very basic example of code that employs offload programming technique is shown in the next listing. Please note that this code is sequential and utilizes only single core of the accelerator.
-```console
+```cpp
 $ cat source-offload.cpp
 #include <iostream>
@@ -190,7 +190,7 @@ $ ./bin-offload
 One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition.
-```console
+```cpp
 $ cat ./vect-add
 #include <stdio.h>
@@ -317,7 +317,7 @@ $ ml intel
 The code can be copied to a file and compiled without any necessary modification.
-```console
+```cpp
 $ vim sgemm-ao-short.c
 #include <stdio.h>
@@ -329,46 +329,46 @@ $ vim sgemm-ao-short.c
 int main(int argc, char **argv)
 {
-        float *A, *B, *C; /* Matrices */
+    float *A, *B, *C; /* Matrices */
-        MKL_INT N = 2560; /* Matrix dimensions */
+    MKL_INT N = 2560; /* Matrix dimensions */
-        MKL_INT LD = N; /* Leading dimension */
+    MKL_INT LD = N; /* Leading dimension */
-        int matrix_bytes; /* Matrix size in bytes */
+    int matrix_bytes; /* Matrix size in bytes */
-        int matrix_elements; /* Matrix size in elements */
+    int matrix_elements; /* Matrix size in elements */
-        float alpha = 1.0, beta = 1.0; /* Scaling factors */
+    float alpha = 1.0, beta = 1.0; /* Scaling factors */
-        char transa = 'N', transb = 'N'; /* Transposition options */
+    char transa = 'N', transb = 'N'; /* Transposition options */
-        int i, j; /* Counters */
+    int i, j; /* Counters */
-        matrix_elements = N * N;
+    matrix_elements = N * N;
-        matrix_bytes = sizeof(float) * matrix_elements;
+    matrix_bytes = sizeof(float) * matrix_elements;
-        /* Allocate the matrices */
+    /* Allocate the matrices */
-        A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
+    A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
-        /* Initialize the matrices */
+    /* Initialize the matrices */
-        for (i = 0; i < matrix_elements; i++) {
+    for (i = 0; i < matrix_elements; i++) {
-                A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
+            A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
-        }
+    }
-        printf("Computing SGEMM on the host\n");
+    printf("Computing SGEMM on the host\n");
-        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
-        printf("Enabling Automatic Offload\n");
+    printf("Enabling Automatic Offload\n");
-        /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
+    /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
-        mkl_mic_enable();
+    mkl_mic_enable();
-        int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
+    int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
-        printf("Automatic Offload enabled: %d MIC devices present\n",   ndevices);
+    printf("Automatic Offload enabled: %d MIC devices present\n",   ndevices);
-        printf("Computing SGEMM with automatic workdivision\n");
+    printf("Computing SGEMM with automatic workdivision\n");
-        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
+    sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
-        /* Free the matrix memory */
+    /* Free the matrix memory */
-        free(A); free(B); free(C);
+    free(A); free(B); free(C);
-        printf("Done\n");
+    printf("Done\n");
    return 0;
 }

--- a/docs.it4i/software/numerical-languages/octave.md
+++ b/docs.it4i/software/numerical-languages/octave.md
@@ -60,7 +60,7 @@ Octave may use MPI for interprocess communication This functionality is currentl
 ## Xeon Phi Support
-Octave may take advantage of the Xeon Phi accelerators. This will only work on the  [Intel Xeon Phi](../../salomon/software/intel-xeon-phi/)  [accelerated nodes](../../salomon/compute-nodes/).
+Octave may take advantage of the Xeon Phi accelerators. This will only work on the  [Intel Xeon Phi](../intel-xeon-phi/)  [accelerated nodes](../../salomon/compute-nodes/).
 ### Automatic Offload Support

--- a/docs.it4i/software/numerical-languages/r.md
+++ b/docs.it4i/software/numerical-languages/r.md
@@ -402,4 +402,4 @@ By leveraging MKL, R can accelerate certain computations, most notably linear al
 $ export MKL_MIC_ENABLE=1
 ```
-[Read more about automatic offload](../../anselm/software/intel-xeon-phi/)
+[Read more about automatic offload](../intel-xeon-phi/)
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -113,6 +113,9 @@ pages:
      - Intel MKL: software/intel-suite/intel-mkl.md
      - Intel TBB: software/intel-suite/intel-tbb.md
      - Intel Trace Analyzer and Collector: software/intel-suite/intel-trace-analyzer-and-collector.md
+    - 'Intel Xeon Phi':
+      - Intel Xeon Phi Salomon: software/intel-xeon-phi.md
+      - Intel Xeon Phi Anselm: software/intel-xeon-phi.anselm.md
    - ISV Licenses: software/isv_licenses.md
    - Java: software/java.md
    - 'Machine larning':
@@ -145,10 +148,7 @@ pages:
    - OpenFOAM: software/openfoam.md
    - Operating System: software/operating-system.md
    - ParaView: software/paraview.md
-    - Salomon Software:
-      - Intel Xeon Phi: salomon/software/intel-xeon-phi.md
    - Anselm Software:
-      - Intel Xeon Phi: anselm/software/intel-xeon-phi.md
      - NVIDIA CUDA: anselm/software/nvidia-cuda.md
      - Virtualization: anselm/software/virtualization.md
  - PBS Pro Documentation: pbspro.md