diff --git a/docs.it4i/salomon/software/operating-system.md b/docs.it4i/salomon/software/operating-system.md deleted file mode 100644 index f68a9a97aac216dd727e0973d3ac56754726b90a..0000000000000000000000000000000000000000 --- a/docs.it4i/salomon/software/operating-system.md +++ /dev/null @@ -1,5 +0,0 @@ -# Operating System - -The operating system on Salomon is Linux - [**CentOS 6.x**](https://en.wikipedia.org/wiki/CentOS) - -The CentOS Linux distribution is a stable, predictable, manageable and reproducible platform derived from the sources of Red Hat Enterprise Linux (RHEL). diff --git a/docs.it4i/software/debuggers/papi.md b/docs.it4i/software/debuggers/papi.md index c1c519ecdfc52b5b1675b811d5a4224584a7b948..15b03837e4bb18d2b8b6357d60bb29574b30fdae 100644 --- a/docs.it4i/software/debuggers/papi.md +++ b/docs.it4i/software/debuggers/papi.md @@ -193,7 +193,7 @@ $ ./matrix !!! note PAPI currently supports only a subset of counters on the Intel Xeon Phi processor compared to Intel Xeon, for example the floating point operations counter is missing. -To use PAPI in [Intel Xeon Phi](../../anselm/software/intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" : +To use PAPI in [Intel Xeon Phi](../intel-xeon-phi/) native applications, you need to load module with " -mic" suffix, for example " papi/5.3.2-mic" : ```console $ ml papi/5.3.2-mic diff --git a/docs.it4i/anselm/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.anselm.md similarity index 85% rename from docs.it4i/anselm/software/intel-xeon-phi.md rename to docs.it4i/software/intel-xeon-phi.anselm.md index dd83a229884b4f00b5b599bb8cb603fd098f03e2..b1e86256d093b4bd34fe799e48f64d38f48d0e83 100644 --- a/docs.it4i/anselm/software/intel-xeon-phi.md +++ b/docs.it4i/software/intel-xeon-phi.anselm.md @@ -108,24 +108,24 @@ A very basic example of code that employs offload programming technique is shown !!! note This code is sequential and utilizes only single core of the accelerator. -```console +```cpp $ vim source-offload.cpp - #include <iostream> +#include <iostream> - int main(int argc, char* argv[]) - { - const int niter = 100000; - double result = 0; +int main(int argc, char* argv[]) +{ + const int niter = 100000; + double result = 0; - #pragma offload target(mic) - for (int i = 0; i < niter; ++i) { - const double t = (i + 0.5) / niter; - result += 4.0 / (t * t + 1.0); - } - result /= niter; - std::cout << "Pi ~ " << result << 'n'; + #pragma offload target(mic) + for (int i = 0; i < niter; ++i) { + const double t = (i + 0.5) / niter; + result += 4.0 / (t * t + 1.0); } + result /= niter; + std::cout << "Pi ~ " << result << 'n'; +} ``` To compile a code using Intel compiler run @@ -144,82 +144,82 @@ $ ./bin-offload One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition. -```console +```cpp $ vim ./vect-add - #include <stdio.h> - - typedef int T; - - #define SIZE 1000 - - #pragma offload_attribute(push, target(mic)) - T in1[SIZE]; - T in2[SIZE]; - T res[SIZE]; - #pragma offload_attribute(pop) - - // MIC function to add two vectors - __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { - int i = 0; - #pragma omp parallel for - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; +#include <stdio.h> + +typedef int T; + +#define SIZE 1000 + +#pragma offload_attribute(push, target(mic)) +T in1[SIZE]; +T in2[SIZE]; +T res[SIZE]; +#pragma offload_attribute(pop) + +// MIC function to add two vectors +__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { + int i = 0; + #pragma omp parallel for + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} + +// CPU function to add two vectors +void add_cpu (T *a, T *b, T *c, int size) { + int i; + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; +} + +// CPU function to generate a vector of random numbers +void random_T (T *a, int size) { + int i; + for (i = 0; i < size; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 +} + +// CPU function to compare two vectors +int compare(T *a, T *b, T size ){ + int pass = 0; + int i; + for (i = 0; i < size; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; +} - // CPU function to add two vectors - void add_cpu (T *a, T *b, T *c, int size) { - int i; - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; - } +int main() +{ + int i; + random_T(in1, SIZE); + random_T(in2, SIZE); - // CPU function to generate a vector of random numbers - void random_T (T *a, int size) { - int i; - for (i = 0; i < size; i++) - a[i] = rand() % 10000; // random number between 0 and 9999 - } + #pragma offload target(mic) in(in1,in2) inout(res) + { - // CPU function to compare two vectors - int compare(T *a, T *b, T size ){ - int pass = 0; - int i; - for (i = 0; i < size; i++){ - if (a[i] != b[i]) { - printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); - pass = 1; - } - } - if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); - return pass; - } + // Parallel loop from main function + #pragma omp parallel for + for (i=0; i<SIZE; i++) + res[i] = in1[i] + in2[i]; - int main() - { - int i; - random_T(in1, SIZE); - random_T(in2, SIZE); + // or parallel loop is called inside the function + add_mic(in1, in2, res, SIZE); - #pragma offload target(mic) in(in1,in2) inout(res) - { + } - // Parallel loop from main function - #pragma omp parallel for - for (i=0; i<SIZE; i++) - res[i] = in1[i] + in2[i]; + //Check the results with CPU implementation + T res_cpu[SIZE]; + add_cpu(in1, in2, res_cpu, SIZE); + compare(res, res_cpu, SIZE); - // or parallel loop is called inside the function - add_mic(in1, in2, res, SIZE); - - } - - //Check the results with CPU implementation - T res_cpu[SIZE]; - add_cpu(in1, in2, res_cpu, SIZE); - compare(res, res_cpu, SIZE); - - } +} ``` During the compilation Intel compiler shows which loops have been vectorized in both host and accelerator. This can be enabled with compiler option "-vec-report2". To compile and execute the code run @@ -271,61 +271,61 @@ $ module load intel Following example show how to automatically offload an SGEMM (single precision - general matrix multiply) function to MIC coprocessor. The code can be copied to a file and compiled without any necessary modification. -```console +```cpp $ vim sgemm-ao-short.c - #include <stdio.h> - #include <stdlib.h> - #include <malloc.h> - #include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <stdint.h> - #include "mkl.h" +#include "mkl.h" - int main(int argc, char **argv) - { - float *A, *B, *C; /* Matrices */ +int main(int argc, char **argv) +{ + float *A, *B, *C; /* Matrices */ - MKL_INT N = 2560; /* Matrix dimensions */ - MKL_INT LD = N; /* Leading dimension */ - int matrix_bytes; /* Matrix size in bytes */ - int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ - float alpha = 1.0, beta = 1.0; /* Scaling factors */ - char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ - int i, j; /* Counters */ + int i, j; /* Counters */ - matrix_elements = N * N; - matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; - /* Allocate the matrices */ - A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); - /* Initialize the matrices */ - for (i = 0; i < matrix_elements; i++) { - A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; - } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } - printf("Computing SGEMM on the hostn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the hostn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Enabling Automatic Offloadn"); - /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ - mkl_mic_enable(); + printf("Enabling Automatic Offloadn"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); - int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ - printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); - printf("Computing SGEMM with automatic workdivisionn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM with automatic workdivisionn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - /* Free the matrix memory */ - free(A); free(B); free(C); + /* Free the matrix memory */ + free(A); free(B); free(C); - printf("Donen"); + printf("Donen"); - return 0; - } + return 0; +} ``` !!! note @@ -573,28 +573,28 @@ $ mpiicc -mmic -o mpi-test-mic mpi-test.c An example of basic MPI version of "hello-world" example in C language, that can be executed on both host and Xeon Phi is (can be directly copy and pasted to a .c file) ```cpp - #include <stdio.h> - #include <mpi.h> +#include <stdio.h> +#include <mpi.h> - int main (argc, argv) - int argc; - char *argv[]; - { - int rank, size; +int main (argc, argv) + int argc; + char *argv[]; +{ + int rank, size; - int len; - char node[MPI_MAX_PROCESSOR_NAME]; + int len; + char node[MPI_MAX_PROCESSOR_NAME]; - MPI_Init (&argc, &argv); /* starts MPI */ - MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ - MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ + MPI_Init (&argc, &argv); /* starts MPI */ + MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ + MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ - MPI_Get_processor_name(node,&len); + MPI_Get_processor_name(node,&len); - printf( "Hello world from process %d of %d on host %s n", rank, size, node ); - MPI_Finalize(); - return 0; - } + printf( "Hello world from process %d of %d on host %s n", rank, size, node ); + MPI_Finalize(); + return 0; +} ``` ### MPI Programming Models diff --git a/docs.it4i/salomon/software/intel-xeon-phi.md b/docs.it4i/software/intel-xeon-phi.md similarity index 95% rename from docs.it4i/salomon/software/intel-xeon-phi.md rename to docs.it4i/software/intel-xeon-phi.md index b2b6da255803c17e58bc3158fbb25c3617e7eeaf..f09e343ce7c02c194c8d1406cc374442d0be4249 100644 --- a/docs.it4i/salomon/software/intel-xeon-phi.md +++ b/docs.it4i/software/intel-xeon-phi.md @@ -154,7 +154,7 @@ export OFFLOAD_REPORT=3 A very basic example of code that employs offload programming technique is shown in the next listing. Please note that this code is sequential and utilizes only single core of the accelerator. -```console +```cpp $ cat source-offload.cpp #include <iostream> @@ -190,7 +190,7 @@ $ ./bin-offload One way of paralelization a code for Xeon Phi is using OpenMP directives. The following example shows code for parallel vector addition. -```console +```cpp $ cat ./vect-add #include <stdio.h> @@ -317,7 +317,7 @@ $ ml intel The code can be copied to a file and compiled without any necessary modification. -```console +```cpp $ vim sgemm-ao-short.c #include <stdio.h> @@ -329,46 +329,46 @@ $ vim sgemm-ao-short.c int main(int argc, char **argv) { - float *A, *B, *C; /* Matrices */ + float *A, *B, *C; /* Matrices */ - MKL_INT N = 2560; /* Matrix dimensions */ - MKL_INT LD = N; /* Leading dimension */ - int matrix_bytes; /* Matrix size in bytes */ - int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ - float alpha = 1.0, beta = 1.0; /* Scaling factors */ - char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ - int i, j; /* Counters */ + int i, j; /* Counters */ - matrix_elements = N * N; - matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; - /* Allocate the matrices */ - A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); - /* Initialize the matrices */ - for (i = 0; i < matrix_elements; i++) { - A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; - } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } - printf("Computing SGEMM on the host\n"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the host\n"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Enabling Automatic Offload\n"); - /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ - mkl_mic_enable(); + printf("Enabling Automatic Offload\n"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); - int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ - printf("Automatic Offload enabled: %d MIC devices present\n", ndevices); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices present\n", ndevices); - printf("Computing SGEMM with automatic workdivision\n"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM with automatic workdivision\n"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - /* Free the matrix memory */ - free(A); free(B); free(C); + /* Free the matrix memory */ + free(A); free(B); free(C); - printf("Done\n"); + printf("Done\n"); return 0; } diff --git a/docs.it4i/software/numerical-languages/octave.md b/docs.it4i/software/numerical-languages/octave.md index 4d96754cc4eccec854c1d5c88d8b43161e65e0d4..ca785e75dca4e83cccbdf25b68800363f33a841b 100644 --- a/docs.it4i/software/numerical-languages/octave.md +++ b/docs.it4i/software/numerical-languages/octave.md @@ -60,7 +60,7 @@ Octave may use MPI for interprocess communication This functionality is currentl ## Xeon Phi Support -Octave may take advantage of the Xeon Phi accelerators. This will only work on the [Intel Xeon Phi](../../salomon/software/intel-xeon-phi/) [accelerated nodes](../../salomon/compute-nodes/). +Octave may take advantage of the Xeon Phi accelerators. This will only work on the [Intel Xeon Phi](../intel-xeon-phi/) [accelerated nodes](../../salomon/compute-nodes/). ### Automatic Offload Support diff --git a/docs.it4i/software/numerical-languages/r.md b/docs.it4i/software/numerical-languages/r.md index 771c3efd53bda41adb21f0e553d51ad14d5a9941..3322a89acbf62cde753cfc57adf36a001d986148 100644 --- a/docs.it4i/software/numerical-languages/r.md +++ b/docs.it4i/software/numerical-languages/r.md @@ -402,4 +402,4 @@ By leveraging MKL, R can accelerate certain computations, most notably linear al $ export MKL_MIC_ENABLE=1 ``` -[Read more about automatic offload](../../anselm/software/intel-xeon-phi/) +[Read more about automatic offload](../intel-xeon-phi/) diff --git a/mkdocs.yml b/mkdocs.yml index 2db0d019d3dea4f9445245801f232ccd68c954ab..1188f3408c0a2cbbedde2ae4e67d3d69ddee83e9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -113,6 +113,9 @@ pages: - Intel MKL: software/intel-suite/intel-mkl.md - Intel TBB: software/intel-suite/intel-tbb.md - Intel Trace Analyzer and Collector: software/intel-suite/intel-trace-analyzer-and-collector.md + - 'Intel Xeon Phi': + - Intel Xeon Phi Salomon: software/intel-xeon-phi.md + - Intel Xeon Phi Anselm: software/intel-xeon-phi.anselm.md - ISV Licenses: software/isv_licenses.md - Java: software/java.md - 'Machine larning': @@ -145,10 +148,7 @@ pages: - OpenFOAM: software/openfoam.md - Operating System: software/operating-system.md - ParaView: software/paraview.md - - Salomon Software: - - Intel Xeon Phi: salomon/software/intel-xeon-phi.md - Anselm Software: - - Intel Xeon Phi: anselm/software/intel-xeon-phi.md - NVIDIA CUDA: anselm/software/nvidia-cuda.md - Virtualization: anselm/software/virtualization.md - PBS Pro Documentation: pbspro.md