diff --git a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md index 8ea1ba38a2798804b67e164d9e97eeed623bb5d8..80e76efecfacbe0a7158a13c0ca4658a695053ff 100644 --- a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md +++ b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md @@ -19,7 +19,7 @@ To set up the environment module "Intel" has to be loaded $ module load intel/13.5.192 ``` -Information about the hardware can be obtained by running the micinfo program on the host. +Information about the hardware can be obtained by running the micinfo program on the host. ```bash $ /usr/bin/micinfo @@ -32,61 +32,61 @@ The output of the "micinfo" utility executed on one of the Anselm node is as fol Created Mon Jul 22 00:23:50 2013 - System Info - HOST OS : Linux - OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64 - Driver Version : 6720-15 - MPSS Version : 2.1.6720-15 - Host Physical Memory : 98843 MB + System Info + HOST OS : Linux + OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64 + Driver Version : 6720-15 + MPSS Version : 2.1.6720-15 + Host Physical Memory : 98843 MB Device No: 0, Device Name: mic0 - Version - Flash Version : 2.1.03.0386 - SMC Firmware Version : 1.15.4830 - SMC Boot Loader Version : 1.8.4326 - uOS Version : 2.6.38.8-g2593b11 - Device Serial Number : ADKC30102482 - - Board - Vendor ID : 0x8086 - Device ID : 0x2250 - Subsystem ID : 0x2500 - Coprocessor Stepping ID : 3 - PCIe Width : x16 - PCIe Speed : 5 GT/s - PCIe Max payload size : 256 bytes - PCIe Max read req size : 512 bytes - Coprocessor Model : 0x01 - Coprocessor Model Ext : 0x00 - Coprocessor Type : 0x00 - Coprocessor Family : 0x0b - Coprocessor Family Ext : 0x00 - Coprocessor Stepping : B1 - Board SKU : B1PRQ-5110P/5120D - ECC Mode : Enabled - SMC HW Revision : Product 225W Passive CS - - Cores - Total No of Active Cores : 60 - Voltage : 1032000 uV - Frequency : 1052631 kHz - - Thermal - Fan Speed Control : N/A - Fan RPM : N/A - Fan PWM : N/A - Die Temp : 49 C - - GDDR - GDDR Vendor : Elpida - GDDR Version : 0x1 - GDDR Density : 2048 Mb - GDDR Size : 7936 MB - GDDR Technology : GDDR5 - GDDR Speed : 5.000000 GT/s - GDDR Frequency : 2500000 kHz - GDDR Voltage : 1501000 uV + Version + Flash Version : 2.1.03.0386 + SMC Firmware Version : 1.15.4830 + SMC Boot Loader Version : 1.8.4326 + uOS Version : 2.6.38.8-g2593b11 + Device Serial Number : ADKC30102482 + + Board + Vendor ID : 0x8086 + Device ID : 0x2250 + Subsystem ID : 0x2500 + Coprocessor Stepping ID : 3 + PCIe Width : x16 + PCIe Speed : 5 GT/s + PCIe Max payload size : 256 bytes + PCIe Max read req size : 512 bytes + Coprocessor Model : 0x01 + Coprocessor Model Ext : 0x00 + Coprocessor Type : 0x00 + Coprocessor Family : 0x0b + Coprocessor Family Ext : 0x00 + Coprocessor Stepping : B1 + Board SKU : B1PRQ-5110P/5120D + ECC Mode : Enabled + SMC HW Revision : Product 225W Passive CS + + Cores + Total No of Active Cores : 60 + Voltage : 1032000 uV + Frequency : 1052631 kHz + + Thermal + Fan Speed Control : N/A + Fan RPM : N/A + Fan PWM : N/A + Die Temp : 49 C + + GDDR + GDDR Vendor : Elpida + GDDR Version : 0x1 + GDDR Density : 2048 Mb + GDDR Size : 7936 MB + GDDR Technology : GDDR5 + GDDR Speed : 5.000000 GT/s + GDDR Frequency : 2500000 kHz + GDDR Voltage : 1501000 uV ``` Offload Mode @@ -113,16 +113,16 @@ A very basic example of code that employs offload programming technique is shown int main(int argc, char* argv[]) { - const int niter = 100000; - double result = 0; - - #pragma offload target(mic) - for (int i = 0; i < niter; ++i) { - const double t = (i + 0.5) / niter; - result += 4.0 / (t * t + 1.0); - } - result /= niter; - std::cout << "Pi ~ " << result << 'n'; + const int niter = 100000; + double result = 0; + + #pragma offload target(mic) + for (int i = 0; i < niter; ++i) { + const double t = (i + 0.5) / niter; + result += 4.0 / (t * t + 1.0); + } + result /= niter; + std::cout << "Pi ~ " << result << 'n'; } ``` @@ -159,63 +159,63 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo // MIC function to add two vectors __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { - int i = 0; - #pragma omp parallel for - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; + int i = 0; + #pragma omp parallel for + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; } // CPU function to add two vectors void add_cpu (T *a, T *b, T *c, int size) { - int i; - for (i = 0; i < size; i++) - c[i] = a[i] + b[i]; + int i; + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; } // CPU function to generate a vector of random numbers void random_T (T *a, int size) { - int i; - for (i = 0; i < size; i++) - a[i] = rand() % 10000; // random number between 0 and 9999 + int i; + for (i = 0; i < size; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 } // CPU function to compare two vectors int compare(T *a, T *b, T size ){ - int pass = 0; - int i; - for (i = 0; i < size; i++){ - if (a[i] != b[i]) { - printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); - pass = 1; - } - } - if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); - return pass; + int pass = 0; + int i; + for (i = 0; i < size; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; + } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; } int main() { - int i; - random_T(in1, SIZE); - random_T(in2, SIZE); + int i; + random_T(in1, SIZE); + random_T(in2, SIZE); - #pragma offload target(mic) in(in1,in2) inout(res) - { + #pragma offload target(mic) in(in1,in2) inout(res) + { - // Parallel loop from main function - #pragma omp parallel for - for (i=0; i<SIZE; i++) - res[i] = in1[i] + in2[i]; + // Parallel loop from main function + #pragma omp parallel for + for (i=0; i<SIZE; i++) + res[i] = in1[i] + in2[i]; - // or parallel loop is called inside the function - add_mic(in1, in2, res, SIZE); + // or parallel loop is called inside the function + add_mic(in1, in2, res, SIZE); - } + } - //Check the results with CPU implementation - T res_cpu[SIZE]; - add_cpu(in1, in2, res_cpu, SIZE); - compare(res, res_cpu, SIZE); + //Check the results with CPU implementation + T res_cpu[SIZE]; + add_cpu(in1, in2, res_cpu, SIZE); + compare(res, res_cpu, SIZE); } ``` @@ -282,48 +282,48 @@ Following example show how to automatically offload an SGEMM (single precision - int main(int argc, char **argv) { - float *A, *B, *C; /* Matrices */ + float *A, *B, *C; /* Matrices */ - MKL_INT N = 2560; /* Matrix dimensions */ - MKL_INT LD = N; /* Leading dimension */ - int matrix_bytes; /* Matrix size in bytes */ - int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ - float alpha = 1.0, beta = 1.0; /* Scaling factors */ - char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ - int i, j; /* Counters */ + int i, j; /* Counters */ - matrix_elements = N * N; - matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; - /* Allocate the matrices */ - A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); - /* Initialize the matrices */ - for (i = 0; i < matrix_elements; i++) { - A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; - } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } - printf("Computing SGEMM on the hostn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the hostn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - printf("Enabling Automatic Offloadn"); - /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ - mkl_mic_enable(); + printf("Enabling Automatic Offloadn"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); - int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ - printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); - printf("Computing SGEMM with automatic workdivisionn"); - sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM with automatic workdivisionn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); - /* Free the matrix memory */ - free(A); free(B); free(C); + /* Free the matrix memory */ + free(A); free(B); free(C); - printf("Donen"); + printf("Donen"); - return 0; + return 0; } ``` @@ -349,10 +349,10 @@ The output of a code should look similar to following listing, where lines start Enabling Automatic Offload Automatic Offload enabled: 1 MIC devices present Computing SGEMM with automatic workdivision - [MKL] [MIC --] [AO Function] SGEMM - [MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00 - [MKL] [MIC 00] [AO SGEMM CPU Time] 0.463351 seconds - [MKL] [MIC 00] [AO SGEMM MIC Time] 0.179608 seconds + [MKL] [MIC --] [AO Function] SGEMM + [MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00 + [MKL] [MIC 00] [AO SGEMM CPU Time] 0.463351 seconds + [MKL] [MIC 00] [AO SGEMM MIC Time] 0.179608 seconds [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 52428800 bytes [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 26214400 bytes Done @@ -479,23 +479,23 @@ After executing the complied binary file, following output should be displayed. Number of available platforms: 1 Platform names: - [0] Intel(R) OpenCL [Selected] + [0] Intel(R) OpenCL [Selected] Number of devices available for each type: - CL_DEVICE_TYPE_CPU: 1 - CL_DEVICE_TYPE_GPU: 0 - CL_DEVICE_TYPE_ACCELERATOR: 1 + CL_DEVICE_TYPE_CPU: 1 + CL_DEVICE_TYPE_GPU: 0 + CL_DEVICE_TYPE_ACCELERATOR: 1 ** Detailed information for each device *** CL_DEVICE_TYPE_CPU[0] - CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz - CL_DEVICE_AVAILABLE: 1 + CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz + CL_DEVICE_AVAILABLE: 1 ... CL_DEVICE_TYPE_ACCELERATOR[0] - CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card - CL_DEVICE_AVAILABLE: 1 + CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card + CL_DEVICE_AVAILABLE: 1 ... ``` @@ -579,23 +579,23 @@ An example of basic MPI version of "hello-world" example in C language, that can #include <mpi.h> int main (argc, argv) - int argc; - char *argv[]; + int argc; + char *argv[]; { - int rank, size; + int rank, size; - int len; - char node[MPI_MAX_PROCESSOR_NAME]; + int len; + char node[MPI_MAX_PROCESSOR_NAME]; - MPI_Init (&argc, &argv); /* starts MPI */ - MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ - MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ + MPI_Init (&argc, &argv); /* starts MPI */ + MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ + MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ - MPI_Get_processor_name(node,&len); + MPI_Get_processor_name(node,&len); - printf( "Hello world from process %d of %d on host %s n", rank, size, node ); - MPI_Finalize(); - return 0; + printf( "Hello world from process %d of %d on host %s n", rank, size, node ); + MPI_Finalize(); + return 0; } ``` @@ -722,8 +722,8 @@ The output should be again similar to: Please note that the **"mpiexec.hydra"** requires a file the MIC filesystem. If the file is missing please contact the system administrators. A simple test to see if the file is present is to execute: ```bash - $ ssh mic0 ls /bin/pmi_proxy - /bin/pmi_proxy + $ ssh mic0 ls /bin/pmi_proxy + /bin/pmi_proxy ``` **Execution on host - MPI processes distributed over multiple accelerators on multiple nodes** @@ -769,8 +769,8 @@ The launch the MPI program use: ```bash $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp - -genv I_MPI_FABRICS shm:tcp - -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204-mic0 -n 4 ~/mpi-test-mic : -host cn205-mic0 -n 6 ~/mpi-test-mic ``` @@ -779,8 +779,8 @@ or using mpirun: ```bash $ mpirun -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp - -genv I_MPI_FABRICS shm:tcp - -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204-mic0 -n 4 ~/mpi-test-mic : -host cn205-mic0 -n 6 ~/mpi-test-mic ``` @@ -805,8 +805,8 @@ The same way MPI program can be executed on multiple hosts: ```bash $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp - -genv I_MPI_FABRICS shm:tcp - -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204 -n 4 ~/mpi-test : -host cn205 -n 6 ~/mpi-test ``` @@ -822,7 +822,7 @@ In the previous section we have compiled two binary files, one for hosts "**mpi- $ mpiexec.hydra -genv I_MPI_FABRICS_LIST tcp -genv I_MPI_FABRICS shm:tcp - -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -host cn205 -n 2 ~/mpi-test : -host cn205-mic0 -n 2 ~/mpi-test-mic @@ -851,7 +851,7 @@ An example of a machine file that uses 2 >hosts (**cn205** and **cn206**) and 2 cn206-mic0:2 ``` -In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic"). +In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic"). ```bash $ export I_MPI_MIC_POSTFIX=-mic @@ -864,8 +864,8 @@ To run the MPI code using mpirun and the machine file "hosts_file_mix" use: -genv I_MPI_FABRICS shm:tcp -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp - -genv I_MPI_FABRICS shm:tcp - -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -machinefile hosts_file_mix ~/mpi-test ``` @@ -901,4 +901,4 @@ Please note each host or accelerator is listed only per files. User has to speci Optimization ------------ -For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization") +For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization")