From 5c20d591a4316b5befad14ff3a2a2bde6bf480c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Krup=C4=8D=C3=ADk?= <lukas.krupcik@vsb.cz> Date: Fri, 20 Jan 2017 09:38:59 +0100 Subject: [PATCH] test --- .../software/intel-xeon-phi.md | 330 +++++++++--------- 1 file changed, 165 insertions(+), 165 deletions(-) diff --git a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md index 8ea1ba38a..80e76efec 100644 --- a/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md +++ b/docs.it4i/anselm-cluster-documentation/software/intel-xeon-phi.md @@ -19,7 +19,7 @@ To set up the environment module "Intel" has to be loaded $ module load intel/13.5.192 ``` -Information about the hardware can be obtained by running the micinfo program on the host. +Information about the hardware can be obtained by running the micinfo program on the host. ```bash $ /usr/bin/micinfo @@ -32,61 +32,61 @@ The output of the "micinfo" utility executed on one of the Anselm node is as fol Created Mon Jul 22 00:23:50 2013 -        System Info -                HOST OS                : Linux -                OS Version             : 2.6.32-279.5.2.bl6.Bull.33.x86_64 -                Driver Version         : 6720-15 -                MPSS Version           : 2.1.6720-15 -                Host Physical Memory   : 98843 MB + System Info + HOST OS : Linux + OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64 + Driver Version : 6720-15 + MPSS Version : 2.1.6720-15 + Host Physical Memory : 98843 MB Device No: 0, Device Name: mic0 -        Version -                Flash Version           : 2.1.03.0386 -                SMC Firmware Version    : 1.15.4830 -                SMC Boot Loader Version : 1.8.4326 -                uOS Version             : 2.6.38.8-g2593b11 -                Device Serial Number    : ADKC30102482 - -        Board -                Vendor ID               : 0x8086 -                Device ID               : 0x2250 -                Subsystem ID            : 0x2500 -                Coprocessor Stepping ID : 3 -                PCIe Width              : x16 -                PCIe Speed              : 5 GT/s -                PCIe Max payload size   : 256 bytes -                PCIe Max read req size  : 512 bytes -                Coprocessor Model       : 0x01 -                Coprocessor Model Ext   : 0x00 -                Coprocessor Type        : 0x00 -                Coprocessor Family      : 0x0b -                Coprocessor Family Ext  : 0x00 -                Coprocessor Stepping    : B1 -                Board SKU               : B1PRQ-5110P/5120D -                ECC Mode                : Enabled -                SMC HW Revision         : Product 225W Passive CS - -        Cores -                Total No of Active Cores : 60 -                Voltage                 : 1032000 uV -                Frequency               : 1052631 kHz - -        Thermal -                Fan Speed Control       : N/A -                Fan RPM                 : N/A -                Fan PWM                 : N/A -                Die Temp                : 49 C - -        GDDR -                GDDR Vendor             : Elpida -                GDDR Version            : 0x1 -                GDDR Density            : 2048 Mb -                GDDR Size               : 7936 MB -                GDDR Technology         : GDDR5 -                GDDR Speed              : 5.000000 GT/s -                GDDR Frequency          : 2500000 kHz -                GDDR Voltage            : 1501000 uV + Version + Flash Version : 2.1.03.0386 + SMC Firmware Version : 1.15.4830 + SMC Boot Loader Version : 1.8.4326 + uOS Version : 2.6.38.8-g2593b11 + Device Serial Number : ADKC30102482 + + Board + Vendor ID : 0x8086 + Device ID : 0x2250 + Subsystem ID : 0x2500 + Coprocessor Stepping ID : 3 + PCIe Width : x16 + PCIe Speed : 5 GT/s + PCIe Max payload size : 256 bytes + PCIe Max read req size : 512 bytes + Coprocessor Model : 0x01 + Coprocessor Model Ext : 0x00 + Coprocessor Type : 0x00 + Coprocessor Family : 0x0b + Coprocessor Family Ext : 0x00 + Coprocessor Stepping : B1 + Board SKU : B1PRQ-5110P/5120D + ECC Mode : Enabled + SMC HW Revision : Product 225W Passive CS + + Cores + Total No of Active Cores : 60 + Voltage : 1032000 uV + Frequency : 1052631 kHz + + Thermal + Fan Speed Control : N/A + Fan RPM : N/A + Fan PWM : N/A + Die Temp : 49 C + + GDDR + GDDR Vendor : Elpida + GDDR Version : 0x1 + GDDR Density : 2048 Mb + GDDR Size : 7936 MB + GDDR Technology : GDDR5 + GDDR Speed : 5.000000 GT/s + GDDR Frequency : 2500000 kHz + GDDR Voltage : 1501000 uV ``` Offload Mode @@ -113,16 +113,16 @@ A very basic example of code that employs offload programming technique is shown int main(int argc, char* argv[]) { -    const int niter = 100000; -    double result = 0; - -  #pragma offload target(mic) -    for (int i = 0; i < niter; ++i) { -        const double t = (i + 0.5) / niter; -        result += 4.0 / (t * t + 1.0); -    } -    result /= niter; -    std::cout << "Pi ~ " << result << 'n'; + const int niter = 100000; + double result = 0; + + #pragma offload target(mic) + for (int i = 0; i < niter; ++i) { + const double t = (i + 0.5) / niter; + result += 4.0 / (t * t + 1.0); + } + result /= niter; + std::cout << "Pi ~ " << result << 'n'; } ``` @@ -159,63 +159,63 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo // MIC function to add two vectors __attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) { -  int i = 0; -  #pragma omp parallel for -    for (i = 0; i < size; i++) -      c[i] = a[i] + b[i]; + int i = 0; + #pragma omp parallel for + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; } // CPU function to add two vectors void add_cpu (T *a, T *b, T *c, int size) { -  int i; -  for (i = 0; i < size; i++) -    c[i] = a[i] + b[i]; + int i; + for (i = 0; i < size; i++) + c[i] = a[i] + b[i]; } // CPU function to generate a vector of random numbers void random_T (T *a, int size) { -  int i; -  for (i = 0; i < size; i++) -    a[i] = rand() % 10000; // random number between 0 and 9999 + int i; + for (i = 0; i < size; i++) + a[i] = rand() % 10000; // random number between 0 and 9999 } // CPU function to compare two vectors int compare(T *a, T *b, T size ){ -  int pass = 0; -  int i; -  for (i = 0; i < size; i++){ -    if (a[i] != b[i]) { -      printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); -      pass = 1; -    } -  } -  if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); -  return pass; + int pass = 0; + int i; + for (i = 0; i < size; i++){ + if (a[i] != b[i]) { + printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]); + pass = 1; + } + } + if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn"); + return pass; } int main() { -  int i; -  random_T(in1, SIZE); -  random_T(in2, SIZE); + int i; + random_T(in1, SIZE); + random_T(in2, SIZE); -  #pragma offload target(mic) in(in1,in2) inout(res) -  { + #pragma offload target(mic) in(in1,in2) inout(res) + { -    // Parallel loop from main function -    #pragma omp parallel for -    for (i=0; i<SIZE; i++) -      res[i] = in1[i] + in2[i]; + // Parallel loop from main function + #pragma omp parallel for + for (i=0; i<SIZE; i++) + res[i] = in1[i] + in2[i]; -    // or parallel loop is called inside the function -    add_mic(in1, in2, res, SIZE); + // or parallel loop is called inside the function + add_mic(in1, in2, res, SIZE); -  } + } -  //Check the results with CPU implementation -  T res_cpu[SIZE]; -  add_cpu(in1, in2, res_cpu, SIZE); -  compare(res, res_cpu, SIZE); + //Check the results with CPU implementation + T res_cpu[SIZE]; + add_cpu(in1, in2, res_cpu, SIZE); + compare(res, res_cpu, SIZE); } ``` @@ -282,48 +282,48 @@ Following example show how to automatically offload an SGEMM (single precision - int main(int argc, char **argv) { -        float *A, *B, *C; /* Matrices */ + float *A, *B, *C; /* Matrices */ -        MKL_INT N = 2560; /* Matrix dimensions */ -        MKL_INT LD = N; /* Leading dimension */ -        int matrix_bytes; /* Matrix size in bytes */ -        int matrix_elements; /* Matrix size in elements */ + MKL_INT N = 2560; /* Matrix dimensions */ + MKL_INT LD = N; /* Leading dimension */ + int matrix_bytes; /* Matrix size in bytes */ + int matrix_elements; /* Matrix size in elements */ -        float alpha = 1.0, beta = 1.0; /* Scaling factors */ -        char transa = 'N', transb = 'N'; /* Transposition options */ + float alpha = 1.0, beta = 1.0; /* Scaling factors */ + char transa = 'N', transb = 'N'; /* Transposition options */ -        int i, j; /* Counters */ + int i, j; /* Counters */ -        matrix_elements = N * N; -        matrix_bytes = sizeof(float) * matrix_elements; + matrix_elements = N * N; + matrix_bytes = sizeof(float) * matrix_elements; -        /* Allocate the matrices */ -        A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); + /* Allocate the matrices */ + A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes); -        /* Initialize the matrices */ -        for (i = 0; i < matrix_elements; i++) { -                A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; -        } + /* Initialize the matrices */ + for (i = 0; i < matrix_elements; i++) { + A[i] = 1.0; B[i] = 2.0; C[i] = 0.0; + } -        printf("Computing SGEMM on the hostn"); -        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM on the hostn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); -        printf("Enabling Automatic Offloadn"); -        /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ -        mkl_mic_enable(); + printf("Enabling Automatic Offloadn"); + /* Alternatively, set environment variable MKL_MIC_ENABLE=1 */ + mkl_mic_enable(); -        int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ -        printf("Automatic Offload enabled: %d MIC devices presentn",  ndevices); + int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */ + printf("Automatic Offload enabled: %d MIC devices presentn", ndevices); -        printf("Computing SGEMM with automatic workdivisionn"); -        sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); + printf("Computing SGEMM with automatic workdivisionn"); + sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N); -        /* Free the matrix memory */ -        free(A); free(B); free(C); + /* Free the matrix memory */ + free(A); free(B); free(C); -        printf("Donen"); + printf("Donen"); -    return 0; + return 0; } ``` @@ -349,10 +349,10 @@ The output of a code should look similar to following listing, where lines start Enabling Automatic Offload Automatic Offload enabled: 1 MIC devices present Computing SGEMM with automatic workdivision - [MKL] [MIC --] [AO Function]   SGEMM - [MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00 - [MKL] [MIC 00] [AO SGEMM CPU Time]     0.463351 seconds - [MKL] [MIC 00] [AO SGEMM MIC Time]     0.179608 seconds + [MKL] [MIC --] [AO Function] SGEMM + [MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00 + [MKL] [MIC 00] [AO SGEMM CPU Time] 0.463351 seconds + [MKL] [MIC 00] [AO SGEMM MIC Time] 0.179608 seconds [MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 52428800 bytes [MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 26214400 bytes Done @@ -479,23 +479,23 @@ After executing the complied binary file, following output should be displayed. Number of available platforms: 1 Platform names: -    [0] Intel(R) OpenCL [Selected] + [0] Intel(R) OpenCL [Selected] Number of devices available for each type: -    CL_DEVICE_TYPE_CPU: 1 -    CL_DEVICE_TYPE_GPU: 0 -    CL_DEVICE_TYPE_ACCELERATOR: 1 + CL_DEVICE_TYPE_CPU: 1 + CL_DEVICE_TYPE_GPU: 0 + CL_DEVICE_TYPE_ACCELERATOR: 1 ** Detailed information for each device *** CL_DEVICE_TYPE_CPU[0] -    CL_DEVICE_NAME:       Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz -    CL_DEVICE_AVAILABLE: 1 + CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz + CL_DEVICE_AVAILABLE: 1 ... CL_DEVICE_TYPE_ACCELERATOR[0] -    CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card -    CL_DEVICE_AVAILABLE: 1 + CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card + CL_DEVICE_AVAILABLE: 1 ... ``` @@ -579,23 +579,23 @@ An example of basic MPI version of "hello-world" example in C language, that can #include <mpi.h> int main (argc, argv) -     int argc; -     char *argv[]; + int argc; + char *argv[]; { -  int rank, size; + int rank, size; -  int len; -  char node[MPI_MAX_PROCESSOR_NAME]; + int len; + char node[MPI_MAX_PROCESSOR_NAME]; -  MPI_Init (&argc, &argv);     /* starts MPI */ -  MPI_Comm_rank (MPI_COMM_WORLD, &rank);       /* get current process id */ -  MPI_Comm_size (MPI_COMM_WORLD, &size);       /* get number of processes */ + MPI_Init (&argc, &argv); /* starts MPI */ + MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ + MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ -  MPI_Get_processor_name(node,&len); + MPI_Get_processor_name(node,&len); -  printf( "Hello world from process %d of %d on host %s n", rank, size, node ); -  MPI_Finalize(); -  return 0; + printf( "Hello world from process %d of %d on host %s n", rank, size, node ); + MPI_Finalize(); + return 0; } ``` @@ -722,8 +722,8 @@ The output should be again similar to: Please note that the **"mpiexec.hydra"** requires a file the MIC filesystem. If the file is missing please contact the system administrators. A simple test to see if the file is present is to execute: ```bash -   $ ssh mic0 ls /bin/pmi_proxy -  /bin/pmi_proxy + $ ssh mic0 ls /bin/pmi_proxy + /bin/pmi_proxy ``` **Execution on host - MPI processes distributed over multiple accelerators on multiple nodes** @@ -769,8 +769,8 @@ The launch the MPI program use: ```bash $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp -  -genv I_MPI_FABRICS shm:tcp -  -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204-mic0 -n 4 ~/mpi-test-mic : -host cn205-mic0 -n 6 ~/mpi-test-mic ``` @@ -779,8 +779,8 @@ or using mpirun: ```bash $ mpirun -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp -  -genv I_MPI_FABRICS shm:tcp -  -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204-mic0 -n 4 ~/mpi-test-mic : -host cn205-mic0 -n 6 ~/mpi-test-mic ``` @@ -805,8 +805,8 @@ The same way MPI program can be executed on multiple hosts: ```bash $ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp -  -genv I_MPI_FABRICS shm:tcp -  -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -host cn204 -n 4 ~/mpi-test : -host cn205 -n 6 ~/mpi-test ``` @@ -822,7 +822,7 @@ In the previous section we have compiled two binary files, one for hosts "**mpi- $ mpiexec.hydra -genv I_MPI_FABRICS_LIST tcp -genv I_MPI_FABRICS shm:tcp -  -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -host cn205 -n 2 ~/mpi-test : -host cn205-mic0 -n 2 ~/mpi-test-mic @@ -851,7 +851,7 @@ An example of a machine file that uses 2 >hosts (**cn205** and **cn206**) and 2 cn206-mic0:2 ``` -In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic"). +In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic"). ```bash $ export I_MPI_MIC_POSTFIX=-mic @@ -864,8 +864,8 @@ To run the MPI code using mpirun and the machine file "hosts_file_mix" use: -genv I_MPI_FABRICS shm:tcp -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/ -genv I_MPI_FABRICS_LIST tcp -  -genv I_MPI_FABRICS shm:tcp -  -genv I_MPI_TCP_NETMASK=10.1.0.0/16 + -genv I_MPI_FABRICS shm:tcp + -genv I_MPI_TCP_NETMASK=10.1.0.0/16 -machinefile hosts_file_mix ~/mpi-test ``` @@ -901,4 +901,4 @@ Please note each host or accelerator is listed only per files. User has to speci Optimization ------------ -For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization") +For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization") -- GitLab