Skip to content
Snippets Groups Projects
Commit 5c20d591 authored by Lukáš Krupčík's avatar Lukáš Krupčík
Browse files

test

parent 085fb398
No related branches found
No related tags found
No related merge requests found
......@@ -19,7 +19,7 @@ To set up the environment module "Intel" has to be loaded
$ module load intel/13.5.192
```
Information about the hardware can be obtained by running the micinfo program on the host.
Information about the hardware can be obtained by running the micinfo program on the host.
```bash
$ /usr/bin/micinfo
......@@ -32,61 +32,61 @@ The output of the "micinfo" utility executed on one of the Anselm node is as fol
Created Mon Jul 22 00:23:50 2013
System Info
HOST OS : Linux
OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64
Driver Version : 6720-15
MPSS Version : 2.1.6720-15
Host Physical Memory : 98843 MB
System Info
HOST OS : Linux
OS Version : 2.6.32-279.5.2.bl6.Bull.33.x86_64
Driver Version : 6720-15
MPSS Version : 2.1.6720-15
Host Physical Memory : 98843 MB
Device No: 0, Device Name: mic0
Version
Flash Version : 2.1.03.0386
SMC Firmware Version : 1.15.4830
SMC Boot Loader Version : 1.8.4326
uOS Version : 2.6.38.8-g2593b11
Device Serial Number : ADKC30102482
Board
Vendor ID : 0x8086
Device ID : 0x2250
Subsystem ID : 0x2500
Coprocessor Stepping ID : 3
PCIe Width : x16
PCIe Speed : 5 GT/s
PCIe Max payload size : 256 bytes
PCIe Max read req size : 512 bytes
Coprocessor Model : 0x01
Coprocessor Model Ext : 0x00
Coprocessor Type : 0x00
Coprocessor Family : 0x0b
Coprocessor Family Ext : 0x00
Coprocessor Stepping : B1
Board SKU : B1PRQ-5110P/5120D
ECC Mode : Enabled
SMC HW Revision : Product 225W Passive CS
Cores
Total No of Active Cores : 60
Voltage : 1032000 uV
Frequency : 1052631 kHz
Thermal
Fan Speed Control : N/A
Fan RPM : N/A
Fan PWM : N/A
Die Temp : 49 C
GDDR
GDDR Vendor : Elpida
GDDR Version : 0x1
GDDR Density : 2048 Mb
GDDR Size : 7936 MB
GDDR Technology : GDDR5
GDDR Speed : 5.000000 GT/s
GDDR Frequency : 2500000 kHz
GDDR Voltage : 1501000 uV
Version
Flash Version : 2.1.03.0386
SMC Firmware Version : 1.15.4830
SMC Boot Loader Version : 1.8.4326
uOS Version : 2.6.38.8-g2593b11
Device Serial Number : ADKC30102482
Board
Vendor ID : 0x8086
Device ID : 0x2250
Subsystem ID : 0x2500
Coprocessor Stepping ID : 3
PCIe Width : x16
PCIe Speed : 5 GT/s
PCIe Max payload size : 256 bytes
PCIe Max read req size : 512 bytes
Coprocessor Model : 0x01
Coprocessor Model Ext : 0x00
Coprocessor Type : 0x00
Coprocessor Family : 0x0b
Coprocessor Family Ext : 0x00
Coprocessor Stepping : B1
Board SKU : B1PRQ-5110P/5120D
ECC Mode : Enabled
SMC HW Revision : Product 225W Passive CS
Cores
Total No of Active Cores : 60
Voltage : 1032000 uV
Frequency : 1052631 kHz
Thermal
Fan Speed Control : N/A
Fan RPM : N/A
Fan PWM : N/A
Die Temp : 49 C
GDDR
GDDR Vendor : Elpida
GDDR Version : 0x1
GDDR Density : 2048 Mb
GDDR Size : 7936 MB
GDDR Technology : GDDR5
GDDR Speed : 5.000000 GT/s
GDDR Frequency : 2500000 kHz
GDDR Voltage : 1501000 uV
```
Offload Mode
......@@ -113,16 +113,16 @@ A very basic example of code that employs offload programming technique is shown
int main(int argc, char* argv[])
{
const int niter = 100000;
double result = 0;
#pragma offload target(mic)
for (int i = 0; i < niter; ++i) {
const double t = (i + 0.5) / niter;
result += 4.0 / (t * t + 1.0);
}
result /= niter;
std::cout << "Pi ~ " << result << 'n';
const int niter = 100000;
double result = 0;
#pragma offload target(mic)
for (int i = 0; i < niter; ++i) {
const double t = (i + 0.5) / niter;
result += 4.0 / (t * t + 1.0);
}
result /= niter;
std::cout << "Pi ~ " << result << 'n';
}
```
......@@ -159,63 +159,63 @@ One way of paralelization a code for Xeon Phi is using OpenMP directives. The fo
// MIC function to add two vectors
__attribute__((target(mic))) add_mic(T *a, T *b, T *c, int size) {
int i = 0;
#pragma omp parallel for
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
int i = 0;
#pragma omp parallel for
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// CPU function to add two vectors
void add_cpu (T *a, T *b, T *c, int size) {
int i;
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
int i;
for (i = 0; i < size; i++)
c[i] = a[i] + b[i];
}
// CPU function to generate a vector of random numbers
void random_T (T *a, int size) {
int i;
for (i = 0; i < size; i++)
a[i] = rand() % 10000; // random number between 0 and 9999
int i;
for (i = 0; i < size; i++)
a[i] = rand() % 10000; // random number between 0 and 9999
}
// CPU function to compare two vectors
int compare(T *a, T *b, T size ){
int pass = 0;
int i;
for (i = 0; i < size; i++){
if (a[i] != b[i]) {
printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
pass = 1;
}
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
int pass = 0;
int i;
for (i = 0; i < size; i++){
if (a[i] != b[i]) {
printf("Value mismatch at location %d, values %d and %dn",i, a[i], b[i]);
pass = 1;
}
}
if (pass == 0) printf ("Test passedn"); else printf ("Test Failedn");
return pass;
}
int main()
{
int i;
random_T(in1, SIZE);
random_T(in2, SIZE);
int i;
random_T(in1, SIZE);
random_T(in2, SIZE);
#pragma offload target(mic) in(in1,in2) inout(res)
{
#pragma offload target(mic) in(in1,in2) inout(res)
{
// Parallel loop from main function
#pragma omp parallel for
for (i=0; i<SIZE; i++)
res[i] = in1[i] + in2[i];
// Parallel loop from main function
#pragma omp parallel for
for (i=0; i<SIZE; i++)
res[i] = in1[i] + in2[i];
// or parallel loop is called inside the function
add_mic(in1, in2, res, SIZE);
// or parallel loop is called inside the function
add_mic(in1, in2, res, SIZE);
}
}
//Check the results with CPU implementation
T res_cpu[SIZE];
add_cpu(in1, in2, res_cpu, SIZE);
compare(res, res_cpu, SIZE);
//Check the results with CPU implementation
T res_cpu[SIZE];
add_cpu(in1, in2, res_cpu, SIZE);
compare(res, res_cpu, SIZE);
}
```
......@@ -282,48 +282,48 @@ Following example show how to automatically offload an SGEMM (single precision -
int main(int argc, char **argv)
{
float *A, *B, *C; /* Matrices */
float *A, *B, *C; /* Matrices */
MKL_INT N = 2560; /* Matrix dimensions */
MKL_INT LD = N; /* Leading dimension */
int matrix_bytes; /* Matrix size in bytes */
int matrix_elements; /* Matrix size in elements */
MKL_INT N = 2560; /* Matrix dimensions */
MKL_INT LD = N; /* Leading dimension */
int matrix_bytes; /* Matrix size in bytes */
int matrix_elements; /* Matrix size in elements */
float alpha = 1.0, beta = 1.0; /* Scaling factors */
char transa = 'N', transb = 'N'; /* Transposition options */
float alpha = 1.0, beta = 1.0; /* Scaling factors */
char transa = 'N', transb = 'N'; /* Transposition options */
int i, j; /* Counters */
int i, j; /* Counters */
matrix_elements = N * N;
matrix_bytes = sizeof(float) * matrix_elements;
matrix_elements = N * N;
matrix_bytes = sizeof(float) * matrix_elements;
/* Allocate the matrices */
A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
/* Allocate the matrices */
A = malloc(matrix_bytes); B = malloc(matrix_bytes); C = malloc(matrix_bytes);
/* Initialize the matrices */
for (i = 0; i < matrix_elements; i++) {
A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
}
/* Initialize the matrices */
for (i = 0; i < matrix_elements; i++) {
A[i] = 1.0; B[i] = 2.0; C[i] = 0.0;
}
printf("Computing SGEMM on the hostn");
sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
printf("Computing SGEMM on the hostn");
sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
printf("Enabling Automatic Offloadn");
/* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
mkl_mic_enable();
printf("Enabling Automatic Offloadn");
/* Alternatively, set environment variable MKL_MIC_ENABLE=1 */
mkl_mic_enable();
int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
printf("Automatic Offload enabled: %d MIC devices presentn", ndevices);
int ndevices = mkl_mic_get_device_count(); /* Number of MIC devices */
printf("Automatic Offload enabled: %d MIC devices presentn", ndevices);
printf("Computing SGEMM with automatic workdivisionn");
sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
printf("Computing SGEMM with automatic workdivisionn");
sgemm(&transa, &transb, &N, &N, &N, &alpha, A, &N, B, &N, &beta, C, &N);
/* Free the matrix memory */
free(A); free(B); free(C);
/* Free the matrix memory */
free(A); free(B); free(C);
printf("Donen");
printf("Donen");
return 0;
return 0;
}
```
......@@ -349,10 +349,10 @@ The output of a code should look similar to following listing, where lines start
Enabling Automatic Offload
Automatic Offload enabled: 1 MIC devices present
Computing SGEMM with automatic workdivision
[MKL] [MIC --] [AO Function] SGEMM
[MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00
[MKL] [MIC 00] [AO SGEMM CPU Time] 0.463351 seconds
[MKL] [MIC 00] [AO SGEMM MIC Time] 0.179608 seconds
[MKL] [MIC --] [AO Function] SGEMM
[MKL] [MIC --] [AO SGEMM Workdivision] 0.00 1.00
[MKL] [MIC 00] [AO SGEMM CPU Time] 0.463351 seconds
[MKL] [MIC 00] [AO SGEMM MIC Time] 0.179608 seconds
[MKL] [MIC 00] [AO SGEMM CPU->MIC Data] 52428800 bytes
[MKL] [MIC 00] [AO SGEMM MIC->CPU Data] 26214400 bytes
Done
......@@ -479,23 +479,23 @@ After executing the complied binary file, following output should be displayed.
Number of available platforms: 1
Platform names:
[0] Intel(R) OpenCL [Selected]
[0] Intel(R) OpenCL [Selected]
Number of devices available for each type:
CL_DEVICE_TYPE_CPU: 1
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 1
CL_DEVICE_TYPE_CPU: 1
CL_DEVICE_TYPE_GPU: 0
CL_DEVICE_TYPE_ACCELERATOR: 1
** Detailed information for each device ***
CL_DEVICE_TYPE_CPU[0]
CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
CL_DEVICE_AVAILABLE: 1
CL_DEVICE_NAME: Intel(R) Xeon(R) CPU E5-2470 0 @ 2.30GHz
CL_DEVICE_AVAILABLE: 1
...
CL_DEVICE_TYPE_ACCELERATOR[0]
CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
CL_DEVICE_AVAILABLE: 1
CL_DEVICE_NAME: Intel(R) Many Integrated Core Acceleration Card
CL_DEVICE_AVAILABLE: 1
...
```
......@@ -579,23 +579,23 @@ An example of basic MPI version of "hello-world" example in C language, that can
#include <mpi.h>
int main (argc, argv)
int argc;
char *argv[];
int argc;
char *argv[];
{
int rank, size;
int rank, size;
int len;
char node[MPI_MAX_PROCESSOR_NAME];
int len;
char node[MPI_MAX_PROCESSOR_NAME];
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
MPI_Get_processor_name(node,&len);
MPI_Get_processor_name(node,&len);
printf( "Hello world from process %d of %d on host %s n", rank, size, node );
MPI_Finalize();
return 0;
printf( "Hello world from process %d of %d on host %s n", rank, size, node );
MPI_Finalize();
return 0;
}
```
......@@ -722,8 +722,8 @@ The output should be again similar to:
Please note that the **"mpiexec.hydra"** requires a file the MIC filesystem. If the file is missing please contact the system administrators. A simple test to see if the file is present is to execute:
```bash
$ ssh mic0 ls /bin/pmi_proxy
/bin/pmi_proxy
$ ssh mic0 ls /bin/pmi_proxy
/bin/pmi_proxy
```
**Execution on host - MPI processes distributed over multiple accelerators on multiple nodes**
......@@ -769,8 +769,8 @@ The launch the MPI program use:
```bash
$ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
-genv I_MPI_FABRICS_LIST tcp
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-host cn204-mic0 -n 4 ~/mpi-test-mic
: -host cn205-mic0 -n 6 ~/mpi-test-mic
```
......@@ -779,8 +779,8 @@ or using mpirun:
```bash
$ mpirun -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
-genv I_MPI_FABRICS_LIST tcp
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-host cn204-mic0 -n 4 ~/mpi-test-mic
: -host cn205-mic0 -n 6 ~/mpi-test-mic
```
......@@ -805,8 +805,8 @@ The same way MPI program can be executed on multiple hosts:
```bash
$ mpiexec.hydra -genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
-genv I_MPI_FABRICS_LIST tcp
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-host cn204 -n 4 ~/mpi-test
: -host cn205 -n 6 ~/mpi-test
```
......@@ -822,7 +822,7 @@ In the previous section we have compiled two binary files, one for hosts "**mpi-
$ mpiexec.hydra
-genv I_MPI_FABRICS_LIST tcp
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
-host cn205 -n 2 ~/mpi-test
: -host cn205-mic0 -n 2 ~/mpi-test-mic
......@@ -851,7 +851,7 @@ An example of a machine file that uses 2 >hosts (**cn205** and **cn206**) and 2
cn206-mic0:2
```
In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic").
In addition if a naming convention is set in a way that the name of the binary for host is **"bin_name"** and the name of the binary for the accelerator is **"bin_name-mic"** then by setting up the environment variable **I_MPI_MIC_POSTFIX** to **"-mic"** user do not have to specify the names of booth binaries. In this case mpirun needs just the name of the host binary file (i.e. "mpi-test") and uses the suffix to get a name of the binary for accelerator (i..e. "mpi-test-mic").
```bash
$ export I_MPI_MIC_POSTFIX=-mic
......@@ -864,8 +864,8 @@ To run the MPI code using mpirun and the machine file "hosts_file_mix" use:
-genv I_MPI_FABRICS shm:tcp
-genv LD_LIBRARY_PATH /apps/intel/impi/4.1.1.036/mic/lib/
-genv I_MPI_FABRICS_LIST tcp
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-genv I_MPI_FABRICS shm:tcp
-genv I_MPI_TCP_NETMASK=10.1.0.0/16
-machinefile hosts_file_mix
~/mpi-test
```
......@@ -901,4 +901,4 @@ Please note each host or accelerator is listed only per files. User has to speci
Optimization
------------
For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization")
For more details about optimization techniques please read Intel document [Optimization and Performance Tuning for Intel® Xeon Phi™ Coprocessors](http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization "http://software.intel.com/en-us/articles/optimization-and-performance-tuning-for-intel-xeon-phi-coprocessors-part-1-optimization")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment