diff --git a/docs.it4i/cs/guides/power10.md b/docs.it4i/cs/guides/power10.md new file mode 100644 index 0000000000000000000000000000000000000000..102e23fc0afc59ef017303f9d45ad801f67614d2 --- /dev/null +++ b/docs.it4i/cs/guides/power10.md @@ -0,0 +1,211 @@ +# Using IBM Power Partition + +For testing your application on the IBM Power parition, +you need to prepare a job script for that partition or use the interactive job: + +```console +scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00 +``` + +where: + +- `-N 1` means allocation single node, +- `-c 192` means allocation 192 cores (threads), +- `-p p07-power` is IBM Power parition, +- `--time=08:00:00` means allocation for 8 hours. + +On the partition, you should reload the list of modules: + +``` +ml architecture/ppc64le +``` + +The platform offers both `GNU` based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library ([ESSL](https://www.ibm.com/docs/en/essl/6.1)), which can be used by both toolchain. + +## Building Applications + +Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use): +``` +ml GCC OpenBLAS +``` + +### GCC Toolchain + +In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++` +``` +g++ -lopenblas hello.cpp -o hello +``` +or `gfortran` +``` +gfortran -lopenblas hello.f90 -o hello +``` +as usual. + +### IBM Toolchain + +The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module +``` +IBM_ROOT=/opt/ibm +OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1 +OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1 + +export PATH=$OPENXLC_ROOT/bin:$PATH +export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH + +export PATH=$OPENXLF_ROOT/bin:$PATH +export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH +``` + +from there we can use either `ibm-clang++` +``` +ibm-clang++ -lopenblas hello.cpp -o hello +``` +or `xlf` +``` +xlf -lopenblas hello.f90 -o hello +``` +to build the application as usual. + +!!! note + Combination of `xlf` and `openblas` seems to cause severe performance degradation. Therefore `ESSL` library should be preferred (see below). + +### Using ESSL Library + +The [ESSL](https://www.ibm.com/docs/en/essl/6.1) library is installed in `/opt/ibm/math/essl/7.1` so we define additional environment variables + +``` +IBM_ROOT=/opt/ibm +ESSL_ROOT=${IBM_ROOT}math/essl/7.1 +export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH +``` + +The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines). +In practice this can look like +``` +g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello +``` +or +``` +gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello +``` +and similarly for IBM compilers (`ibm-clang++` and `xlf`). + +## Hello World Applications + +The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine). + +Stationary probability vector estimation in `C++`: +```c++ +#include <iostream> +#include <vector> +#include <chrono> +#include "cblas.h" + +const size_t ITERATIONS = 32; +const size_t MATRIX_SIZE = 1024; + +int main(int argc, char *argv[]) +{ + const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE; + + std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE)); + + for(size_t i = 0; i < MATRIX_SIZE; ++i) + a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f); + a[0] = 0.5f; + + std::vector<float> w1(matrixElements, 0.0f); + std::vector<float> w2(matrixElements, 0.0f); + + std::copy(a.begin(), a.end(), w1.begin()); + + std::vector<float> *t1, *t2; + t1 = &w1; + t2 = &w2; + + auto c1 = std::chrono::steady_clock::now(); + + for(size_t i = 0; i < ITERATIONS; ++i) + { + std::fill(t2->begin(), t2->end(), 0.0f); + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, + 1.0f, t1->data(), MATRIX_SIZE, + a.data(), MATRIX_SIZE, + 1.0f, t2->data(), MATRIX_SIZE); + + std::swap(t1, t2); + } + + auto c2 = std::chrono::steady_clock::now(); + + for(size_t i = 0; i < MATRIX_SIZE; ++i) + { + std::cout << (*t1)[i*MATRIX_SIZE + i] << " "; + } + + std::cout << std::endl; + + std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl; + + return 0; +} +``` + +Stationary probability vector estimation in `Fortran`: +```fortran +program main + implicit none + + integer :: matrix_size, iterations + integer :: i + real, allocatable, target :: a(:,:), w1(:,:), w2(:,:) + real, dimension(:,:), contiguous, pointer :: t1, t2, tmp + real, pointer :: out_data(:), out_diag(:) + integer :: cr, cm, c1, c2 + + iterations = 32 + matrix_size = 1024 + + call system_clock(count_rate=cr) + call system_clock(count_max=cm) + + allocate(a(matrix_size, matrix_size)) + allocate(w1(matrix_size, matrix_size)) + allocate(w2(matrix_size, matrix_size)) + + a(:,:) = 1.0 / real(matrix_size) + a(:,1) = 0.5 / real(matrix_size - 1) + a(1,1) = 0.5 + + w1 = a + w2(:,:) = 0.0 + + t1 => w1 + t2 => w2 + + call system_clock(c1) + + do i = 0, iterations + t2(:,:) = 0.0 + + call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size) + + tmp => t1 + t1 => t2 + t2 => tmp + end do + + call system_clock(c2) + + out_data(1:size(t1)) => t1 + out_diag => out_data(1::matrix_size+1) + + print *, out_diag + print *, "Elapsed Time: ", (c2 - c1) / real(cr) + + deallocate(a) + deallocate(w1) + deallocate(w2) +end program main +``` \ No newline at end of file