From d79e40b3faaf3fe45b2482d3113a5fca20cae5e5 Mon Sep 17 00:00:00 2001 From: Jan Siwiec <jan.siwiec@vsb.cz> Date: Wed, 17 Jan 2024 13:13:44 +0100 Subject: [PATCH] Update power10.md --- docs.it4i/cs/guides/power10.md | 71 ++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/docs.it4i/cs/guides/power10.md b/docs.it4i/cs/guides/power10.md index 265293336..28c55dad8 100644 --- a/docs.it4i/cs/guides/power10.md +++ b/docs.it4i/cs/guides/power10.md @@ -25,6 +25,7 @@ The platform offers both `GNU` based and proprietary IBM toolchains for building ## Building Applications Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use): + ``` ml GCC OpenBLAS ``` @@ -32,10 +33,13 @@ ml GCC OpenBLAS ### GCC Toolchain In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++` + ``` g++ -lopenblas hello.cpp -o hello ``` + or `gfortran` + ``` gfortran -lopenblas hello.f90 -o hello ``` @@ -44,6 +48,7 @@ as usual. ### IBM Toolchain The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module + ``` IBM_ROOT=/opt/ibm OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1 @@ -57,10 +62,12 @@ export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH ``` from there we can use either `ibm-clang++` + ``` ibm-clang++ -lopenblas hello.cpp -o hello ``` or `xlf` + ``` xlf -lopenblas hello.f90 -o hello ``` @@ -81,10 +88,12 @@ export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines). In practice this can look like + ``` g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello ``` or + ``` gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello ``` @@ -95,6 +104,7 @@ and similarly for IBM compilers (`ibm-clang++` and `xlf`). The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine). Stationary probability vector estimation in `C++`: + ```c++ #include <iostream> #include <vector> @@ -107,45 +117,45 @@ const size_t MATRIX_SIZE = 1024; int main(int argc, char *argv[]) { const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE; - + std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE)); - + for(size_t i = 0; i < MATRIX_SIZE; ++i) a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f); a[0] = 0.5f; - + std::vector<float> w1(matrixElements, 0.0f); std::vector<float> w2(matrixElements, 0.0f); - + std::copy(a.begin(), a.end(), w1.begin()); - + std::vector<float> *t1, *t2; t1 = &w1; t2 = &w2; - + auto c1 = std::chrono::steady_clock::now(); - + for(size_t i = 0; i < ITERATIONS; ++i) { std::fill(t2->begin(), t2->end(), 0.0f); - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, 1.0f, t1->data(), MATRIX_SIZE, - a.data(), MATRIX_SIZE, + a.data(), MATRIX_SIZE, 1.0f, t2->data(), MATRIX_SIZE); - + std::swap(t1, t2); } - + auto c2 = std::chrono::steady_clock::now(); - + for(size_t i = 0; i < MATRIX_SIZE; ++i) { std::cout << (*t1)[i*MATRIX_SIZE + i] << " "; } - + std::cout << std::endl; - + std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl; return 0; @@ -153,57 +163,58 @@ int main(int argc, char *argv[]) ``` Stationary probability vector estimation in `Fortran`: + ```fortran program main implicit none - + integer :: matrix_size, iterations integer :: i real, allocatable, target :: a(:,:), w1(:,:), w2(:,:) real, dimension(:,:), contiguous, pointer :: t1, t2, tmp real, pointer :: out_data(:), out_diag(:) integer :: cr, cm, c1, c2 - + iterations = 32 matrix_size = 1024 - + call system_clock(count_rate=cr) call system_clock(count_max=cm) - + allocate(a(matrix_size, matrix_size)) allocate(w1(matrix_size, matrix_size)) allocate(w2(matrix_size, matrix_size)) - + a(:,:) = 1.0 / real(matrix_size) a(:,1) = 0.5 / real(matrix_size - 1) a(1,1) = 0.5 - + w1 = a w2(:,:) = 0.0 - + t1 => w1 t2 => w2 - + call system_clock(c1) - + do i = 0, iterations t2(:,:) = 0.0 - + call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size) - + tmp => t1 t1 => t2 t2 => tmp end do - + call system_clock(c2) - + out_data(1:size(t1)) => t1 out_diag => out_data(1::matrix_size+1) - + print *, out_diag print *, "Elapsed Time: ", (c2 - c1) / real(cr) - + deallocate(a) deallocate(w1) deallocate(w2) -- GitLab