Update power10.md

d79e40b3 · Jan Siwiec · 51ad1f58 · d79e40b3
Commit d79e40b3 authored 1 year ago by Jan Siwiec
--- a/docs.it4i/cs/guides/power10.md
+++ b/docs.it4i/cs/guides/power10.md
@@ -25,6 +25,7 @@ The platform offers both `GNU` based and proprietary IBM toolchains for building
 ## Building Applications
 Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use):
 ```
 ml GCC OpenBLAS
 ```
@@ -32,10 +33,13 @@ ml GCC OpenBLAS
 ### GCC Toolchain
 In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++`
 ```
 g++ -lopenblas hello.cpp -o hello
 ```
 or `gfortran`
 ```
 gfortran -lopenblas hello.f90 -o hello
 ```
@@ -44,6 +48,7 @@ as usual.
 ### IBM Toolchain
 The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module
 ```
 IBM_ROOT=/opt/ibm
 OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
@@ -57,10 +62,12 @@ export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
 ```
 from there we can use either `ibm-clang++`
 ```
 ibm-clang++ -lopenblas hello.cpp -o hello
 ```
 or `xlf`
 ```
 xlf -lopenblas hello.f90 -o hello
 ```
@@ -81,10 +88,12 @@ export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
 The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines).
 In practice this can look like
 ```
 g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
 ```
 or
 ```
 gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
 ```
@@ -95,6 +104,7 @@ and similarly for IBM compilers (`ibm-clang++` and `xlf`).
 The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).
 Stationary probability vector estimation in `C++`:
 ```c++
 #include <iostream>
 #include <vector>
@@ -107,45 +117,45 @@ const size_t MATRIX_SIZE = 1024;
 int main(int argc, char *argv[])
 {
    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;
    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));
    for(size_t i = 0; i < MATRIX_SIZE; ++i)
        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
    a[0] = 0.5f;
    std::vector<float> w1(matrixElements, 0.0f);
    std::vector<float> w2(matrixElements, 0.0f);
    std::copy(a.begin(), a.end(), w1.begin());
    std::vector<float> *t1, *t2;
    t1 = &w1;
    t2 = &w2;
    auto c1 = std::chrono::steady_clock::now();
    for(size_t i = 0; i < ITERATIONS; ++i)
    {
        std::fill(t2->begin(), t2->end(), 0.0f);
-        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, 
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
                    1.0f, t1->data(), MATRIX_SIZE,
-                    a.data(), MATRIX_SIZE, 
+                    a.data(), MATRIX_SIZE,
                    1.0f, t2->data(), MATRIX_SIZE);
        std::swap(t1, t2);
    }
    auto c2 = std::chrono::steady_clock::now();
    for(size_t i = 0; i < MATRIX_SIZE; ++i)
    {
        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
    }
    std::cout << std::endl;
    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
    return 0;
@@ -153,57 +163,58 @@ int main(int argc, char *argv[])
 ```
 Stationary probability vector estimation in `Fortran`:
 ```fortran
 program main
    implicit none
    integer :: matrix_size, iterations
    integer :: i
    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
    real, pointer :: out_data(:), out_diag(:)
    integer :: cr, cm, c1, c2
    iterations  = 32
    matrix_size = 1024
    call system_clock(count_rate=cr)
    call system_clock(count_max=cm)
    allocate(a(matrix_size, matrix_size))
    allocate(w1(matrix_size, matrix_size))
    allocate(w2(matrix_size, matrix_size))
    a(:,:) = 1.0 / real(matrix_size)
    a(:,1) = 0.5 / real(matrix_size - 1)
    a(1,1) = 0.5
    w1 = a
    w2(:,:) = 0.0
    t1 => w1
    t2 => w2
    call system_clock(c1)
    do i = 0, iterations
        t2(:,:) = 0.0
        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)
        tmp => t1
        t1  => t2
        t2  => tmp
    end do
    call system_clock(c2)
    out_data(1:size(t1)) => t1
    out_diag => out_data(1::matrix_size+1)
    print *, out_diag
    print *, "Elapsed Time: ", (c2 - c1) / real(cr)
    deallocate(a)
    deallocate(w1)
    deallocate(w2)