From 33fc350fa85eedec4a7c2e31399beae99ef2e988 Mon Sep 17 00:00:00 2001
From: Jan Siwiec <jan.siwiec@vsb.cz>
Date: Wed, 13 Mar 2024 14:19:28 +0100
Subject: [PATCH] Update file hm_management.md

---
 docs.it4i/cs/guides/hm_management.md | 64 ++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/docs.it4i/cs/guides/hm_management.md b/docs.it4i/cs/guides/hm_management.md
index 2b5725801..a23382ff4 100644
--- a/docs.it4i/cs/guides/hm_management.md
+++ b/docs.it4i/cs/guides/hm_management.md
@@ -17,37 +17,47 @@ The `numactl` allows to either restrict memory pool of the process to specific s
 ```bash
 numactl --membind <node_ids_set>
 ```
+
 or select single preffered node
+
 ```bash
 numactl --preffered <node_id>
 ```
+
 where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation.
 
 Convenient way to check `numactl` configuration is
+
 ```bash
 numactl -s
 ```
+
 which prints configuration in its execution environment eg.
+
 ```bash
 numactl --membind 8-15 numactl -s
 policy: bind
 preferred node: 0
-physcpubind: 0 1 2 ... 189 190 191 
-cpubind: 0 1 2 3 4 5 6 7 
-nodebind: 0 1 2 3 4 5 6 7 
+physcpubind: 0 1 2 ... 189 190 191
+cpubind: 0 1 2 3 4 5 6 7
+nodebind: 0 1 2 3 4 5 6 7
 membind: 8 9 10 11 12 13 14 15
 ```
+
 The last row shows allocations memory are restricted to NUMA nodes `8-15`.
 
 ### Allocation Level (MEMKIND)
 
 The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing:
+
 ```cpp
 void *pData = malloc(<SIZE>);
 /* ... */
 free(pData);
 ```
+
 with
+
 ```cpp
 #include <memkind.h>
 
@@ -55,6 +65,7 @@ void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>);
 /* ... */
 memkind_free(NULL, pData); // "kind" parameter is deduced from the address
 ```
+
 Similarly other memory types can be chosen.
 
 !!! note
@@ -63,9 +74,11 @@ Similarly other memory types can be chosen.
 ## High Bandwidth Memory (HBM)
 
 Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running
+
 ```bash
 numactl -H
 ```
+
 which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).
 
 ![](../../img/cs/guides/p10_numa_sc4_flat.png)
@@ -73,6 +86,7 @@ which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR
 ### Process Level
 
 With this we can easily restrict application to DDR DRAM or HBM memory:
+
 ```bash
 # Only DDR DRAM
 numactl --membind 0-7 ./stream
@@ -92,21 +106,28 @@ Scale:        1045065.2     0.015814     0.015310     0.016309
 Add:          1096992.2     0.022619     0.021878     0.024182
 Triad:        1065152.4     0.023449     0.022532     0.024559
 ```
+
 The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar.
 
 Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like
+
 ```bash
 #!/bin/bash
 numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@
 ```
+
 and can be used as
+
 ```bash
 mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi
 ```
-(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise 
+
+(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise
+
 ```bash
 mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi
 ```
+
 is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:
 ![](../../img/cs/guides/p10_stream_dram.png)
 ![](../../img/cs/guides/p10_stream_hbm.png)
@@ -114,6 +135,7 @@ is most likely preferable even for MPI workloads. Applying above approach to MPI
 ### Allocation Level
 
 Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows
+
 ```cpp
 #include <memkind.h>
 // ...
@@ -125,16 +147,21 @@ memkind_free(NULL, a);
 memkind_free(NULL, b);
 memkind_free(NULL, c);
 ```
+
 Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B.
 The code then has to be linked with `memkind` library
+
 ```bash
 gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream
 ```
+
 and can be run as
+
 ```bash
 export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
 OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream
 ```
+
 While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.
 
 ![](../../img/cs/guides/p10_stream_memkind.png)
@@ -169,29 +196,29 @@ const size_t N_ITERS      = 10;
 int main(int argc, char *argv[])
 {
     const double binWidth = 1.0 / double(N_BINS_COUNT + 1);
-    
+
     double *pData = (double *)memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double));
     size_t *pBins = (size_t *)memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double));
-    
+
     #pragma omp parallel
     {
         drand48_data state;
         srand48_r(omp_get_thread_num(), &state);
-        
+
         #pragma omp for
         for(size_t i = 0; i < N_DATA_SIZE; ++i)
             drand48_r(&state, &pData[i]);
     }
-    
+
     auto c1 = std::chrono::steady_clock::now();
-    
+
     for(size_t it = 0; it < N_ITERS; ++it)
     {
         #pragma omp parallel
         {
             for(size_t i = 0; i < N_BINS_COUNT; ++i)
                 pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0);
-            
+
             #pragma omp for
             for(size_t i = 0; i < N_DATA_SIZE; ++i)
             {
@@ -200,35 +227,36 @@ int main(int argc, char *argv[])
             }
         }
     }
-    
+
     auto c2 = std::chrono::steady_clock::now();
-    
+
     #pragma omp parallel for
     for(size_t i = 0; i < N_BINS_COUNT; ++i)
     {
         for(size_t j = 1; j < omp_get_max_threads(); ++j)
             pBins[i] += pBins[j*N_BINS_COUNT + i];
     }
-    
+
     std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
-    
+
     size_t total = 0;
     #pragma omp parallel for reduction(+:total)
     for(size_t i = 0; i < N_BINS_COUNT; ++i)
         total += pBins[i];
-    
+
     std::cout << "Total Items: " << total << std::endl;
-    
+
     memkind_free(NULL, pData);
     memkind_free(NULL, pBins);
-    
+
     return 0;
 }
 ```
 
-### Using HBM Memory (p10-intel)
+### Using HBM Memory (P10-Intel)
 
 Following commands can be used to compile and run example application above
+
 ```bash
 ml GCC memkind
 export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
-- 
GitLab