From 33fc350fa85eedec4a7c2e31399beae99ef2e988 Mon Sep 17 00:00:00 2001 From: Jan Siwiec <jan.siwiec@vsb.cz> Date: Wed, 13 Mar 2024 14:19:28 +0100 Subject: [PATCH] Update file hm_management.md --- docs.it4i/cs/guides/hm_management.md | 64 ++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/docs.it4i/cs/guides/hm_management.md b/docs.it4i/cs/guides/hm_management.md index 2b5725801..a23382ff4 100644 --- a/docs.it4i/cs/guides/hm_management.md +++ b/docs.it4i/cs/guides/hm_management.md @@ -17,37 +17,47 @@ The `numactl` allows to either restrict memory pool of the process to specific s ```bash numactl --membind <node_ids_set> ``` + or select single preffered node + ```bash numactl --preffered <node_id> ``` + where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation. Convenient way to check `numactl` configuration is + ```bash numactl -s ``` + which prints configuration in its execution environment eg. + ```bash numactl --membind 8-15 numactl -s policy: bind preferred node: 0 -physcpubind: 0 1 2 ... 189 190 191 -cpubind: 0 1 2 3 4 5 6 7 -nodebind: 0 1 2 3 4 5 6 7 +physcpubind: 0 1 2 ... 189 190 191 +cpubind: 0 1 2 3 4 5 6 7 +nodebind: 0 1 2 3 4 5 6 7 membind: 8 9 10 11 12 13 14 15 ``` + The last row shows allocations memory are restricted to NUMA nodes `8-15`. ### Allocation Level (MEMKIND) The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing: + ```cpp void *pData = malloc(<SIZE>); /* ... */ free(pData); ``` + with + ```cpp #include <memkind.h> @@ -55,6 +65,7 @@ void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>); /* ... */ memkind_free(NULL, pData); // "kind" parameter is deduced from the address ``` + Similarly other memory types can be chosen. !!! note @@ -63,9 +74,11 @@ Similarly other memory types can be chosen. ## High Bandwidth Memory (HBM) Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running + ```bash numactl -H ``` + which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).  @@ -73,6 +86,7 @@ which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR ### Process Level With this we can easily restrict application to DDR DRAM or HBM memory: + ```bash # Only DDR DRAM numactl --membind 0-7 ./stream @@ -92,21 +106,28 @@ Scale: 1045065.2 0.015814 0.015310 0.016309 Add: 1096992.2 0.022619 0.021878 0.024182 Triad: 1065152.4 0.023449 0.022532 0.024559 ``` + The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar. Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like + ```bash #!/bin/bash numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@ ``` + and can be used as + ```bash mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi ``` -(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise + +(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise + ```bash mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi ``` + is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:   @@ -114,6 +135,7 @@ is most likely preferable even for MPI workloads. Applying above approach to MPI ### Allocation Level Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows + ```cpp #include <memkind.h> // ... @@ -125,16 +147,21 @@ memkind_free(NULL, a); memkind_free(NULL, b); memkind_free(NULL, c); ``` + Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B. The code then has to be linked with `memkind` library + ```bash gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream ``` + and can be run as + ```bash export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15 OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream ``` + While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.  @@ -169,29 +196,29 @@ const size_t N_ITERS = 10; int main(int argc, char *argv[]) { const double binWidth = 1.0 / double(N_BINS_COUNT + 1); - + double *pData = (double *)memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double)); size_t *pBins = (size_t *)memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double)); - + #pragma omp parallel { drand48_data state; srand48_r(omp_get_thread_num(), &state); - + #pragma omp for for(size_t i = 0; i < N_DATA_SIZE; ++i) drand48_r(&state, &pData[i]); } - + auto c1 = std::chrono::steady_clock::now(); - + for(size_t it = 0; it < N_ITERS; ++it) { #pragma omp parallel { for(size_t i = 0; i < N_BINS_COUNT; ++i) pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0); - + #pragma omp for for(size_t i = 0; i < N_DATA_SIZE; ++i) { @@ -200,35 +227,36 @@ int main(int argc, char *argv[]) } } } - + auto c2 = std::chrono::steady_clock::now(); - + #pragma omp parallel for for(size_t i = 0; i < N_BINS_COUNT; ++i) { for(size_t j = 1; j < omp_get_max_threads(); ++j) pBins[i] += pBins[j*N_BINS_COUNT + i]; } - + std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl; - + size_t total = 0; #pragma omp parallel for reduction(+:total) for(size_t i = 0; i < N_BINS_COUNT; ++i) total += pBins[i]; - + std::cout << "Total Items: " << total << std::endl; - + memkind_free(NULL, pData); memkind_free(NULL, pBins); - + return 0; } ``` -### Using HBM Memory (p10-intel) +### Using HBM Memory (P10-Intel) Following commands can be used to compile and run example application above + ```bash ml GCC memkind export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15 -- GitLab