Compare revisions

392ff5d2 · 669d02fe · 0cb59aca · a32d20e6 · f34c3839 · 9450b8f0
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,7 +12,7 @@ docs:
  image: it4innovations/docker-mdcheck:latest
  allow_failure: true
  script:
-  - mdl -r ~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046 *.md docs.it4i # BUGS
+  - find content/docs -name "*.mdx" | xargs mdl -r ~MD002,~MD007,~MD013,~MD010,~MD014,~MD024,~MD026,~MD029,~MD033,~MD036,~MD037,~MD046

 pylint:
  stage: test
@@ -22,20 +22,16 @@ pylint:
  script:
  - pylint $(find . -name "*.py" -not -name "feslicescript.py")

-pysafety:
+capitalize:
  stage: test
-  image: it4innovations/docker-pycheck:latest
+  image: it4innovations/docker-mkdocscheck:latest
  allow_failure: true
  before_script:
  - source /opt/.venv3/bin/activate
+  - python -V # debug
+  - pip list | grep titlecase 
  script:
-  - cat requirements.txt | safety check --stdin --full-report
-
-capitalize:
-  stage: test
-  image: it4innovations/docker-mkdocscheck:latest
-  script:
-  - find mkdocs.yml docs.it4i/ \( -name '*.md' -o -name '*.yml' \) -print0 | xargs -0 -n1 scripts/titlemd.py --test
+  - find content/docs/ \( -name '*.mdx' -o -name '*.yml' \) ! -path '*einfracz*' -print0 | xargs -0 -n1 scripts/titlemd.py --test

 ext_links:
  stage: after_test
@@ -45,7 +41,7 @@ ext_links:
  # remove JSON results
  - rm *.json
  script:
-  - find docs.it4i/ -name '*.md' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
+  - find content/docs -name '*.mdx' -exec grep --color -l http {} + | xargs awesome_bot -t 10 --allow-dupe --allow-redirect
  only:
  - master

@@ -55,8 +51,8 @@ ext_links:
  before_script:
  - echo "192.168.101.10 docs.it4i.cz" >> /etc/hosts
  - wget -V
-  - echo https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
-  - wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_BUILD_REF_NAME/ || true
+  - echo https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
+  - wget --spider -e robots=off -o wget.log -r -p https://docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/ || true
  script:
  - cat wget.log | awk '/^Found [0-9]+ broken link[s]?.$/,/FINISHED/ { rc=-1; print $0 }; END { exit rc }'

@@ -75,7 +71,7 @@ mkdocs:
    # get modules list from clusters
  - bash scripts/get_modules.sh
    # generate site_url
-  - (if [ "${CI_BUILD_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_BUILD_REF_NAME\//" mkdocs.yml;fi);
+  - (if [ "${CI_COMMIT_REF_NAME}" != 'master' ]; then sed -i "s/\(site_url.*$\)/\1devel\/$CI_COMMIT_REF_NAME\//" mkdocs.yml;fi);
    # generate ULT for code link
 #  - sed -i "s/master/$CI_BUILD_REF_NAME/g" material/partials/toc.html
    # regenerate modules matrix
@@ -113,7 +109,7 @@ deploy to stage:
  - echo -e "Host *\n\tStrictHostKeyChecking no\n\n" > ~/.ssh/config
  script:
  - chown nginx:nginx site -R
-  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_BUILD_REF_NAME/
+  - rsync -a --delete site/ root@"$SSH_HOST_STAGE":/srv/docs.it4i.cz/devel/$CI_COMMIT_REF_NAME/
  only:
  - branches@sccs/docs.it4i.cz


--- a/.spelling
+++ b/.spelling
+Quantum Scalar I6
+JAN
+LUMI
+AI
+CI/CD
+AWS
+CLI
+FAQ
+s3cmd
+GUI
+EESSI
 hipBlas
 hipSolver
 LUMI
@@ -822,3 +833,19 @@ e-INFRA CZ
 DICE
 qgpu
 qcpu
+it4i-portal-clients
+it4icheckaccess
+it4idedicatedtime
+it4ifree
+it4ifsusage
+it4iuserfsusage
+it4iprojectfsusage
+it4imotd
+e-INFRA
+it4i-portal-clients
+s3cmd
+s5cmd
+title:
+e-INFRA CZ Cloud Ostrava
+e-INFRA CZ Account
+
--- a/README.md
+++ b/README.md
-# User Documentation
+# IT4Inovations Documentation

 This project contains IT4Innovations user documentation source.

-## Development
+## Migration

-### Install
-
-```console
-$ sudo apt install libpython-dev
-$ virtualenv venv
-$ source venv/bin/activate
-$ pip install -r requirements.txt
-```
-
-### Package Upgrade With pip
-
-```console
-$ pip list -o
-$ pip install --upgrade package
-$ pip freeze | sed '/pkg-resources==/d' > requirements.txt
-```
-
-## Environments
-
-* [https://docs.it4i.cz - master branch](https://docs.it4i.cz - master branch)
-* [https://docs.it4i.cz/devel/$BRANCH_NAME](https://docs.it4i.cz/devel/$BRANCH_NAME) - maps the branches, available only with VPN access
-
-## URLs
-
-* [http://facelessuser.github.io/pymdown-extensions/](http://facelessuser.github.io/pymdown-extensions/)
-* [http://squidfunk.github.io/mkdocs-material/](http://squidfunk.github.io/mkdocs-material/)
-
-```
-fair-share
-InfiniBand
-RedHat
-CentOS
-Mellanox
-```
-
-## Mathematical Formulae
-
-### Formulas Are Made With:
-
-* [https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/](https://facelessuser.github.io/pymdown-extensions/extensions/arithmatex/)
-* [https://www.mathjax.org/](https://www.mathjax.org/)
-
-You can add formula to page like this:
-
-```
-$$
-MAX\_FAIRSHARE * ( 1 - \frac{usage_{Project}}{usage_{Total}} )
-$$
-```
-
-To enable the MathJX on page you need to enable it by adding line ```---8<--- "mathjax.md"``` at the end of file.
+* [fumadocs](https://fumadocs.vercel.app/)
\ No newline at end of file
--- a/docs.it4i/apiv2.md
+++ b/docs.it4i/apiv2.md
+# SCS API v2
+
+## Info
+
+- **OpenAPI:** 3.1.0
+- **Title:** scs-api-2
+- **Version:** 0.1.0
+- **Server URL:** `https://scs.it4i.cz/api/v2`
+
+## Paths
+
+### `/dedicated-time`
+
+**GET**
+
+- **Summary:** Get dedicated times
+- **Description:** Retrieves dedicated time entries, optionally filtered by cluster name or period preset
+- **OperationId:** `dedicated_time_handler`
+
+**Parameters:**
+
+- `cluster` (query): Filter by cluster name; Available values: karolina, barbora, dgx *(optional)*
+- `period` (query): Filter by time period preset; Available values: planned, active *(optional)*
+
+**Responses:**
+
+- `200`: List of dedicated time entries
+- `400`: Failed to deserialize query, Invalid cluster, Invalid period
+  Example:
+
+  ```json
+  {
+    "message": "Invalid cluster: el_gordo"
+  }
+  ```
+- `500`: Failed to retrieve dedicated time due to a server error
+  Example:
+  ```json
+  {
+    "message": "Failed to retreive dedicated time"
+  }
+  ```
+
+### `/dedicated-time-calendar`
+
+**GET**
+
+- **Summary:** Get dedicated times
+- **Description:** Retrieves dedicated time entries and generates a VCalendar response.
+- **OperationId:** `dedicated_time_calendar`
+
+**Responses:**
+
+- `200`: Dedicated time VCalendar
+  Example:
+
+  ```
+  BEGIN:VCALENDAR
+  VERSION:2.0
+  PRODID:-//SUTD Timetable Calendar//randName//EN
+  CALSCALE:GREGORIAN
+  BEGIN:VEVENT
+  UID:1234@example.com
+  DTSTAMP:20230101T000000Z
+  DTSTART:20230101T000000Z
+  DTEND:20230102T000000Z
+  SUMMARY:Sample Dedicated Time - Cluster Outage
+  DESCRIPTION:Sample Dedicated Time - Cluster Outage
+  END:VEVENT
+  END:VCALENDAR
+  ```
+
+- `500`: Failed to retrieve dedicated time calendar
+  Example:
+
+  ```json
+  {
+    "message": "Failed to retreive dedicated time calendar"
+  }
+  ```
+
+### `/motd`
+
+**GET**
+
+- **Summary:** Get messages of the day
+- **Description:** Retrieves messages of the day, optionally filtered by category
+- **OperationId:** `motd`
+
+**Parameters:**
+
+- `category` (query): *(optional)*
+
+**Responses:**
+
+- `200`: List of motd entries
+- `400`: Failed to deserialize query, Invalid motd category
+- `500`: Failed to retrieve motd entries due to a server error
+  Example:
+
+  ```json
+  {
+    "message": "Failed to retrieve motd"
+  }
+  ```
+
+## Components
+
+### Schemas
+
+#### DedicatedTime
+
+```yaml
+type: object
+required:
+  - updated_at
+properties:
+  cluster_type:
+    type: [string, 'null']
+  date_efficiency:
+    type: [string, 'null']
+    format: date-time
+  date_expiration:
+    type: [string, 'null']
+    format: date-time
+  updated_at:
+    type: string
+    format: date-time
+```
+
+#### Motd
+
+```yaml
+type: object
+required:
+  - id
+  - author
+  - category
+  - created_at
+  - updated_at
+  - date_modification
+  - title
+  - message_body
+  - systems
+properties:
+  id:
+    type: integer
+    format: int32
+    examples: [1]
+  author:
+    type: string
+    examples: [Admin]
+  category:
+    type: string
+    examples: [public-service-announcement]
+  created_at:
+    type: string
+    format: date-time
+  updated_at:
+    type: string
+    format: date-time
+  date_modification:
+    type: string
+    format: date-time
+  date_efficiency:
+    type: [string, 'null']
+    format: date-time
+  date_expiration:
+    type: [string, 'null']
+    format: date-time
+  date_outage_efficiency:
+    type: [string, 'null']
+    format: date-time
+  date_outage_expiration:
+    type: [string, 'null']
+    format: date-time
+  title:
+    type: string
+    examples: [Important Update]
+  message_body:
+    type: string
+    examples: [We are experiencing some service disruptions.]
+  systems:
+    type: array
+    items:
+      type: string
+      examples: [Karolina]
+```
+
+#### MsgResponse
+
+```yaml
+type: object
+description: |
+  Common struct for DTO-less responses
+  eg. ```200 {"message":"Operation succeeded"}```
+required:
+  - message
+properties:
+  message:
+    type: string
+    examples: [API response]
+```
--- a/docs.it4i/barbora-ng/hardware-overview.md
+++ b/docs.it4i/barbora-ng/hardware-overview.md
+# Hardware Overview
+
+!!!important Work in progress
+    Barbora NG documentation is a WIP.
+    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.
+
+    The launch of Barbora NG is planned for October/November.
+    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.
+
+Barbora NG consists of 141 non-accelerated compute nodes named **cn[001-141]**.
+Each node is a powerful x86-64 computer equipped with 192 cores
+(2x Intel Xeon 6952P with 96 CPU cores) and 768 GB RAM.
+User access to the Barbora NG cluster is provided by two login nodes **login[1-2]**.
+The nodes are interlinked through high speed InfiniBand NDR and Ethernet networks.
+
+The parameters are summarized in the following tables:
+
+| **In general**                       |                       |
+| ------------------------------------ | --------------------- |
+| Architecture of compute nodes        | x86-64                |
+| Operating system                     | Linux                 |
+| [**Compute nodes**][1]               |                       |
+| Total                                | 141                   |
+| Processor Type                       | [Intel Xeon 6952P][b] |
+| Architecture                         | Granite Rapids        |
+| Processor cores                      | 96                    |
+| Processors per node                  | 2                     |
+| RAM                                  | 768 GB                |
+| Local disk drive                     | no                    |
+| Compute network                      | InfiniBand HDR        |
+| non-accelerated                      | 141, cn[001-141]        |
+| **In total**                         |                       |
+| Theoretical peak performance (Rpeak) | ??? TFLOP/s           |
+| Cores                                | 27072                 |
+| RAM                                  | 108.288 TB            |
+
+[1]: compute-nodes.md
+[2]: ../general/resources-allocation-policy.md
+[3]: network.md
+[4]: storage.md
+[5]: ../general/shell-and-data-access.md
+[6]: visualization.md
+
+[a]: https://support.it4i.cz/rt
+[b]: https://www.intel.com/content/www/us/en/products/sku/241643/intel-xeon-6952p-processor-480m-cache-2-10-ghz/specifications.html
\ No newline at end of file
--- a/docs.it4i/barbora-ng/introduction.md
+++ b/docs.it4i/barbora-ng/introduction.md
+# Introduction
+
+!!!important Work in progress
+    Barbora NG documentation is a WIP.
+    The documentation is still being developed (reflecting changes in technical specifications) and may be updated frequently.
+
+    The launch of Barbora NG is planned for October/November.
+    In the meantime, the first computational resources have already been allocated in the latest Open Access Grant Competition.
+
+Welcome to Barbora Next Gen (NG) supercomputer cluster.
+Barbora NG is our latest supercomputer which consists of 141 compute nodes,
+totaling 27072 compute cores with 108288 GB RAM, giving over ??? TFLOP/s theoretical peak performance.
+
+Nodes are interconnected through a fully non-blocking fat-tree InfiniBand NDR network
+and are equipped with Intel Granite Rapids processors.
+Read more in [Hardware Overview][1].
+
+The cluster runs with an operating system compatible with the Red Hat [Linux family][a]. We have installed a wide range of software packages targeted at different scientific domains.
+These packages are accessible via the [modules environment][2].
+
+The user data shared file system and job data shared file system are available to users.
+
+The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].
+
+Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].
+
+
+[1]: hardware-overview.md
+[2]: ../environment-and-modules.md
+[3]: ../general/resources-allocation-policy.md
+[4]: ../general/applying-for-resources.md
+[5]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
+[6]: ../general/shell-and-data-access.md
+
+[a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
+[b]: https://slurm.schedmd.com/
--- a/docs.it4i/barbora/introduction.md
+++ b/docs.it4i/barbora/introduction.md
@@ -8,7 +8,7 @@ The cluster runs with an operating system compatible with the Red Hat [Linux fam

 The user data shared file system and job data shared file system are available to users.

-The [PBS Professional Open Source Project][b] workload manager provides [computing resources allocations and job execution][3].
+The [Slurm][b] workload manager provides [computing resources allocations and job execution][3].

 Read more on how to [apply for resources][4], [obtain login credentials][5] and [access the cluster][6].

@@ -22,4 +22,4 @@ Read more on how to [apply for resources][4], [obtain login credentials][5] and
 [6]: ../general/shell-and-data-access.md

 [a]: http://upload.wikimedia.org/wikipedia/commons/1/1b/Linux_Distribution_Timeline.svg
-[b]: https://www.pbspro.org/
+[b]: https://slurm.schedmd.com/
--- a/docs.it4i/barbora/storage.md
+++ b/docs.it4i/barbora/storage.md
@@ -120,7 +120,7 @@ The filesystem is backed up, so that it can be restored in case of a catastrophi

 The SCRATCH is realized as Lustre parallel file system and is available from all login and computational nodes. There are 5 OSTs dedicated for the SCRATCH file system.

-The SCRATCH filesystem is mounted in directory /scratch. Users may freely create subdirectories and files on the filesystem. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.
+The SCRATCH filesystem is mounted in the `/scratch/project/PROJECT_ID` directory created automatically with the `PROJECT_ID` project. Accessible capacity is 310TB, shared among all users. Individual users are restricted by filesystem usage quotas, set to 10TB per user. The purpose of this quota is to prevent runaway programs from filling the entire filesystem and deny service to other users. Should 10TB prove insufficient, contact [support][d], the quota may be lifted upon request.

 !!! note
    The Scratch filesystem is intended for temporary scratch data generated during the calculation as well as for high-performance access to input and output files. All I/O intensive jobs must use the SCRATCH filesystem as their working directory.

--- a/docs.it4i/cloud/einfracz-cloud.md
+++ b/docs.it4i/cloud/einfracz-cloud.md
 # e-INFRA CZ Cloud Ostrava

-Ostrava cloud consists of 28 nodes from [Karolina][a] supercomputer.
+Ostrava cloud consists of 22 nodes from the [Karolina][a] supercomputer.
 The cloud site is built on top of OpenStack,
 which is a free open standard cloud computing platform.

@@ -61,15 +61,15 @@ For the list of deployed OpenStack services, see the [list of components][1].

 More information can be found on the [e-INFRA CZ website][2].

-[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
-[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
-[3]: https://docs.e-infra.cz/account/
-[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
-[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/quota-limits/
+[1]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/openstack-components
+[2]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site
+[3]: https://docs.account.e-infra.cz/en/docs/access/account#how-to-apply-for-the-first-time
+[4]: https://docs.platforms.cloud.e-infra.cz/en/docs/getting-started/creating-first-infrastructure
+[5]: https://docs.platforms.cloud.e-infra.cz/en/docs/technical-reference/ostrava-g2-site/quota-limits
 [6]: https://ostrava.openstack.cloud.e-infra.cz/
-[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
+[7]: https://cyso.cloud/docs/cloud/extra/how-to-use-the-openstack-cli-tools-on-linux/
 [8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
-[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
+[9]: https://docs.platforms.cloud.e-infra.cz/en/docs/how-to-guides/obtaining-api-key
 [10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline

 [a]: ../karolina/introduction.md

--- a/docs.it4i/cloud/it4i-cloud.md
+++ b/docs.it4i/cloud/it4i-cloud.md
+# IT4I Cloud
+
+IT4I cloud consists of 14 nodes from the [Karolina][a] supercomputer.
+The cloud site is built on top of OpenStack,
+which is a free open standard cloud computing platform.
+
+!!! Note
+    The guide describes steps for personal projects.<br>
+    Some steps may differ for large projects.<br>
+    For large project, apply for resources to the [Allocation Committee][11].
+
+## Access
+
+To access the cloud you must be a member of an active EUROHPC project,
+or fall into the **Access Category B**, i.e. [Access For Thematic HPC Resource Utilisation][11].
+
+A personal OpenStack project is required. Request one by contacting [IT4I Support][12].
+
+The dashboard is available at [https://cloud.it4i.cz][6].
+
+You can see quotas set for the IT4I Cloud in the [Quota Limits][f] section.
+
+## Creating First Instance
+
+To create your first VM instance, follow the steps below:
+
+### Log In
+
+Go to [https://cloud.it4i.cz][6], enter your LDAP username and password and choose the `IT4I_LDAP` domain. After you sign in, you will be redirected to the dashboard.
+
+![](../img/login.png)
+
+### Create Key Pair
+
+SSH key is required for remote access to your instance.
+
+1. Go to **Project > Compute > Key Pairs** and click the **Create Key Pair** button.
+
+    ![](../img/keypairs.png)
+
+1. In the Create Key Pair window, name your key pair, select `SSH Key` for key type and confirm by clicking Create Key Pair.
+
+    ![](../img/keypairs1.png)
+
+1. Download and manage the private key according to your operating system.
+
+### Update Security Group
+
+To be able to remotely access your VM instance, you have to allow access in the security group.
+
+1. Go to **Project > Network > Security Groups** and click on **Manage Rules**, for the default security group.
+
+    ![](../img/securityg.png)
+
+1. Click on **Add Rule**, choose **SSH**, and leave the remaining fields unchanged.
+
+    ![](../img/securityg1.png)
+
+### Create VM Instance
+
+1. In **Compute > Instances**, click **Launch Instance**.
+
+    ![](../img/instance.png)
+
+1. Choose Instance Name, Description, and number of instances. Click **Next**.
+
+    ![](../img/instance1.png)
+
+1. Choose an image from which to boot the instance. Choose to delete the volume after instance delete. Click **Next**.
+
+    ![](../img/instance2.png)
+
+1. Choose the hardware resources of the instance by selecting a flavor. Additional volumes for data can be attached later on. Click **Next**.
+
+    ![](../img/instance3.png)
+
+1. Select the network and continue to **Security Groups**.
+
+    ![](../img/instance4.png)
+
+1. Allocate the security group with SSH rule that you added in the [Update Security Group](it4i-cloud.md#update-security-group) step. Then click **Next** to go to the **Key Pair**.
+
+    ![](../img/securityg2.png)
+
+1. Select the key that you created in the [Create Key Pair][g] section and launch the instance.
+
+    ![](../img/instance5.png)
+
+### Associate Floating IP
+
+1. Click on the **Associate** button next to the floating IP.
+
+    ![](../img/floatingip.png)
+
+1. Select Port to be associated with the instance, then click the **Associate** button.
+
+Now you can join the VM using your preferred SSH client.
+
+## Process Automatization
+
+You can automate the process using Openstack.
+
+### OpenStack
+
+Prerequisites:
+
+* Linux/Mac/WSL terminal BASH shell
+* installed [OpenStack client][7]
+
+Follow the guide: [https://code.it4i.cz/commandline][10]
+
+Run commands:
+
+```console
+source project_openrc.sh.inc
+```
+
+```console
+./cmdline-demo.sh basic-infrastructure-1
+```
+
+[1]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/openstack-components/
+[2]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-site/
+[3]: https://docs.e-infra.cz/account/
+[4]: https://docs.e-infra.cz/compute/openstack/getting-started/creating-first-infrastructure/
+[5]: https://docs.e-infra.cz/compute/openstack/technical-reference/ostrava-g2-site/quota-limits/
+[6]: https://cloud.it4i.cz
+[7]: https://docs.fuga.cloud/how-to-use-the-openstack-cli-tools-on-linux
+[8]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/terraform
+[9]: https://docs.e-infra.cz/compute/openstack/how-to-guides/obtaining-api-key/
+[10]: https://code.it4i.cz/dvo0012/infrastructure-by-script/-/tree/main/openstack-infrastructure-as-code-automation/clouds/g2/ostrava/general/commandline
+[11]: https://www.it4i.cz/en/for-users/computing-resources-allocation
+[12]: mailto:support@it4i.cz @@
+
+[a]: ../karolina/introduction.md
+[b]: ../general/access/project-access.md
+[c]: einfracz-cloud.md
+[d]: ../general/accessing-the-clusters/vpn-access.md
+[e]: ../general/obtaining-login-credentials/obtaining-login-credentials.md
+[f]: it4i-quotas.md
+[g]: it4i-cloud.md#create-key-pair
+
+
--- a/docs.it4i/cloud/it4i-quotas.md
+++ b/docs.it4i/cloud/it4i-quotas.md
+# IT4I Cloud Quotas
+
+| Resource                              | Quota |
+|---------------------------------------|-------|
+| Instances                             |    10 |
+| VCPUs                                 |    20 |
+| RAM                                   |  32GB |
+| Volumes                               |    20 |
+| Volume Snapshots                      |    12 |
+| Volume Storage                        |   500 |
+| Floating-IPs                          |     1 |
+| Security Groups                       |    10 |
+| Security Group Rules                  |   100 |
+| Networks                              |     1 |
+| Ports                                 |    10 |
+| Routers                               |     1 |
+| Backups                               |    12 |
+| Groups                                |    10 |
+| rbac_policies                         |    10 |
+| Subnets                               |     1 |
+| Subnet_pools                          |    -1 |
+| Fixed-ips                             |    -1 |
+| Injected-file-size                    | 10240 |
+| Injected-path-size                    |   255 |
+| Injected-files                        |     5 |
+| Key-pairs                             |   100 |
+| Properties                            |   128 |
+| Server-groups                         |    10 |
+| Server-group-members                  |    10 |
+| Backup-gigabytes                      |  1002 |
+| Per-volume-gigabytes                  |    -1 |
--- a/docs.it4i/cs/amd.md
+++ b/docs.it4i/cs/amd.md
--- a/docs.it4i/cs/arm.md
+++ b/docs.it4i/cs/arm.md
--- a/docs.it4i/cs/guides/grace.md
+++ b/docs.it4i/cs/guides/grace.md
+# Using NVIDIA Grace Partition
+
+For testing your application on the NVIDIA Grace Partition,
+you need to prepare a job script for that partition or use the interactive job:
+
+```console
+salloc -N 1 -c 144 -A PROJECT-ID -p p11-grace --time=08:00:00
+```
+
+where:
+
+- `-N 1` means allocation single node,
+- `-c 144` means allocation 144 cores,
+- `-p p11-grace` is NVIDIA Grace partition,
+- `--time=08:00:00` means allocation for 8 hours.
+
+## Available Toolchains
+
+The platform offers three toolchains:
+
+- Standard GCC (as a module `ml GCC`)
+- [NVHPC](https://developer.nvidia.com/hpc-sdk) (as a module `ml NVHPC`)
+- [Clang for NVIDIA Grace](https://developer.nvidia.com/grace/clang) (installed in `/opt/nvidia/clang`)
+
+!!! note
+    The NVHPC toolchain showed strong results with minimal amount of tuning necessary in our initial evaluation.
+
+### GCC Toolchain
+
+The GCC compiler seems to struggle with vectorization of short (constant length) loops, which tend to get completely unrolled/eliminated instead of being vectorized. For example simple nested loop such as
+
+```cpp
+for(int i = 0; i < 1000000; ++i) {
+    // Iterations dependent in "i"
+    // ...
+    for(int j = 0; j < 8; ++j) {
+        // but independent in "j"
+        // ...
+    }
+}
+```
+
+may emit scalar code for the inner loop leading to no vectorization being used at all.
+
+### Clang (For Grace) Toolchain
+
+The Clang/LLVM tends to behave similarly, but can be guided to properly vectorize the inner loop with either flags `-O3 -ffast-math -march=native -fno-unroll-loops -mllvm -force-vector-width=8` or pragmas such as `#pragma clang loop vectorize_width(8)` and `#pragma clang loop unroll(disable)`.
+
+```cpp
+for(int i = 0; i < 1000000; ++i) {
+    // Iterations dependent in "i"
+    // ...
+    #pragma clang loop unroll(disable) vectorize_width(8)
+    for(int j = 0; j < 8; ++j) {
+        // but independent in "j"
+        // ...
+    }
+}
+```
+
+!!! note
+    Our basic experiments show that fixed width vectorization (NEON) tends to perform better in the case of short (register-length) loops than SVE. In cases (like above), where specified `vectorize_width` is larger than availiable vector unit width, Clang will emit multiple NEON instructions (eg. 4 instructions will be emitted to process 8 64-bit operations in 128-bit units of Grace).
+
+### NVHPC Toolchain
+
+The NVHPC toolchain handled aforementioned case without any additional tuning. Simple `-O3 -march=native -fast` should be therefore sufficient.
+
+## Basic Math Libraries
+
+The basic libraries (BLAS and LAPACK) are included in NVHPC toolchain and can be used simply as `-lblas` and `-llapack` for BLAS and LAPACK respectively (`lp64` and `ilp64` versions are also included).
+
+!!! note
+    The Grace platform doesn't include CUDA-capable GPU, therefore `nvcc` will fail with an error. This means that `nvc`, `nvc++` and `nvfortran` should be used instead.
+
+### NVIDIA Performance Libraries
+
+The [NVPL](https://developer.nvidia.com/nvpl) package includes more extensive set of libraries in both sequential and multi-threaded versions:
+
+- BLACS: `-lnvpl_blacs_{lp64,ilp64}_{mpich,openmpi3,openmpi4,openmpi5}`
+- BLAS: `-lnvpl_blas_{lp64,ilp64}_{seq,gomp}`
+- FFTW: `-lnvpl_fftw`
+- LAPACK: `-lnvpl_lapack_{lp64,ilp64}_{seq,gomp}`
+- ScaLAPACK: `-lnvpl_scalapack_{lp64,ilp64}`
+- RAND: `-lnvpl_rand` or `-lnvpl_rand_mt`
+- SPARSE: `-lnvpl_sparse`
+
+This package should be compatible with all availiable toolchains and includes CMake module files for easy integration into CMake-based projects. For further documentation see also [NVPL](https://docs.nvidia.com/nvpl).
+
+### Recommended BLAS Library
+
+We recommend to use the multi-threaded BLAS library from the NVPL package.
+
+!!! note
+    It is important to pin the processes using **OMP_PROC_BIND=spread**
+
+Example:
+
+```console
+$ ml NVHPC
+$ nvc -O3 -march=native myprog.c -o myprog -lnvpl_blas_lp64_gomp
+$ OMP_PROC_BIND=spread ./myprog
+```
+
+## Basic Communication Libraries
+
+The OpenMPI 4 implementation is included with NVHPC toolchain and is exposed as a module (`ml OpenMPI`). The following example
+
+```cpp
+#include <mpi.h>
+#include <sched.h>
+#include <omp.h>
+
+int main(int argc, char **argv)
+{
+        int rank;
+        MPI_Init(&argc, &argv);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        #pragma omp parallel
+        {
+                printf("Hello on rank %d, thread %d on CPU %d\n", rank, omp_get_thread_num(), sched_getcpu());
+        }
+        MPI_Finalize();
+}
+```
+
+can be compiled and run as follows
+
+```console
+ml OpenMPI
+mpic++ -fast -fopenmp hello.cpp -o hello
+OMP_PROC_BIND=close OMP_NUM_THREADS=4 mpirun -np 4 --map-by slot:pe=36 ./hello
+```
+
+In this configuration we run 4 ranks bound to one quarter of cores each with 4 OpenMP threads.
+
+## Simple BLAS Application
+
+The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).
+
+Stationary probability vector estimation in `C++`:
+
+```cpp
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include "cblas.h"
+
+const size_t ITERATIONS  = 32;
+const size_t MATRIX_SIZE = 1024;
+
+int main(int argc, char *argv[])
+{
+    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;
+
+    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));
+
+    for(size_t i = 0; i < MATRIX_SIZE; ++i)
+        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
+    a[0] = 0.5f;
+
+    std::vector<float> w1(matrixElements, 0.0f);
+    std::vector<float> w2(matrixElements, 0.0f);
+
+    std::copy(a.begin(), a.end(), w1.begin());
+
+    std::vector<float> *t1, *t2;
+    t1 = &w1;
+    t2 = &w2;
+
+    auto c1 = std::chrono::steady_clock::now();
+
+    for(size_t i = 0; i < ITERATIONS; ++i)
+    {
+        std::fill(t2->begin(), t2->end(), 0.0f);
+
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
+                    1.0f, t1->data(), MATRIX_SIZE,
+                    a.data(), MATRIX_SIZE,
+                    1.0f, t2->data(), MATRIX_SIZE);
+
+        std::swap(t1, t2);
+    }
+
+    auto c2 = std::chrono::steady_clock::now();
+
+    for(size_t i = 0; i < MATRIX_SIZE; ++i)
+    {
+        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
+    }
+
+    std::cout << std::endl;
+
+    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
+
+    return 0;
+}
+```
+
+Stationary probability vector estimation in `Fortran`:
+
+```fortran
+program main
+    implicit none
+
+    integer :: matrix_size, iterations
+    integer :: i
+    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
+    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
+    real, pointer :: out_data(:), out_diag(:)
+    integer :: cr, cm, c1, c2
+
+    iterations  = 32
+    matrix_size = 1024
+
+    call system_clock(count_rate=cr)
+    call system_clock(count_max=cm)
+
+    allocate(a(matrix_size, matrix_size))
+    allocate(w1(matrix_size, matrix_size))
+    allocate(w2(matrix_size, matrix_size))
+
+    a(:,:) = 1.0 / real(matrix_size)
+    a(:,1) = 0.5 / real(matrix_size - 1)
+    a(1,1) = 0.5
+
+    w1 = a
+    w2(:,:) = 0.0
+
+    t1 => w1
+    t2 => w2
+
+    call system_clock(c1)
+
+    do i = 0, iterations
+        t2(:,:) = 0.0
+
+        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)
+
+        tmp => t1
+        t1  => t2
+        t2  => tmp
+    end do
+
+    call system_clock(c2)
+
+    out_data(1:size(t1)) => t1
+    out_diag => out_data(1::matrix_size+1)
+
+    print *, out_diag
+    print *, "Elapsed Time: ", (c2 - c1) / real(cr)
+
+    deallocate(a)
+    deallocate(w1)
+    deallocate(w2)
+end program main
+```
+
+### Using NVHPC Toolchain
+
+The C++ version of the example can be compiled with NVHPC and ran as follows
+
+```console
+ml NVHPC
+nvc++ -O3 -march=native -fast -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lblas main.cpp -o main
+OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
+```
+
+The Fortran version is just as simple:
+
+```console
+ml NVHPC
+nvfortran -O3 -march=native -fast -lblas main.f90 -o main.x
+OMP_NUM_THREADS=144 OMP_PROC_BIND=spread ./main
+```
+
+!!! note
+    It may be advantageous to use NVPL libraries instead NVHPC ones. For example DGEMM BLAS 3 routine from NVPL is almost 30% faster than NVHPC one.
+
+### Using Clang (For Grace) Toolchain
+
+Similarly Clang for Grace toolchain with NVPL BLAS can be used to compile C++ version of the example.
+
+```console
+ml NVHPC
+/opt/nvidia/clang/17.23.11/bin/clang++ -O3 -march=native -ffast-math -I$NVHPC/Linux_aarch64/$EBVERSIONNVHPC/compilers/include/lp64 -lnvpl_blas_lp64_gomp main.cpp -o main
+```
+
+!!! note
+    NVHPC module is used just for the `cblas.h` include in this case. This can be avoided by changing the code to use `nvpl_blas.h` instead.
+
+## Additional Resources
+
+- [https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/][1]
+- [https://developer.nvidia.com/hpc-sdk][2]
+- [https://developer.nvidia.com/grace/clang][3]
+- [https://docs.nvidia.com/nvpl][4]
+
+[1]: https://www.nvidia.com/en-us/data-center/grace-cpu-superchip/
+[2]: https://developer.nvidia.com/hpc-sdk
+[3]: https://developer.nvidia.com/grace/clang
+[4]: https://docs.nvidia.com/nvpl
--- a/docs.it4i/cs/guides/hm_management.md
+++ b/docs.it4i/cs/guides/hm_management.md
+# Heterogeneous Memory Management on Intel Platforms
+
+Partition `p10-intel` offser heterogeneous memory directly exposed to the user. This allows to manually pick appropriate kind of memory to be used at process or even single allocation granularity. Both kinds of memory are exposed as memory-only NUMA nodes. This allows both coarse (process level) and fine (allocation level) grained control over memory type used.
+
+## Overview
+
+At the process level the `numactl` facilities can be utilized, while Intel provided `memkind` library allows for finer control. Both `memkind` library and `numactl` can be accessed by loading `memkind` module or `OpenMPI` module (only `numactl`).
+
+```bash
+ml memkind
+```
+
+### Process Level (NUMACTL)
+
+The `numactl` allows to either restrict memory pool of the process to specific set of memory NUMA nodes
+
+```bash
+numactl --membind <node_ids_set>
+```
+
+or select single preffered node
+
+```bash
+numactl --preffered <node_id>
+```
+
+where `<node_ids_set>` is comma separated list (eg. `0,2,5,...`) in combination with ranges (such as `0-5`). The `membind` option kills the process if it requests more memory than can be satisfied from specified nodes. The `preffered` option just reverts to using other nodes according to their NUMA distance in the same situation.
+
+Convenient way to check `numactl` configuration is
+
+```bash
+numactl -s
+```
+
+which prints configuration in its execution environment eg.
+
+```bash
+numactl --membind 8-15 numactl -s
+policy: bind
+preferred node: 0
+physcpubind: 0 1 2 ... 189 190 191
+cpubind: 0 1 2 3 4 5 6 7
+nodebind: 0 1 2 3 4 5 6 7
+membind: 8 9 10 11 12 13 14 15
+```
+
+The last row shows allocations memory are restricted to NUMA nodes `8-15`.
+
+### Allocation Level (MEMKIND)
+
+The `memkind` library (in its simplest use case) offers new variant of `malloc/free` function pair, which allows to specify kind of memory to be used for given allocation. Moving specific allocation from default to HBM memory pool then can be achieved by replacing:
+
+```cpp
+void *pData = malloc(<SIZE>);
+/* ... */
+free(pData);
+```
+
+with
+
+```cpp
+#include <memkind.h>
+
+void *pData = memkind_malloc(MEMKIND_HBW, <SIZE>);
+/* ... */
+memkind_free(NULL, pData); // "kind" parameter is deduced from the address
+```
+
+Similarly other memory types can be chosen.
+
+!!! note
+    The allocation will return `NULL` pointer when memory of specified kind is not available.
+
+## High Bandwidth Memory (HBM)
+
+Intel Sapphire Rapids (partition `p10-intel`) consists of two sockets each with `128GB` of DDR and `64GB` on-package HBM memory. The machine is configured in FLAT mode and therefore exposes HBM memory as memory-only NUMA nodes (`16GB` per 12-core tile). The configuration can be verified by running
+
+```bash
+numactl -H
+```
+
+which should show 16 NUMA nodes (`0-7` should contain 12 cores and `32GB` of DDR DRAM, while `8-15` should have no cores and `16GB` of HBM each).
+
+![](../../img/cs/guides/p10_numa_sc4_flat.png)
+
+### Process Level
+
+With this we can easily restrict application to DDR DRAM or HBM memory:
+
+```bash
+# Only DDR DRAM
+numactl --membind 0-7 ./stream
+# ...
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:          369745.8     0.043355     0.043273     0.043588
+Scale:         366989.8     0.043869     0.043598     0.045355
+Add:           378054.0     0.063652     0.063483     0.063899
+Triad:         377852.5     0.063621     0.063517     0.063884
+
+# Only HBM
+numactl --membind 8-15 ./stream
+# ...
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:         1128430.1     0.015214     0.014179     0.015615
+Scale:        1045065.2     0.015814     0.015310     0.016309
+Add:          1096992.2     0.022619     0.021878     0.024182
+Triad:        1065152.4     0.023449     0.022532     0.024559
+```
+
+The DDR DRAM achieves bandwidth of around 400GB/s, while the HBM clears 1TB/s bar.
+
+Some further improvements can be achieved by entirely isolating a process to a single tile. This can be useful for MPI jobs, where `$OMPI_COMM_WORLD_RANK` can be used to bind each process individually. The simple wrapper script to do this may look like
+
+```bash
+#!/bin/bash
+numactl --membind $((8 + $OMPI_COMM_WORLD_RANK)) $@
+```
+
+and can be used as
+
+```bash
+mpirun -np 8 --map-by slot:pe=12 membind_wrapper.sh ./stream_mpi
+```
+
+(8 tiles with 12 cores each). However, this approach assumes `16GB` of HBM memory local to the tile is sufficient for each process (memory cannot spill between tiles). This approach may be significantly more useful in combination with `--preferred` instead of `--membind` to force preference of local HBM with spill to DDR DRAM. Otherwise
+
+```bash
+mpirun -n 8 --map-by slot:pe=12 numactl --membind 8-15 ./stream_mpi
+```
+
+is most likely preferable even for MPI workloads. Applying above approach to MPI Stream with 8 ranks and 1-24 threads per rank we can expect these results:
+![](../../img/cs/guides/p10_stream_dram.png)
+![](../../img/cs/guides/p10_stream_hbm.png)
+
+### Allocation Level
+
+Allocation level memory kind selection using `memkind` library can be illustrated using modified stream benchmark. The stream benchmark uses three working arrays (A, B and C), whose allocation can be changed to `memkind_malloc` as follows
+
+```cpp
+#include <memkind.h>
+// ...
+STREAM_TYPE *a = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
+STREAM_TYPE *b = (STREAM_TYPE *)memkind_malloc(MEMKIND_REGULAR, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
+STREAM_TYPE *c = (STREAM_TYPE *)memkind_malloc(MEMKIND_HBW_ALL, STREAM_ARRAY_SIZE * sizeof(STREAM_TYPE));
+// ...
+memkind_free(NULL, a);
+memkind_free(NULL, b);
+memkind_free(NULL, c);
+```
+
+Arrays A and C are allocated from HBM (`MEMKIND_HBW_ALL`), while DDR DRAM (`MEMKIND_REGULAR`) is used for B.
+The code then has to be linked with `memkind` library
+
+```bash
+gcc -march=native -O3 -fopenmp -lmemkind memkind_stream.c -o memkind_stream
+```
+
+and can be run as
+
+```bash
+export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
+OMP_NUM_THREADS=$((N*12)) OMP_PROC_BIND=spread ./memkind_stream
+```
+
+While the `memkind` library should be able to detect HBM memory on its own (through `HMAT` and `hwloc`) this is not supported on `p10-intel`. This means that NUMA nodes representing HBM have to be specified manually using `MEMKIND_HBW_NODES` environment variable.
+
+![](../../img/cs/guides/p10_stream_memkind.png)
+
+With this setup we can see that simple copy operation (C[i] = A[i]) achieves bandwidth comparable to the application bound entirely to HBM memory. On the other hand the scale operation (B[i] = s*C[i]) is mostly limited by DDR DRAM bandwidth. Its also worth noting that operations combining all three arrays are performing close to HBM-only configuration.
+
+## Simple Application
+
+One of applications that can greatly benefit from availability of large slower and faster smaller memory is computing histogram with many bins over large dataset.
+
+```cpp
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <omp.h>
+#include <memkind.h>
+
+const size_t N_DATA_SIZE  = 2 * 1024 * 1024 * 1024ull;
+const size_t N_BINS_COUNT = 1 * 1024 * 1024ull;
+const size_t N_ITERS      = 10;
+
+#if defined(HBM)
+    #define DATA_MEMKIND MEMKIND_REGULAR
+    #define BINS_MEMKIND MEMKIND_HBW_ALL
+#else
+    #define DATA_MEMKIND MEMKIND_REGULAR
+    #define BINS_MEMKIND MEMKIND_REGULAR
+#endif
+
+int main(int argc, char *argv[])
+{
+    const double binWidth = 1.0 / double(N_BINS_COUNT + 1);
+
+    double *pData = (double *)memkind_malloc(DATA_MEMKIND, N_DATA_SIZE * sizeof(double));
+    size_t *pBins = (size_t *)memkind_malloc(BINS_MEMKIND, N_BINS_COUNT * omp_get_max_threads() * sizeof(double));
+
+    #pragma omp parallel
+    {
+        drand48_data state;
+        srand48_r(omp_get_thread_num(), &state);
+
+        #pragma omp for
+        for(size_t i = 0; i < N_DATA_SIZE; ++i)
+            drand48_r(&state, &pData[i]);
+    }
+
+    auto c1 = std::chrono::steady_clock::now();
+
+    for(size_t it = 0; it < N_ITERS; ++it)
+    {
+        #pragma omp parallel
+        {
+            for(size_t i = 0; i < N_BINS_COUNT; ++i)
+                pBins[omp_get_thread_num()*N_BINS_COUNT + i] = size_t(0);
+
+            #pragma omp for
+            for(size_t i = 0; i < N_DATA_SIZE; ++i)
+            {
+                const size_t idx = size_t(pData[i] / binWidth) % N_BINS_COUNT;
+                pBins[omp_get_thread_num()*N_BINS_COUNT + idx]++;
+            }
+        }
+    }
+
+    auto c2 = std::chrono::steady_clock::now();
+
+    #pragma omp parallel for
+    for(size_t i = 0; i < N_BINS_COUNT; ++i)
+    {
+        for(size_t j = 1; j < omp_get_max_threads(); ++j)
+            pBins[i] += pBins[j*N_BINS_COUNT + i];
+    }
+
+    std::cout << "Elapsed Time [s]: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
+
+    size_t total = 0;
+    #pragma omp parallel for reduction(+:total)
+    for(size_t i = 0; i < N_BINS_COUNT; ++i)
+        total += pBins[i];
+
+    std::cout << "Total Items: " << total << std::endl;
+
+    memkind_free(NULL, pData);
+    memkind_free(NULL, pBins);
+
+    return 0;
+}
+```
+
+### Using HBM Memory (P10-Intel)
+
+Following commands can be used to compile and run example application above
+
+```bash
+ml GCC memkind
+export MEMKIND_HBW_NODES=8,9,10,11,12,13,14,15
+g++ -O3 -fopenmp -lmemkind histogram.cpp -o histogram_dram
+g++ -O3 -fopenmp -lmemkind -DHBM histogram.cpp -o histogram_hbm
+OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_dram
+OMP_PROC_BIND=spread GOMP_CPU_AFFINITY=0-95 OMP_NUM_THREADS=96 ./histogram_hbm
+```
+
+Moving histogram bins data into HBM memory should speedup the algorithm more than twice. It should be noted that moving also `pData` array into HBM memory worsens this result (presumably because the algorithm can saturate both memory interfaces).
+
+## Additional Resources
+
+- [https://linux.die.net/man/8/numactl][1]
+- [http://memkind.github.io/memkind/man_pages/memkind.html][2]
+- [https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory][3]
+
+[1]: https://linux.die.net/man/8/numactl
+[2]: http://memkind.github.io/memkind/man_pages/memkind.html
+[3]: https://lenovopress.lenovo.com/lp1738-implementing-intel-high-bandwidth-memory
\ No newline at end of file
--- a/docs.it4i/cs/guides/horizon.md
+++ b/docs.it4i/cs/guides/horizon.md
+# Using VMware Horizon
+
+VMware Horizon is a virtual desktop infrastructure (VDI) solution
+that enables users to access virtual desktops and applications from any device and any location.
+It provides a comprehensive end-to-end solution for managing and delivering virtual desktops and applications,
+including features such as session management, user authentication, and virtual desktop provisioning.
+
+![](../../img/horizon.png)
+
+## How to Access VMware Horizon
+
+!!! important
+    Access to VMware Horizon requires IT4I VPN.
+
+1. Contact [IT4I support][a] with a request for an access and VM allocation.
+1. [Download][1] and install the VMware Horizon Client for Windows.
+1. Add a new server `https://vdi-cs01.msad.it4i.cz/` in the Horizon client.
+1. Connect to the server using your IT4I username and password.
+   Username is in the `domain\username` format and the domain is `msad.it4i.cz`.
+   For example: `msad.it4i.cz\user123`
+
+## Example
+
+Below is an example of how to mount a remote folder and check the conection on Windows OS:
+
+### Prerequsities
+
+3D applications
+
+* [Blender][3]
+
+SSHFS for remote access
+
+* [sshfs-win][4]
+* [winfsp][5]
+* [shfs-win-manager][6]
+* ssh keys for access to clusters
+
+### Steps
+
+1. Start the VPN and connect to the server via VMware Horizon Client.
+
+    ![](../../img/vmware.png)
+
+1. Mount a remote folder.
+    * Run sshfs-win-manager.
+
+    ![](../../img/sshfs.png)
+
+    * Add a new connection.
+
+    ![](../../img/sshfs1.png)
+
+    * Click on **Connect**.
+
+    ![](../../img/sshfs2.png)
+
+1. Check that the folder is mounted.
+
+    ![](../../img/mount.png)
+
+1. Check the GPU resources.
+
+    ![](../../img/gpu.png)
+
+### Blender
+
+Now if you run, for example, Blender, you can check the available GPU resources in Blender Preferences.
+
+  ![](../../img/blender.png)
+
+[a]: mailto:support@it4i.cz
+
+[1]: https://vdi-cs01.msad.it4i.cz/
+[2]: https://www.paraview.org/download/
+[3]: https://www.blender.org/download/
+[4]: https://github.com/winfsp/sshfs-win/releases
+[5]: https://github.com/winfsp/winfsp/releases/
+[6]: https://github.com/evsar3/sshfs-win-manager/releases
--- a/docs.it4i/cs/guides/power10.md
+++ b/docs.it4i/cs/guides/power10.md
+# Using IBM Power Partition
+
+For testing your application on the IBM Power partition,
+you need to prepare a job script for that partition or use the interactive job:
+
+```console
+scalloc -N 1 -c 192 -A PROJECT-ID -p p07-power --time=08:00:00
+```
+
+where:
+
+- `-N 1` means allocation single node,
+- `-c 192` means allocation 192 cores (threads),
+- `-p p07-power` is IBM Power partition,
+- `--time=08:00:00` means allocation for 8 hours.
+
+On the partition, you should reload the list of modules:
+
+```
+ml architecture/ppc64le
+```
+
+The platform offers both `GNU` based and proprietary IBM toolchains for building applications. IBM also provides optimized BLAS routines library ([ESSL](https://www.ibm.com/docs/en/essl/6.1)), which can be used by both toolchain.
+
+## Building Applications
+
+Our sample application depends on `BLAS`, therefore we start by loading following modules (regardless of which toolchain we want to use):
+
+```
+ml GCC OpenBLAS
+```
+
+### GCC Toolchain
+
+In the case of GCC toolchain we can go ahead and compile the application as usual using either `g++`
+
+```
+g++ -lopenblas hello.cpp -o hello
+```
+
+or `gfortran`
+
+```
+gfortran -lopenblas hello.f90 -o hello
+```
+
+as usual.
+
+### IBM Toolchain
+
+The IBM toolchain requires additional environment setup as it is installed in `/opt/ibm` and is not exposed as a module
+
+```
+IBM_ROOT=/opt/ibm
+OPENXLC_ROOT=$IBM_ROOT/openxlC/17.1.1
+OPENXLF_ROOT=$IBM_ROOT/openxlf/17.1.1
+
+export PATH=$OPENXLC_ROOT/bin:$PATH
+export LD_LIBRARY_PATH=$OPENXLC_ROOT/lib:$LD_LIBRARY_PATH
+
+export PATH=$OPENXLF_ROOT/bin:$PATH
+export LD_LIBRARY_PATH=$OPENXLF_ROOT/lib:$LD_LIBRARY_PATH
+```
+
+from there we can use either `ibm-clang++`
+
+```
+ibm-clang++ -lopenblas hello.cpp -o hello
+```
+
+or `xlf`
+
+```
+xlf -lopenblas hello.f90 -o hello
+```
+
+to build the application as usual.
+
+!!! note
+    Combination of `xlf` and `openblas` seems to cause severe performance degradation. Therefore `ESSL` library should be preferred (see below).
+
+### Using ESSL Library
+
+The [ESSL](https://www.ibm.com/docs/en/essl/6.1) library is installed in `/opt/ibm/math/essl/7.1` so we define additional environment variables
+
+```
+IBM_ROOT=/opt/ibm
+ESSL_ROOT=${IBM_ROOT}math/essl/7.1
+export LD_LIBRARY_PATH=$ESSL_ROOT/lib64:$LD_LIBRARY_PATH
+```
+
+The simplest way to utilize `ESSL` in application, which already uses `BLAS` or `CBLAS` routines is to link with the provided `libessl.so`. This can be done by replacing `-lopenblas` with `-lessl` or `-lessl -lopenblas` (in case `ESSL` does not provide all required `BLAS` routines).
+In practice this can look like
+
+```
+g++ -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.cpp -o hello
+```
+
+or
+
+```
+gfortran -L${ESSL_ROOT}/lib64 -lessl -lopenblas hello.f90 -o hello
+```
+
+and similarly for IBM compilers (`ibm-clang++` and `xlf`).
+
+## Hello World Applications
+
+The `hello world` example application (written in `C++` and `Fortran`) uses simple stationary probability vector estimation to illustrate use of GEMM (BLAS 3 routine).
+
+Stationary probability vector estimation in `C++`:
+
+```c++
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include "cblas.h"
+
+const size_t ITERATIONS  = 32;
+const size_t MATRIX_SIZE = 1024;
+
+int main(int argc, char *argv[])
+{
+    const size_t matrixElements = MATRIX_SIZE*MATRIX_SIZE;
+
+    std::vector<float> a(matrixElements, 1.0f / float(MATRIX_SIZE));
+
+    for(size_t i = 0; i < MATRIX_SIZE; ++i)
+        a[i] = 0.5f / (float(MATRIX_SIZE) - 1.0f);
+    a[0] = 0.5f;
+
+    std::vector<float> w1(matrixElements, 0.0f);
+    std::vector<float> w2(matrixElements, 0.0f);
+
+    std::copy(a.begin(), a.end(), w1.begin());
+
+    std::vector<float> *t1, *t2;
+    t1 = &w1;
+    t2 = &w2;
+
+    auto c1 = std::chrono::steady_clock::now();
+
+    for(size_t i = 0; i < ITERATIONS; ++i)
+    {
+        std::fill(t2->begin(), t2->end(), 0.0f);
+
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE,
+                    1.0f, t1->data(), MATRIX_SIZE,
+                    a.data(), MATRIX_SIZE,
+                    1.0f, t2->data(), MATRIX_SIZE);
+
+        std::swap(t1, t2);
+    }
+
+    auto c2 = std::chrono::steady_clock::now();
+
+    for(size_t i = 0; i < MATRIX_SIZE; ++i)
+    {
+        std::cout << (*t1)[i*MATRIX_SIZE + i] << " ";
+    }
+
+    std::cout << std::endl;
+
+    std::cout << "Elapsed Time: " << std::chrono::duration<double>(c2 - c1).count() << std::endl;
+
+    return 0;
+}
+```
+
+Stationary probability vector estimation in `Fortran`:
+
+```fortran
+program main
+    implicit none
+
+    integer :: matrix_size, iterations
+    integer :: i
+    real, allocatable, target :: a(:,:), w1(:,:), w2(:,:)
+    real, dimension(:,:), contiguous, pointer :: t1, t2, tmp
+    real, pointer :: out_data(:), out_diag(:)
+    integer :: cr, cm, c1, c2
+
+    iterations  = 32
+    matrix_size = 1024
+
+    call system_clock(count_rate=cr)
+    call system_clock(count_max=cm)
+
+    allocate(a(matrix_size, matrix_size))
+    allocate(w1(matrix_size, matrix_size))
+    allocate(w2(matrix_size, matrix_size))
+
+    a(:,:) = 1.0 / real(matrix_size)
+    a(:,1) = 0.5 / real(matrix_size - 1)
+    a(1,1) = 0.5
+
+    w1 = a
+    w2(:,:) = 0.0
+
+    t1 => w1
+    t2 => w2
+
+    call system_clock(c1)
+
+    do i = 0, iterations
+        t2(:,:) = 0.0
+
+        call sgemm('N', 'N', matrix_size, matrix_size, matrix_size, 1.0, t1, matrix_size, a, matrix_size, 1.0, t2, matrix_size)
+
+        tmp => t1
+        t1  => t2
+        t2  => tmp
+    end do
+
+    call system_clock(c2)
+
+    out_data(1:size(t1)) => t1
+    out_diag => out_data(1::matrix_size+1)
+
+    print *, out_diag
+    print *, "Elapsed Time: ", (c2 - c1) / real(cr)
+
+    deallocate(a)
+    deallocate(w1)
+    deallocate(w2)
+end program main
+```
--- a/docs.it4i/cs/guides/xilinx.md
+++ b/docs.it4i/cs/guides/xilinx.md
+# Using Xilinx Accelerator Platform
+
+The first step to use Xilinx accelerators is to initialize Vitis (compiler) and XRT (runtime) environments.
+
+```console
+$ . /tools/Xilinx/Vitis/2023.1/settings64.sh
+$ . /opt/xilinx/xrt/setup.sh
+```
+
+## Platform Level Accelerator Management
+
+This should allow to examine current platform using `xbutil examine`,
+which should output user-level information about XRT platform and list available devices
+
+```
+$ xbutil examine
+System Configuration
+  OS Name              : Linux
+  Release              : 4.18.0-477.27.1.el8_8.x86_64
+  Version              : #1 SMP Thu Aug 31 10:29:22 EDT 2023
+  Machine              : x86_64
+  CPU Cores            : 64
+  Memory               : 257145 MB
+  Distribution         : Red Hat Enterprise Linux 8.8 (Ootpa)
+  GLIBC                : 2.28
+  Model                : ProLiant XL675d Gen10 Plus
+
+XRT
+  Version              : 2.16.0
+  Branch               : master
+  Hash                 : f2524a2fcbbabd969db19abf4d835c24379e390d
+  Hash Date            : 2023-10-11 14:01:19
+  XOCL                 : 2.16.0, f2524a2fcbbabd969db19abf4d835c24379e390d
+  XCLMGMT              : 2.16.0, f2524a2fcbbabd969db19abf4d835c24379e390d
+
+Devices present
+BDF             :  Shell                            Logic UUID                            Device ID       Device Ready*
+-------------------------------------------------------------------------------------------------------------------------
+[0000:88:00.1]  :  xilinx_u280_gen3x16_xdma_base_1  283BAB8F-654D-8674-968F-4DA57F7FA5D7  user(inst=132)  Yes
+[0000:8c:00.1]  :  xilinx_u280_gen3x16_xdma_base_1  283BAB8F-654D-8674-968F-4DA57F7FA5D7  user(inst=133)  Yes
+
+
+* Devices that are not ready will have reduced functionality when using XRT tools
+```
+
+Here two Xilinx Alveo u280 accelerators (`0000:88:00.1` and `0000:8c:00.1`) are available.
+The `xbutil` can be also used to query additional information about specific device using its BDF address
+
+```console
+$ xbutil examine -d "0000:88:00.1"
+
+-------------------------------------------------
+[0000:88:00.1] : xilinx_u280_gen3x16_xdma_base_1
+-------------------------------------------------
+Platform
+  XSA Name               : xilinx_u280_gen3x16_xdma_base_1
+  Logic UUID             : 283BAB8F-654D-8674-968F-4DA57F7FA5D7
+  FPGA Name              :
+  JTAG ID Code           : 0x14b7d093
+  DDR Size               : 0 Bytes
+  DDR Count              : 0
+  Mig Calibrated         : true
+  P2P Status             : disabled
+  Performance Mode       : not supported
+  P2P IO space required  : 64 GB
+
+Clocks
+  DATA_CLK (Data)        : 300 MHz
+  KERNEL_CLK (Kernel)    : 500 MHz
+  hbm_aclk (System)      : 450 MHz
+
+Mac Addresses            : 00:0A:35:0E:20:B0
+                         : 00:0A:35:0E:20:B1
+
+  Device Status: HEALTHY
+  Hardware Context ID: 0
+    Xclbin UUID: 6306D6AE-1D66-AEA7-B15D-446D4ECC53BD
+    PL Compute Units
+      Index  Name         Base Address  Usage  Status
+      -------------------------------------------------
+      0      vadd:vadd_1  0x800000      1      (IDLE)
+```
+
+Basic functionality of the device can be checked using `xbutil validate -d <BDF>` as
+
+```console
+$ xbutil validate -d "0000:88:00.1"
+Validate Device           : [0000:88:00.1]
+    Platform              : xilinx_u280_gen3x16_xdma_base_1
+    SC Version            : 4.3.27
+    Platform ID           : 283BAB8F-654D-8674-968F-4DA57F7FA5D7
+-------------------------------------------------------------------------------
+Test 1 [0000:88:00.1]     : aux-connection
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 2 [0000:88:00.1]     : pcie-link
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 3 [0000:88:00.1]     : sc-version
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 4 [0000:88:00.1]     : verify
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 5 [0000:88:00.1]     : dma
+    Details               : Buffer size - '16 MB' Memory Tag - 'HBM[0]'
+                            Host -> PCIe -> FPGA write bandwidth = 11988.9 MB/s
+                            Host <- PCIe <- FPGA read bandwidth = 12571.2 MB/s
+                            ...
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 6 [0000:88:00.1]     : iops
+    Details               : IOPS: 387240(verify)
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 7 [0000:88:00.1]     : mem-bw
+    Details               : Throughput (Type: DDR) (Bank count: 2) : 33932.9MB/s
+                            Throughput of Memory Tag: DDR[0] is 16974.1MB/s
+                            Throughput of Memory Tag: DDR[1] is 16974.2MB/s
+                            Throughput (Type: HBM) (Bank count: 1) : 12383.7MB/s
+    Test Status           : [PASSED]
+-------------------------------------------------------------------------------
+Test 8 [0000:88:00.1]     : p2p
+Test 9 [0000:88:00.1]     : vcu
+Test 10 [0000:88:00.1]    : aie
+Test 11 [0000:88:00.1]    : ps-aie
+Test 12 [0000:88:00.1]    : ps-pl-verify
+Test 13 [0000:88:00.1]    : ps-verify
+Test 14 [0000:88:00.1]    : ps-iops
+```
+
+Finally, the device can be reinitialized using `xbutil reset -d <BDF>` as
+
+```console
+$ xbutil reset -d "0000:88:00.1"
+Performing 'HOT Reset' on '0000:88:00.1'
+Are you sure you wish to proceed? [Y/n]: Y
+Successfully reset Device[0000:88:00.1]
+```
+
+This can be useful to recover the device from states such as `HANGING`, reported by `xbutil examine -d <BDF>`.
+
+## OpenCL Platform Level
+
+The `clinfo` utility can be used to verify that the accelerator is visible to OpenCL
+
+```console
+$ clinfo
+Number of platforms:                             2
+  Platform Profile:                              FULL_PROFILE
+  Platform Version:                              OpenCL 2.1 AMD-APP (3590.0)
+  Platform Name:                                 AMD Accelerated Parallel Processing
+  Platform Vendor:                               Advanced Micro Devices, Inc.
+  Platform Extensions:                           cl_khr_icd cl_amd_event_callback
+  Platform Profile:                              EMBEDDED_PROFILE
+  Platform Version:                              OpenCL 1.0
+  Platform Name:                                 Xilinx
+  Platform Vendor:                               Xilinx
+  Platform Extensions:                           cl_khr_icd
+<...>
+  Platform Name:                                 Xilinx
+Number of devices:                               2
+  Device Type:                                   CL_DEVICE_TYPE_ACCRLERATOR
+  Vendor ID:                                     0h
+  Max compute units:                             0
+  Max work items dimensions:                     3
+    Max work items[0]:                           4294967295
+    Max work items[1]:                           4294967295
+    Max work items[2]:                           4294967295
+  Max work group size:                           4294967295
+  Preferred vector width char:                   1
+  Preferred vector width short:                  1
+  Preferred vector width int:                    1
+  Preferred vector width long:                   1
+  Preferred vector width float:                  1
+  Preferred vector width double:                 0
+  Max clock frequency:                           0Mhz
+  Address bits:                                  64
+  Max memory allocation:                         4294967296
+  Image support:                                 Yes
+  Max number of images read arguments:           128
+  Max number of images write arguments:          8
+  Max image 2D width:                            8192
+  Max image 2D height:                           8192
+  Max image 3D width:                            2048
+  Max image 3D height:                           2048
+  Max image 3D depth:                            2048
+  Max samplers within kernel:                    0
+  Max size of kernel argument:                   2048
+  Alignment (bits) of base address:              32768
+  Minimum alignment (bytes) for any datatype:    128
+  Single precision floating point capability
+    Denorms:                                     No
+    Quiet NaNs:                                  Yes
+    Round to nearest even:                       Yes
+    Round to zero:                               No
+    Round to +ve and infinity:                   No
+    IEEE754-2008 fused multiply-add:             No
+  Cache type:                                    None
+  Cache line size:                               64
+  Cache size:                                    0
+  Global memory size:                            0
+  Constant buffer size:                          4194304
+  Max number of constant args:                   8
+  Local memory type:                             Scratchpad
+  Local memory size:                             16384
+  Error correction support:                      1
+  Profiling timer resolution:                    1
+  Device endianess:                              Little
+  Available:                                     No
+  Compiler available:                            No
+  Execution capabilities:
+    Execute OpenCL kernels:                      Yes
+    Execute native function:                     No
+  Queue on Host properties:
+    Out-of-Order:                                Yes
+    Profiling:                                   Yes
+  Platform ID:                                   0x16fbae8
+  Name:                                          xilinx_u280_gen3x16_xdma_base_1
+  Vendor:                                        Xilinx
+  Driver version:                                1.0
+  Profile:                                       EMBEDDED_PROFILE
+  Version:                                       OpenCL 1.0
+<...>
+```
+
+which shows that both `Xilinx` platform and accelerator devices are present.
+
+## Building Applications
+
+To simplify the build process we define two environment variables `IT4I_PLATFORM` and `IT4I_BUILD_MODE`.
+The first `IT4I_PLATFORM` denotes specific accelerator hardware such as `Alveo u250` or `Alveo u280`
+and its configuration stored in (`*.xpfm` files).
+The list of available platforms can be obtained using `platforminfo` utility:
+
+```console
+$ platforminfo -l
+{
+    "platforms": [
+        {
+            "baseName": "xilinx_u280_gen3x16_xdma_1_202211_1",
+            "version": "202211.1",
+            "type": "sdaccel",
+            "dataCenter": "true",
+            "embedded": "false",
+            "externalHost": "true",
+            "serverManaged": "true",
+            "platformState": "impl",
+            "usesPR": "true",
+            "platformFile": "\/opt\/xilinx\/platforms\/xilinx_u280_gen3x16_xdma_1_202211_1\/xilinx_u280_gen3x16_xdma_1_202211_1.xpfm"
+        },
+        {
+            "baseName": "xilinx_u250_gen3x16_xdma_4_1_202210_1",
+            "version": "202210.1",
+            "type": "sdaccel",
+            "dataCenter": "true",
+            "embedded": "false",
+            "externalHost": "true",
+            "serverManaged": "true",
+            "platformState": "impl",
+            "usesPR": "true",
+            "platformFile": "\/opt\/xilinx\/platforms\/xilinx_u250_gen3x16_xdma_4_1_202210_1\/xilinx_u250_gen3x16_xdma_4_1_202210_1.xpfm"
+        }
+    ]
+}
+```
+
+Here, `baseName` and potentially `platformFile` are of interest and either can be specified as value of `IT4I_PLATFORM`.
+In this case we have platform files `xilinx_u280_gen3x16_xdma_1_202211_1` (Alveo u280) and `xilinx_u250_gen3x16_xdma_4_1_202210_1` (Alveo u250).
+
+The `IT4I_BUILD_MODE` is used to specify build type (`hw`, `hw_emu` and `sw_emu`):
+
+- `hw` performs full synthesis for the accelerator
+- `hw_emu` allows to run both synthesis and emulation for debugging
+- `sw_emu` compiles kernels only for emulation (doesn't require accelerator and allows much faster build)
+
+For example to configure build for `Alveo u280` we set:
+
+```console
+$ export IT4I_PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1
+```
+
+### Software Emulation Mode
+
+The software emulation mode is preferable for development as HLS synthesis is very time consuming. To build following applications in this mode we set:
+
+```console
+$ export IT4I_BUILD_MODE=sw_emu
+```
+
+and run each application with `XCL_EMULATION_MODE` set to `sw_emu`:
+
+```
+$ XCL_EMULATION_MODE=sw_emu <application>
+```
+
+### Hardware Synthesis Mode
+
+!!! note
+    The HLS of these simple applications **can take up to 2 hours** to finish.
+
+To allow the application to utilize real hardware we have to synthetize FPGA design for the accelerator. This can be done by repeating same steps used to build kernels in emulation mode, but with `IT4I_BUILD_MODE` set to `hw` like so:
+
+```console
+$ export IT4I_BUILD_MODE=hw
+```
+
+the host application binary can be reused, but it has to be run without `XCL_EMULATION_MODE`:
+
+```console
+$ <application>
+```
+
+## Sample Applications
+
+The first two samples illustrate two main approaches to building FPGA accelerated applications using Xilinx platform - **XRT** and **OpenCL**.
+The final example combines **HIP** with **XRT** to show basics necessary to build application, which utilizes both GPU and FPGA accelerators.
+
+### Using HLS and XRT
+
+The applications are typically separated into host and accelerator/kernel side.
+The following host-side code should be saved as `host.cpp`
+
+```c++
+/*
+# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: X11
+*/
+#include <iostream>
+#include <cstring>
+
+// XRT includes
+#include "xrt/xrt_bo.h"
+#include <experimental/xrt_xclbin.h>
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#define DATA_SIZE 4096
+
+int main(int argc, char** argv)
+{
+    if(argc != 2)
+    {
+        std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Read settings
+    std::string binaryFile = argv[1];
+    int device_index = 0;
+
+    std::cout << "Open the device" << device_index << std::endl;
+    auto device = xrt::device(device_index);
+    std::cout << "Load the xclbin " << binaryFile << std::endl;
+    auto uuid = device.load_xclbin("./vadd.xclbin");
+
+    size_t vector_size_bytes = sizeof(int) * DATA_SIZE;
+
+    //auto krnl = xrt::kernel(device, uuid, "vadd");
+    auto krnl = xrt::kernel(device, uuid, "vadd", xrt::kernel::cu_access_mode::exclusive);
+
+    std::cout << "Allocate Buffer in Global Memory\n";
+    auto boIn1 = xrt::bo(device, vector_size_bytes, krnl.group_id(0)); //Match kernel arguments to RTL kernel
+    auto boIn2 = xrt::bo(device, vector_size_bytes, krnl.group_id(1));
+    auto boOut = xrt::bo(device, vector_size_bytes, krnl.group_id(2));
+
+    // Map the contents of the buffer object into host memory
+    auto bo0_map = boIn1.map<int*>();
+    auto bo1_map = boIn2.map<int*>();
+    auto bo2_map = boOut.map<int*>();
+    std::fill(bo0_map, bo0_map + DATA_SIZE, 0);
+    std::fill(bo1_map, bo1_map + DATA_SIZE, 0);
+    std::fill(bo2_map, bo2_map + DATA_SIZE, 0);
+
+    // Create the test data
+    int bufReference[DATA_SIZE];
+    for (int i = 0; i < DATA_SIZE; ++i)
+    {
+        bo0_map[i] = i;
+        bo1_map[i] = i;
+        bufReference[i] = bo0_map[i] + bo1_map[i]; //Generate check data for validation
+    }
+
+    // Synchronize buffer content with device side
+    std::cout << "synchronize input buffer data to device global memory\n";
+    boIn1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    boIn2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    std::cout << "Execution of the kernel\n";
+    auto run = krnl(boIn1, boIn2, boOut, DATA_SIZE); //DATA_SIZE=size
+    run.wait();
+
+    // Get the output;
+    std::cout << "Get the output data from the device" << std::endl;
+    boOut.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    // Validate results
+    if (std::memcmp(bo2_map, bufReference, vector_size_bytes))
+        throw std::runtime_error("Value read back does not match reference");
+
+    std::cout << "TEST PASSED\n";
+    return 0;
+}
+```
+
+The host-side code can now be compiled using GCC toolchain as:
+
+```console
+$ g++ host.cpp -I$XILINX_XRT/include -I$XILINX_VIVADO/include -L$XILINX_XRT/lib -lxrt_coreutil -o host
+```
+
+The accelerator side (simple vector-add kernel) should be saved as `vadd.cpp`.
+
+```c++
+/*
+# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: X11
+*/
+
+extern "C" {
+	void vadd(
+	        const unsigned int *in1, // Read-Only Vector 1
+	        const unsigned int *in2, // Read-Only Vector 2
+	        unsigned int *out,       // Output Result
+	        int size                 // Size in integer
+	        )
+	{
+#pragma HLS INTERFACE m_axi port=in1 bundle=aximm1
+#pragma HLS INTERFACE m_axi port=in2 bundle=aximm2
+#pragma HLS INTERFACE m_axi port=out bundle=aximm1
+
+	    for(int i = 0; i < size; ++i)
+	    {
+	        out[i] = in1[i] + in2[i];
+	    }
+	}
+}
+```
+
+The accelerator-side code is build using Vitis `v++`.
+This is two-step process, which either builds emulation binary or performs full HLS (depending on the value of `-t` argument).
+The platform (specific accelerator) has to be also specified at this step (both for emulation and full HLS).
+
+```console
+$ v++ -c -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM -k vadd vadd.cpp -o vadd.xo
+$ v++ -l -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM vadd.xo -o vadd.xclbin
+```
+
+This process should result in `vadd.xclbin`, which can be loaded by host-side application.
+
+### Running the Application
+
+With both host application and kernel binary at hand the application (in emulation mode) can be launched as
+
+```console
+$ XCL_EMULATION_MODE=sw_emu ./host vadd.xclbin
+```
+
+or with real hardware (having compiled kernels with `IT4I_BUILD_MODE=hw`)
+
+```console
+./host vadd.xclbin
+```
+
+## Using HLS and OpenCL
+
+The host-side application code should be saved as `host.cpp`.
+This application attempts to find `Xilinx` OpenCL platform in the system and selects first device in that platform.
+The device is then configured with provided kernel binary.
+Other than that the only difference to typical vector-add in OpenCL is use of `enqueueTask(...)` to launch the kernel
+(compared to typical `enqueueNDRangeKernel`).
+
+```c++
+#include <iostream>
+#include <fstream>
+#include <iterator>
+#include <vector>
+
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+#include <CL/cl2.hpp>
+#include <CL/cl_ext_xilinx.h>
+
+std::vector<unsigned char> read_binary_file(const std::string &filename)
+{
+    std::cout << "INFO: Reading " << filename << std::endl;
+    std::ifstream file(filename, std::ios::binary);
+    file.unsetf(std::ios::skipws);
+
+    std::streampos file_size;
+    file.seekg(0, std::ios::end);
+    file_size = file.tellg();
+    file.seekg(0, std::ios::beg);
+
+    std::vector<unsigned char> data;
+    data.reserve(file_size);
+    data.insert(data.begin(),
+        std::istream_iterator<unsigned char>(file),
+        std::istream_iterator<unsigned char>());
+
+    return data;
+}
+
+cl::Device select_device()
+{
+    std::vector<cl::Platform> platforms;
+    cl::Platform::get(&platforms);
+    cl::Platform platform;
+
+    for(cl::Platform &p: platforms)
+    {
+        const std::string name = p.getInfo<CL_PLATFORM_NAME>();
+        std::cout << "PLATFORM: " << name << std::endl;
+        if(name == "Xilinx")
+        {
+            platform = p;
+            break;
+        }
+    }
+
+    if(platform == cl::Platform())
+    {
+        std::cout << "Xilinx platform not found!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    std::vector<cl::Device> devices;
+    platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+    return devices[0];
+}
+
+static const int DATA_SIZE = 1024;
+
+int main(int argc, char *argv[])
+{
+    if(argc != 2)
+    {
+        std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::string binary_file = argv[1];
+
+    std::vector<int> source_a(DATA_SIZE, 10);
+    std::vector<int> source_b(DATA_SIZE, 32);
+
+    auto program_binary = read_binary_file(binary_file);
+    cl::Program::Binaries bins{{program_binary.data(), program_binary.size()}};
+
+    cl::Device device = select_device();
+    cl::Context context(device, nullptr, nullptr, nullptr);
+    cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
+
+    cl::Program program(context, {device}, bins, nullptr);
+
+    cl::Kernel vadd_kernel = cl::Kernel(program, "vector_add");
+
+    cl::Buffer buffer_a(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, source_a.size() * sizeof(int), source_a.data());
+    cl::Buffer buffer_b(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, source_b.size() * sizeof(int), source_b.data());
+    cl::Buffer buffer_res(context, CL_MEM_READ_WRITE, source_a.size() * sizeof(int));
+
+    int narg = 0;
+    vadd_kernel.setArg(narg++, buffer_res);
+    vadd_kernel.setArg(narg++, buffer_a);
+    vadd_kernel.setArg(narg++, buffer_b);
+    vadd_kernel.setArg(narg++, DATA_SIZE);
+
+    q.enqueueTask(vadd_kernel);
+
+    std::vector<int> result(DATA_SIZE, 0);
+    q.enqueueReadBuffer(buffer_res, CL_TRUE, 0, result.size() * sizeof(int), result.data());
+
+    int mismatch_count = 0;
+    for(size_t i = 0; i < DATA_SIZE; ++i)
+    {
+        int host_result = source_a[i] + source_b[i];
+        if(result[i] != host_result)
+        {
+            mismatch_count++;
+            std::cout << "ERROR: " << result[i] << " != " << host_result << std::endl;
+            break;
+        }
+    }
+
+    std::cout << "RESULT: " << (mismatch_count == 0 ? "PASSED" : "FAILED") << std::endl;
+
+    return 0;
+}
+```
+
+The host-side code can now be compiled using GCC toolchain as:
+
+```console
+$ g++ host.cpp -I$XILINX_XRT/include -I$XILINX_VIVADO/include -lOpenCL -o host
+```
+
+The accelerator side (simple vector-add kernel) should be saved as `vadd.cl`.
+
+```c++
+#define BUFFER_SIZE 256
+#define DATA_SIZE 1024
+
+// TRIPCOUNT indentifier
+__constant uint c_len = DATA_SIZE / BUFFER_SIZE;
+__constant uint c_size = BUFFER_SIZE;
+
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__kernel void vector_add(__global int* c,
+    __global const int* a,
+    __global const int* b,
+    const int n_elements)
+{
+    int arrayA[BUFFER_SIZE];
+    int arrayB[BUFFER_SIZE];
+
+    __attribute__((xcl_loop_tripcount(c_len, c_len)))
+    for (int i = 0; i < n_elements; i += BUFFER_SIZE)
+    {
+        int size = BUFFER_SIZE;
+
+        if(i + size > n_elements)
+            size = n_elements - i;
+
+        __attribute__((xcl_loop_tripcount(c_size, c_size)))
+        __attribute__((xcl_pipeline_loop(1))) readA:
+        for(int j = 0; j < size; j++)
+            arrayA[j] = a[i + j];
+
+        __attribute__((xcl_loop_tripcount(c_size, c_size)))
+        __attribute__((xcl_pipeline_loop(1))) readB:
+        for(int j = 0; j < size; j++)
+            arrayB[j] = b[i + j];
+
+        __attribute__((xcl_loop_tripcount(c_size, c_size)))
+        __attribute__((xcl_pipeline_loop(1))) vadd_writeC:
+        for(int j = 0; j < size; j++)
+            c[i + j] = arrayA[j] + arrayB[j];
+    }
+}
+```
+
+The accelerator-side code is build using Vitis `v++`.
+This is three-step process, which either builds emulation binary or performs full HLS (depending on the value of `-t` argument).
+The platform (specific accelerator) has to be also specified at this step (both for emulation and full HLS).
+
+```console
+$ v++ -c -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM -k vector_add -o vadd.xo vadd.cl
+$ v++ -l -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM -o vadd.link.xclbin vadd.xo
+$ v++ -p vadd.link.xclbin -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM -o vadd.xclbin
+```
+
+This process should result in `vadd.xclbin`, which can be loaded by host-side application.
+
+### Running the Application
+
+With both host application and kernel binary at hand the application (in emulation mode) can be launched as
+
+```console
+$ XCL_EMULATION_MODE=sw_emu ./host vadd.xclbin
+```
+
+or with real hardware (having compiled kernels with `IT4I_BUILD_MODE=hw`)
+
+```console
+./host vadd.xclbin
+```
+
+## Hybrid GPU and FPGA Application (HIP+XRT)
+
+This simple 8-bit quantized dot product (`R = sum(X[i]*Y[i])`) example illustrates basic approach to utilize both GPU and FPGA accelerators in a single application.
+The application takes the simplest approach, where both synchronization and data transfers are handled explicitly by the host.
+The HIP toolchain is used to compile the single source host/GPU code as usual, but it is also linked with XRT runtime, which allows host to control the FPGA accelerator.
+The FPGA kernels are built separately as in previous examples.
+
+The host/GPU HIP code should be saved as `main.hip`
+
+```c++
+#include <iostream>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "experimental/xrt_xclbin.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+#include "hip/hip_runtime.h"
+
+const size_t DATA_SIZE = 1024;
+
+float compute_reference(const float *srcX, const float *srcY, size_t count);
+
+__global__ void quantize(int8_t *out, const float *in, size_t count)
+{
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(size_t i = idx; i < count; i += blockDim.x * gridDim.x)
+        out[i] = int8_t(in[i] * 127);
+}
+
+__global__ void dequantize(float *out, const int16_t *in, size_t count)
+{
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(size_t i = idx; i < count; i += blockDim.x * gridDim.x)
+        out[i] = float(in[i] / float(127*127));
+}
+
+int main(int argc, char *argv[])
+{
+    if(argc != 2)
+    {
+        std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Prepare experiment data
+    std::vector<float> srcX(DATA_SIZE);
+    std::vector<float> srcY(DATA_SIZE);
+    float outR = 0.0f;
+
+    for(size_t i = 0; i < DATA_SIZE; ++i)
+    {
+        srcX[i] = float(rand()) / float(RAND_MAX);
+        srcY[i] = float(rand()) / float(RAND_MAX);
+        outR += srcX[i] * srcY[i];
+    }
+
+    float outR_quant = compute_reference(srcX.data(), srcY.data(), DATA_SIZE);
+
+    std::cout << "REFERENCE: " << outR_quant << " (" << outR << ")" << std::endl;
+
+    // Initialize XRT (FPGA device), load kernels binary and create kernel object
+    xrt::device device(0);
+    std::cout << "Loading xclbin file " << argv[1] << std::endl;
+    xrt::uuid xclbinId = device.load_xclbin(argv[1]);
+    xrt::kernel mulKernel(device, xclbinId, "multiply", xrt::kernel::cu_access_mode::exclusive);
+
+    // Allocate GPU buffers
+    float   *srcX_gpu, *srcY_gpu, *res_gpu;
+    int8_t  *srcX_gpu_quant, *srcY_gpu_quant;
+    int16_t *res_gpu_quant;
+    hipMalloc(&srcX_gpu, DATA_SIZE * sizeof(float));
+    hipMalloc(&srcY_gpu, DATA_SIZE * sizeof(float));
+    hipMalloc(&res_gpu,  DATA_SIZE * sizeof(float));
+    hipMalloc(&srcX_gpu_quant, DATA_SIZE * sizeof(int8_t));
+    hipMalloc(&srcY_gpu_quant, DATA_SIZE * sizeof(int8_t));
+    hipMalloc(&res_gpu_quant,  DATA_SIZE * sizeof(int16_t));
+
+    // Allocate FPGA buffers
+    xrt::bo srcX_fpga_quant(device, DATA_SIZE * sizeof(int8_t), mulKernel.group_id(0));
+    xrt::bo srcY_fpga_quant(device, DATA_SIZE * sizeof(int8_t), mulKernel.group_id(1));
+    xrt::bo res_fpga_quant(device, DATA_SIZE * sizeof(int16_t), mulKernel.group_id(2));
+
+    // Copy experiment data from HOST to GPU
+    hipMemcpy(srcX_gpu, srcX.data(), DATA_SIZE * sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(srcY_gpu, srcY.data(), DATA_SIZE * sizeof(float), hipMemcpyHostToDevice);
+
+    // Execute quantization kernels on both input vectors
+    quantize<<<16, 256>>>(srcX_gpu_quant, srcX_gpu, DATA_SIZE);
+    quantize<<<16, 256>>>(srcY_gpu_quant, srcY_gpu, DATA_SIZE);
+
+    // Map FPGA buffers into HOST memory, copy data from GPU to these mapped buffers and synchronize them into FPGA memory
+    hipMemcpy(srcX_fpga_quant.map<int8_t *>(), srcX_gpu_quant, DATA_SIZE * sizeof(int8_t), hipMemcpyDeviceToHost);
+    srcX_fpga_quant.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    hipMemcpy(srcY_fpga_quant.map<int8_t *>(), srcY_gpu_quant, DATA_SIZE * sizeof(int8_t), hipMemcpyDeviceToHost);
+    srcY_fpga_quant.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // Execute FPGA kernel (8-bit integer multiplication)
+    auto kernelRun = mulKernel(res_fpga_quant, srcX_fpga_quant, srcY_fpga_quant, DATA_SIZE);
+    kernelRun.wait();
+
+    // Synchronize output FPGA buffer back to HOST and copy its contents to GPU buffer for dequantization
+    res_fpga_quant.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    hipMemcpy(res_gpu_quant, res_fpga_quant.map<int16_t *>(), DATA_SIZE * sizeof(int16_t), hipMemcpyDeviceToHost);
+
+    // Dequantize multiplication result on GPU
+    dequantize<<<16, 256>>>(res_gpu, res_gpu_quant, DATA_SIZE);
+
+    // Copy dequantized results from GPU to HOST
+    std::vector<float> res(DATA_SIZE);
+    hipMemcpy(res.data(), res_gpu, DATA_SIZE * sizeof(float), hipMemcpyDeviceToHost);
+
+    // Perform simple sum on CPU
+    float out = 0.0;
+    for(size_t i = 0; i < DATA_SIZE; ++i)
+        out += res[i];
+
+    std::cout << "RESULT: " << out << std::endl;
+
+    hipFree(srcX_gpu);
+    hipFree(srcY_gpu);
+    hipFree(res_gpu);
+    hipFree(srcX_gpu_quant);
+    hipFree(srcY_gpu_quant);
+    hipFree(res_gpu_quant);
+
+    return 0;
+}
+
+float compute_reference(const float *srcX, const float *srcY, size_t count)
+{
+    float out = 0.0f;
+
+    for(size_t i = 0; i < count; ++i)
+    {
+        int16_t quantX(srcX[i] * 127);
+        int16_t quantY(srcY[i] * 127);
+
+        out += float(int16_t(quantX * quantY) / float(127*127));
+    }
+
+    return out;
+}
+```
+
+The host/GPU application can be built using HIPCC as:
+
+```console
+$ hipcc -I$XILINX_XRT/include -I$XILINX_VIVADO/include -L$XILINX_XRT/lib -lxrt_coreutil main.hip -o host
+```
+
+The accelerator side (simple vector-multiply kernel) should be saved as `kernels.cpp`.
+
+```c++
+extern "C" {
+    void multiply(
+        short *out,
+        const char *inX,
+        const char *inY,
+        int size)
+    {
+#pragma HLS INTERFACE m_axi port=inX bundle=aximm1
+#pragma HLS INTERFACE m_axi port=inY bundle=aximm2
+#pragma HLS INTERFACE m_axi port=out bundle=aximm1
+        for(int i = 0; i < size; ++i)
+            out[i] = short(inX[i]) * short(inY[i]);
+    }
+}
+```
+
+Once again the HLS kernel is build using Vitis `v++` in two steps:
+
+```console
+v++ -c -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM -k multiply kernels.cpp -o kernels.xo
+v++ -l -t $IT4I_BUILD_MODE --platform $IT4I_PLATFORM kernels.xo -o kernels.xclbin
+```
+
+### Running the Application
+
+In emulation mode (FPGA emulation, GPU HW is required) the application can be launched as:
+
+```console
+$ XCL_EMULATION_MODE=sw_emu ./host kernels.xclbin
+REFERENCE: 256.554 (260.714)
+Loading xclbin file ./kernels.xclbin
+RESULT: 256.554
+```
+
+or, having compiled kernels with `IT4I_BUILD_MODE=hw` set, using real hardware (both FPGA and GPU HW is required)
+
+```console
+$ ./host kernels.xclbin
+REFERENCE: 256.554 (260.714)
+Loading xclbin file ./kernels.xclbin
+RESULT: 256.554
+```
+
+## Additional Resources
+
+- [https://xilinx.github.io/Vitis-Tutorials/][1]
+- [http://xilinx.github.io/Vitis_Accel_Examples/][2]
+
+[1]: https://xilinx.github.io/Vitis-Tutorials/
+[2]: http://xilinx.github.io/Vitis_Accel_Examples/
--- a/docs.it4i/cs/introduction.md
+++ b/docs.it4i/cs/introduction.md
-# Complementary Systems
+# Introduction

 Complementary systems offer development environment for users
 that need to port and optimize their code and applications
@@ -26,6 +26,8 @@ Second stage of complementary systems implementation comprises of these partitio
 - compute partition 7 - based on IBM Power10 architecture
 - compute partition 8 - modern CPU with a very high L3 cache capacity (over 750MB)
 - compute partition 9 - virtual GPU accelerated workstations
+- compute partition 10 - Sapphire Rapids-HBM server
+- compute partition 11 - NVIDIA Grace CPU Superchip

 ![](../img/cs2_2.png)


--- a/docs.it4i/cs/job-scheduling.md
+++ b/docs.it4i/cs/job-scheduling.md
@@ -20,6 +20,7 @@ p05-synt     up 1-00:00:00          0/1/0/1 p05-synt01
 p06-arm      up 1-00:00:00          0/2/0/2 p06-arm[01-02]
 p07-power    up 1-00:00:00          0/1/0/1 p07-power01
 p08-amd      up 1-00:00:00          0/1/0/1 p08-amd01
+p10-intel    up 1-00:00:00          0/1/0/1 p10-intel01
 ```

 ## Getting Job Information
@@ -89,7 +90,7 @@ set | grep ^SLURM

 | variable name | description | example |
 | ------ | ------ | ------ |
-| SLURM_JOBID | job id of the executing job| 593 |
+| SLURM_JOB_ID | job id of the executing job| 593 |
 | SLURM_JOB_NODELIST | nodes allocated to the job | p03-amd[01-02] |
 | SLURM_JOB_NUM_NODES | number of nodes allocated to the job | 2 |
 | SLURM_STEP_NODELIST | nodes allocated to the job step | p03-amd01 |
@@ -145,6 +146,7 @@ $ scancel JOBID
 | p06-arm   | 2     | yes        | 80             | aarch64,ib |
 | p07-power | 1     | yes        | 192            | ppc64le,ib |
 | p08-amd   | 1     | yes        | 128            | x86_64,amd,milan-x,ib,ht |
+| p10-intel | 1     | yes        | 96             | x86_64,intel,sapphire_rapids,ht|

 Use `-t`, `--time` option to specify job run time limit. Default job time limit is 2 hours, maximum job time limit is 24 hours.

@@ -312,6 +314,14 @@ Whole node allocation:
 salloc -A PROJECT-ID -p p08-amd
 ```

+## Partition 10 - Intel Sapphire Rapids
+
+Whole node allocation:
+
+```console
+salloc -A PROJECT-ID -p p10-intel
+```
+
 ## Features

 Nodes have feature tags assigned to them.
@@ -326,6 +336,7 @@ Users can select nodes based on the feature tags using --constraint option.
 | intel | manufacturer |
 | icelake | processor family |
 | broadwell | processor family |
+| sapphire_rapids | processor family |
 | milan | processor family |
 | milan-x | processor family |
 | ib | Infiniband |
@@ -342,10 +353,14 @@ p00-arm01        aarch64,cortex-a72
 p01-arm[01-08]   aarch64,a64fx,ib
 p02-intel01      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,ht
 p02-intel02      x86_64,intel,icelake,ib,fpga,bitware,nvdimm,noht
-p03-amd01        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
 p03-amd02        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,noht
+p03-amd01        x86_64,amd,milan,ib,gpu,mi100,fpga,xilinx,ht
 p04-edge01       x86_64,intel,broadwell,ib,ht
 p05-synt01       x86_64,amd,milan,ib,ht
+p06-arm[01-02]   aarch64,ib
+p07-power01      ppc64le,ib
+p08-amd01        x86_64,amd,milan-x,ib,ht
+p10-intel01      x86_64,intel,sapphire_rapids,ht
 ```

 ```
No results found